# Use TTH_Template Custom Notebook template

In [3]:
import datetime, warnings
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, classification_report

pd.options.display.max_columns = 50
warnings.filterwarnings("ignore")

from fosforml.model_manager.snowflakesession import get_session
from fosforml import register_model

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


ModuleNotFoundError: No module named 'matplotlib'

# Read data from Snowflake

In [2]:
my_session = get_session()

In [3]:
table_name = 'FLIGHTS'

sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [4]:
flights = df.copy()
flights_needed_data = df.copy()

In [5]:
flights_needed_data.shape

(5819079, 31)

In [6]:
flights_needed_data = flights[0:100000]  # getting a segment 

In [7]:
flights_needed_data

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE__CODE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT_CODE,DESTINATION_AIRPORT_CODE,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,6,11,4,OO,6213,N727SK,CVG,ORD,747,742.0,-5.0,13.0,755.0,76.0,71.0,52.0,265,747.0,6.0,803,753.0,-10.0,0,0,,,,,,
1,2015,6,11,4,MQ,3487,N645MQ,RAP,DFW,748,740.0,-8.0,15.0,755.0,141.0,131.0,111.0,835,1046.0,5.0,1109,1051.0,-18.0,0,0,,,,,,
2,2015,6,11,4,EV,6017,N16546,ORD,FNT,749,815.0,26.0,19.0,834.0,66.0,64.0,41.0,223,1015.0,4.0,955,1019.0,24.0,0,0,,0.0,0.0,24.0,0.0,0.0
3,2015,6,11,4,UA,1548,N36469,SEA,IAH,749,755.0,6.0,22.0,817.0,256.0,247.0,221.0,1874,1358.0,4.0,1405,1402.0,-3.0,0,0,,,,,,
4,2015,6,11,4,AA,42,N3JTAA,SEA,ORD,750,756.0,6.0,16.0,812.0,242.0,235.0,206.0,1721,1338.0,13.0,1352,1351.0,-1.0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2015,6,17,3,F9,915,N939FR,TTN,ORD,700,821.0,81.0,16.0,837.0,130.0,130.0,103.0,693,920.0,11.0,810,931.0,81.0,0,0,,0.0,0.0,81.0,0.0,0.0
99996,2015,6,17,3,HA,110,N481HA,LIH,HNL,700,653.0,-7.0,8.0,701.0,32.0,34.0,20.0,102,721.0,6.0,732,727.0,-5.0,0,0,,,,,,
99997,2015,6,17,3,MQ,3058,N642MQ,DFW,CVG,700,708.0,8.0,29.0,737.0,139.0,142.0,108.0,812,1025.0,5.0,1019,1030.0,11.0,0,0,,,,,,
99998,2015,6,17,3,MQ,3061,N621MQ,DFW,HSV,700,729.0,29.0,62.0,831.0,112.0,156.0,82.0,603,953.0,12.0,852,1005.0,73.0,0,0,,63.0,0.0,0.0,10.0,0.0


In [8]:
flights_needed_data.value_counts('DIVERTED')  # will tell us the no. of flights which were diverted

DIVERTED
0    99507
1      493
Name: count, dtype: int64

In [9]:
# filtering out unnecessary columns
flights_needed_data=flights_needed_data.drop(['YEAR','FLIGHT_NUMBER','AIRLINE__CODE','DISTANCE','TAIL_NUMBER','TAXI_OUT',
                                              'SCHEDULED_TIME','DEPARTURE_TIME','WHEELS_OFF','ELAPSED_TIME',
                                              'AIR_TIME','WHEELS_ON','DAY_OF_WEEK','TAXI_IN','CANCELLATION_REASON'],
                                             axis=1)

In [10]:
flights_needed_data.isnull().sum()

MONTH                           0
DAY                             0
ORIGIN_AIRPORT_CODE             0
DESTINATION_AIRPORT_CODE        0
SCHEDULED_DEPARTURE             0
DEPARTURE_DELAY              2756
SCHEDULED_ARRIVAL               0
ARRIVAL_TIME                 2988
ARRIVAL_DELAY                3386
DIVERTED                        0
CANCELLED                       0
AIR_SYSTEM_DELAY            73676
SECURITY_DELAY              73676
AIRLINE_DELAY               73676
LATE_AIRCRAFT_DELAY         73676
WEATHER_DELAY               73676
dtype: int64

In [11]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   MONTH                     100000 non-null  int8   
 1   DAY                       100000 non-null  int8   
 2   ORIGIN_AIRPORT_CODE       100000 non-null  object 
 3   DESTINATION_AIRPORT_CODE  100000 non-null  object 
 4   SCHEDULED_DEPARTURE       100000 non-null  int16  
 5   DEPARTURE_DELAY           97244 non-null   float64
 6   SCHEDULED_ARRIVAL         100000 non-null  int16  
 7   ARRIVAL_TIME              97012 non-null   float64
 8   ARRIVAL_DELAY             96614 non-null   float64
 9   DIVERTED                  100000 non-null  int8   
 10  CANCELLED                 100000 non-null  int8   
 11  AIR_SYSTEM_DELAY          26324 non-null   float64
 12  SECURITY_DELAY            26324 non-null   float64
 13  AIRLINE_DELAY             26324 non-null   fl

In [12]:
#flights_needed_data = flights_needed_data.apply(lambda x: x.fillna(x.mean()), axis=0)

cols = ['DEPARTURE_DELAY','ARRIVAL_TIME','ARRIVAL_DELAY','AIR_SYSTEM_DELAY','SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']

for column in cols:
    print (column)
    flights_needed_data[column].fillna(flights_needed_data[column].mean(), inplace=True)

DEPARTURE_DELAY
ARRIVAL_TIME
ARRIVAL_DELAY
AIR_SYSTEM_DELAY
SECURITY_DELAY
AIRLINE_DELAY
LATE_AIRCRAFT_DELAY
WEATHER_DELAY


In [13]:
# replacing all NaN values with the mean of the attribute in which they are present
#flights_needed_data=flights_needed_data.fillna(flights_needed_data.mean())

In [14]:
flights_needed_data.head()

Unnamed: 0,MONTH,DAY,ORIGIN_AIRPORT_CODE,DESTINATION_AIRPORT_CODE,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,6,11,CVG,ORD,747,-5.0,803,753.0,-10.0,0,0,15.556374,0.055501,18.88596,26.817391,2.735185
1,6,11,RAP,DFW,748,-8.0,1109,1051.0,-18.0,0,0,15.556374,0.055501,18.88596,26.817391,2.735185
2,6,11,ORD,FNT,749,26.0,955,1019.0,24.0,0,0,0.0,0.0,24.0,0.0,0.0
3,6,11,SEA,IAH,749,6.0,1405,1402.0,-3.0,0,0,15.556374,0.055501,18.88596,26.817391,2.735185
4,6,11,SEA,ORD,750,6.0,1352,1351.0,-1.0,0,0,15.556374,0.055501,18.88596,26.817391,2.735185


In [15]:
# creating a new column; it will tell if the flight was delayed or not
result=[]

# Create Target/Result column for Classifier

In [16]:
for row in flights_needed_data['ARRIVAL_DELAY']:
  if row > 15:
    result.append(1)
  else:
    result.append(0) 

In [17]:
flights_needed_data['result'] = result

In [18]:
flights_needed_data.value_counts('result')

result
0    74524
1    25476
Name: count, dtype: int64

In [19]:
# removing some more columns
flights_needed_data=flights_needed_data.drop(['ORIGIN_AIRPORT_CODE', 'DESTINATION_AIRPORT_CODE', 'ARRIVAL_TIME', 'ARRIVAL_DELAY'],axis=1)
flights_needed_data

Unnamed: 0,MONTH,DAY,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,result
0,6,11,747,-5.0,803,0,0,15.556374,0.055501,18.88596,26.817391,2.735185,0
1,6,11,748,-8.0,1109,0,0,15.556374,0.055501,18.88596,26.817391,2.735185,0
2,6,11,749,26.0,955,0,0,0.000000,0.000000,24.00000,0.000000,0.000000,1
3,6,11,749,6.0,1405,0,0,15.556374,0.055501,18.88596,26.817391,2.735185,0
4,6,11,750,6.0,1352,0,0,15.556374,0.055501,18.88596,26.817391,2.735185,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,6,17,700,81.0,810,0,0,0.000000,0.000000,81.00000,0.000000,0.000000,1
99996,6,17,700,-7.0,732,0,0,15.556374,0.055501,18.88596,26.817391,2.735185,0
99997,6,17,700,8.0,1019,0,0,15.556374,0.055501,18.88596,26.817391,2.735185,0
99998,6,17,700,29.0,852,0,0,63.000000,0.000000,0.00000,10.000000,0.000000,1


# Train test split

In [20]:
data = flights_needed_data.values
X, y = data[:,:-1], data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)  # splitting in the ratio 70:30

In [21]:
flights_needed_data.columns

Index(['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY',
       'SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 'AIR_SYSTEM_DELAY',
       'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
       'WEATHER_DELAY', 'result'],
      dtype='object')

# Scaling Input Feature

In [22]:
sc = StandardScaler()
X_train_1 = sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

# Model training and Prediction

In [23]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

In [24]:
y_prob = clf.predict_proba(X_test)[:,1]

In [25]:
y_pred = clf.predict(X_test)

In [26]:
auc_score = roc_auc_score(y_test, y_pred)
auc_score

0.9972391962901147

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     22426
         1.0       1.00      1.00      1.00      7574

    accuracy                           1.00     30000
   macro avg       1.00      1.00      1.00     30000
weighted avg       1.00      1.00      1.00     30000



In [28]:
flights_needed_data.shape

(100000, 13)

In [29]:
flights_needed_df = flights_needed_data.drop(['result'],axis=1)

In [30]:
final_data = flights_needed_df.values

In [31]:
final_data = sc.transform(final_data)

In [32]:
y_pred = clf.predict(final_data)

In [33]:
y_prob = clf.predict_proba(final_data)[:,1]

In [34]:
len(y_pred)

100000

In [35]:
clf.feature_importances_

array([0.00000000e+00, 4.22499242e-04, 1.38155940e-03, 1.25888649e-02,
       1.12057210e-03, 0.00000000e+00, 0.00000000e+00, 9.43116643e-03,
       9.56779678e-01, 9.95294628e-03, 5.95440359e-03, 2.36830981e-03])

In [37]:
type(X_train),type(X_test),type(y_train),type(y_test),type(y_pred),type(y_prob)

(numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray)

In [43]:
column = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY','SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 
          'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY']

In [44]:
X_train_df = pd.DataFrame(X_train, columns=column)
X_test_df = pd.DataFrame(X_test, columns=column)
y_train_df = pd.DataFrame(y_train, columns=['ACTUAL_DELAY'])
y_test_df = pd.DataFrame(y_test, columns=['ACTUAL_DELAY'])
y_pred_df = pd.DataFrame(y_pred, columns=['PREDICTED_DELAY'])
y_prob_df = pd.DataFrame(y_prob, columns=['PROBABILITY'])

# Model Registration

In [1]:
y_pred_df

NameError: name 'y_pred_df' is not defined

In [45]:
## registering the model in Fosfor Insight Designer.
register_model(
    model_obj=clf, 
    session=my_session,
    x_train=X_train_df,
    y_train=y_train_df,
    x_test=X_test_df,
    y_test=y_test_df,
    y_pred=y_pred_df,
    y_prob=y_prob_df,
    source="Notebook",
    dataset_name="FLIGHTS",
    dataset_source="Snowflake",
    #dataset_source="InMemory",
    name="Decision_Tree_Delay_Classifier",
    description="Decision tree model trained via Notebook to identify Flight Delay application",
    flavour="sklearn",
    model_type="classification",
    conda_dependencies=["scikit-learn==1.3.2"]
)

The version of package 'numpy' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'numpy' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.


Calculating build time metrics

Progress: ██████████████                                                         20.0%
Calculating build time metrics

Progress: ████████████████████████████                                           40.0%


The version of package 'numpy' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn<1.4'. Your UDF might not work when the package version is different between the server and your local environment.
DataFrame.flatten() is deprecated since 0.7.0. Use `DataFrame.join_table_function()` instead.


(1300) (1304): 01b7ad6e-0411-c7b0-0072-f30312a21086: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/classification.py", line 1059, in end_partition
  File "/usr/lib/python_udf/14494b11cebb767d68275d470153ae1f7c3c0eb2b64723dc58cc329db2f80c38/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/14494b11cebb767d68275d470153ae1f7c3c0eb2b64723dc58cc329db2f80c38/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 505, in multilabel_confusion_matrix
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/usr/lib/python_udf/14494b11cebb767d68275d470153ae1f7c3c0eb2b64723dc58cc329db2f80c38/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 93, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of unknown an

The version of package 'numpy' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn<1.4'. Your UDF might not work when the package version is different between the server and your local environment.


(1300) (1304): 01b7ad6e-0411-c7b0-0072-f30312a210a6: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/classification.py", line 1059, in end_partition
  File "/usr/lib/python_udf/14494b11cebb767d68275d470153ae1f7c3c0eb2b64723dc58cc329db2f80c38/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/14494b11cebb767d68275d470153ae1f7c3c0eb2b64723dc58cc329db2f80c38/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 505, in multilabel_confusion_matrix
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/usr/lib/python_udf/14494b11cebb767d68275d470153ae1f7c3c0eb2b64723dc58cc329db2f80c38/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 93, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of unknown an

The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn==1.3.*'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'snowflake-snowpark-python' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'snowflake-snowpark-python'. Your UDF might not work when the package version is different between the server and your local environment.


Error in while calculating roc_auc: SnowparkSQLException('(1300) (1304): 01b7ad6e-0411-c7b0-0072-f30312a210ae: 100357 (P0000): Python Interpreter Error:\nTraceback (most recent call last):\n  File "/home/udf/32355341556637422/udf_py_1665680387.zip/udf_py_1665680387.py", line 56, in compute\n    return func(session)\n  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/ranking.py", line 263, in roc_auc_score_anon_sproc\n  File "/usr/lib/python_udf/663e987bb39f7f448dff8c02c1a4f15e961e0d191f349c610a304845487fbda7/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper\n    return func(*args, **kwargs)\n  File "/usr/lib/python_udf/663e987bb39f7f448dff8c02c1a4f15e961e0d191f349c610a304845487fbda7/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 603, in roc_auc_score\n    y_type = type_of_target(y_true, input_name="y_true")\n  File "/usr/lib/python_udf/663e987bb39f7f448dff8c02c1a4f15e961e0d191f349c610a304845487fbda7/lib/pytho

"Model 'MODEL_4FD7D083_2A05_47E2_A1ED_023104009A08_FDC_DECISION_TREE_DELAY_CLASSIFIER' registered successfully."

In [58]:
y_train_df

Unnamed: 0,ACTUAL_DELAY
0,0.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
69995,0.0
69996,0.0
69997,0.0
69998,0.0


# Push Model Input on Snowflake

In [67]:
data_df = pd.concat([X_train_df,y_train_df],axis=1,ignore_index=True)

In [68]:
data_df.columns = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY','SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 
          'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY','FLIGHT_DELAY']

In [69]:
data_df.head()

Unnamed: 0,MONTH,DAY,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,FLIGHT_DELAY
0,0.0,0.775843,0.804729,-0.322289,0.800724,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0
1,0.0,0.20501,1.037545,0.469695,0.982003,-0.069137,-0.172027,-0.860071,-0.067793,0.256177,-1.105922,-0.256964,1.0
2,0.0,-0.936654,0.776386,-0.410287,0.66715,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0
3,0.0,0.20501,-0.258126,-0.542284,0.025995,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0
4,0.0,0.775843,-1.300737,-0.002698,-1.288753,-0.069137,5.813024,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0


In [75]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   MONTH                70000 non-null  float64
 1   DAY                  70000 non-null  float64
 2   SCHEDULED_DEPARTURE  70000 non-null  float64
 3   DEPARTURE_DELAY      70000 non-null  float64
 4   SCHEDULED_ARRIVAL    70000 non-null  float64
 5   DIVERTED             70000 non-null  float64
 6   CANCELLED            70000 non-null  float64
 7   AIR_SYSTEM_DELAY     70000 non-null  float64
 8   SECURITY_DELAY       70000 non-null  float64
 9   AIRLINE_DELAY        70000 non-null  float64
 10  LATE_AIRCRAFT_DELAY  70000 non-null  float64
 11  WEATHER_DELAY        70000 non-null  float64
 12  FLIGHT_DELAY         70000 non-null  object 
dtypes: float64(12), object(1)
memory usage: 6.9+ MB


In [74]:
data_df['FLIGHT_DELAY'] = data_df['FLIGHT_DELAY'].astype('str')

In [76]:
data_df.head()

Unnamed: 0,MONTH,DAY,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,FLIGHT_DELAY
0,0.0,0.775843,0.804729,-0.322289,0.800724,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0
1,0.0,0.20501,1.037545,0.469695,0.982003,-0.069137,-0.172027,-0.860071,-0.067793,0.256177,-1.105922,-0.256964,1.0
2,0.0,-0.936654,0.776386,-0.410287,0.66715,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0
3,0.0,0.20501,-0.258126,-0.542284,0.025995,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0
4,0.0,0.775843,-1.300737,-0.002698,-1.288753,-0.069137,5.813024,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0


In [78]:
ins_train_sf=my_session.createDataFrame(
        data_df.values.tolist(),
        schema=data_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.EXPERIMENT_INSIGHT_INPUT")

# Push Model Output on snowflake

In [84]:
test_df = pd.concat([X_test_df,y_test_df,y_pred_df, y_prob_df],axis=1,ignore_index=True)

In [86]:
test_df.dropna(inplace=True)

In [87]:
test_df.shape

(30000, 15)

In [88]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,0.775843,0.640746,8.74152,1.029708,-0.069137,-0.172027,-0.860071,-0.067793,-0.792759,12.724268,4.8116,1.0,0.0,0.0
1,0.0,0.775843,1.634769,0.997683,-2.595869,-0.069137,-0.172027,-0.860071,-0.067793,-0.582971,0.735249,-0.256964,1.0,0.0,0.0
2,0.0,-0.936654,-0.683269,-0.366288,-0.735376,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0,1.0,1.0
3,0.0,0.775843,0.824974,-0.366288,0.991544,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0,0.0,0.0
4,0.0,1.346675,0.72375,0.535693,0.800724,-0.069137,-0.172027,0.408834,-0.067793,-0.792759,-0.549289,-0.256964,1.0,0.0,0.0


In [89]:
test_df.columns = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY','SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 
          'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY','FLIGHT_DELAY','PREDICTED_DELAY','PROBABILITY']

In [90]:
X1_df = test_df.head(15000)
X2_df = test_df.tail(15000)

In [91]:
ins_train_sf=my_session.createDataFrame(
        X1_df.values.tolist(),
        schema=X1_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.DELAY_CLASSIFIER_OUTPUT_1")

In [92]:
ins_train_sf=my_session.createDataFrame(
        X2_df.values.tolist(),
        schema=X2_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.DELAY_CLASSIFIER_OUTPUT_2")