# Use TTH_Template Custom Notebook template

In [46]:
import datetime, warnings, scipy 
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

from sklearn.model_selection import train_test_split
# Importing Scikit-learn's preprocessing utilities for encoding categorical variables and scaling numerical data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, classification_report

pd.options.display.max_columns = 50
warnings.filterwarnings("ignore")

from fosforml.model_manager.snowflakesession import get_session
from fosforml import register_model

In [2]:
## for no font error:

# Set Matplotlib's default font family to 'DeJavu Serif' to ensure a consistent font style across plots
plt.rcParams['font.family'] = 'DeJavu Serif'

# Read data from Snowflake

In [3]:
my_session = get_session()

In [4]:
## table_name = 'FLIGHTS'
table_name = 'FLIGHTS_FULL'

sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [5]:
## FILTERING DATA for just 2 airlines

options = ['Southwest Airlines Co.', 'Delta Air Lines Inc.'] 
  
# selecting rows based on condition 
flights = df.loc[df['AIRLINE'].isin(options)] 

## check
## flights['AIRLINE'].unique()

In [6]:
## flights = df.copy()
flights_needed_data = flights.copy()

In [7]:
flights_needed_data.shape

## OLD VALUE: (5819079, 31)

(2137736, 45)

In [None]:
flights_needed_data.info()

In [None]:
flights_needed_data.head

In [None]:
## flights_needed_data = df.loc[(df['fly_date'] <= '2024-10-31')]
## flights_needed_data = flights[0:100000]  # getting a segment 

In [8]:
def categorize_time(SCHEDULED_ARRIVAL):
    if SCHEDULED_ARRIVAL >= 500 and SCHEDULED_ARRIVAL< 800: return 'Early morning'
    elif SCHEDULED_ARRIVAL >= 800 and SCHEDULED_ARRIVAL < 1100: return 'Late morning'
    elif SCHEDULED_ARRIVAL >= 1100 and SCHEDULED_ARRIVAL < 1400: return 'Around noon'
    elif SCHEDULED_ARRIVAL >= 1400 and SCHEDULED_ARRIVAL < 1700: return 'Afternoon'
    elif SCHEDULED_ARRIVAL >= 1700 and SCHEDULED_ARRIVAL < 2000: return 'Evening'
    elif SCHEDULED_ARRIVAL >= 2000 and SCHEDULED_ARRIVAL < 2300: return 'Night'
    elif SCHEDULED_ARRIVAL >= 2300 or SCHEDULED_ARRIVAL < 200: return 'Late night'
    elif SCHEDULED_ARRIVAL >= 200 or SCHEDULED_ARRIVAL < 500: return 'Dawn'
#    else:
#        return 'Dawn'

# Apply the function to the Age column using the apply() function
flights_needed_data['ARRIVAL_TIME_SEGMENT'] = flights_needed_data['SCHEDULED_ARRIVAL'].apply(categorize_time)

In [None]:
flights_needed_data

In [9]:
 flights['AIRLINE__CODE'].unique()

array(['DL', 'WN'], dtype=object)

In [10]:
flights_needed_data.value_counts('DIVERTED')  # will tell us the no. of flights which were diverted

DIVERTED
0    2132545
1       5191
Name: count, dtype: int64

In [11]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2137736 entries, 1 to 5819078
Data columns (total 46 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   YEAR                      int16  
 1   MONTH                     int8   
 2   DAY                       int8   
 3   DAY_OF_WEEK               int8   
 4   AIRLINE__CODE             object 
 5   FLIGHT_NUMBER             int16  
 6   TAIL_NUMBER               object 
 7   ORIGIN_AIRPORT_CODE       object 
 8   DESTINATION_AIRPORT_CODE  object 
 9   SCHEDULED_DEPARTURE       int16  
 10  DEPARTURE_TIME            float64
 11  DEPARTURE_DELAY           float64
 12  TAXI_OUT                  float64
 13  WHEELS_OFF                float64
 14  SCHEDULED_TIME            float64
 15  ELAPSED_TIME              float64
 16  AIR_TIME                  float64
 17  DISTANCE                  int16  
 18  WHEELS_ON                 float64
 19  TAXI_IN                   float64
 20  SCHEDULED_ARRIVAL         int

In [12]:
# filtering out unnecessary columns
flights_needed_data=flights_needed_data.drop(['YEAR','FLIGHT_NUMBER','AIRLINE__CODE','TAIL_NUMBER','TAXI_OUT',
                                              'SCHEDULED_TIME','WHEELS_OFF','ELAPSED_TIME',
                                              'AIR_TIME','WHEELS_ON','DAY_OF_WEEK','TAXI_IN','CANCELLATION_REASON', 'DEST_LATITUDE',
                                              'DEST_LONGITUDE','ORIGIN_LATITUDE', 'ORIGIN_LONGITUDE','ORIGIN_STATE', 'ORIGIN_COUNTRY', 
                                              'DEST_AIRPORT','DEST_CITY','DEST_STATE','DEST_COUNTRY', 'ORIGIN_CITY', 'ORIGIN_AIRPORT' ],
                                             axis=1)
# REMOVED DISTANCE FROM THIS LIST

In [14]:
## flights_needed_data=flights_needed_data.drop(['ORIGIN_AIRPORT' ],
#                                             axis=1)

In [18]:
flights_needed_data.isnull().sum()

MONTH                             0
DAY                               0
ORIGIN_AIRPORT_CODE               0
DESTINATION_AIRPORT_CODE          0
SCHEDULED_DEPARTURE               0
DEPARTURE_TIME                19430
DEPARTURE_DELAY               19430
DISTANCE                          0
SCHEDULED_ARRIVAL                 0
ARRIVAL_TIME                  20737
ARRIVAL_DELAY                 25058
DIVERTED                          0
CANCELLED                         0
AIR_SYSTEM_DELAY            1783087
SECURITY_DELAY              1783087
AIRLINE_DELAY               1783087
LATE_AIRCRAFT_DELAY         1783087
WEATHER_DELAY               1783087
FLY_DATE                          0
AIRLINE                           0
ARRIVAL_TIME_SEGMENT              0
dtype: int64

In [19]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2137736 entries, 1 to 5819078
Data columns (total 21 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   MONTH                     int8   
 1   DAY                       int8   
 2   ORIGIN_AIRPORT_CODE       object 
 3   DESTINATION_AIRPORT_CODE  object 
 4   SCHEDULED_DEPARTURE       int16  
 5   DEPARTURE_TIME            float64
 6   DEPARTURE_DELAY           float64
 7   DISTANCE                  int16  
 8   SCHEDULED_ARRIVAL         int16  
 9   ARRIVAL_TIME              float64
 10  ARRIVAL_DELAY             float64
 11  DIVERTED                  int8   
 12  CANCELLED                 int8   
 13  AIR_SYSTEM_DELAY          float64
 14  SECURITY_DELAY            float64
 15  AIRLINE_DELAY             float64
 16  LATE_AIRCRAFT_DELAY       float64
 17  WEATHER_DELAY             float64
 18  FLY_DATE                  object 
 19  AIRLINE                   object 
 20  ARRIVAL_TIME_SEGMENT      obj

## DO NOT RUN THIS CODE
#flights_needed_data = flights_needed_data.apply(lambda x: x.fillna(x.mean()), axis=0)

cols = ['DEPARTURE_DELAY','ARRIVAL_DELAY','AIR_SYSTEM_DELAY',
        'SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']
cols1 = ['DEPARTURE_TIME']
cols2 = ['ARRIVAL_TIME']
#cols = ['DEPARTURE_TIME','DEPARTURE_DELAY','ARRIVAL_TIME','ARRIVAL_DELAY','AIR_SYSTEM_DELAY',
#        'SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']

for column in cols1:
    print (column)
    flights_needed_data[column].fillna(flights_needed_data[['SCHEDULED_DEPARTURE']]), inplace=True)

for column in cols2:
    print (column)
    flights_needed_data[column].fillna(flights_needed_data[['SCHEDULED_ARRIVAL']]), inplace=True)
    
for column in cols:
    print (column)
    flights_needed_data[column].fillna(flights_needed_data[column].mean(), inplace=True)

In [23]:
#flights_needed_data = flights_needed_data.apply(lambda x: x.fillna(x.mean()), axis=0)

cols = ['DEPARTURE_TIME', 'ARRIVAL_TIME','DEPARTURE_DELAY','ARRIVAL_DELAY','AIR_SYSTEM_DELAY',
        'SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']

for column in cols:
    print (column)
    flights_needed_data[column].fillna(flights_needed_data[column].mean(), inplace=True)

DEPARTURE_TIME
ARRIVAL_TIME
DEPARTURE_DELAY
ARRIVAL_DELAY
AIR_SYSTEM_DELAY
SECURITY_DELAY
AIRLINE_DELAY
LATE_AIRCRAFT_DELAY
WEATHER_DELAY


In [None]:
# replacing all NaN values with the mean of the attribute in which they are present
#flights_needed_data=flights_needed_data.fillna(flights_needed_data.mean())

In [24]:
flights_needed_data.head()

Unnamed: 0,MONTH,DAY,ORIGIN_AIRPORT_CODE,DESTINATION_AIRPORT_CODE,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,FLY_DATE,AIRLINE,ARRIVAL_TIME_SEGMENT
1,12,17,CHA,ATL,549,552.0,3.0,106,645,640.0,-5.0,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-12-17,Delta Air Lines Inc.,Early morning
6,12,17,BDL,DTW,550,549.0,-1.0,549,750,758.0,8.0,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-12-17,Delta Air Lines Inc.,Early morning
7,12,17,DSM,ATL,550,549.0,-1.0,743,901,910.0,9.0,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-12-17,Delta Air Lines Inc.,Late morning
14,12,17,CMH,BWI,550,548.0,-2.0,337,710,656.0,-14.0,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-12-17,Southwest Airlines Co.,Early morning
15,12,17,ABQ,MDW,550,557.0,7.0,1121,935,930.0,-5.0,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-12-17,Southwest Airlines Co.,Late morning


In [25]:
# creating a new column; it will tell if the flight was delayed or not
result=[]

# Create Target/Result column for Classifier

In [26]:
for row in flights_needed_data['ARRIVAL_DELAY']:
  if row > 5:
    result.append(1)
  else:
    result.append(0) 

In [27]:
flights_needed_data['result'] = result

In [28]:
flights_needed_data.value_counts('result')

result
0    1595430
1     542306
Name: count, dtype: int64

In [29]:
# removing some more columns
flights_needed_data=flights_needed_data.drop(['ORIGIN_AIRPORT_CODE', 'DESTINATION_AIRPORT_CODE', 'ARRIVAL_TIME', 'ARRIVAL_DELAY'],axis=1)
flights_needed_data

Unnamed: 0,MONTH,DAY,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,DISTANCE,SCHEDULED_ARRIVAL,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,FLY_DATE,AIRLINE,ARRIVAL_TIME_SEGMENT,result
1,12,17,549,552.0,3.0,106,645,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-12-17,Delta Air Lines Inc.,Early morning,0
6,12,17,550,549.0,-1.0,549,750,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-12-17,Delta Air Lines Inc.,Early morning,1
7,12,17,550,549.0,-1.0,743,901,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-12-17,Delta Air Lines Inc.,Late morning,1
14,12,17,550,548.0,-2.0,337,710,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-12-17,Southwest Airlines Co.,Early morning,0
15,12,17,550,557.0,7.0,1121,935,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-12-17,Southwest Airlines Co.,Late morning,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819074,9,21,1820,1817.0,-3.0,189,1910,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-09-21,Southwest Airlines Co.,Evening,0
5819075,9,21,1820,1826.0,6.0,1209,2220,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-09-21,Southwest Airlines Co.,Night,1
5819076,9,21,1820,1817.0,-3.0,192,1910,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-09-21,Southwest Airlines Co.,Evening,0
5819077,9,21,1820,1825.0,5.0,2065,2030,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-09-21,Southwest Airlines Co.,Night,0


In [43]:
flights_needed_data=flights_needed_data.drop(['FLY_DATE'], axis=1)

In [44]:
#Get list of categorical variables
s = (flights_needed_data.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables in the dataset:", object_cols)

Categorical variables in the dataset: ['AIRLINE', 'ARRIVAL_TIME_SEGMENT']


In [47]:
#Label Encoding the object dtypes.
LE=LabelEncoder()
for i in object_cols:
    flights_needed_data[i]=flights_needed_data[[i]].apply(LE.fit_transform)
    
print("All features are now numerical")

All features are now numerical


In [48]:
print(flights_needed_data.max(axis=0)) # will return max value of each column
print(flights_needed_data.min(axis=0)) # will return min value of each column

MONTH                     12.0
DAY                       31.0
SCHEDULED_DEPARTURE     2359.0
DEPARTURE_TIME          2400.0
DEPARTURE_DELAY         1289.0
DISTANCE                4983.0
SCHEDULED_ARRIVAL       2359.0
DIVERTED                   1.0
CANCELLED                  1.0
AIR_SYSTEM_DELAY         991.0
SECURITY_DELAY           440.0
AIRLINE_DELAY           1274.0
LATE_AIRCRAFT_DELAY     1010.0
WEATHER_DELAY           1211.0
AIRLINE                    1.0
ARRIVAL_TIME_SEGMENT       7.0
result                     1.0
dtype: float64
MONTH                    1.0
DAY                      1.0
SCHEDULED_DEPARTURE      1.0
DEPARTURE_TIME           1.0
DEPARTURE_DELAY        -61.0
DISTANCE                74.0
SCHEDULED_ARRIVAL        1.0
DIVERTED                 0.0
CANCELLED                0.0
AIR_SYSTEM_DELAY         0.0
SECURITY_DELAY           0.0
AIRLINE_DELAY            0.0
LATE_AIRCRAFT_DELAY      0.0
WEATHER_DELAY            0.0
AIRLINE                  0.0
ARRIVAL_TIME_SEGMENT   

# Train test split

In [49]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2137736 entries, 1 to 5819078
Data columns (total 17 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MONTH                 int8   
 1   DAY                   int8   
 2   SCHEDULED_DEPARTURE   int16  
 3   DEPARTURE_TIME        float64
 4   DEPARTURE_DELAY       float64
 5   DISTANCE              int16  
 6   SCHEDULED_ARRIVAL     int16  
 7   DIVERTED              int8   
 8   CANCELLED             int8   
 9   AIR_SYSTEM_DELAY      float64
 10  SECURITY_DELAY        float64
 11  AIRLINE_DELAY         float64
 12  LATE_AIRCRAFT_DELAY   float64
 13  WEATHER_DELAY         float64
 14  AIRLINE               int64  
 15  ARRIVAL_TIME_SEGMENT  int64  
 16  result                int64  
dtypes: float64(7), int16(3), int64(3), int8(4)
memory usage: 199.8 MB


In [50]:
test = flights_needed_data[flights_needed_data['MONTH'] >= 11] 
#test=test.drop(['FLY_DATE'], axis=1)
train = flights_needed_data[flights_needed_data['MONTH'] < 11]
#train=train.drop(['FLY_DATE'], axis=1)
test_data = test.values
train_data = train.values
X_train, y_train = train_data[:,:-1], train_data[:,-1]
X_test, y_test = test_data[:,:-1], test_data[:,-1]
#train_data = flights_needed_data[flights_needed_data['FLY_DATE'].apply(lambda x:x.date()) < datetime.date(2024, 11, 1)]
#test_data  = flights_needed_data[flights_needed_data['FLY_DATE'].apply(lambda x:x.date()) > datetime.date(2024, 10, 31)]

In [84]:
test.shape

(354388, 17)

In [85]:
train.shape

(1783348, 17)

In [41]:
X_train

array([[7, 9, 1552, ..., 0.0, 'Delta Air Lines Inc.', 'Evening'],
       [7, 9, 1553, ..., 0.0, 'Delta Air Lines Inc.', 'Afternoon'],
       [7, 9, 1555, ..., 3.237764663089421, 'Delta Air Lines Inc.',
        'Afternoon'],
       ...,
       [9, 21, 1820, ..., 3.237764663089421, 'Southwest Airlines Co.',
        'Evening'],
       [9, 21, 1820, ..., 3.237764663089421, 'Southwest Airlines Co.',
        'Night'],
       [9, 21, 1820, ..., 3.237764663089421, 'Southwest Airlines Co.',
        'Late night']], dtype=object)

In [51]:
sc = StandardScaler()
X_train_1 = sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

# Model training and Prediction

In [52]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

In [53]:
y_prob = clf.predict_proba(X_test)[:,1]

In [54]:
y_pred = clf.predict(X_test)

In [55]:
auc_score = roc_auc_score(y_test, y_pred)
auc_score

0.8414290715982611

In [56]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.91      0.92    263912
         1.0       0.75      0.77      0.76     90476

    accuracy                           0.88    354388
   macro avg       0.84      0.84      0.84    354388
weighted avg       0.88      0.88      0.88    354388



In [57]:
flights_needed_data.shape

(2137736, 17)

In [58]:
flights_needed_df = flights_needed_data.drop(['result'],axis=1)

In [59]:
final_data = flights_needed_df.values

In [60]:
final_data = sc.transform(final_data)

In [61]:
y_pred = clf.predict(final_data)

In [62]:
final_data

array([[ 2.26781130e+00,  1.43291730e-01, -1.62725887e+00, ...,
        -4.10201150e-04, -1.19703701e+00, -1.89088769e-01],
       [ 2.26781130e+00,  1.43291730e-01, -1.62517973e+00, ...,
        -4.10201150e-04, -1.19703701e+00, -1.89088769e-01],
       [ 2.26781130e+00,  1.43291730e-01, -1.62517973e+00, ...,
        -4.10201150e-04, -1.19703701e+00,  6.06922520e-01],
       ...,
       [ 1.20403516e+00,  5.99215933e-01,  1.01532546e+00, ...,
        -4.10201150e-04,  8.35396060e-01,  2.08916875e-01],
       [ 1.20403516e+00,  5.99215933e-01,  1.01532546e+00, ...,
        -4.10201150e-04,  8.35396060e-01,  1.40293381e+00],
       [ 1.20403516e+00,  5.99215933e-01,  1.01532546e+00, ...,
        -4.10201150e-04,  8.35396060e-01,  1.00492816e+00]])

In [63]:
y_prob = clf.predict_proba(final_data)[:,1]

In [64]:
len(y_pred)

2137736

In [65]:
clf.feature_importances_

array([3.05461050e-02, 7.61999266e-02, 3.14243960e-02, 3.95948221e-02,
       1.02353132e-01, 6.48555744e-02, 5.76878262e-02, 2.00407377e-03,
       1.57177703e-03, 1.07571810e-05, 5.85308619e-01, 1.10105398e-05,
       2.21335565e-05, 5.42736065e-05, 4.73519823e-03, 3.62037436e-03])

In [66]:
type(X_train),type(X_test),type(y_train),type(y_test),type(y_pred),type(y_prob)

(numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray)

In [71]:
column = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME','DEPARTURE_DELAY','DISTANCE','SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 
          'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY','AIRLINE', 'ARRIVAL_TIME_SEGMENT']


In [72]:
X_train_df = pd.DataFrame(X_train, columns=column)
X_test_df = pd.DataFrame(X_test, columns=column)
y_train_df = pd.DataFrame(y_train, columns=['ACTUAL_DELAY'])
y_test_df = pd.DataFrame(y_test, columns=['ACTUAL_DELAY'])
y_pred_df = pd.DataFrame(y_pred, columns=['PREDICTED_DELAY'])
y_prob_df = pd.DataFrame(y_prob, columns=['PROBABILITY'])

# Model Registration

In [74]:
## registering the model in Fosfor Insight Designer.
register_model(
    model_obj=clf, 
    session=my_session,
    x_train=X_train_df,
    y_train=y_train_df,
    x_test=X_test_df,
    y_test=y_test_df,
    y_pred=y_pred_df,
    y_prob=y_prob_df,
    source="Notebook",
    dataset_name="FLIGHTS_FULL",
    dataset_source="Snowflake",
    #dataset_source="InMemory",
    name="Decision_Tree_Delay_Classifier",
    description="Decision tree model trained via Notebook to identify Flight Delay application",
    flavour="sklearn",
    model_type="classification",
    conda_dependencies=["scikit-learn==1.3.2"]
)

The version of package 'numpy' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'numpy' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.


Calculating build time metrics

Progress: ██████████████                                                         20.0%
Calculating build time metrics

Progress: ████████████████████████████                                           40.0%


The version of package 'numpy' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn<1.4'. Your UDF might not work when the package version is different between the server and your local environment.


(1300) (1304): 01b7dbd5-0511-d6db-0072-f3031301147a: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/classification.py", line 1059, in end_partition
  File "/usr/lib/python_udf/60d027bb89e79d3272b53d18af09089c8c0829c4d256cdefc695e6110f9272f1/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/60d027bb89e79d3272b53d18af09089c8c0829c4d256cdefc695e6110f9272f1/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 505, in multilabel_confusion_matrix
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/usr/lib/python_udf/60d027bb89e79d3272b53d18af09089c8c0829c4d256cdefc695e6110f9272f1/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 93, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of unknown an

The version of package 'numpy' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn<1.4'. Your UDF might not work when the package version is different between the server and your local environment.


(1300) (1304): 01b7dbd5-0511-d6db-0072-f30313011496: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/classification.py", line 1059, in end_partition
  File "/usr/lib/python_udf/60d027bb89e79d3272b53d18af09089c8c0829c4d256cdefc695e6110f9272f1/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/60d027bb89e79d3272b53d18af09089c8c0829c4d256cdefc695e6110f9272f1/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 505, in multilabel_confusion_matrix
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/usr/lib/python_udf/60d027bb89e79d3272b53d18af09089c8c0829c4d256cdefc695e6110f9272f1/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 93, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of unknown an

The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn==1.3.*'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'snowflake-snowpark-python' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'snowflake-snowpark-python'. Your UDF might not work when the package version is different between the server and your local environment.


Error in while calculating roc_auc: SnowparkSQLException('(1300) (1304): 01b7dbd6-0511-d749-0072-f3031301245a: 100357 (P0000): Python Interpreter Error:\nTraceback (most recent call last):\n  File "/home/udf/32355341556656106/udf_py_1746131121.zip/udf_py_1746131121.py", line 56, in compute\n    return func(session)\n  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/ranking.py", line 263, in roc_auc_score_anon_sproc\n  File "/usr/lib/python_udf/8555fbc9c36bcffcb868e1758df7efcc8c08a36721f54bbea8d245c9d48366ab/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper\n    return func(*args, **kwargs)\n  File "/usr/lib/python_udf/8555fbc9c36bcffcb868e1758df7efcc8c08a36721f54bbea8d245c9d48366ab/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 603, in roc_auc_score\n    y_type = type_of_target(y_true, input_name="y_true")\n  File "/usr/lib/python_udf/8555fbc9c36bcffcb868e1758df7efcc8c08a36721f54bbea8d245c9d48366ab/lib/pytho

"Failed to save build time metrics for model 'MODEL_4FD7D083_2A05_47E2_A1ED_023104009A08_FDC_DECISION_TREE_DELAY_CLASSIFIER'. (1304): 01b7dbd7-0511-d6db-0072-f303130114e2: 100069 (22P02): Error parsing JSON: document is too large, max size 16777216 bytes, pos 16777216"

In [75]:
y_train_df

Unnamed: 0,ACTUAL_DELAY
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
1783343,0.0
1783344,1.0
1783345,0.0
1783346,0.0


# Push Model Input on Snowflake

In [82]:
X_train.shape

(1783348, 16)

In [83]:
y_train.shape

(1783348,)

In [76]:
data_df = pd.concat([X_train_df,y_train_df],axis=1,ignore_index=True)

In [77]:
data_df.columns = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME','DEPARTURE_DELAY','DISTANCE','SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 
          'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY','AIRLINE', 'ARRIVAL_TIME_SEGMENT','FLIGHT_DELAY']

In [78]:
data_df.head()

Unnamed: 0,MONTH,DAY,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,DISTANCE,SCHEDULED_ARRIVAL,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,AIRLINE,ARRIVAL_TIME_SEGMENT,FLIGHT_DELAY
0,0.494851,-0.768557,0.458116,0.606529,1.113146,2.231401,0.449906,-0.050122,-0.098325,-1.050502,-0.065692,1.307518,-1.47725,-0.354656,-1.197037,0.208917,1.0
1,0.494851,-0.768557,0.460196,0.787425,2.567548,-0.302322,0.293479,-0.050122,-0.098325,-1.050502,-0.065692,3.75721,-1.47725,-0.354656,-1.197037,-1.383106,1.0
2,0.494851,-0.768557,0.464354,0.529292,-0.129155,-0.691831,0.257837,-0.050122,-0.098325,-0.001827,0.001476,-0.000523,0.000157,-0.00041,-1.197037,-1.383106,0.0
3,0.494851,-0.768557,0.464354,0.425632,-0.462456,-0.548629,0.477627,-0.050122,-0.098325,-0.001827,0.001476,-0.000523,0.000157,-0.00041,-1.197037,0.208917,0.0
4,0.494851,-0.768557,0.464354,0.433762,-0.341256,-0.050286,0.416244,-0.050122,-0.098325,-0.001827,0.001476,-0.000523,0.000157,-0.00041,-1.197037,0.208917,0.0


In [79]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1783348 entries, 0 to 1783347
Data columns (total 17 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MONTH                 float64
 1   DAY                   float64
 2   SCHEDULED_DEPARTURE   float64
 3   DEPARTURE_TIME        float64
 4   DEPARTURE_DELAY       float64
 5   DISTANCE              float64
 6   SCHEDULED_ARRIVAL     float64
 7   DIVERTED              float64
 8   CANCELLED             float64
 9   AIR_SYSTEM_DELAY      float64
 10  SECURITY_DELAY        float64
 11  AIRLINE_DELAY         float64
 12  LATE_AIRCRAFT_DELAY   float64
 13  WEATHER_DELAY         float64
 14  AIRLINE               float64
 15  ARRIVAL_TIME_SEGMENT  float64
 16  FLIGHT_DELAY          float64
dtypes: float64(17)
memory usage: 231.3 MB


In [80]:
data_df['FLIGHT_DELAY'] = data_df['FLIGHT_DELAY'].astype('str')

In [None]:
data_df.head()

In [None]:
ins_train_sf=my_session.createDataFrame(
        data_df.values.tolist(),
        schema=data_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.FLIGHTS_TRAINDATA")

# Push Model Output on snowflake

In [None]:
test_df = pd.concat([X_test_df,y_test_df,y_pred_df, y_prob_df],axis=1,ignore_index=True)

In [None]:
test_df.dropna(inplace=True)

In [None]:
test_df.shape

In [None]:
test_df.head()

In [None]:
test_df.columns = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME','DEPARTURE_DELAY','DISTANCE','SCHEDULED_ARRIVAL', 
                   'DIVERTED', 'CANCELLED', 'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
                   'WEATHER_DELAY','AIRLINE', 'ARRIVAL_TIME_SEGMENT','FLIGHT_DELAY','PREDICTED_DELAY','PROBABILITY']

In [None]:
X1_df = test_df.head(15000)
X2_df = test_df.tail(15000)

In [None]:
ins_train_sf=my_session.createDataFrame(
        X1_df.values.tolist(),
        schema=X1_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.DELAY_CLASSIFIER_OUTPUT_1")

In [None]:
ins_train_sf=my_session.createDataFrame(
        X2_df.values.tolist(),
        schema=X2_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.DELAY_CLASSIFIER_OUTPUT_2")