# Use TTH_Template Custom Notebook template

In [1]:
import datetime, warnings, scipy 
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, classification_report

pd.options.display.max_columns = 50
warnings.filterwarnings("ignore")

from fosforml.model_manager.snowflakesession import get_session
from fosforml import register_model

In [4]:
## for no font error:

# Set Matplotlib's default font family to 'DeJavu Serif' to ensure a consistent font style across plots
plt.rcParams['font.family'] = 'DeJavu Serif'

# Read data from Snowflake

In [5]:
my_session = get_session()

In [12]:
## table_name = 'FLIGHTS'
table_name = 'FLIGHTS_FULL'

sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [13]:
## FILTERING DATA for just 2 airlines

options = ['Southwest Airlines Co.', 'Delta Air Lines Inc.'] 
  
# selecting rows based on condition 
flights = df.loc[df['AIRLINE'].isin(options)] 

## check
## flights['AIRLINE'].unique()

In [15]:
## flights = df.copy()
flights_needed_data = flights.copy()

In [16]:
flights_needed_data.shape

## OLD VALUE: (5819079, 31)

(2137736, 45)

In [17]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2137736 entries, 1 to 5819078
Data columns (total 45 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   YEAR                      int16  
 1   MONTH                     int8   
 2   DAY                       int8   
 3   DAY_OF_WEEK               int8   
 4   AIRLINE__CODE             object 
 5   FLIGHT_NUMBER             int16  
 6   TAIL_NUMBER               object 
 7   ORIGIN_AIRPORT_CODE       object 
 8   DESTINATION_AIRPORT_CODE  object 
 9   SCHEDULED_DEPARTURE       int16  
 10  DEPARTURE_TIME            float64
 11  DEPARTURE_DELAY           float64
 12  TAXI_OUT                  float64
 13  WHEELS_OFF                float64
 14  SCHEDULED_TIME            float64
 15  ELAPSED_TIME              float64
 16  AIR_TIME                  float64
 17  DISTANCE                  int16  
 18  WHEELS_ON                 float64
 19  TAXI_IN                   float64
 20  SCHEDULED_ARRIVAL         int

In [18]:
flights_needed_data.head

<bound method NDFrame.head of          YEAR  MONTH  DAY  DAY_OF_WEEK AIRLINE__CODE  FLIGHT_NUMBER  \
1        2024     12   17            4            DL           1799   
6        2024     12   17            4            DL           1480   
7        2024     12   17            4            DL           1952   
14       2024     12   17            4            WN            558   
15       2024     12   17            4            WN           2360   
...       ...    ...  ...          ...           ...            ...   
5819074  2024      9   21            1            WN            714   
5819075  2024      9   21            1            WN           2364   
5819076  2024      9   21            1            WN             41   
5819077  2024      9   21            1            WN            151   
5819078  2024      9   21            1            WN           3412   

        TAIL_NUMBER ORIGIN_AIRPORT_CODE DESTINATION_AIRPORT_CODE  \
1            N603AT                 CHA          

In [6]:
## flights_needed_data = df.loc[(df['fly_date'] <= '2024-10-31')]
## flights_needed_data = flights[0:100000]  # getting a segment 

In [31]:
def categorize_time(SCHEDULED_ARRIVAL):
    if SCHEDULED_ARRIVAL >= 500 and SCHEDULED_ARRIVAL< 800: return 'Early morning'
    elif SCHEDULED_ARRIVAL >= 800 and SCHEDULED_ARRIVAL < 1100: return 'Late morning'
    elif SCHEDULED_ARRIVAL >= 1100 and SCHEDULED_ARRIVAL < 1400: return 'Around noon'
    elif SCHEDULED_ARRIVAL >= 1400 and SCHEDULED_ARRIVAL < 1700: return 'Afternoon'
    elif SCHEDULED_ARRIVAL >= 1700 and SCHEDULED_ARRIVAL < 2000: return 'Evening'
    elif SCHEDULED_ARRIVAL >= 2000 and SCHEDULED_ARRIVAL < 2300: return 'Night'
    elif SCHEDULED_ARRIVAL >= 2300 or SCHEDULED_ARRIVAL < 200: return 'Late night'
    elif SCHEDULED_ARRIVAL >= 200 or SCHEDULED_ARRIVAL < 500: return 'Dawn'
#    else:
#        return 'Dawn'

# Apply the function to the Age column using the apply() function
flights_needed_data['ARRIVAL_TIME_SEGMENT'] = flights_needed_data['SCHEDULED_ARRIVAL'].apply(categorize_time)

In [32]:
flights_needed_data

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE__CODE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT_CODE,DESTINATION_AIRPORT_CODE,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,FLY_DATE,AIRLINE,ORIGIN_AIRPORT,ORIGIN_CITY,ORIGIN_STATE,ORIGIN_COUNTRY,ORIGIN_LATITUDE,ORIGIN_LONGITUDE,DEST_AIRPORT,DEST_CITY,DEST_STATE,DEST_COUNTRY,DEST_LATITUDE,DEST_LONGITUDE,ARRIVAL_TIME_SEGMENT
1,2024,12,17,4,DL,1799,N603AT,CHA,ATL,549,552.0,3.0,15.0,607.0,56.0,48.0,25.0,106,632.0,8.0,645,640.0,-5.0,0,0,,,,,,,2024-12-17,Delta Air Lines Inc.,Chattanooga Metropolitan Airport (Lovell Field),Chattanooga,TN,USA,35.03527,-85.20379,Hartsfield-Jackson Atlanta International Airport,Atlanta,GA,USA,33.64044,-84.42694,Early morning
6,2024,12,17,4,DL,1480,N319US,BDL,DTW,550,549.0,-1.0,20.0,609.0,120.0,129.0,104.0,549,753.0,5.0,750,758.0,8.0,0,0,,,,,,,2024-12-17,Delta Air Lines Inc.,Bradley International Airport,Windsor Locks,CT,USA,41.93887,-72.68323,Detroit Metropolitan Airport,Detroit,MI,USA,42.21206,-83.34884,Early morning
7,2024,12,17,4,DL,1952,N939AT,DSM,ATL,550,549.0,-1.0,15.0,604.0,131.0,141.0,110.0,743,854.0,16.0,901,910.0,9.0,0,0,,,,,,,2024-12-17,Delta Air Lines Inc.,Des Moines International Airport,Des Moines,IA,USA,41.53493,-93.66068,Hartsfield-Jackson Atlanta International Airport,Atlanta,GA,USA,33.64044,-84.42694,Late morning
14,2024,12,17,4,WN,558,N263WN,CMH,BWI,550,548.0,-2.0,10.0,558.0,80.0,68.0,53.0,337,651.0,5.0,710,656.0,-14.0,0,0,,,,,,,2024-12-17,Southwest Airlines Co.,Port Columbus International Airport,Columbus,OH,USA,39.99799,-82.89188,Baltimore-Washington International Airport,Baltimore,MD,USA,39.17540,-76.66820,Early morning
15,2024,12,17,4,WN,2360,N7720F,ABQ,MDW,550,557.0,7.0,16.0,613.0,165.0,153.0,132.0,1121,925.0,5.0,935,930.0,-5.0,0,0,,,,,,,2024-12-17,Southwest Airlines Co.,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919,Chicago Midway International Airport,Chicago,IL,USA,41.78598,-87.75242,Late morning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819074,2024,9,21,1,WN,714,N515SW,DAL,AUS,1820,1817.0,-3.0,9.0,1826.0,50.0,44.0,31.0,189,1857.0,4.0,1910,1901.0,-9.0,0,0,,,,,,,2024-09-21,Southwest Airlines Co.,Dallas Love Field,Dallas,TX,USA,32.84711,-96.85177,Austin-Bergstrom International Airport,Austin,TX,USA,30.19453,-97.66987,Evening
5819075,2024,9,21,1,WN,2364,N701GS,DAL,BWI,1820,1826.0,6.0,11.0,1837.0,180.0,187.0,147.0,1209,2204.0,29.0,2220,2233.0,13.0,0,0,,,,,,,2024-09-21,Southwest Airlines Co.,Dallas Love Field,Dallas,TX,USA,32.84711,-96.85177,Baltimore-Washington International Airport,Baltimore,MD,USA,39.17540,-76.66820,Night
5819076,2024,9,21,1,WN,41,N257WN,HOU,SAT,1820,1817.0,-3.0,6.0,1823.0,50.0,44.0,35.0,192,1858.0,3.0,1910,1901.0,-9.0,0,0,,,,,,,2024-09-21,Southwest Airlines Co.,William P. Hobby Airport,Houston,TX,USA,29.64542,-95.27889,San Antonio International Airport,San Antonio,TX,USA,29.53369,-98.46978,Evening
5819077,2024,9,21,1,WN,151,N480WN,IAD,LAS,1820,1825.0,5.0,16.0,1841.0,310.0,291.0,268.0,2065,2009.0,7.0,2030,2016.0,-14.0,0,0,,,,,,,2024-09-21,Southwest Airlines Co.,Washington Dulles International Airport,Chantilly,VA,USA,38.94453,-77.45581,McCarran International Airport,Las Vegas,NV,USA,36.08036,-115.15233,Night


In [33]:
flights_needed_data.value_counts('DIVERTED')  # will tell us the no. of flights which were diverted

DIVERTED
0    2132545
1       5191
Name: count, dtype: int64

In [34]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2137736 entries, 1 to 5819078
Data columns (total 46 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   YEAR                      int16  
 1   MONTH                     int8   
 2   DAY                       int8   
 3   DAY_OF_WEEK               int8   
 4   AIRLINE__CODE             object 
 5   FLIGHT_NUMBER             int16  
 6   TAIL_NUMBER               object 
 7   ORIGIN_AIRPORT_CODE       object 
 8   DESTINATION_AIRPORT_CODE  object 
 9   SCHEDULED_DEPARTURE       int16  
 10  DEPARTURE_TIME            float64
 11  DEPARTURE_DELAY           float64
 12  TAXI_OUT                  float64
 13  WHEELS_OFF                float64
 14  SCHEDULED_TIME            float64
 15  ELAPSED_TIME              float64
 16  AIR_TIME                  float64
 17  DISTANCE                  int16  
 18  WHEELS_ON                 float64
 19  TAXI_IN                   float64
 20  SCHEDULED_ARRIVAL         int

In [9]:
# filtering out unnecessary columns
flights_needed_data=flights_needed_data.drop(['YEAR','FLIGHT_NUMBER','AIRLINE__CODE','TAIL_NUMBER','TAXI_OUT',
                                              'SCHEDULED_TIME','DEPARTURE_TIME','WHEELS_OFF','ELAPSED_TIME',
                                              'AIR_TIME','WHEELS_ON','DAY_OF_WEEK','TAXI_IN','CANCELLATION_REASON', 'DEST_LATITUDE',
                                              'DEST_LONGITUDE','ORIGIN_LATITUDE', 'ORIGIN_LONGITUDE' ],
                                             axis=1)
# REMOVED DISTANCE FROM THIS LIST

In [10]:
flights_needed_data.isnull().sum()

MONTH                           0
DAY                             0
ORIGIN_AIRPORT_CODE             0
DESTINATION_AIRPORT_CODE        0
SCHEDULED_DEPARTURE             0
DEPARTURE_DELAY              2756
SCHEDULED_ARRIVAL               0
ARRIVAL_TIME                 2988
ARRIVAL_DELAY                3386
DIVERTED                        0
CANCELLED                       0
AIR_SYSTEM_DELAY            73676
SECURITY_DELAY              73676
AIRLINE_DELAY               73676
LATE_AIRCRAFT_DELAY         73676
WEATHER_DELAY               73676
dtype: int64

In [11]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   MONTH                     100000 non-null  int8   
 1   DAY                       100000 non-null  int8   
 2   ORIGIN_AIRPORT_CODE       100000 non-null  object 
 3   DESTINATION_AIRPORT_CODE  100000 non-null  object 
 4   SCHEDULED_DEPARTURE       100000 non-null  int16  
 5   DEPARTURE_DELAY           97244 non-null   float64
 6   SCHEDULED_ARRIVAL         100000 non-null  int16  
 7   ARRIVAL_TIME              97012 non-null   float64
 8   ARRIVAL_DELAY             96614 non-null   float64
 9   DIVERTED                  100000 non-null  int8   
 10  CANCELLED                 100000 non-null  int8   
 11  AIR_SYSTEM_DELAY          26324 non-null   float64
 12  SECURITY_DELAY            26324 non-null   float64
 13  AIRLINE_DELAY             26324 non-null   fl

In [12]:
#flights_needed_data = flights_needed_data.apply(lambda x: x.fillna(x.mean()), axis=0)

cols = ['DEPARTURE_DELAY','ARRIVAL_TIME','ARRIVAL_DELAY','AIR_SYSTEM_DELAY','SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']

for column in cols:
    print (column)
    flights_needed_data[column].fillna(flights_needed_data[column].mean(), inplace=True)

DEPARTURE_DELAY
ARRIVAL_TIME
ARRIVAL_DELAY
AIR_SYSTEM_DELAY
SECURITY_DELAY
AIRLINE_DELAY
LATE_AIRCRAFT_DELAY
WEATHER_DELAY


In [13]:
# replacing all NaN values with the mean of the attribute in which they are present
#flights_needed_data=flights_needed_data.fillna(flights_needed_data.mean())

In [14]:
flights_needed_data.head()

Unnamed: 0,MONTH,DAY,ORIGIN_AIRPORT_CODE,DESTINATION_AIRPORT_CODE,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,6,11,CVG,ORD,747,-5.0,803,753.0,-10.0,0,0,15.556374,0.055501,18.88596,26.817391,2.735185
1,6,11,RAP,DFW,748,-8.0,1109,1051.0,-18.0,0,0,15.556374,0.055501,18.88596,26.817391,2.735185
2,6,11,ORD,FNT,749,26.0,955,1019.0,24.0,0,0,0.0,0.0,24.0,0.0,0.0
3,6,11,SEA,IAH,749,6.0,1405,1402.0,-3.0,0,0,15.556374,0.055501,18.88596,26.817391,2.735185
4,6,11,SEA,ORD,750,6.0,1352,1351.0,-1.0,0,0,15.556374,0.055501,18.88596,26.817391,2.735185


In [15]:
# creating a new column; it will tell if the flight was delayed or not
result=[]

# Create Target/Result column for Classifier

In [16]:
for row in flights_needed_data['ARRIVAL_DELAY']:
  if row > 15:
    result.append(1)
  else:
    result.append(0) 

In [17]:
flights_needed_data['result'] = result

In [18]:
flights_needed_data.value_counts('result')

result
0    74524
1    25476
Name: count, dtype: int64

In [19]:
# removing some more columns
flights_needed_data=flights_needed_data.drop(['ORIGIN_AIRPORT_CODE', 'DESTINATION_AIRPORT_CODE', 'ARRIVAL_TIME', 'ARRIVAL_DELAY'],axis=1)
flights_needed_data

Unnamed: 0,MONTH,DAY,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,result
0,6,11,747,-5.0,803,0,0,15.556374,0.055501,18.88596,26.817391,2.735185,0
1,6,11,748,-8.0,1109,0,0,15.556374,0.055501,18.88596,26.817391,2.735185,0
2,6,11,749,26.0,955,0,0,0.000000,0.000000,24.00000,0.000000,0.000000,1
3,6,11,749,6.0,1405,0,0,15.556374,0.055501,18.88596,26.817391,2.735185,0
4,6,11,750,6.0,1352,0,0,15.556374,0.055501,18.88596,26.817391,2.735185,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,6,17,700,81.0,810,0,0,0.000000,0.000000,81.00000,0.000000,0.000000,1
99996,6,17,700,-7.0,732,0,0,15.556374,0.055501,18.88596,26.817391,2.735185,0
99997,6,17,700,8.0,1019,0,0,15.556374,0.055501,18.88596,26.817391,2.735185,0
99998,6,17,700,29.0,852,0,0,63.000000,0.000000,0.00000,10.000000,0.000000,1


# Train test split

In [20]:
data = flights_needed_data.values
X, y = data[:,:-1], data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)  # splitting in the ratio 70:30

In [21]:
flights_needed_data.columns

Index(['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY',
       'SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 'AIR_SYSTEM_DELAY',
       'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
       'WEATHER_DELAY', 'result'],
      dtype='object')

# Scaling Input Feature

In [22]:
sc = StandardScaler()
X_train_1 = sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

# Model training and Prediction

In [23]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

In [24]:
y_prob = clf.predict_proba(X_test)[:,1]

In [25]:
y_pred = clf.predict(X_test)

In [26]:
auc_score = roc_auc_score(y_test, y_pred)
auc_score

0.9972391962901147

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     22426
         1.0       1.00      1.00      1.00      7574

    accuracy                           1.00     30000
   macro avg       1.00      1.00      1.00     30000
weighted avg       1.00      1.00      1.00     30000



In [28]:
flights_needed_data.shape

(100000, 13)

In [29]:
flights_needed_df = flights_needed_data.drop(['result'],axis=1)

In [30]:
final_data = flights_needed_df.values

In [31]:
final_data = sc.transform(final_data)

In [32]:
y_pred = clf.predict(final_data)

In [33]:
y_prob = clf.predict_proba(final_data)[:,1]

In [34]:
len(y_pred)

100000

In [35]:
clf.feature_importances_

array([0.00000000e+00, 4.22499242e-04, 1.38155940e-03, 1.25888649e-02,
       1.12057210e-03, 0.00000000e+00, 0.00000000e+00, 9.43116643e-03,
       9.56779678e-01, 9.95294628e-03, 5.95440359e-03, 2.36830981e-03])

In [37]:
type(X_train),type(X_test),type(y_train),type(y_test),type(y_pred),type(y_prob)

(numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray)

In [43]:
column = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY','SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 
          'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY']

In [44]:
X_train_df = pd.DataFrame(X_train, columns=column)
X_test_df = pd.DataFrame(X_test, columns=column)
y_train_df = pd.DataFrame(y_train, columns=['ACTUAL_DELAY'])
y_test_df = pd.DataFrame(y_test, columns=['ACTUAL_DELAY'])
y_pred_df = pd.DataFrame(y_pred, columns=['PREDICTED_DELAY'])
y_prob_df = pd.DataFrame(y_prob, columns=['PROBABILITY'])

# Model Registration

In [45]:
## registering the model in Fosfor Insight Designer.
register_model(
    model_obj=clf, 
    session=my_session,
    x_train=X_train_df,
    y_train=y_train_df,
    x_test=X_test_df,
    y_test=y_test_df,
    y_pred=y_pred_df,
    y_prob=y_prob_df,
    source="Notebook",
    dataset_name="FLIGHTS",
    dataset_source="Snowflake",
    #dataset_source="InMemory",
    name="Decision_Tree_Delay_Classifier",
    description="Decision tree model trained via Notebook to identify Flight Delay application",
    flavour="sklearn",
    model_type="classification",
    conda_dependencies=["scikit-learn==1.3.2"]
)

The version of package 'numpy' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'numpy' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.


Calculating build time metrics

Progress: ██████████████                                                         20.0%
Calculating build time metrics

Progress: ████████████████████████████                                           40.0%


The version of package 'numpy' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn<1.4'. Your UDF might not work when the package version is different between the server and your local environment.
DataFrame.flatten() is deprecated since 0.7.0. Use `DataFrame.join_table_function()` instead.


(1300) (1304): 01b7ad6e-0411-c7b0-0072-f30312a21086: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/classification.py", line 1059, in end_partition
  File "/usr/lib/python_udf/14494b11cebb767d68275d470153ae1f7c3c0eb2b64723dc58cc329db2f80c38/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/14494b11cebb767d68275d470153ae1f7c3c0eb2b64723dc58cc329db2f80c38/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 505, in multilabel_confusion_matrix
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/usr/lib/python_udf/14494b11cebb767d68275d470153ae1f7c3c0eb2b64723dc58cc329db2f80c38/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 93, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of unknown an

The version of package 'numpy' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'numpy'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn<1.4'. Your UDF might not work when the package version is different between the server and your local environment.


(1300) (1304): 01b7ad6e-0411-c7b0-0072-f30312a210a6: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/classification.py", line 1059, in end_partition
  File "/usr/lib/python_udf/14494b11cebb767d68275d470153ae1f7c3c0eb2b64723dc58cc329db2f80c38/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/14494b11cebb767d68275d470153ae1f7c3c0eb2b64723dc58cc329db2f80c38/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 505, in multilabel_confusion_matrix
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/usr/lib/python_udf/14494b11cebb767d68275d470153ae1f7c3c0eb2b64723dc58cc329db2f80c38/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 93, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of unknown an

The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn==1.3.*'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'snowflake-snowpark-python' in the local environment is 1.23.0, which does not fit the criteria for the requirement 'snowflake-snowpark-python'. Your UDF might not work when the package version is different between the server and your local environment.


Error in while calculating roc_auc: SnowparkSQLException('(1300) (1304): 01b7ad6e-0411-c7b0-0072-f30312a210ae: 100357 (P0000): Python Interpreter Error:\nTraceback (most recent call last):\n  File "/home/udf/32355341556637422/udf_py_1665680387.zip/udf_py_1665680387.py", line 56, in compute\n    return func(session)\n  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/ranking.py", line 263, in roc_auc_score_anon_sproc\n  File "/usr/lib/python_udf/663e987bb39f7f448dff8c02c1a4f15e961e0d191f349c610a304845487fbda7/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper\n    return func(*args, **kwargs)\n  File "/usr/lib/python_udf/663e987bb39f7f448dff8c02c1a4f15e961e0d191f349c610a304845487fbda7/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 603, in roc_auc_score\n    y_type = type_of_target(y_true, input_name="y_true")\n  File "/usr/lib/python_udf/663e987bb39f7f448dff8c02c1a4f15e961e0d191f349c610a304845487fbda7/lib/pytho

"Model 'MODEL_4FD7D083_2A05_47E2_A1ED_023104009A08_FDC_DECISION_TREE_DELAY_CLASSIFIER' registered successfully."

In [58]:
y_train_df

Unnamed: 0,ACTUAL_DELAY
0,0.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
69995,0.0
69996,0.0
69997,0.0
69998,0.0


# Push Model Input on Snowflake

In [67]:
data_df = pd.concat([X_train_df,y_train_df],axis=1,ignore_index=True)

In [68]:
data_df.columns = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY','SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 
          'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY','FLIGHT_DELAY']

In [69]:
data_df.head()

Unnamed: 0,MONTH,DAY,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,FLIGHT_DELAY
0,0.0,0.775843,0.804729,-0.322289,0.800724,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0
1,0.0,0.20501,1.037545,0.469695,0.982003,-0.069137,-0.172027,-0.860071,-0.067793,0.256177,-1.105922,-0.256964,1.0
2,0.0,-0.936654,0.776386,-0.410287,0.66715,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0
3,0.0,0.20501,-0.258126,-0.542284,0.025995,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0
4,0.0,0.775843,-1.300737,-0.002698,-1.288753,-0.069137,5.813024,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0


In [75]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   MONTH                70000 non-null  float64
 1   DAY                  70000 non-null  float64
 2   SCHEDULED_DEPARTURE  70000 non-null  float64
 3   DEPARTURE_DELAY      70000 non-null  float64
 4   SCHEDULED_ARRIVAL    70000 non-null  float64
 5   DIVERTED             70000 non-null  float64
 6   CANCELLED            70000 non-null  float64
 7   AIR_SYSTEM_DELAY     70000 non-null  float64
 8   SECURITY_DELAY       70000 non-null  float64
 9   AIRLINE_DELAY        70000 non-null  float64
 10  LATE_AIRCRAFT_DELAY  70000 non-null  float64
 11  WEATHER_DELAY        70000 non-null  float64
 12  FLIGHT_DELAY         70000 non-null  object 
dtypes: float64(12), object(1)
memory usage: 6.9+ MB


In [74]:
data_df['FLIGHT_DELAY'] = data_df['FLIGHT_DELAY'].astype('str')

In [76]:
data_df.head()

Unnamed: 0,MONTH,DAY,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,FLIGHT_DELAY
0,0.0,0.775843,0.804729,-0.322289,0.800724,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0
1,0.0,0.20501,1.037545,0.469695,0.982003,-0.069137,-0.172027,-0.860071,-0.067793,0.256177,-1.105922,-0.256964,1.0
2,0.0,-0.936654,0.776386,-0.410287,0.66715,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0
3,0.0,0.20501,-0.258126,-0.542284,0.025995,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0
4,0.0,0.775843,-1.300737,-0.002698,-1.288753,-0.069137,5.813024,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0


In [78]:
ins_train_sf=my_session.createDataFrame(
        data_df.values.tolist(),
        schema=data_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.EXPERIMENT_INSIGHT_INPUT")

# Push Model Output on snowflake

In [84]:
test_df = pd.concat([X_test_df,y_test_df,y_pred_df, y_prob_df],axis=1,ignore_index=True)

In [86]:
test_df.dropna(inplace=True)

In [87]:
test_df.shape

(30000, 15)

In [88]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,0.775843,0.640746,8.74152,1.029708,-0.069137,-0.172027,-0.860071,-0.067793,-0.792759,12.724268,4.8116,1.0,0.0,0.0
1,0.0,0.775843,1.634769,0.997683,-2.595869,-0.069137,-0.172027,-0.860071,-0.067793,-0.582971,0.735249,-0.256964,1.0,0.0,0.0
2,0.0,-0.936654,-0.683269,-0.366288,-0.735376,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0,1.0,1.0
3,0.0,0.775843,0.824974,-0.366288,0.991544,-0.069137,-0.172027,-0.001829,-0.000646,-0.000352,-0.000475,-0.000234,0.0,0.0,0.0
4,0.0,1.346675,0.72375,0.535693,0.800724,-0.069137,-0.172027,0.408834,-0.067793,-0.792759,-0.549289,-0.256964,1.0,0.0,0.0


In [89]:
test_df.columns = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY','SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 
          'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY','FLIGHT_DELAY','PREDICTED_DELAY','PROBABILITY']

In [90]:
X1_df = test_df.head(15000)
X2_df = test_df.tail(15000)

In [91]:
ins_train_sf=my_session.createDataFrame(
        X1_df.values.tolist(),
        schema=X1_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.DELAY_CLASSIFIER_OUTPUT_1")

In [92]:
ins_train_sf=my_session.createDataFrame(
        X2_df.values.tolist(),
        schema=X2_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.DELAY_CLASSIFIER_OUTPUT_2")