# Use TTH_Template Custom Notebook template

In [1]:
import datetime, warnings, scipy 
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

from sklearn.model_selection import train_test_split
# Importing Scikit-learn's preprocessing utilities for encoding categorical variables and scaling numerical data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, classification_report

pd.options.display.max_columns = 50
warnings.filterwarnings("ignore")

from fosforml.model_manager.snowflakesession import get_session
from fosforml import register_model

In [2]:
## for no font error:

# Set Matplotlib's default font family to 'DeJavu Serif' to ensure a consistent font style across plots
plt.rcParams['font.family'] = 'DeJavu Serif'

# Read data from Snowflake

In [3]:
my_session = get_session()

In [4]:
## table_name = 'FLIGHTS'
table_name = 'FLIGHTS_FULL'

sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [5]:
## FILTERING DATA for just 2 airlines

options = ['Southwest Airlines Co.', 'Delta Air Lines Inc.'] 
  
# selecting rows based on condition 
flights = df.loc[df['AIRLINE'].isin(options)] 

## check
## flights['AIRLINE'].unique()

In [6]:
## flights = df.copy()
flights_needed_data = flights.copy()

In [7]:
flights_needed_data.shape

## OLD VALUE: (5819079, 31)

(2137736, 45)

In [8]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2137736 entries, 2 to 5819067
Data columns (total 45 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   YEAR                      int16  
 1   MONTH                     int8   
 2   DAY                       int8   
 3   DAY_OF_WEEK               int8   
 4   AIRLINE__CODE             object 
 5   FLIGHT_NUMBER             int16  
 6   TAIL_NUMBER               object 
 7   ORIGIN_AIRPORT_CODE       object 
 8   DESTINATION_AIRPORT_CODE  object 
 9   SCHEDULED_DEPARTURE       int16  
 10  DEPARTURE_TIME            float64
 11  DEPARTURE_DELAY           float64
 12  TAXI_OUT                  float64
 13  WHEELS_OFF                float64
 14  SCHEDULED_TIME            float64
 15  ELAPSED_TIME              float64
 16  AIR_TIME                  float64
 17  DISTANCE                  int16  
 18  WHEELS_ON                 float64
 19  TAXI_IN                   float64
 20  SCHEDULED_ARRIVAL         int

In [9]:
flights_needed_data.head

<bound method NDFrame.head of          YEAR  MONTH  DAY  DAY_OF_WEEK AIRLINE__CODE  FLIGHT_NUMBER  \
2        2024      7    9            4            DL            766   
9        2024      7    9            4            DL           1369   
30       2024      7    9            4            DL           1277   
31       2024      7    9            4            DL           1057   
32       2024      7    9            4            DL           2483   
...       ...    ...  ...          ...           ...            ...   
5819058  2024      2    1            7            DL           1412   
5819064  2024      2    1            7            DL           1367   
5819065  2024      2    1            7            DL           1767   
5819066  2024      2    1            7            DL           2047   
5819067  2024      2    1            7            DL           2600   

        TAIL_NUMBER ORIGIN_AIRPORT_CODE DESTINATION_AIRPORT_CODE  \
2            N752AT                 DTW          

In [10]:
## flights_needed_data = df.loc[(df['fly_date'] <= '2024-10-31')]
## flights_needed_data = flights[0:100000]  # getting a segment 

In [11]:
def categorize_time(SCHEDULED_ARRIVAL):
    if SCHEDULED_ARRIVAL >= 500 and SCHEDULED_ARRIVAL< 800: return 'Early morning'
    elif SCHEDULED_ARRIVAL >= 800 and SCHEDULED_ARRIVAL < 1100: return 'Late morning'
    elif SCHEDULED_ARRIVAL >= 1100 and SCHEDULED_ARRIVAL < 1400: return 'Around noon'
    elif SCHEDULED_ARRIVAL >= 1400 and SCHEDULED_ARRIVAL < 1700: return 'Afternoon'
    elif SCHEDULED_ARRIVAL >= 1700 and SCHEDULED_ARRIVAL < 2000: return 'Evening'
    elif SCHEDULED_ARRIVAL >= 2000 and SCHEDULED_ARRIVAL < 2300: return 'Night'
    elif SCHEDULED_ARRIVAL >= 2300 or SCHEDULED_ARRIVAL < 200: return 'Late night'
    elif SCHEDULED_ARRIVAL >= 200 or SCHEDULED_ARRIVAL < 500: return 'Dawn'
#    else:
#        return 'Dawn'

# Apply the function to the Age column using the apply() function
flights_needed_data['ARRIVAL_TIME_SEGMENT'] = flights_needed_data['SCHEDULED_ARRIVAL'].apply(categorize_time)

In [12]:
flights_needed_data

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE__CODE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT_CODE,DESTINATION_AIRPORT_CODE,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,FLY_DATE,AIRLINE,ORIGIN_AIRPORT,ORIGIN_CITY,ORIGIN_STATE,ORIGIN_COUNTRY,ORIGIN_LATITUDE,ORIGIN_LONGITUDE,DEST_AIRPORT,DEST_CITY,DEST_STATE,DEST_COUNTRY,DEST_LATITUDE,DEST_LONGITUDE,ARRIVAL_TIME_SEGMENT
2,2024,7,9,4,DL,766,N752AT,DTW,SAN,1552,1638.0,46.0,18.0,1656.0,278.0,275.0,253.0,1956,1809.0,4.0,1730,1813.0,43.0,0,0,,0.0,0.0,43.0,0.0,0.0,2024-07-09,Delta Air Lines Inc.,Detroit Metropolitan Airport,Detroit,MI,USA,42.21206,-83.34884,San Diego International Airport (Lindbergh Field),San Diego,CA,USA,32.73356,-117.18966,Evening
9,2024,7,9,4,DL,1369,N968AT,DTW,MCI,1553,1727.0,94.0,17.0,1744.0,118.0,113.0,91.0,629,1815.0,5.0,1651,1820.0,89.0,0,0,,0.0,0.0,89.0,0.0,0.0,2024-07-09,Delta Air Lines Inc.,Detroit Metropolitan Airport,Detroit,MI,USA,42.21206,-83.34884,Kansas City International Airport,Kansas City,MO,USA,39.29761,-94.71391,Afternoon
30,2024,7,9,4,DL,1277,N980DL,ATL,MSY,1555,1600.0,5.0,18.0,1618.0,98.0,86.0,59.0,425,1617.0,9.0,1633,1626.0,-7.0,0,0,,,,,,,2024-07-09,Delta Air Lines Inc.,Hartsfield-Jackson Atlanta International Airport,Atlanta,GA,USA,33.64044,-84.42694,Louis Armstrong New Orleans International Airport,New Orleans,LA,USA,29.99339,-90.25803,Afternoon
31,2024,7,9,4,DL,1057,N906DE,DTW,CLT,1555,1549.0,-6.0,24.0,1613.0,109.0,106.0,73.0,500,1726.0,9.0,1744,1735.0,-9.0,0,0,,,,,,,2024-07-09,Delta Air Lines Inc.,Detroit Metropolitan Airport,Detroit,MI,USA,42.21206,-83.34884,Charlotte Douglas International Airport,Charlotte,NC,USA,35.21401,-80.94313,Evening
32,2024,7,9,4,DL,2483,N953DL,ATL,OKC,1555,1553.0,-2.0,23.0,1616.0,138.0,128.0,100.0,761,1656.0,5.0,1713,1701.0,-12.0,0,0,,,,,,,2024-07-09,Delta Air Lines Inc.,Hartsfield-Jackson Atlanta International Airport,Atlanta,GA,USA,33.64044,-84.42694,Will Rogers World Airport,Oklahoma City,OK,USA,35.39309,-97.60073,Evening
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819058,2024,2,1,7,DL,1412,N932DL,BNA,ATL,1158,1154.0,-4.0,29.0,1223.0,78.0,74.0,35.0,214,1358.0,10.0,1416,1408.0,-8.0,0,0,,,,,,,2024-02-01,Delta Air Lines Inc.,Nashville International Airport,Nashville,TN,USA,36.12448,-86.67818,Hartsfield-Jackson Atlanta International Airport,Atlanta,GA,USA,33.64044,-84.42694,Afternoon
5819064,2024,2,1,7,DL,1367,N920DN,ROC,ATL,1159,1148.0,-11.0,42.0,1230.0,145.0,169.0,119.0,749,1429.0,8.0,1424,1437.0,13.0,0,0,,,,,,,2024-02-01,Delta Air Lines Inc.,Greater Rochester International Airport,Rochester,NY,USA,43.11887,-77.67238,Hartsfield-Jackson Atlanta International Airport,Atlanta,GA,USA,33.64044,-84.42694,Afternoon
5819065,2024,2,1,7,DL,1767,N967DL,FNT,ATL,1159,1155.0,-4.0,29.0,1224.0,137.0,146.0,109.0,645,1413.0,8.0,1416,1421.0,5.0,0,0,,,,,,,2024-02-01,Delta Air Lines Inc.,Bishop International Airport,Flint,MI,USA,42.96550,-83.74346,Hartsfield-Jackson Atlanta International Airport,Atlanta,GA,USA,33.64044,-84.42694,Afternoon
5819066,2024,2,1,7,DL,2047,N554NW,LGA,ATL,1159,1216.0,17.0,32.0,1248.0,171.0,163.0,123.0,762,1451.0,8.0,1450,1459.0,9.0,0,0,,,,,,,2024-02-01,Delta Air Lines Inc.,LaGuardia Airport (Marine Air Terminal),New York,NY,USA,40.77724,-73.87261,Hartsfield-Jackson Atlanta International Airport,Atlanta,GA,USA,33.64044,-84.42694,Afternoon


In [13]:
 flights['AIRLINE__CODE'].unique()

array(['DL', 'WN'], dtype=object)

In [14]:
flights_needed_data.value_counts('DIVERTED')  # will tell us the no. of flights which were diverted

DIVERTED
0    2132545
1       5191
Name: count, dtype: int64

In [15]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2137736 entries, 2 to 5819067
Data columns (total 46 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   YEAR                      int16  
 1   MONTH                     int8   
 2   DAY                       int8   
 3   DAY_OF_WEEK               int8   
 4   AIRLINE__CODE             object 
 5   FLIGHT_NUMBER             int16  
 6   TAIL_NUMBER               object 
 7   ORIGIN_AIRPORT_CODE       object 
 8   DESTINATION_AIRPORT_CODE  object 
 9   SCHEDULED_DEPARTURE       int16  
 10  DEPARTURE_TIME            float64
 11  DEPARTURE_DELAY           float64
 12  TAXI_OUT                  float64
 13  WHEELS_OFF                float64
 14  SCHEDULED_TIME            float64
 15  ELAPSED_TIME              float64
 16  AIR_TIME                  float64
 17  DISTANCE                  int16  
 18  WHEELS_ON                 float64
 19  TAXI_IN                   float64
 20  SCHEDULED_ARRIVAL         int

In [16]:
# filtering out unnecessary columns
flights_needed_data=flights_needed_data.drop(['YEAR','FLIGHT_NUMBER','AIRLINE__CODE','TAIL_NUMBER','TAXI_OUT',
                                              'SCHEDULED_TIME','WHEELS_OFF','ELAPSED_TIME',
                                              'AIR_TIME','WHEELS_ON','DAY_OF_WEEK','TAXI_IN','CANCELLATION_REASON', 'DEST_LATITUDE',
                                              'DEST_LONGITUDE','ORIGIN_LATITUDE', 'ORIGIN_LONGITUDE','ORIGIN_STATE', 'ORIGIN_COUNTRY', 
                                              'DEST_AIRPORT','DEST_CITY','DEST_STATE','DEST_COUNTRY', 'ORIGIN_CITY', 'ORIGIN_AIRPORT' ],
                                             axis=1)
# REMOVED DISTANCE FROM THIS LIST

In [17]:
## flights_needed_data=flights_needed_data.drop(['ORIGIN_AIRPORT' ],
#                                             axis=1)

In [18]:
flights_needed_data.isnull().sum()

MONTH                             0
DAY                               0
ORIGIN_AIRPORT_CODE               0
DESTINATION_AIRPORT_CODE          0
SCHEDULED_DEPARTURE               0
DEPARTURE_TIME                19430
DEPARTURE_DELAY               19430
DISTANCE                          0
SCHEDULED_ARRIVAL                 0
ARRIVAL_TIME                  20737
ARRIVAL_DELAY                 25058
DIVERTED                          0
CANCELLED                         0
AIR_SYSTEM_DELAY            1783087
SECURITY_DELAY              1783087
AIRLINE_DELAY               1783087
LATE_AIRCRAFT_DELAY         1783087
WEATHER_DELAY               1783087
FLY_DATE                          0
AIRLINE                           0
ARRIVAL_TIME_SEGMENT              0
dtype: int64

In [19]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2137736 entries, 2 to 5819067
Data columns (total 21 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   MONTH                     int8   
 1   DAY                       int8   
 2   ORIGIN_AIRPORT_CODE       object 
 3   DESTINATION_AIRPORT_CODE  object 
 4   SCHEDULED_DEPARTURE       int16  
 5   DEPARTURE_TIME            float64
 6   DEPARTURE_DELAY           float64
 7   DISTANCE                  int16  
 8   SCHEDULED_ARRIVAL         int16  
 9   ARRIVAL_TIME              float64
 10  ARRIVAL_DELAY             float64
 11  DIVERTED                  int8   
 12  CANCELLED                 int8   
 13  AIR_SYSTEM_DELAY          float64
 14  SECURITY_DELAY            float64
 15  AIRLINE_DELAY             float64
 16  LATE_AIRCRAFT_DELAY       float64
 17  WEATHER_DELAY             float64
 18  FLY_DATE                  object 
 19  AIRLINE                   object 
 20  ARRIVAL_TIME_SEGMENT      obj

## DO NOT RUN THIS CODE
#flights_needed_data = flights_needed_data.apply(lambda x: x.fillna(x.mean()), axis=0)

cols = ['DEPARTURE_DELAY','ARRIVAL_DELAY','AIR_SYSTEM_DELAY',
        'SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']
cols1 = ['DEPARTURE_TIME']
cols2 = ['ARRIVAL_TIME']
#cols = ['DEPARTURE_TIME','DEPARTURE_DELAY','ARRIVAL_TIME','ARRIVAL_DELAY','AIR_SYSTEM_DELAY',
#        'SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']

for column in cols1:
    print (column)
    flights_needed_data[column].fillna(flights_needed_data[['SCHEDULED_DEPARTURE']]), inplace=True)

for column in cols2:
    print (column)
    flights_needed_data[column].fillna(flights_needed_data[['SCHEDULED_ARRIVAL']]), inplace=True)
    
for column in cols:
    print (column)
    flights_needed_data[column].fillna(flights_needed_data[column].mean(), inplace=True)

In [20]:
#flights_needed_data = flights_needed_data.apply(lambda x: x.fillna(x.mean()), axis=0)

cols = ['DEPARTURE_TIME', 'ARRIVAL_TIME','DEPARTURE_DELAY','ARRIVAL_DELAY','AIR_SYSTEM_DELAY',
        'SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']

for column in cols:
    print (column)
    flights_needed_data[column].fillna(flights_needed_data[column].mean(), inplace=True)

DEPARTURE_TIME
ARRIVAL_TIME
DEPARTURE_DELAY
ARRIVAL_DELAY
AIR_SYSTEM_DELAY
SECURITY_DELAY
AIRLINE_DELAY
LATE_AIRCRAFT_DELAY
WEATHER_DELAY


In [21]:
# replacing all NaN values with the mean of the attribute in which they are present
#flights_needed_data=flights_needed_data.fillna(flights_needed_data.mean())

In [22]:
flights_needed_data.head()

Unnamed: 0,MONTH,DAY,ORIGIN_AIRPORT_CODE,DESTINATION_AIRPORT_CODE,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,FLY_DATE,AIRLINE,ARRIVAL_TIME_SEGMENT
2,7,9,DTW,SAN,1552,1638.0,46.0,1956,1730,1813.0,43.0,0,0,0.0,0.0,43.0,0.0,0.0,2024-07-09,Delta Air Lines Inc.,Evening
9,7,9,DTW,MCI,1553,1727.0,94.0,629,1651,1820.0,89.0,0,0,0.0,0.0,89.0,0.0,0.0,2024-07-09,Delta Air Lines Inc.,Afternoon
30,7,9,ATL,MSY,1555,1600.0,5.0,425,1633,1626.0,-7.0,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-07-09,Delta Air Lines Inc.,Afternoon
31,7,9,DTW,CLT,1555,1549.0,-6.0,500,1744,1735.0,-9.0,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-07-09,Delta Air Lines Inc.,Evening
32,7,9,ATL,OKC,1555,1553.0,-2.0,761,1713,1701.0,-12.0,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-07-09,Delta Air Lines Inc.,Evening


In [23]:
# creating a new column; it will tell if the flight was delayed or not
result=[]

# Create Target/Result column for Classifier

In [24]:
for row in flights_needed_data['ARRIVAL_DELAY']:
  if row > 5:
    result.append(1)
  else:
    result.append(0) 

In [25]:
flights_needed_data['result'] = result

In [26]:
flights_needed_data.value_counts('result')

result
0    1595430
1     542306
Name: count, dtype: int64

In [27]:
# removing some more columns
flights_needed_data=flights_needed_data.drop(['ORIGIN_AIRPORT_CODE', 'DESTINATION_AIRPORT_CODE', 'ARRIVAL_TIME', 'ARRIVAL_DELAY'],axis=1)
flights_needed_data

Unnamed: 0,MONTH,DAY,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,DISTANCE,SCHEDULED_ARRIVAL,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,FLY_DATE,AIRLINE,ARRIVAL_TIME_SEGMENT,result
2,7,9,1552,1638.0,46.0,1956,1730,0,0,0.000000,0.000000,43.000000,0.000000,0.000000,2024-07-09,Delta Air Lines Inc.,Evening,1
9,7,9,1553,1727.0,94.0,629,1651,0,0,0.000000,0.000000,89.000000,0.000000,0.000000,2024-07-09,Delta Air Lines Inc.,Afternoon,1
30,7,9,1555,1600.0,5.0,425,1633,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-07-09,Delta Air Lines Inc.,Afternoon,0
31,7,9,1555,1549.0,-6.0,500,1744,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-07-09,Delta Air Lines Inc.,Evening,0
32,7,9,1555,1553.0,-2.0,761,1713,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-07-09,Delta Air Lines Inc.,Evening,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819058,2,1,1158,1154.0,-4.0,214,1416,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-02-01,Delta Air Lines Inc.,Afternoon,0
5819064,2,1,1159,1148.0,-11.0,749,1424,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-02-01,Delta Air Lines Inc.,Afternoon,1
5819065,2,1,1159,1155.0,-4.0,645,1416,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-02-01,Delta Air Lines Inc.,Afternoon,0
5819066,2,1,1159,1216.0,17.0,762,1450,0,0,9.723927,0.044545,18.437779,23.825489,3.237765,2024-02-01,Delta Air Lines Inc.,Afternoon,1


In [28]:
flights_needed_data=flights_needed_data.drop(['FLY_DATE'], axis=1)

In [29]:
#Get list of categorical variables
s = (flights_needed_data.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables in the dataset:", object_cols)

Categorical variables in the dataset: ['AIRLINE', 'ARRIVAL_TIME_SEGMENT']


In [30]:
#Label Encoding the object dtypes.
LE=LabelEncoder()
for i in object_cols:
    flights_needed_data[i]=flights_needed_data[[i]].apply(LE.fit_transform)
    
print("All features are now numerical")

All features are now numerical


In [31]:
print(flights_needed_data.max(axis=0)) # will return max value of each column
print(flights_needed_data.min(axis=0)) # will return min value of each column

MONTH                     12.0
DAY                       31.0
SCHEDULED_DEPARTURE     2359.0
DEPARTURE_TIME          2400.0
DEPARTURE_DELAY         1289.0
DISTANCE                4983.0
SCHEDULED_ARRIVAL       2359.0
DIVERTED                   1.0
CANCELLED                  1.0
AIR_SYSTEM_DELAY         991.0
SECURITY_DELAY           440.0
AIRLINE_DELAY           1274.0
LATE_AIRCRAFT_DELAY     1010.0
WEATHER_DELAY           1211.0
AIRLINE                    1.0
ARRIVAL_TIME_SEGMENT       7.0
result                     1.0
dtype: float64
MONTH                    1.0
DAY                      1.0
SCHEDULED_DEPARTURE      1.0
DEPARTURE_TIME           1.0
DEPARTURE_DELAY        -61.0
DISTANCE                74.0
SCHEDULED_ARRIVAL        1.0
DIVERTED                 0.0
CANCELLED                0.0
AIR_SYSTEM_DELAY         0.0
SECURITY_DELAY           0.0
AIRLINE_DELAY            0.0
LATE_AIRCRAFT_DELAY      0.0
WEATHER_DELAY            0.0
AIRLINE                  0.0
ARRIVAL_TIME_SEGMENT   

# Train test split

In [32]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2137736 entries, 2 to 5819067
Data columns (total 17 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MONTH                 int8   
 1   DAY                   int8   
 2   SCHEDULED_DEPARTURE   int16  
 3   DEPARTURE_TIME        float64
 4   DEPARTURE_DELAY       float64
 5   DISTANCE              int16  
 6   SCHEDULED_ARRIVAL     int16  
 7   DIVERTED              int8   
 8   CANCELLED             int8   
 9   AIR_SYSTEM_DELAY      float64
 10  SECURITY_DELAY        float64
 11  AIRLINE_DELAY         float64
 12  LATE_AIRCRAFT_DELAY   float64
 13  WEATHER_DELAY         float64
 14  AIRLINE               int64  
 15  ARRIVAL_TIME_SEGMENT  int64  
 16  result                int64  
dtypes: float64(7), int16(3), int64(3), int8(4)
memory usage: 199.8 MB


In [33]:
test = flights_needed_data[flights_needed_data['MONTH'] >= 11] 
#test=test.drop(['FLY_DATE'], axis=1)
train = flights_needed_data[flights_needed_data['MONTH'] < 11]
#train=train.drop(['FLY_DATE'], axis=1)
test_data = test.values
train_data = train.values
X_train, y_train = train_data[:,:-1], train_data[:,-1]
X_test, y_test = test_data[:,:-1], test_data[:,-1]
#train_data = flights_needed_data[flights_needed_data['FLY_DATE'].apply(lambda x:x.date()) < datetime.date(2024, 11, 1)]
#test_data  = flights_needed_data[flights_needed_data['FLY_DATE'].apply(lambda x:x.date()) > datetime.date(2024, 10, 31)]

In [34]:
test.shape

(354388, 17)

In [35]:
train.shape

(1783348, 17)

In [36]:
X_train

array([[7.00000000e+00, 9.00000000e+00, 1.55200000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 4.00000000e+00],
       [7.00000000e+00, 9.00000000e+00, 1.55300000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.00000000e+00, 9.00000000e+00, 1.55500000e+03, ...,
        3.23776466e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.00000000e+00, 1.00000000e+00, 1.15900000e+03, ...,
        3.23776466e+00, 0.00000000e+00, 0.00000000e+00],
       [2.00000000e+00, 1.00000000e+00, 1.15900000e+03, ...,
        3.23776466e+00, 0.00000000e+00, 0.00000000e+00],
       [2.00000000e+00, 1.00000000e+00, 1.15900000e+03, ...,
        3.23776466e+00, 0.00000000e+00, 0.00000000e+00]])

In [37]:
sc = StandardScaler()
X_train_1 = sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

# Model training and Prediction

In [38]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

In [39]:
y_prob = clf.predict_proba(X_test)[:,1]

In [40]:
y_pred = clf.predict(X_test)

In [41]:
auc_score = roc_auc_score(y_test, y_pred)
auc_score

0.8414646022366795

In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.91      0.92    263912
         1.0       0.75      0.77      0.76     90476

    accuracy                           0.88    354388
   macro avg       0.84      0.84      0.84    354388
weighted avg       0.88      0.88      0.88    354388



In [46]:
flights_needed_data.shape

(2137736, 17)

In [47]:
flights_needed_df = flights_needed_data.drop(['result'],axis=1)

In [48]:
final_data = flights_needed_df.values

In [49]:
final_data = sc.transform(final_data)

In [50]:
y_pred = clf.predict(final_data)

In [51]:
final_data

array([[ 4.94851068e-01, -7.68556678e-01,  4.58116494e-01, ...,
        -3.54655787e-01, -1.19703701e+00,  2.08916875e-01],
       [ 4.94851068e-01, -7.68556678e-01,  4.60195631e-01, ...,
        -3.54655787e-01, -1.19703701e+00, -1.38310570e+00],
       [ 4.94851068e-01, -7.68556678e-01,  4.64353907e-01, ...,
        -4.10201150e-04, -1.19703701e+00, -1.38310570e+00],
       ...,
       [-1.27810916e+00, -1.68040509e+00, -3.58984720e-01, ...,
        -4.10201150e-04, -1.19703701e+00, -1.38310570e+00],
       [-1.27810916e+00, -1.68040509e+00, -3.58984720e-01, ...,
        -4.10201150e-04, -1.19703701e+00, -1.38310570e+00],
       [-1.27810916e+00, -1.68040509e+00, -3.58984720e-01, ...,
        -4.10201150e-04, -1.19703701e+00, -1.38310570e+00]])

In [52]:
y_prob = clf.predict_proba(final_data)[:,1]

In [53]:
len(y_pred)

2137736

In [54]:
clf.feature_importances_

array([3.04968008e-02, 7.63155348e-02, 3.12607396e-02, 3.98420352e-02,
       1.02250458e-01, 6.45660769e-02, 5.78497868e-02, 2.00001539e-03,
       1.57139825e-03, 6.47466460e-06, 5.85322681e-01, 1.32422190e-05,
       2.98667482e-05, 3.23202395e-05, 4.76247403e-03, 3.68009525e-03])

In [55]:
type(X_train),type(X_test),type(y_train),type(y_test),type(y_pred),type(y_prob)

(numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray)

In [None]:
column = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME','DEPARTURE_DELAY','DISTANCE','SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 
          'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY','AIRLINE', 'ARRIVAL_TIME_SEGMENT']


In [None]:
X_train_df = pd.DataFrame(X_train, columns=column)
X_test_df = pd.DataFrame(X_test, columns=column)
y_train_df = pd.DataFrame(y_train, columns=['ACTUAL_DELAY'])
y_test_df = pd.DataFrame(y_test, columns=['ACTUAL_DELAY'])
y_pred_df = pd.DataFrame(y_pred, columns=['PREDICTED_DELAY'])
y_prob_df = pd.DataFrame(y_prob, columns=['PROBABILITY'])

# Model Registration

In [None]:
## registering the model in Fosfor Insight Designer.
register_model(
    model_obj=clf, 
    session=my_session,
    x_train=X_train_df,
    y_train=y_train_df,
    x_test=X_test_df,
    y_test=y_test_df,
    y_pred=y_pred_df,
    y_prob=y_prob_df,
    source="Notebook",
    dataset_name="FLIGHTS_FULL",
    dataset_source="Snowflake",
    #dataset_source="InMemory",
    name="Decision_Tree_Delay_Classifier",
    description="Decision tree model trained via Notebook to identify Flight Delay application",
    flavour="sklearn",
    model_type="classification",
    conda_dependencies=["scikit-learn==1.3.2"]
)

In [None]:
y_train_df

# Push Model Input on Snowflake

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
data_df = pd.concat([X_train_df,y_train_df],axis=1,ignore_index=True)

In [None]:
data_df.columns = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME','DEPARTURE_DELAY','DISTANCE','SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 
          'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY','AIRLINE', 'ARRIVAL_TIME_SEGMENT','FLIGHT_DELAY']

In [None]:
data_df.head()

In [None]:
data_df.info()

In [None]:
data_df['FLIGHT_DELAY'] = data_df['FLIGHT_DELAY'].astype('str')

In [None]:
data_df.head()

In [None]:
ins_train_sf=my_session.createDataFrame(
        data_df.values.tolist(),
        schema=data_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.FLIGHTS_TRAINDATA")

# Push Model Output on snowflake

In [None]:
test_df = pd.concat([X_test_df,y_test_df,y_pred_df, y_prob_df],axis=1,ignore_index=True)

In [None]:
test_df.dropna(inplace=True)

In [None]:
test_df.shape

In [None]:
test_df.head()

In [None]:
test_df.columns = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME','DEPARTURE_DELAY','DISTANCE','SCHEDULED_ARRIVAL', 
                   'DIVERTED', 'CANCELLED', 'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
                   'WEATHER_DELAY','AIRLINE', 'ARRIVAL_TIME_SEGMENT','FLIGHT_DELAY','PREDICTED_DELAY','PROBABILITY']

In [None]:
X1_df = test_df.head(15000)
X2_df = test_df.tail(15000)

In [None]:
ins_train_sf=my_session.createDataFrame(
        X1_df.values.tolist(),
        schema=X1_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.DELAY_CLASSIFIER_OUTPUT_1")

In [None]:
ins_train_sf=my_session.createDataFrame(
        X2_df.values.tolist(),
        schema=X2_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.DELAY_CLASSIFIER_OUTPUT_2")