# Use TTH_Template Custom Notebook template

In [1]:
import datetime, warnings, scipy 
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, classification_report

pd.options.display.max_columns = 50
warnings.filterwarnings("ignore")

from fosforml.model_manager.snowflakesession import get_session
from fosforml import register_model

In [2]:
## for no font error:

# Set Matplotlib's default font family to 'DeJavu Serif' to ensure a consistent font style across plots
plt.rcParams['font.family'] = 'DeJavu Serif'

# Read data from Snowflake

In [3]:
my_session = get_session()

In [4]:
## table_name = 'FLIGHTS'
table_name = 'FLIGHTS_FULL'

sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [5]:
## FILTERING DATA for just 2 airlines

options = ['Southwest Airlines Co.', 'Delta Air Lines Inc.'] 
  
# selecting rows based on condition 
flights = df.loc[df['AIRLINE'].isin(options)] 

## check
## flights['AIRLINE'].unique()

In [6]:
## flights = df.copy()
flights_needed_data = flights.copy()

In [7]:
flights_needed_data.shape

## OLD VALUE: (5819079, 31)

(2137736, 45)

In [8]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2137736 entries, 1 to 5819078
Data columns (total 45 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   YEAR                      int16  
 1   MONTH                     int8   
 2   DAY                       int8   
 3   DAY_OF_WEEK               int8   
 4   AIRLINE__CODE             object 
 5   FLIGHT_NUMBER             int16  
 6   TAIL_NUMBER               object 
 7   ORIGIN_AIRPORT_CODE       object 
 8   DESTINATION_AIRPORT_CODE  object 
 9   SCHEDULED_DEPARTURE       int16  
 10  DEPARTURE_TIME            float64
 11  DEPARTURE_DELAY           float64
 12  TAXI_OUT                  float64
 13  WHEELS_OFF                float64
 14  SCHEDULED_TIME            float64
 15  ELAPSED_TIME              float64
 16  AIR_TIME                  float64
 17  DISTANCE                  int16  
 18  WHEELS_ON                 float64
 19  TAXI_IN                   float64
 20  SCHEDULED_ARRIVAL         int

In [9]:
flights_needed_data.head

<bound method NDFrame.head of          YEAR  MONTH  DAY  DAY_OF_WEEK AIRLINE__CODE  FLIGHT_NUMBER  \
1        2024     12   17            4            DL           1799   
6        2024     12   17            4            DL           1480   
7        2024     12   17            4            DL           1952   
14       2024     12   17            4            WN            558   
15       2024     12   17            4            WN           2360   
...       ...    ...  ...          ...           ...            ...   
5819074  2024      9   21            1            WN            714   
5819075  2024      9   21            1            WN           2364   
5819076  2024      9   21            1            WN             41   
5819077  2024      9   21            1            WN            151   
5819078  2024      9   21            1            WN           3412   

        TAIL_NUMBER ORIGIN_AIRPORT_CODE DESTINATION_AIRPORT_CODE  \
1            N603AT                 CHA          

In [10]:
## flights_needed_data = df.loc[(df['fly_date'] <= '2024-10-31')]
## flights_needed_data = flights[0:100000]  # getting a segment 

In [11]:
def categorize_time(SCHEDULED_ARRIVAL):
    if SCHEDULED_ARRIVAL >= 500 and SCHEDULED_ARRIVAL< 800: return 'Early morning'
    elif SCHEDULED_ARRIVAL >= 800 and SCHEDULED_ARRIVAL < 1100: return 'Late morning'
    elif SCHEDULED_ARRIVAL >= 1100 and SCHEDULED_ARRIVAL < 1400: return 'Around noon'
    elif SCHEDULED_ARRIVAL >= 1400 and SCHEDULED_ARRIVAL < 1700: return 'Afternoon'
    elif SCHEDULED_ARRIVAL >= 1700 and SCHEDULED_ARRIVAL < 2000: return 'Evening'
    elif SCHEDULED_ARRIVAL >= 2000 and SCHEDULED_ARRIVAL < 2300: return 'Night'
    elif SCHEDULED_ARRIVAL >= 2300 or SCHEDULED_ARRIVAL < 200: return 'Late night'
    elif SCHEDULED_ARRIVAL >= 200 or SCHEDULED_ARRIVAL < 500: return 'Dawn'
#    else:
#        return 'Dawn'

# Apply the function to the Age column using the apply() function
flights_needed_data['ARRIVAL_TIME_SEGMENT'] = flights_needed_data['SCHEDULED_ARRIVAL'].apply(categorize_time)

In [12]:
flights_needed_data

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE__CODE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT_CODE,DESTINATION_AIRPORT_CODE,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,FLY_DATE,AIRLINE,ORIGIN_AIRPORT,ORIGIN_CITY,ORIGIN_STATE,ORIGIN_COUNTRY,ORIGIN_LATITUDE,ORIGIN_LONGITUDE,DEST_AIRPORT,DEST_CITY,DEST_STATE,DEST_COUNTRY,DEST_LATITUDE,DEST_LONGITUDE,ARRIVAL_TIME_SEGMENT
1,2024,12,17,4,DL,1799,N603AT,CHA,ATL,549,552.0,3.0,15.0,607.0,56.0,48.0,25.0,106,632.0,8.0,645,640.0,-5.0,0,0,,,,,,,2024-12-17,Delta Air Lines Inc.,Chattanooga Metropolitan Airport (Lovell Field),Chattanooga,TN,USA,35.03527,-85.20379,Hartsfield-Jackson Atlanta International Airport,Atlanta,GA,USA,33.64044,-84.42694,Early morning
6,2024,12,17,4,DL,1480,N319US,BDL,DTW,550,549.0,-1.0,20.0,609.0,120.0,129.0,104.0,549,753.0,5.0,750,758.0,8.0,0,0,,,,,,,2024-12-17,Delta Air Lines Inc.,Bradley International Airport,Windsor Locks,CT,USA,41.93887,-72.68323,Detroit Metropolitan Airport,Detroit,MI,USA,42.21206,-83.34884,Early morning
7,2024,12,17,4,DL,1952,N939AT,DSM,ATL,550,549.0,-1.0,15.0,604.0,131.0,141.0,110.0,743,854.0,16.0,901,910.0,9.0,0,0,,,,,,,2024-12-17,Delta Air Lines Inc.,Des Moines International Airport,Des Moines,IA,USA,41.53493,-93.66068,Hartsfield-Jackson Atlanta International Airport,Atlanta,GA,USA,33.64044,-84.42694,Late morning
14,2024,12,17,4,WN,558,N263WN,CMH,BWI,550,548.0,-2.0,10.0,558.0,80.0,68.0,53.0,337,651.0,5.0,710,656.0,-14.0,0,0,,,,,,,2024-12-17,Southwest Airlines Co.,Port Columbus International Airport,Columbus,OH,USA,39.99799,-82.89188,Baltimore-Washington International Airport,Baltimore,MD,USA,39.17540,-76.66820,Early morning
15,2024,12,17,4,WN,2360,N7720F,ABQ,MDW,550,557.0,7.0,16.0,613.0,165.0,153.0,132.0,1121,925.0,5.0,935,930.0,-5.0,0,0,,,,,,,2024-12-17,Southwest Airlines Co.,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919,Chicago Midway International Airport,Chicago,IL,USA,41.78598,-87.75242,Late morning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819074,2024,9,21,1,WN,714,N515SW,DAL,AUS,1820,1817.0,-3.0,9.0,1826.0,50.0,44.0,31.0,189,1857.0,4.0,1910,1901.0,-9.0,0,0,,,,,,,2024-09-21,Southwest Airlines Co.,Dallas Love Field,Dallas,TX,USA,32.84711,-96.85177,Austin-Bergstrom International Airport,Austin,TX,USA,30.19453,-97.66987,Evening
5819075,2024,9,21,1,WN,2364,N701GS,DAL,BWI,1820,1826.0,6.0,11.0,1837.0,180.0,187.0,147.0,1209,2204.0,29.0,2220,2233.0,13.0,0,0,,,,,,,2024-09-21,Southwest Airlines Co.,Dallas Love Field,Dallas,TX,USA,32.84711,-96.85177,Baltimore-Washington International Airport,Baltimore,MD,USA,39.17540,-76.66820,Night
5819076,2024,9,21,1,WN,41,N257WN,HOU,SAT,1820,1817.0,-3.0,6.0,1823.0,50.0,44.0,35.0,192,1858.0,3.0,1910,1901.0,-9.0,0,0,,,,,,,2024-09-21,Southwest Airlines Co.,William P. Hobby Airport,Houston,TX,USA,29.64542,-95.27889,San Antonio International Airport,San Antonio,TX,USA,29.53369,-98.46978,Evening
5819077,2024,9,21,1,WN,151,N480WN,IAD,LAS,1820,1825.0,5.0,16.0,1841.0,310.0,291.0,268.0,2065,2009.0,7.0,2030,2016.0,-14.0,0,0,,,,,,,2024-09-21,Southwest Airlines Co.,Washington Dulles International Airport,Chantilly,VA,USA,38.94453,-77.45581,McCarran International Airport,Las Vegas,NV,USA,36.08036,-115.15233,Night


In [13]:
 flights['AIRLINE__CODE'].unique()

array(['DL', 'WN'], dtype=object)

In [14]:
flights_needed_data.value_counts('DIVERTED')  # will tell us the no. of flights which were diverted

DIVERTED
0    2132545
1       5191
Name: count, dtype: int64

In [15]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2137736 entries, 1 to 5819078
Data columns (total 46 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   YEAR                      int16  
 1   MONTH                     int8   
 2   DAY                       int8   
 3   DAY_OF_WEEK               int8   
 4   AIRLINE__CODE             object 
 5   FLIGHT_NUMBER             int16  
 6   TAIL_NUMBER               object 
 7   ORIGIN_AIRPORT_CODE       object 
 8   DESTINATION_AIRPORT_CODE  object 
 9   SCHEDULED_DEPARTURE       int16  
 10  DEPARTURE_TIME            float64
 11  DEPARTURE_DELAY           float64
 12  TAXI_OUT                  float64
 13  WHEELS_OFF                float64
 14  SCHEDULED_TIME            float64
 15  ELAPSED_TIME              float64
 16  AIR_TIME                  float64
 17  DISTANCE                  int16  
 18  WHEELS_ON                 float64
 19  TAXI_IN                   float64
 20  SCHEDULED_ARRIVAL         int

In [16]:
# filtering out unnecessary columns
flights_needed_data=flights_needed_data.drop(['YEAR','FLIGHT_NUMBER','AIRLINE__CODE','TAIL_NUMBER','TAXI_OUT',
                                              'SCHEDULED_TIME','WHEELS_OFF','ELAPSED_TIME',
                                              'AIR_TIME','WHEELS_ON','DAY_OF_WEEK','TAXI_IN','CANCELLATION_REASON', 'DEST_LATITUDE',
                                              'DEST_LONGITUDE','ORIGIN_LATITUDE', 'ORIGIN_LONGITUDE', 'ORIGIN_AIRPORT_CODE', 
                                              'DESTINATION_AIRPORT_CODE'],
                                             axis=1)
# REMOVED DISTANCE FROM THIS LIST

In [17]:
flights_needed_data.isnull().sum()

MONTH                         0
DAY                           0
SCHEDULED_DEPARTURE           0
DEPARTURE_TIME            19430
DEPARTURE_DELAY           19430
DISTANCE                      0
SCHEDULED_ARRIVAL             0
ARRIVAL_TIME              20737
ARRIVAL_DELAY             25058
DIVERTED                      0
CANCELLED                     0
AIR_SYSTEM_DELAY        1783087
SECURITY_DELAY          1783087
AIRLINE_DELAY           1783087
LATE_AIRCRAFT_DELAY     1783087
WEATHER_DELAY           1783087
FLY_DATE                      0
AIRLINE                       0
ORIGIN_AIRPORT           180068
ORIGIN_CITY              180068
ORIGIN_STATE             180068
ORIGIN_COUNTRY           180068
DEST_AIRPORT             180068
DEST_CITY                180068
DEST_STATE               180068
DEST_COUNTRY             180068
ARRIVAL_TIME_SEGMENT          0
dtype: int64

In [18]:
flights.isnull().sum()

YEAR                              0
MONTH                             0
DAY                               0
DAY_OF_WEEK                       0
AIRLINE__CODE                     0
FLIGHT_NUMBER                     0
TAIL_NUMBER                    1431
ORIGIN_AIRPORT_CODE               0
DESTINATION_AIRPORT_CODE          0
SCHEDULED_DEPARTURE               0
DEPARTURE_TIME                19430
DEPARTURE_DELAY               19430
TAXI_OUT                      19744
WHEELS_OFF                    19744
SCHEDULED_TIME                    0
ELAPSED_TIME                  25058
AIR_TIME                      25058
DISTANCE                          0
WHEELS_ON                     20737
TAXI_IN                       20737
SCHEDULED_ARRIVAL                 0
ARRIVAL_TIME                  20737
ARRIVAL_DELAY                 25058
DIVERTED                          0
CANCELLED                         0
CANCELLATION_REASON         2117869
AIR_SYSTEM_DELAY            1783087
SECURITY_DELAY              

In [None]:
flights_needed_data.info()

In [None]:
#flights_needed_data = flights_needed_data.apply(lambda x: x.fillna(x.mean()), axis=0)

cols = ['DEPARTURE_TIME','DEPARTURE_DELAY','ARRIVAL_TIME','ARRIVAL_DELAY','AIR_SYSTEM_DELAY',
        'SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']

for column in cols:
    print (column)
    flights_needed_data[column].fillna(flights_needed_data[column].mean(), inplace=True)

In [None]:
# replacing all NaN values with the mean of the attribute in which they are present
#flights_needed_data=flights_needed_data.fillna(flights_needed_data.mean())

In [None]:
flights_needed_data.head()

In [None]:
# creating a new column; it will tell if the flight was delayed or not
result=[]

# Create Target/Result column for Classifier

In [None]:
for row in flights_needed_data['ARRIVAL_DELAY']:
  if row > 10:
    result.append(1)
  else:
    result.append(0) 

In [None]:
flights_needed_data['result'] = result

In [None]:
flights_needed_data.value_counts('result')

In [None]:
# removing some more columns
flights_needed_data=flights_needed_data.drop(['ORIGIN_AIRPORT_CODE', 'DESTINATION_AIRPORT_CODE', 'ARRIVAL_TIME', 'ARRIVAL_DELAY'],axis=1)
flights_needed_data

# Train test split

In [None]:
data = flights_needed_data.values
X, y = data[:,:-1], data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)  # splitting in the ratio 70:30

In [None]:
flights_needed_data.columns

# Scaling Input Feature

In [None]:
sc = StandardScaler()
X_train_1 = sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

# Model training and Prediction

In [None]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

In [None]:
y_prob = clf.predict_proba(X_test)[:,1]

In [None]:
y_pred = clf.predict(X_test)

In [None]:
auc_score = roc_auc_score(y_test, y_pred)
auc_score

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
flights_needed_data.shape

In [None]:
flights_needed_df = flights_needed_data.drop(['result'],axis=1)

In [None]:
final_data = flights_needed_df.values

In [None]:
final_data = sc.transform(final_data)

In [None]:
y_pred = clf.predict(final_data)

In [None]:
y_prob = clf.predict_proba(final_data)[:,1]

In [None]:
len(y_pred)

In [None]:
clf.feature_importances_

In [None]:
type(X_train),type(X_test),type(y_train),type(y_test),type(y_pred),type(y_prob)

In [None]:
column = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY','SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 
          'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY']

In [None]:
X_train_df = pd.DataFrame(X_train, columns=column)
X_test_df = pd.DataFrame(X_test, columns=column)
y_train_df = pd.DataFrame(y_train, columns=['ACTUAL_DELAY'])
y_test_df = pd.DataFrame(y_test, columns=['ACTUAL_DELAY'])
y_pred_df = pd.DataFrame(y_pred, columns=['PREDICTED_DELAY'])
y_prob_df = pd.DataFrame(y_prob, columns=['PROBABILITY'])

# Model Registration

In [None]:
## registering the model in Fosfor Insight Designer.
register_model(
    model_obj=clf, 
    session=my_session,
    x_train=X_train_df,
    y_train=y_train_df,
    x_test=X_test_df,
    y_test=y_test_df,
    y_pred=y_pred_df,
    y_prob=y_prob_df,
    source="Notebook",
    dataset_name="FLIGHTS",
    dataset_source="Snowflake",
    #dataset_source="InMemory",
    name="Decision_Tree_Delay_Classifier",
    description="Decision tree model trained via Notebook to identify Flight Delay application",
    flavour="sklearn",
    model_type="classification",
    conda_dependencies=["scikit-learn==1.3.2"]
)

In [None]:
y_train_df

# Push Model Input on Snowflake

In [None]:
data_df = pd.concat([X_train_df,y_train_df],axis=1,ignore_index=True)

In [None]:
data_df.columns = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY','SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 
          'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY','FLIGHT_DELAY']

In [None]:
data_df.head()

In [None]:
data_df.info()

In [None]:
data_df['FLIGHT_DELAY'] = data_df['FLIGHT_DELAY'].astype('str')

In [None]:
data_df.head()

In [None]:
ins_train_sf=my_session.createDataFrame(
        data_df.values.tolist(),
        schema=data_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.EXPERIMENT_INSIGHT_INPUT")

# Push Model Output on snowflake

In [None]:
test_df = pd.concat([X_test_df,y_test_df,y_pred_df, y_prob_df],axis=1,ignore_index=True)

In [None]:
test_df.dropna(inplace=True)

In [None]:
test_df.shape

In [None]:
test_df.head()

In [None]:
test_df.columns = ['MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY','SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 
          'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY','FLIGHT_DELAY','PREDICTED_DELAY','PROBABILITY']

In [None]:
X1_df = test_df.head(15000)
X2_df = test_df.tail(15000)

In [None]:
ins_train_sf=my_session.createDataFrame(
        X1_df.values.tolist(),
        schema=X1_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.DELAY_CLASSIFIER_OUTPUT_1")

In [None]:
ins_train_sf=my_session.createDataFrame(
        X2_df.values.tolist(),
        schema=X2_df.columns.tolist())
ins_train_sf.write.mode("overwrite").save_as_table("TTH_DB.TTH_AIRLINE_SCHEMA.DELAY_CLASSIFIER_OUTPUT_2")