# Use TTH_Template Custom Notebook template

In [46]:
import datetime, warnings, scipy 
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, classification_report

pd.options.display.max_columns = 50
warnings.filterwarnings("ignore")

from fosforml.model_manager.snowflakesession import get_session
from fosforml import register_model

# Read data from Snowflake

In [18]:
my_session = get_session()

In [19]:
table_name = 'FLIGHTS'

sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [20]:
flights = df.copy()
flights_needed_data = df.copy()

In [21]:
flights_needed_data.shape

(5819079, 31)

In [22]:
flights_needed_data = flights[0:100000]  # getting a segment 

In [23]:
flights_needed_data

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE__CODE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT_CODE,DESTINATION_AIRPORT_CODE,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,2,12,4,WN,697,N399WN,BWI,CLT,840,910.0,30.0,13.0,923.0,95.0,90.0,73.0,361,1036.0,4.0,1015,1040.0,25.0,0,0,,0.0,0.0,12.0,13.0,0.0
1,2015,2,12,4,WN,523,N232WN,BWI,MCI,840,839.0,-1.0,14.0,853.0,185.0,164.0,146.0,967,1019.0,4.0,1045,1023.0,-22.0,0,0,,,,,,
2,2015,2,12,4,WN,2513,N8600F,BWI,RSW,840,909.0,29.0,17.0,926.0,175.0,160.0,139.0,919,1145.0,4.0,1135,1149.0,14.0,0,0,,,,,,
3,2015,2,12,4,WN,451,N8647A,BWI,SJU,840,840.0,0.0,36.0,916.0,255.0,260.0,219.0,1565,1355.0,5.0,1355,1400.0,5.0,0,0,,,,,,
4,2015,2,12,4,WN,3497,N441WN,SFO,ATL,840,839.0,-1.0,9.0,848.0,270.0,275.0,261.0,2139,1609.0,5.0,1610,1614.0,4.0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2015,2,18,3,OO,6535,N760SK,OKC,LAX,1626,1723.0,57.0,14.0,1737.0,201.0,183.0,158.0,1187,1815.0,11.0,1747,1826.0,39.0,0,0,,0.0,0.0,0.0,39.0,0.0
99996,2015,2,18,3,UA,3,N66825,MCO,EWR,1626,1633.0,7.0,14.0,1647.0,164.0,137.0,114.0,937,1841.0,9.0,1910,1850.0,-20.0,0,0,,,,,,
99997,2015,2,18,3,DL,1178,N950DN,RIC,ATL,1627,1703.0,36.0,16.0,1719.0,113.0,106.0,84.0,481,1843.0,6.0,1820,1849.0,29.0,0,0,,0.0,0.0,4.0,25.0,0.0
99998,2015,2,18,3,UA,507,N412UA,ORD,DEN,1627,1712.0,45.0,18.0,1730.0,163.0,151.0,128.0,888,1838.0,5.0,1810,1843.0,33.0,0,0,,0.0,0.0,33.0,0.0,0.0


In [24]:
flights_needed_data.value_counts('DIVERTED')  # will tell us the no. of flights which were diverted

DIVERTED
0    99800
1      200
Name: count, dtype: int64

In [25]:
# filtering out unnecessary columns
flights_needed_data=flights_needed_data.drop(['YEAR','FLIGHT_NUMBER','AIRLINE__CODE','DISTANCE','TAIL_NUMBER','TAXI_OUT',
                                              'SCHEDULED_TIME','DEPARTURE_TIME','WHEELS_OFF','ELAPSED_TIME',
                                              'AIR_TIME','WHEELS_ON','DAY_OF_WEEK','TAXI_IN','CANCELLATION_REASON'],
                                             axis=1)

In [26]:
flights_needed_data.isnull().sum()

MONTH                           0
DAY                             0
ORIGIN_AIRPORT_CODE             0
DESTINATION_AIRPORT_CODE        0
SCHEDULED_DEPARTURE             0
DEPARTURE_DELAY              3965
SCHEDULED_ARRIVAL               0
ARRIVAL_TIME                 4123
ARRIVAL_DELAY                4259
DIVERTED                        0
CANCELLED                       0
AIR_SYSTEM_DELAY            76103
SECURITY_DELAY              76103
AIRLINE_DELAY               76103
LATE_AIRCRAFT_DELAY         76103
WEATHER_DELAY               76103
dtype: int64

In [28]:
flights_needed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   MONTH                     100000 non-null  int8   
 1   DAY                       100000 non-null  int8   
 2   ORIGIN_AIRPORT_CODE       100000 non-null  object 
 3   DESTINATION_AIRPORT_CODE  100000 non-null  object 
 4   SCHEDULED_DEPARTURE       100000 non-null  int16  
 5   DEPARTURE_DELAY           96035 non-null   float64
 6   SCHEDULED_ARRIVAL         100000 non-null  int16  
 7   ARRIVAL_TIME              95877 non-null   float64
 8   ARRIVAL_DELAY             95741 non-null   float64
 9   DIVERTED                  100000 non-null  int8   
 10  CANCELLED                 100000 non-null  int8   
 11  AIR_SYSTEM_DELAY          23897 non-null   float64
 12  SECURITY_DELAY            23897 non-null   float64
 13  AIRLINE_DELAY             23897 non-null   fl

In [29]:
#flights_needed_data = flights_needed_data.apply(lambda x: x.fillna(x.mean()), axis=0)

cols = ['DEPARTURE_DELAY','ARRIVAL_TIME','ARRIVAL_DELAY','AIR_SYSTEM_DELAY','SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']

for column in cols:
    print (column)
    flights_needed_data[column].fillna(flights_needed_data[column].mean(), inplace=True)

DEPARTURE_DELAY
ARRIVAL_TIME
ARRIVAL_DELAY
AIR_SYSTEM_DELAY
SECURITY_DELAY
AIRLINE_DELAY
LATE_AIRCRAFT_DELAY
WEATHER_DELAY


In [None]:
# replacing all NaN values with the mean of the attribute in which they are present
#flights_needed_data=flights_needed_data.fillna(flights_needed_data.mean())

In [30]:
flights_needed_data.head()

Unnamed: 0,MONTH,DAY,ORIGIN_AIRPORT_CODE,DESTINATION_AIRPORT_CODE,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2,12,BWI,CLT,840,30.0,1015,1040.0,25.0,0,0,0.0,0.0,12.0,13.0,0.0
1,2,12,BWI,MCI,840,-1.0,1045,1023.0,-22.0,0,0,12.529564,0.046449,17.951124,21.499435,3.645353
2,2,12,BWI,RSW,840,29.0,1135,1149.0,14.0,0,0,12.529564,0.046449,17.951124,21.499435,3.645353
3,2,12,BWI,SJU,840,0.0,1355,1400.0,5.0,0,0,12.529564,0.046449,17.951124,21.499435,3.645353
4,2,12,SFO,ATL,840,-1.0,1610,1614.0,4.0,0,0,12.529564,0.046449,17.951124,21.499435,3.645353


In [31]:
# creating a new column; it will tell if the flight was delayed or not
result=[]

In [32]:
for row in flights_needed_data['ARRIVAL_DELAY']:
  if row > 15:
    result.append(1)
  else:
    result.append(0) 

In [33]:
flights_needed_data['result'] = result

In [34]:
flights_needed_data.value_counts('result')

result
0    76999
1    23001
Name: count, dtype: int64

In [35]:
# removing some more columns
flights_needed_data=flights_needed_data.drop(['ORIGIN_AIRPORT_CODE', 'DESTINATION_AIRPORT_CODE', 'ARRIVAL_TIME', 'ARRIVAL_DELAY'],axis=1)
flights_needed_data

Unnamed: 0,MONTH,DAY,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,result
0,2,12,840,30.0,1015,0,0,0.000000,0.000000,12.000000,13.000000,0.000000,1
1,2,12,840,-1.0,1045,0,0,12.529564,0.046449,17.951124,21.499435,3.645353,0
2,2,12,840,29.0,1135,0,0,12.529564,0.046449,17.951124,21.499435,3.645353,0
3,2,12,840,0.0,1355,0,0,12.529564,0.046449,17.951124,21.499435,3.645353,0
4,2,12,840,-1.0,1610,0,0,12.529564,0.046449,17.951124,21.499435,3.645353,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2,18,1626,57.0,1747,0,0,0.000000,0.000000,0.000000,39.000000,0.000000,1
99996,2,18,1626,7.0,1910,0,0,12.529564,0.046449,17.951124,21.499435,3.645353,0
99997,2,18,1627,36.0,1820,0,0,0.000000,0.000000,4.000000,25.000000,0.000000,1
99998,2,18,1627,45.0,1810,0,0,0.000000,0.000000,33.000000,0.000000,0.000000,1


In [53]:
data = flights_needed_data.values
X, y = data[:,:-1], data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)  # splitting in the ratio 70:30

In [None]:
X_train.columns

In [54]:
sc = StandardScaler()
X_train_1 = sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [55]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

In [56]:
y_prob = clf.predict_proba(X_test)[:,1]

In [57]:
y_pred = clf.predict(X_test)

In [58]:
auc_score = roc_auc_score(y_test, y_pred)
auc_score

0.9955238365044715

In [59]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     23118
         1.0       0.99      0.99      0.99      6882

    accuracy                           1.00     30000
   macro avg       1.00      1.00      1.00     30000
weighted avg       1.00      1.00      1.00     30000



In [69]:
flights_needed_data.shape

(100000, 12)

In [64]:
flights_needed_df = flights_needed_data.drop(['result'],axis=1)

In [70]:
final_data = flights_needed_df.values

In [71]:
final_data = sc.transform(final_data)

In [72]:
y_pred = clf.predict(final_data)

In [73]:
y_prob = clf.predict_proba(final_data)[:,1]

22990.0