# Use TTH_Template Custom Notebook template

In [17]:
import datetime, warnings, scipy 
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

pd.options.display.max_columns = 50
warnings.filterwarnings("ignore")

from fosforml.model_manager.snowflakesession import get_session
from fosforml import register_model

# Read data from Snowflake

In [18]:
my_session = get_session()

In [None]:
table_name = 'FLIGHTS'

sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [None]:
flights = df.copy()
flights_needed_data = df.copy()

In [None]:
flights_needed_data.shape

In [None]:
flights_needed_data = flights[0:100000]  # getting a segment 

In [None]:
flights_needed_data

In [None]:
flights_needed_data.value_counts('DIVERTED')  # will tell us the no. of flights which were diverted

In [None]:
# filtering out unnecessary columns
flights_needed_data=flights_needed_data.drop(['YEAR','FLIGHT_NUMBER','AIRLINE__CODE','DISTANCE','TAIL_NUMBER','TAXI_OUT',
                                              'SCHEDULED_TIME','DEPARTURE_TIME','WHEELS_OFF','ELAPSED_TIME',
                                              'AIR_TIME','WHEELS_ON','DAY_OF_WEEK','TAXI_IN','CANCELLATION_REASON'],
                                             axis=1)

In [None]:
flights_needed_data.isnull().sum()

In [None]:
# replacing all NaN values with the mean of the attribute in which they are present
flights_needed_data=flights_needed_data.fillna(flights_needed_data.mean())

In [None]:
flights_needed_data.head()

In [None]:
# creating a new column; it will tell if the flight was delayed or not
result=[]

In [None]:
for row in flights_needed_data['ARRIVAL_DELAY']:
  if row > 15:
    result.append(1)
  else:
    result.append(0) 

In [None]:
flights_needed_data['result'] = result

In [None]:
flights_needed_data.value_counts('result')

In [None]:
# removing some more columns
flights_needed_data=flights_needed_data.drop(['ORIGIN_AIRPORT_CODE', 'DESTINATION_AIRPORT_CODE', 'ARRIVAL_TIME', 'ARRIVAL_DELAY'],axis=1)
flights_needed_data

In [None]:
data = flights_needed_data.values
X, y = data[:,:-1], data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)  # splitting in the ratio 70:30

In [None]:
scaled_features = StandardScaler().fit_transform(X_train, X_test)

In [None]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

In [None]:
pred_prob = clf.predict_proba(X_test)
auc_score = roc_auc_score(y_test, pred_prob[:,1])
auc_score