In [559]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [528]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import seaborn as sn
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

### Get onehot encoded flight data

In [529]:
en_data = pd.read_csv('data/hot_encoded_flights.csv')

In [530]:
en_data.head()

Unnamed: 0.1,Unnamed: 0,distance,air_time,actual_elapsed_time,taxi_in,taxi_out,arr_delay,dep_time_format,arrival_time_format,mkt_unique_carrier_AS,...,dest_VEL,dest_VLD,dest_VPS,dest_WRG,dest_WYS,dest_XNA,dest_XWA,dest_YAK,dest_YKM,dest_YUM
0,0,632,82.0,101.0,8.0,11.0,-16.0,3,3,0,...,0,0,0,0,0,0,0,0,0,0
1,1,544,81.0,119.0,8.0,30.0,-12.0,3,3,0,...,0,0,0,0,0,0,0,0,0,0
2,2,649,105.0,120.0,3.0,12.0,-17.0,3,3,0,...,0,0,0,0,0,0,0,0,0,0
3,3,314,58.0,74.0,5.0,11.0,17.0,2,2,0,...,0,0,0,0,0,0,0,0,0,0
4,4,221,56.0,90.0,17.0,17.0,-4.0,3,3,0,...,0,0,0,0,0,0,0,0,0,0


### Get cleaned delay_data data with delays andno outliers

In [531]:
delay_data = pd.read_csv('data/flights_cleaned_no_outlier_iqr_with_delays.csv')

In [532]:
delay_data = delay_data[['Unnamed: 0','late_aircraft_delay','security_delay',
                         'nas_delay','weather_delay','carrier_delay','fl_date']]

In [533]:
delay_data['fl_date'] = pd.to_datetime(delay_data['fl_date'],format='%Y-%m-%d')
delay_data['year'] = pd.DatetimeIndex(delay_data['fl_date']).year
delay_data['month'] = pd.DatetimeIndex(delay_data['fl_date']).month
delay_data['day'] = pd.DatetimeIndex(delay_data['fl_date']).day

In [534]:
delay_data.head()

Unnamed: 0.1,Unnamed: 0,late_aircraft_delay,security_delay,nas_delay,weather_delay,carrier_delay,fl_date,year,month,day
0,0,0.0,0.0,0.0,0.0,0.0,2019-05-22,2019,5,22
1,2,0.0,0.0,0.0,0.0,0.0,2019-06-27,2019,6,27
2,3,0.0,0.0,0.0,0.0,0.0,2018-04-05,2018,4,5
3,5,0.0,0.0,0.0,0.0,0.0,2019-02-23,2019,2,23
4,6,0.0,0.0,0.0,0.0,0.0,2019-10-09,2019,10,9


In [535]:
delay_data['Max'] = delay_data[['late_aircraft_delay','security_delay',
                         'nas_delay','weather_delay','carrier_delay']].idxmax(axis=1)

In [536]:
delay_data.head()

Unnamed: 0.1,Unnamed: 0,late_aircraft_delay,security_delay,nas_delay,weather_delay,carrier_delay,fl_date,year,month,day,Max
0,0,0.0,0.0,0.0,0.0,0.0,2019-05-22,2019,5,22,late_aircraft_delay
1,2,0.0,0.0,0.0,0.0,0.0,2019-06-27,2019,6,27,late_aircraft_delay
2,3,0.0,0.0,0.0,0.0,0.0,2018-04-05,2018,4,5,late_aircraft_delay
3,5,0.0,0.0,0.0,0.0,0.0,2019-02-23,2019,2,23,late_aircraft_delay
4,6,0.0,0.0,0.0,0.0,0.0,2019-10-09,2019,10,9,late_aircraft_delay


In [537]:
weight = delay_data.groupby("Max").count().transform(lambda x: x/x.sum()).rename(columns={'Unnamed: 0':'weight'})[['weight']]

In [538]:
cols = weight.reset_index()['Max'].values.tolist()
cols

['carrier_delay',
 'late_aircraft_delay',
 'nas_delay',
 'security_delay',
 'weather_delay']

In [539]:
weight = weight.reset_index()[['weight']].to_dict()

In [540]:
class_weight = []
for val in weight['weight'].values():
    class_weight.append({0: 1, 1: val})

In [541]:
delay_data = delay_data.drop(columns = ['late_aircraft_delay','security_delay',
                                        'nas_delay','weather_delay','carrier_delay'])

In [542]:
delay_data

Unnamed: 0.1,Unnamed: 0,fl_date,year,month,day,Max
0,0,2019-05-22,2019,5,22,late_aircraft_delay
1,2,2019-06-27,2019,6,27,late_aircraft_delay
2,3,2018-04-05,2018,4,5,late_aircraft_delay
3,5,2019-02-23,2019,2,23,late_aircraft_delay
4,6,2019-10-09,2019,10,9,late_aircraft_delay
...,...,...,...,...,...,...
237184,299994,2019-07-27,2019,7,27,late_aircraft_delay
237185,299995,2019-08-13,2019,8,13,late_aircraft_delay
237186,299996,2018-03-21,2018,3,21,late_aircraft_delay
237187,299998,2019-04-26,2019,4,26,late_aircraft_delay


In [543]:
delay_data = pd.get_dummies(delay_data, prefix='', prefix_sep='')

In [544]:
df_all = pd.merge(en_data, delay_data, on = 'Unnamed: 0').drop(columns = ['Unnamed: 0'])

In [545]:
# df_all.to_csv('data/merged_delay_data_data_randomForest.csv')

In [546]:
df_all.head()

Unnamed: 0,distance,air_time,actual_elapsed_time,taxi_in,taxi_out,arr_delay,dep_time_format,arrival_time_format,mkt_unique_carrier_AS,mkt_unique_carrier_B6,...,dest_YUM,fl_date,year,month,day,carrier_delay,late_aircraft_delay,nas_delay,security_delay,weather_delay
0,632,82.0,101.0,8.0,11.0,-16.0,3,3,0,0,...,0,2019-05-22,2019,5,22,0,1,0,0,0
1,649,105.0,120.0,3.0,12.0,-17.0,3,3,0,0,...,0,2019-06-27,2019,6,27,0,1,0,0,0
2,314,58.0,74.0,5.0,11.0,17.0,2,2,0,0,...,0,2018-04-05,2018,4,5,0,1,0,0,0
3,1242,178.0,196.0,7.0,11.0,2.0,3,2,0,0,...,0,2019-02-23,2019,2,23,0,1,0,0,0
4,683,117.0,159.0,5.0,37.0,3.0,3,3,0,0,...,0,2019-10-09,2019,10,9,0,1,0,0,0


In [547]:
X = df_all.drop(columns =['late_aircraft_delay','security_delay',
                          'nas_delay','weather_delay','carrier_delay',
                          'arr_delay','taxi_in','taxi_out','fl_date'])
y = df_all[cols]

In [554]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=123,shuffle=True)

In [555]:
X_train.columns

Index(['distance', 'air_time', 'actual_elapsed_time', 'dep_time_format',
       'arrival_time_format', 'mkt_unique_carrier_AS', 'mkt_unique_carrier_B6',
       'mkt_unique_carrier_DL', 'mkt_unique_carrier_F9',
       'mkt_unique_carrier_G4',
       ...
       'dest_WRG', 'dest_WYS', 'dest_XNA', 'dest_XWA', 'dest_YAK', 'dest_YKM',
       'dest_YUM', 'year', 'month', 'day'],
      dtype='object', length=763)

In [556]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100,n_jobs=-1,class_weight=class_weight)
clf.fit(X_train,y_train)
y_pred_rndm = clf.predict(X_test)

print('Accuracy: ',accuracy_score(y_test,y_pred_rndm)*100)
cnf_matrix = confusion_matrix(y_test.values.argmax(axis=1), y_pred_rndm.argmax(axis=1))
cnf_matrix

Accuracy:  95.45094021613404


array([[    0,   218,     1,     0,     0],
       [    1, 57764,    13,     0,     0],
       [    0,  2500,     1,     0,     0],
       [    0,     2,     0,     0,     0],
       [    0,    18,     0,     0,     0]], dtype=int64)

In [557]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score


print(roc_auc_score(y_test,y_pred_rndm))
f1_score(y_test, y_pred_rndm, average='weighted',zero_division=1, labels=np.unique(y_pred_rndm))
# print(roc_curve(y_test.values.argmax(axis=1), y_pred_rndm.argmax(axis=1)))

# _, recall, _, _ = precision_recall_fscore_support(y_test,y_pred_rndm)
# recall

0.5000646151728947


0.973044974997803

In [558]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred_rndm))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       219
           1       0.95      1.00      0.98     57778
           2       0.07      0.00      0.00      2501
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        18

   micro avg       0.95      0.95      0.95     60518
   macro avg       0.20      0.20      0.20     60518
weighted avg       0.91      0.95      0.93     60518
 samples avg       0.95      0.95      0.95     60518



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [553]:
#smote
#ramdomsearch