# Multiclass Classification

The target variables are **CARRIER_DELAY, WEATHER_DELAY, NAS_DELAY, SECURITY_DELAY, LATE_AIRCRAFT_DELAY**. We need to do additional transformations because these variables are not binary but continuos. For each flight that was delayed, we need to have one of these variables as 1 and others 0.

It can happen that we have two types of delays with more than 0 minutes. In this case, take the bigger one as 1 and others as 0.

In [160]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [161]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import seaborn as sn
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

### Get onehot encoded flight data

In [162]:
en_data = pd.read_csv('data/hot_encoded_flights.csv')

In [163]:
en_data.head()

Unnamed: 0.1,Unnamed: 0,distance,air_time,actual_elapsed_time,taxi_in,taxi_out,arr_delay,dep_time_format,arrival_time_format,mkt_unique_carrier_AS,...,dest_VEL,dest_VLD,dest_VPS,dest_WRG,dest_WYS,dest_XNA,dest_XWA,dest_YAK,dest_YKM,dest_YUM
0,0,632,82.0,101.0,8.0,11.0,-16.0,3,3,0,...,0,0,0,0,0,0,0,0,0,0
1,1,544,81.0,119.0,8.0,30.0,-12.0,3,3,0,...,0,0,0,0,0,0,0,0,0,0
2,2,649,105.0,120.0,3.0,12.0,-17.0,3,3,0,...,0,0,0,0,0,0,0,0,0,0
3,3,314,58.0,74.0,5.0,11.0,17.0,2,2,0,...,0,0,0,0,0,0,0,0,0,0
4,4,221,56.0,90.0,17.0,17.0,-4.0,3,3,0,...,0,0,0,0,0,0,0,0,0,0


### Get cleaned delay_data data with delays andno outliers

In [164]:
delay_data = pd.read_csv('data/flights_cleaned_no_outlier_iqr_with_delays.csv')

In [165]:
delay_data = delay_data[['Unnamed: 0','late_aircraft_delay','security_delay',
                         'nas_delay','weather_delay','carrier_delay','fl_date','arr_delay']]

In [166]:
# delay_data['fl_date'] = pd.to_datetime(delay_data['fl_date'],format='%Y-%m-%d')
# delay_data['year'] = pd.DatetimeIndex(delay_data['fl_date']).year
# delay_data['month'] = pd.DatetimeIndex(delay_data['fl_date']).month
# delay_data['day'] = pd.DatetimeIndex(delay_data['fl_date']).day

In [167]:
delay_data.head()

Unnamed: 0.1,Unnamed: 0,late_aircraft_delay,security_delay,nas_delay,weather_delay,carrier_delay,fl_date,arr_delay
0,0,0.0,0.0,0.0,0.0,0.0,2019-05-22,-16.0
1,2,0.0,0.0,0.0,0.0,0.0,2019-06-27,-12.0
2,3,0.0,0.0,0.0,0.0,0.0,2018-04-05,-17.0
3,5,0.0,0.0,0.0,0.0,0.0,2019-02-23,-4.0
4,6,0.0,0.0,0.0,0.0,0.0,2019-10-09,2.0


In [168]:
delay_data['Max'] = delay_data[delay_data['arr_delay']>0][['late_aircraft_delay','security_delay',
                         'nas_delay','weather_delay','carrier_delay']].idxmax(axis=1)

In [169]:
delay_data.head()

Unnamed: 0.1,Unnamed: 0,late_aircraft_delay,security_delay,nas_delay,weather_delay,carrier_delay,fl_date,arr_delay,Max
0,0,0.0,0.0,0.0,0.0,0.0,2019-05-22,-16.0,
1,2,0.0,0.0,0.0,0.0,0.0,2019-06-27,-12.0,
2,3,0.0,0.0,0.0,0.0,0.0,2018-04-05,-17.0,
3,5,0.0,0.0,0.0,0.0,0.0,2019-02-23,-4.0,
4,6,0.0,0.0,0.0,0.0,0.0,2019-10-09,2.0,late_aircraft_delay


In [170]:
weight = delay_data.groupby("Max").count().transform(lambda x: x/x.sum()).rename(columns={'Unnamed: 0':'weight'})[['weight']]

In [171]:
cols = weight.reset_index()['Max'].values.tolist()
cols

['carrier_delay',
 'late_aircraft_delay',
 'nas_delay',
 'security_delay',
 'weather_delay']

In [172]:
weight = weight.reset_index()[['weight']].to_dict()

In [173]:
class_weight = []
for val in weight['weight'].values():
    class_weight.append({0: 1, 1: val})

In [174]:
delay_data = delay_data.drop(columns = ['late_aircraft_delay','security_delay',
                                        'nas_delay','weather_delay','carrier_delay','arr_delay','fl_date'])

In [175]:
delay_data

Unnamed: 0.1,Unnamed: 0,Max
0,0,
1,2,
2,3,
3,5,
4,6,late_aircraft_delay
...,...,...
237184,299994,
237185,299995,
237186,299996,
237187,299998,


In [176]:
delay_data = pd.get_dummies(delay_data, prefix='', prefix_sep='')

In [177]:
df_all = pd.merge(en_data, delay_data, on = 'Unnamed: 0').drop(columns = ['Unnamed: 0'])

In [178]:
df_all.to_csv('data/merged_delay_data_data_randomForest.csv')

In [179]:
df_all.head()

Unnamed: 0,distance,air_time,actual_elapsed_time,taxi_in,taxi_out,arr_delay,dep_time_format,arrival_time_format,mkt_unique_carrier_AS,mkt_unique_carrier_B6,...,dest_XNA,dest_XWA,dest_YAK,dest_YKM,dest_YUM,carrier_delay,late_aircraft_delay,nas_delay,security_delay,weather_delay
0,632,82.0,101.0,8.0,11.0,-16.0,3,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,649,105.0,120.0,3.0,12.0,-17.0,3,3,0,0,...,0,0,0,0,0,0,0,0,0,0
2,314,58.0,74.0,5.0,11.0,17.0,2,2,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1242,178.0,196.0,7.0,11.0,2.0,3,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,683,117.0,159.0,5.0,37.0,3.0,3,3,0,0,...,0,0,0,0,0,0,1,0,0,0


In [180]:
X = df_all.drop(columns =['late_aircraft_delay','security_delay',
                          'nas_delay','weather_delay','carrier_delay',
                          'arr_delay'])
y = df_all[cols]

In [181]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=123,shuffle=True)

In [182]:
X_train.columns

Index(['distance', 'air_time', 'actual_elapsed_time', 'taxi_in', 'taxi_out',
       'dep_time_format', 'arrival_time_format', 'mkt_unique_carrier_AS',
       'mkt_unique_carrier_B6', 'mkt_unique_carrier_DL',
       ...
       'dest_VEL', 'dest_VLD', 'dest_VPS', 'dest_WRG', 'dest_WYS', 'dest_XNA',
       'dest_XWA', 'dest_YAK', 'dest_YKM', 'dest_YUM'],
      dtype='object', length=762)

In [183]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100,n_jobs=-1,class_weight='balanced')
clf.fit(X_train,y_train)
y_pred_rndm = clf.predict(X_test)

print('Accuracy: ',accuracy_score(y_test,y_pred_rndm)*100)
cnf_matrix = confusion_matrix(y_test.values.argmax(axis=1), y_pred_rndm.argmax(axis=1))
cnf_matrix

Accuracy:  78.07263954525926


array([[47303,   784,    67,     0,     1],
       [ 9663,   162,    17,     0,     0],
       [ 2459,    39,     2,     0,     1],
       [    2,     0,     0,     0,     0],
       [   17,     1,     0,     0,     0]], dtype=int64)

In [184]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score


print("ROC_AUC Score",roc_auc_score(y_test,y_pred_rndm))
print("F1_score",f1_score(y_test, y_pred_rndm, average='weighted',zero_division=1, labels=np.unique(y_pred_rndm)))

ROC_AUC Score 0.49995020932519746
F1_score 0.02927109537620295


In [185]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred_rndm,zero_division=True))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       219
           1       0.16      0.02      0.03      9842
           2       0.02      0.00      0.00      2501
           3       1.00      0.00      0.00         2
           4       0.00      0.00      0.00        18

   micro avg       0.15      0.01      0.02     12582
   macro avg       0.24      0.00      0.01     12582
weighted avg       0.13      0.01      0.02     12582
 samples avg       0.98      0.79      0.78     12582



In [186]:
# save model using joblib
import pickle
filename = 'model/RandomForest_first_sample_model_very_bad.sav'
pickle.dump(clf, open(filename, 'wb'))