# Multiclass Classification

The target variables are **CARRIER_DELAY, WEATHER_DELAY, NAS_DELAY, SECURITY_DELAY, LATE_AIRCRAFT_DELAY**. We need to do additional transformations because these variables are not binary but continuos. For each flight that was delayed, we need to have one of these variables as 1 and others 0.

It can happen that we have two types of delays with more than 0 minutes. In this case, take the bigger one as 1 and others as 0.

In [10]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import seaborn as sn
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import pickle

In [12]:
# Import encoded large dta
data = pd.read_csv("data/large/flights_7mil_encoded_delays_no_catagorical_sub.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,crs_elapsed_time,crs_dep_time,crs_arr_time,dep_delay,arr_delay,year,month,day,carrier_delay,early,late_aircraft_delay,nas_delay,security_delay,weather_delay
0,0,104.0,1837,2021,50.0,37.0,2019,8,27,0,0,1,0,0,0
1,1,142.0,709,931,-5.0,46.0,2019,8,27,0,0,0,1,0,0
2,2,152.0,1012,1244,53.0,54.0,2019,8,27,0,0,1,0,0,0
3,3,303.0,1538,2341,-1.0,-21.0,2019,8,27,0,1,0,0,0,0
4,4,90.0,1007,1137,69.0,56.0,2019,8,27,0,0,1,0,0,0


In [13]:
# data[data['security_delay'] == 1].count()
# #late_aircraft_delay 1575956
# #security_delay 2227

In [14]:
X = data.drop(columns =['late_aircraft_delay','security_delay',
                          'nas_delay','weather_delay','carrier_delay',
                          'arr_delay','dep_delay','Unnamed: 0'])

y = data[['late_aircraft_delay','security_delay',
            'nas_delay','weather_delay','carrier_delay']]

In [15]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=123,shuffle=True)

In [16]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100,n_jobs=-1,class_weight='balanced')
clf.fit(X_train,y_train)
y_pred_rndm = clf.predict(X_test)

print('Accuracy: ',accuracy_score(y_test,y_pred_rndm)*100)
cnf_matrix = confusion_matrix(y_test.values.argmax(axis=1), y_pred_rndm.argmax(axis=1))
cnf_matrix

Accuracy:  85.36656988285955


array([[1797172,      50,   17358,     618,   11431],
       [    593,       0,      24,       1,      21],
       [ 106488,       2,   11423,     221,    2339],
       [  11176,       0,     669,     103,     307],
       [  93227,       6,    3546,     119,    3368]], dtype=int64)

In [20]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred_rndm,zero_division=True))

              precision    recall  f1-score   support

           0       0.69      0.83      0.75    473427
           1       0.00      0.00      0.00       639
           2       0.35      0.09      0.15    120473
           3       0.10      0.01      0.02     12255
           4       0.19      0.03      0.06    100266

   micro avg       0.66      0.57      0.61    707060
   macro avg       0.27      0.19      0.19    707060
weighted avg       0.55      0.57      0.54    707060
 samples avg       0.90      0.85      0.85    707060



In [18]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score


print("ROC_AUC Score",roc_auc_score(y_test,y_pred_rndm))
print("F1_score",f1_score(y_test, y_pred_rndm, average='weighted',zero_division=1, labels=np.unique(y_pred_rndm)))

ROC_AUC Score 0.5833288289428195
F1_score 0.7513579794243637


In [19]:
# save model using joblib
import joblib
filename = 'model/RandomForest_7mil_no_categorical.sav'
joblib.dump(clf, filename) 
# pickle.dump(clf, open('', 'wb'))

['model/RandomForest_4mil_no_categorical.sav']