# Prediction of Flight delays

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
af=pd.read_csv(r"/DATA/Python/Flight.csv")

In [3]:
af.head(2)

Unnamed: 0,w,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,ArrDelay,Distance,DepDelay
0,10.0,14.0,3.0,741.0,730.0,912.0,849.0,1451.0,91.0,79.0,23.0,447.0,11.0
1,10.0,15.0,4.0,729.0,730.0,903.0,849.0,1451.0,94.0,79.0,14.0,447.0,-1.0


In [4]:
af.shape

(1048575, 13)

In [5]:
af.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 13 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   w                  1047901 non-null  float64
 1   DayofMonth         1047901 non-null  float64
 2   DayOfWeek          1047901 non-null  float64
 3   DepTime            1033396 non-null  float64
 4   CRSDepTime         1047901 non-null  float64
 5   ArrTime            1030616 non-null  float64
 6   CRSArrTime         1047901 non-null  float64
 7   FlightNum          1047901 non-null  float64
 8   ActualElapsedTime  1030616 non-null  float64
 9   CRSElapsedTime     1047901 non-null  float64
 10  ArrDelay           1030616 non-null  float64
 11  Distance           1047901 non-null  float64
 12  DepDelay           1033396 non-null  float64
dtypes: float64(13)
memory usage: 104.0 MB


In [6]:
# find the percentage of NA values
af.isna().sum()/af.shape[0] * 100

w                    0.064278
DayofMonth           0.064278
DayOfWeek            0.064278
DepTime              1.447584
CRSDepTime           0.064278
ArrTime              1.712705
CRSArrTime           0.064278
FlightNum            0.064278
ActualElapsedTime    1.712705
CRSElapsedTime       0.064278
ArrDelay             1.712705
Distance             0.064278
DepDelay             1.447584
dtype: float64

In [7]:
# since the percentage of na value is very low we decide to drop the na values
af.dropna(inplace=True)

In [8]:
# checking if na values are removed or not
af.isna().sum()

w                    0
DayofMonth           0
DayOfWeek            0
DepTime              0
CRSDepTime           0
ArrTime              0
CRSArrTime           0
FlightNum            0
ActualElapsedTime    0
CRSElapsedTime       0
ArrDelay             0
Distance             0
DepDelay             0
dtype: int64

In [9]:
af.head()

Unnamed: 0,w,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,ArrDelay,Distance,DepDelay
0,10.0,14.0,3.0,741.0,730.0,912.0,849.0,1451.0,91.0,79.0,23.0,447.0,11.0
1,10.0,15.0,4.0,729.0,730.0,903.0,849.0,1451.0,94.0,79.0,14.0,447.0,-1.0
2,10.0,17.0,6.0,741.0,730.0,918.0,849.0,1451.0,97.0,79.0,29.0,447.0,11.0
3,10.0,18.0,7.0,729.0,730.0,847.0,849.0,1451.0,78.0,79.0,-2.0,447.0,-1.0
4,10.0,19.0,1.0,749.0,730.0,922.0,849.0,1451.0,93.0,79.0,33.0,447.0,19.0


# Feature engineering

In [10]:
# we create a new column for delayed, the fligths with time more than 15 mins will be considered as delayed

af["Delayed"]=af.DepDelay>=15


In [11]:
#here we replace the value with  0 and 1

af["Delayed"]= af.Delayed.replace({False:0,True:1})

In [12]:
# we create a new column for arivall delay
af["Arrivaldelay"]= af.ArrDelay >=15
af["Arrivaldelay"]=af.Arrivaldelay.replace({False:0,True:1})

In [13]:
# deriving a new feature for airtravel time

af["Airtraveldelay"]=af.ActualElapsedTime-af.CRSElapsedTime

In [14]:
# we create create a new column for air travel delay
af["AirtravelDelayed"]=af.Airtraveldelay >=10
af["AirtravelDelayed"]=af.AirtravelDelayed.replace({False:0,True:1})

In [15]:
af.head()

Unnamed: 0,w,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,ArrDelay,Distance,DepDelay,Delayed,Arrivaldelay,Airtraveldelay,AirtravelDelayed
0,10.0,14.0,3.0,741.0,730.0,912.0,849.0,1451.0,91.0,79.0,23.0,447.0,11.0,0,1,12.0,1
1,10.0,15.0,4.0,729.0,730.0,903.0,849.0,1451.0,94.0,79.0,14.0,447.0,-1.0,0,0,15.0,1
2,10.0,17.0,6.0,741.0,730.0,918.0,849.0,1451.0,97.0,79.0,29.0,447.0,11.0,0,1,18.0,1
3,10.0,18.0,7.0,729.0,730.0,847.0,849.0,1451.0,78.0,79.0,-2.0,447.0,-1.0,0,0,-1.0,0
4,10.0,19.0,1.0,749.0,730.0,922.0,849.0,1451.0,93.0,79.0,33.0,447.0,19.0,1,1,14.0,1


In [16]:
af.drop(["w","FlightNum","DepDelay","DepTime","CRSDepTime","CRSArrTime","ArrTime","ActualElapsedTime","CRSElapsedTime",
         "ArrDelay","Airtraveldelay"],axis=1,inplace=True)

In [17]:
af.head()

Unnamed: 0,DayofMonth,DayOfWeek,Distance,Delayed,Arrivaldelay,AirtravelDelayed
0,14.0,3.0,447.0,0,1,1
1,15.0,4.0,447.0,0,0,1
2,17.0,6.0,447.0,0,1,1
3,18.0,7.0,447.0,0,0,0
4,19.0,1.0,447.0,1,1,1


In [18]:
af.corr()

Unnamed: 0,DayofMonth,DayOfWeek,Distance,Delayed,Arrivaldelay,AirtravelDelayed
DayofMonth,1.0,0.004463,0.003319,0.05739,0.046279,0.007785
DayOfWeek,0.004463,1.0,0.0099,-0.014166,-0.036334,-0.040626
Distance,0.003319,0.0099,1.0,0.041656,0.081404,0.103374
Delayed,0.05739,-0.014166,0.041656,1.0,0.620382,0.050282
Arrivaldelay,0.046279,-0.036334,0.081404,0.620382,1.0,0.487034
AirtravelDelayed,0.007785,-0.040626,0.103374,0.050282,0.487034,1.0


In [19]:
# for oversampling as the data had class imbalance problem

from sklearn.model_selection import train_test_split

In [197]:
af_train, af_test= train_test_split(af, test_size=.25)


In [199]:
af_train.Delayed.value_counts()

0    661243
1    111719
Name: Delayed, dtype: int64

In [200]:

df2= af_train[af_train.Delayed==1]

In [201]:
# solving oversampling

afnew_train= pd.concat([af_train,df2,df2])

In [202]:
afnew_train.Delayed.value_counts()

0    661243
1    335157
Name: Delayed, dtype: int64

In [203]:
afnew_train.head(2)

Unnamed: 0,DayofMonth,DayOfWeek,Distance,Delayed,Arrivaldelay,AirtravelDelayed
363284,31.0,6.0,473.0,0,0,0
933211,10.0,4.0,110.0,0,0,0


In [204]:
# sampling
afnew_x_train= afnew_train.iloc[:,[2,4,5]]
afnew_y_train=afnew_train.iloc[:,3]

In [205]:
afnew_x_test= af_test.iloc[:,[2,4,5]]
afnew_y_test=af_test.iloc[:,3]

In [206]:
afnew_x_train.head(2)

Unnamed: 0,Distance,Arrivaldelay,AirtravelDelayed
363284,473.0,0,0
933211,110.0,0,0


# logisticregression

In [207]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()

In [208]:
logreg.fit(afnew_x_train,afnew_y_train)

LogisticRegression()

In [209]:
pred_log=logreg.predict(afnew_x_test)

In [210]:
from sklearn.metrics import confusion_matrix

In [211]:
tab_log= confusion_matrix(pred_log,afnew_y_test)
tab_log

array([[201618,   8337],
       [ 19166,  28533]], dtype=int64)

In [212]:
ac_log=tab_log.diagonal().sum()/tab_log.sum()
ac_log

0.8932560720966878

In [213]:
# feature importance table
fimp=pd.DataFrame({"Features":afnew_x_train.columns,"Coefficients":np.ravel(logreg.coef_)})

In [214]:
fimp.sort_values(by="Coefficients",ascending=False,inplace=True)
fimp

Unnamed: 0,Features,Coefficients
1,Arrivaldelay,5.175362
0,Distance,0.000306
2,AirtravelDelayed,-2.685923


In [215]:
# ROC AND AUC

In [216]:
from sklearn.metrics import roc_auc_score, roc_curve

In [217]:
roc_score_log=roc_auc_score(pred_log,afnew_y_test)
roc_score_log

0.7792400661525491

In [218]:
# F1 score
from sklearn.metrics import f1_score

In [219]:
f1_log=f1_score(afnew_y_test,pred_log)
f1_log

0.674786269200298

In [220]:
#cross validation

In [221]:
from sklearn.model_selection import cross_val_score

In [222]:
cross_val_logreg=cross_val_score(logreg,afnew_x_train,afnew_y_train,cv=5,scoring="accuracy")

# Decision Tree

In [288]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()

In [289]:
dt.fit(afnew_x_train,afnew_y_train)

DecisionTreeClassifier()

In [291]:
pred_dt= dt.predict(afnew_x_test)

In [292]:
tab_dt= confusion_matrix(pred_dt,afnew_y_test)
tab_dt

array([[205564,   8138],
       [ 15220,  28732]], dtype=int64)

In [293]:
acc_dt= tab_dt.diagonal().sum()/tab_dt.sum()
acc_dt

0.9093435382334448

In [294]:
#ROC

In [295]:
roc_score_dt=roc_auc_score(pred_dt,afnew_y_test)
roc_score_dt

0.8078160377257408

In [296]:
#f1 score

In [297]:
f1_dt=f1_score(afnew_y_test,pred_dt)
f1_dt

0.7109945311920023

In [232]:
#cross validation

In [233]:
cross_val_dt=cross_val_score(dt,afnew_x_train,afnew_y_train,cv=5,scoring="accuracy")

#  Randomforest

In [234]:

from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=50,max_depth=3)

In [235]:
rf.fit(afnew_x_train,afnew_y_train)

RandomForestClassifier(max_depth=3, n_estimators=50)

In [236]:
pred_ran=rf.predict(afnew_x_test)

In [237]:
tab_ran=confusion_matrix(pred_ran,afnew_y_test)
tab_ran

array([[194321,   5249],
       [ 26463,  31621]], dtype=int64)

In [238]:
acc_ran=tab_ran.diagonal().sum()/tab_ran.sum()
acc_ran

0.8769202108253704

In [239]:
# ROC

In [240]:
roc_score_ran=roc_auc_score(pred_ran,afnew_y_test)
roc_score_ran

0.7590498318544157

In [241]:
#F1 score

In [242]:
f1_ran=f1_score(afnew_y_test,pred_ran)
f1_ran

0.6660277608104977

In [176]:
# cross validation

In [275]:
cross_val_ran=cross_val_score(rf,afnew_x_train,afnew_y_train,cv=7,scoring="accuracy")

# Naive bayes

In [243]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes=MultinomialNB()

In [244]:
naive_bayes.fit(afnew_x_train,afnew_y_train)

MultinomialNB()

In [245]:
pred_naiv=naive_bayes.predict(afnew_x_test)

In [246]:
tab_naive=confusion_matrix(pred_naiv,afnew_y_test)
tab_naive

array([[199220,  10063],
       [ 21564,  26807]], dtype=int64)

In [247]:
acc_naive=tab_naive.diagonal().sum()/tab_naive.sum()
acc_naive

0.8772501106134584

In [248]:
#ROC

In [249]:
roc_score_naive_bayes=roc_auc_score(pred_naiv,afnew_y_test)
roc_score_naive_bayes

0.7530562391533011

In [250]:
#f1 score

In [251]:
f1_naive_bayes=f1_score(afnew_y_test,pred_naiv)
f1_naive_bayes

0.6289696272920308

In [252]:
#cross validation

In [253]:
cross_val_naive_bayes=cross_val_score(naive_bayes,afnew_x_train,afnew_y_train,cv=5,scoring="accuracy")

# Using SVM

In [254]:
from sklearn.svm import LinearSVC
sv=LinearSVC()

In [255]:
sv.fit(afnew_x_train,afnew_y_train)



LinearSVC()

In [256]:
pred_sv= sv.predict(afnew_x_test)

In [257]:
tab_sv=confusion_matrix(pred_sv,afnew_y_test)
tab_sv

array([[203362,   8197],
       [ 17422,  28673]], dtype=int64)

In [258]:
acc_sv= tab_sv.diagonal().sum()/tab_sv.sum()
acc_sv

0.9005682038703067

In [259]:
# ROC AUC

In [260]:
roc_score_svm=roc_auc_score(pred_sv,afnew_y_test)
roc_score_svm

0.7916478717367575

In [261]:
#f1 score

In [262]:
f1_sv=f1_score(afnew_y_test,pred_sv)
f1_sv

0.6912071355390828

In [263]:
#cross validation

In [None]:
cross_val_sv=cross_val_score(sv,afnew_x_train,afnew_y_train,cv=5,scoring="accuracy")

# using voting classifier

In [265]:
from sklearn.ensemble import VotingClassifier

In [266]:
vc= VotingClassifier(estimators=(("logreg",logreg),("dt",dt),("rf",rf),("sv",sv),("naive_bay",naive_bayes)),voting="hard")

In [267]:
vc.fit(afnew_x_train,afnew_y_train)



VotingClassifier(estimators=(('logreg', LogisticRegression()),
                             ('dt', DecisionTreeClassifier()),
                             ('rf',
                              RandomForestClassifier(max_depth=3,
                                                     n_estimators=50)),
                             ('sv', LinearSVC()),
                             ('naive_bay', MultinomialNB())))

In [268]:
pred_vc= vc.predict(afnew_x_test)

In [269]:
tab_vc=confusion_matrix(pred_vc,afnew_y_test)
tab_vc

array([[196954,   5954],
       [ 23830,  30916]], dtype=int64)

In [270]:
acc_vc=tab_vc.diagonal().sum()/tab_vc.sum()
acc_vc

0.8844031142539995

In [271]:
# ROC AUC

In [272]:
roc_score_vc=roc_auc_score(pred_vc,afnew_y_test)
roc_score_vc

0.7676868546149204

In [273]:
# f1 score

In [274]:
f1_vc=f1_score(afnew_y_test,pred_vc)
f1_vc

0.6749039469088369

In [None]:
cross_val_vc=cross_val_score(sv,afnew_x_train,afnew_y_train,cv=7,scoring="accuracy")

# ACCURACY AND ROC TABLE

In [276]:

Algorithm=list(["Logistic","DecisionTree","Randomforest","Naivebayes","SVM","Votingclassifier"])
accuracy=list([ac_log,acc_dt,acc_ran,acc_naive,acc_sv,acc_sv])
ROC=list([roc_score_log,roc_score_dt,roc_score_ran,roc_score_naive_bayes,roc_score_svm,roc_score_vc])
F1=list([f1_log,f1_dt,f1_ran,f1_naive_bayes,f1_sv,f1_vc])

In [277]:
# creating a dataframe
accuracytable= pd.DataFrame({"Algorithms":Algorithm,"Accuracy":accuracy,"ROCscore":ROC,"F1_Score":F1})


In [278]:
accuracytable.sort_values(by="F1_Score",ascending=False,inplace=True)
accuracytable

Unnamed: 0,Algorithms,Accuracy,ROCscore,F1_Score
1,DecisionTree,0.909344,0.807816,0.710995
4,SVM,0.900568,0.791648,0.691207
5,Votingclassifier,0.900568,0.767687,0.674904
0,Logistic,0.893256,0.77924,0.674786
2,Randomforest,0.87692,0.75905,0.666028
3,Naivebayes,0.87725,0.753056,0.62897


# Cross validation score table

In [281]:
# name
algo=list(["Logistic","DecisionTree","Randomforest","Naivebayes","SVM"])

In [282]:
# cross validation table for all alogorithms to check for stable model
minscore=list([cross_val_logreg.min(),cross_val_dt.min(),cross_val_ran.min(),cross_val_naive_bayes.min(),cross_val_sv.min()])
maxscore=list([cross_val_logreg.max(),cross_val_dt.max(),cross_val_ran.max(),cross_val_naive_bayes.max(),cross_val_sv.max()])
meanscore=list([cross_val_logreg.mean(),cross_val_dt.mean(),cross_val_ran.mean(),cross_val_naive_bayes.mean(),cross_val_sv.mean()])

In [283]:
crossval_table=pd.DataFrame({"Algorithms":algo,"cv_min_score":minscore,"cv_max_score":maxscore,"cv_meanscore":meanscore})


In [284]:
crossval_table

Unnamed: 0,Algorithms,cv_min_score,cv_max_score,cv_meanscore
0,Logistic,0.865733,0.86758,0.86662
1,DecisionTree,0.880844,0.883148,0.882042
2,Randomforest,0.871978,0.875013,0.873166
3,Naivebayes,0.842353,0.845366,0.843931
4,SVM,0.695229,0.874107,0.817758


In [None]:
# Hence we can conclude that decision tree gives the best result