# Flight Delay Prediction Project

## Part 3: ML model to predict the airline delay

The objective of our model is to predict the arrival delays. We use the follwoing two subparts to make the prediction work:



1.   Delay Classification Model:


* Classify [0/1] whether a flight is delayed more than 5 minutes or not
* Trained a Logistic Regression model
* Averaged predictions over n=100 models
* Output probability of delay P(delay)


2.   Predicted Delay


*   Regression using Linear Regression
*   Trained only on positive delay P(delay)





In [None]:
import os
import pandas as pd
import numpy as np
import time
from sklearn.externals import joblib

os.chdir("/content/drive/MyDrive/Projects/Flight_Delay_Predict_Project/CodeFiles") #Default Project directory




In [None]:
tic = time.time()

#PREPARE DF FOR REGRESSION WITH CLIMATE
df = pd.read_csv('Airline+Weather_data.csv')

toc = time.time()
print("Finished reading CSV file in " + str(toc-tic) + " seconds")

Finished reading CSV file in 3.658984422683716 seconds


In [None]:
#Prepare the data
tic = time.time()

#Drop Variables which do not have correlation with arrival delays/cannot be predicted until the flight
df.drop(['YEAR','DAY_OF_MONTH','CRS_DEP_TIME','DEP_TIME','DEP_DELAY','CRS_ARR_TIME','ARR_TIME','ACTUAL_ELAPSED_TIME','AIR_TIME','DEP_AVG_HourlyVisibility','DEP_AVG_HourlyDryBulbTemperature','DEP_AVG_HourlyWindSpeed','DEP_AVG_HourlyPrecipitation','ARR_AVG_HourlyVisibility','ARR_AVG_HourlyDryBulbTemperature','ARR_AVG_HourlyWindSpeed','ARR_AVG_HourlyPrecipitation'],axis=1, inplace=True)
#Remove data redundancy
df['ARR_HOUR'] = df['ARR_HOUR'].apply(lambda x:0 if x == 24 else x)
#Drop rows with Null Values
df.dropna(inplace=True)

#Convert to Dummy Variables
df = pd.concat([df,pd.get_dummies(df['MONTH'],drop_first=True,prefix="MONTH")],axis=1)
df = pd.concat([df,pd.get_dummies(df['DAY_OF_WEEK'],drop_first=True,prefix="DAY_OF_WEEK")],axis=1)
df = pd.concat([df,pd.get_dummies(df['OP_CARRIER'],drop_first=True,prefix="OP_CARRIER")],axis=1)
df = pd.concat([df,pd.get_dummies(df['ORIGIN'],drop_first=True,prefix="ORIGIN")],axis=1)
df = pd.concat([df,pd.get_dummies(df['DEST'],drop_first=True,prefix="DEST")],axis=1)
df = pd.concat([df,pd.get_dummies(df['DEP_HOUR'],drop_first=True,prefix="DEP_HOUR")],axis=1)
df = pd.concat([df,pd.get_dummies(df['ARR_HOUR'],drop_first=True,prefix="ARR_HOUR")],axis=1)

df.drop(['MONTH','DAY_OF_WEEK','OP_CARRIER','ORIGIN','DEST','DEP_HOUR','ARR_HOUR'],axis=1,inplace=True)
#DELAY_YN -> Delay Yes or No -> 1 if Delay>5 minutes, else 0
df['DELAY_YN'] = df['ARR_DELAY'].apply(lambda x:1 if x>=5 else 0)

toc = time.time()
print("Finished preparing data in " + str(toc-tic) + " seconds")

Finished preparing data in 1.9548366069793701 seconds


In [None]:
#Create 'n' different Logistic Regression Models

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

n = 10 #Number of models to average over

for i in range(n):
    
    tic = time.time()
    #Create a randomly selected smaller dataset for training purpose
    #Each dataset should have negative and positive classes in the ratio 60:40
    df_split = df.loc[np.random.choice(df[df['DELAY_YN']==1].index, 400000, replace = True)]
    df_split2 = df.loc[np.random.choice(df[df['DELAY_YN']==0].index, 600000, replace = True)]
    df_split = df_split.append(df_split2, ignore_index=True)

    X_train, X_test, y_train, y_test = train_test_split(df_split.drop(['DELAY_YN','ARR_DELAY'],axis=1),
                                                    df_split['DELAY_YN'], test_size=0.10, random_state=101)

    logmodel = LogisticRegression()
    logmodel.fit(X_train,y_train)
    
    predictions = logmodel.predict(X_test)

    truePos = X_test[((predictions == 1) & (y_test == predictions))]
    falsePos = X_test[((predictions == 1) & (y_test != predictions))]
    trueNeg = X_test[((predictions == 0) & (y_test == predictions))]
    falseNeg = X_test[((predictions == 0) & (y_test != predictions))]

    TP = truePos.shape[0]
    FP = falsePos.shape[0]
    TN = trueNeg.shape[0]
    FN = falseNeg.shape[0]

    accuracy = float(TP + TN)/float(TP + TN + FP + FN)
    print('Accuracy: '+str(accuracy))
    
    joblib.dump(logmodel, str(i)+'_logmodel.pkl') 
    
    toc = time.time()
    print(str(i+1)+"th fold took " + str(toc-tic) + " seconds")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.62736
1th fold took 54.839728355407715 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.62828
2th fold took 53.38365173339844 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.62585
3th fold took 53.70569324493408 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.62589
4th fold took 51.71776032447815 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.62699
5th fold took 53.64481592178345 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.6261
6th fold took 54.96917104721069 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.62618
7th fold took 53.41668152809143 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.62753
8th fold took 53.66283988952637 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.62706
9th fold took 57.27164030075073 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.62469
10th fold took 51.73858046531677 seconds


In [None]:
#Test the Model performance (on Training data only)
df2 = df.loc[np.random.choice(df.index, 1000000, replace = True)]
X_test = df2.drop(['ARR_DELAY','DELAY_YN'],axis=1)
y_test = df2['DELAY_YN']

n = 10 #Number of models to average over
df2['DELAY_YN'] = np.zeros(len(df2.index))

for i in range(n):
    logmodel = joblib.load(str(i)+'_logmodel.pkl') 
    predictions = logmodel.predict(X_test)
    
    df2['DELAY_YN'] = df2['DELAY_YN'] + logmodel.predict_proba(X_test)[:,1]
    
    truePos = X_test[((predictions == 1) & (y_test == predictions))]
    falsePos = X_test[((predictions == 1) & (y_test != predictions))]
    trueNeg = X_test[((predictions == 0) & (y_test == predictions))]
    falseNeg = X_test[((predictions == 0) & (y_test != predictions))]

    TP = truePos.shape[0]
    FP = falsePos.shape[0]
    TN = trueNeg.shape[0]
    FN = falseNeg.shape[0]

    accuracy = float(TP + TN)/float(TP + TN + FP + FN)
    print('Accuracy: '+str(accuracy))


Accuracy: 0.773554
Accuracy: 0.776456
Accuracy: 0.771744
Accuracy: 0.774679
Accuracy: 0.77745
Accuracy: 0.777271
Accuracy: 0.776318
Accuracy: 0.776062
Accuracy: 0.775448
Accuracy: 0.778734


In [None]:
#Take Average of probabilities for positive class (DELAY_YN = 1). If average probability>0.5, assign value=1
df2['DELAY_YN_vote'] = df2['DELAY_YN']/n
df2['DELAY_YN_vote'] = df2['DELAY_YN_vote'].apply(lambda x:1 if x>0.46 else 0) #Take Vote

truePos = X_test[((df2['DELAY_YN_vote'] == 1) & (y_test == df2['DELAY_YN_vote']))]
falsePos = X_test[((df2['DELAY_YN_vote'] == 1) & (y_test != df2['DELAY_YN_vote']))]
trueNeg = X_test[((df2['DELAY_YN_vote'] == 0) & (y_test == df2['DELAY_YN_vote']))]
falseNeg = X_test[((df2['DELAY_YN_vote'] == 0) & (y_test != df2['DELAY_YN_vote']))]

TP = truePos.shape[0]
FP = falsePos.shape[0]
TN = trueNeg.shape[0]
FN = falseNeg.shape[0]

accuracy = float(TP + TN)/float(TP + TN + FP + FN)
print('Final Accuracy: '+str(accuracy))
print('TP: '+str(TP))
print('FP: '+str(FP))
print('TN: '+str(TN))
print('FN: '+str(FN))
print('% of positive predictions:')
print(len(df2[df2['DELAY_YN_vote']==1].index)/len(df2.index))

Final Accuracy: 0.734007
TP: 57561
FP: 132208
TN: 676446
FN: 133785
% of positive predictions:
0.189769


In [None]:
#Linear Regression on whole dataset
df_late = df[df['DELAY_YN']==1].copy()
df_late['log_delay'] = np.log(df_late['ARR_DELAY'])

print('Total positive delay datapoints:' + str(len(df_late.index)))

Total positive delay datapoints:104712


In [None]:
#Modeling ARR_DELAY
tic = time.time()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_late.drop(['DELAY_YN','log_delay'],axis=1), 
                                                    df_late['log_delay'], test_size=0.30, random_state=101)

print('Training...')
from sklearn.linear_model import LinearRegression
lm = LinearRegression(normalize=True)
lm.fit(X_train.drop('ARR_DELAY',axis=1),y_train)

print('Predicting on test set...')
predictions = lm.predict(X_test.drop('ARR_DELAY',axis=1))

X_test['predicted']=[np.exp(p) for p in predictions]

from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(X_test['ARR_DELAY'],X_test['predicted']))
print('MSE:', metrics.mean_squared_error(X_test['ARR_DELAY'],X_test['predicted']))
print('RMSE:', np.sqrt(metrics.mean_squared_error(X_test['ARR_DELAY'],X_test['predicted'])))

joblib.dump(lm, 'linearmodel.pkl')

toc = time.time()
print("Finished fitting Linear Regression in " + str(toc-tic) + " seconds")

Training...
Predicting on test set...
MAE: 23.57304396241335
MSE: 4970.094951207509
RMSE: 70.4989003545978
Finished fitting Linear Regression in 1.4992551803588867 seconds
