# Flight Delay Prediction Project

## Part 4: Discounting weather from Historical Arrival Delays

Our model has been trained to predict ARR_DELAY given flight features (OP_CARRIER, ORIGIN, DESTINATION, etc) and weather features (PRECIP, WIND_SPEED, VISIBILITY, etc)

To discount the effect of weather on historical delays, we predict ARR_DELAY for each flight with the mean weather of the origin and destination airport.

In [None]:
import os
import pandas as pd
import numpy as np
import time
from sklearn.externals import joblib


os.chdir("/content/drive/MyDrive/Projects/Flight_Delay_Predict_Project/CodeFiles") #Default Project directory



In [None]:
tic = time.time()

#PREPARE DF FOR PREDICTION WITH MEAN CLIMATE
df = pd.read_csv('Airline+Weather_data.csv')

toc = time.time()
print("Finished reading CSV file in " + str(toc-tic) + " seconds")

Finished reading CSV file in 3.961743116378784 seconds


In [None]:
#Prepare the data
tic = time.time()

#Drop Variables which do not have correlation with arrival delays/cannot be predicted until the flight
df.drop(['YEAR','DAY_OF_MONTH','CRS_DEP_TIME','DEP_TIME','DEP_DELAY','CRS_ARR_TIME','ARR_TIME','ACTUAL_ELAPSED_TIME','AIR_TIME','DEP_AVG_HourlyVisibility','DEP_AVG_HourlyDryBulbTemperature','DEP_AVG_HourlyWindSpeed','DEP_AVG_HourlyPrecipitation','ARR_AVG_HourlyVisibility','ARR_AVG_HourlyDryBulbTemperature','ARR_AVG_HourlyWindSpeed','ARR_AVG_HourlyPrecipitation'],axis=1, inplace=True)
#Remove data redundancy
df['ARR_HOUR'] = df['ARR_HOUR'].apply(lambda x:0 if x == 24 else x)
#Drop rows with Null Values
df.dropna(inplace=True)

#Convert to Dummy Variables
df = pd.concat([df,pd.get_dummies(df['MONTH'],drop_first=True,prefix="MONTH")],axis=1)
df = pd.concat([df,pd.get_dummies(df['DAY_OF_WEEK'],drop_first=True,prefix="DAY_OF_WEEK")],axis=1)
df = pd.concat([df,pd.get_dummies(df['OP_CARRIER'],drop_first=True,prefix="OP_CARRIER")],axis=1)
df = pd.concat([df,pd.get_dummies(df['ORIGIN'],drop_first=True,prefix="ORIGIN")],axis=1)
df = pd.concat([df,pd.get_dummies(df['DEST'],drop_first=True,prefix="DEST")],axis=1)
df = pd.concat([df,pd.get_dummies(df['DEP_HOUR'],drop_first=True,prefix="DEP_HOUR")],axis=1)
df = pd.concat([df,pd.get_dummies(df['ARR_HOUR'],drop_first=True,prefix="ARR_HOUR")],axis=1)

#Extra columns to be kept are: ARR_DELAY,DAY_OF_WEEK,UNIQUE_CARRIER,DEP_HOUR
df.drop(['MONTH','ORIGIN','DEST','ARR_HOUR'],axis=1,inplace=True)

toc = time.time()
print("Finished preparing data in " + str(toc-tic) + " seconds")

Finished preparing data in 2.2565112113952637 seconds


In [None]:
tic =time.time()

n = df.shape[0]  # number of rows
batch_size = 1000000  # number of rows in each call to partial_fit
index = 0  # helper-var
df_predicted = pd.DataFrame()

lm = joblib.load('linearmodel.pkl') 

while index < n:
    partial_size = min(batch_size, n - index)  # needed because last loop is possibly incomplete
    
    df_predict = df[index:index+partial_size][['ARR_DELAY','DAY_OF_WEEK','OP_CARRIER','DEP_HOUR']]
    
    X = df[index:index+partial_size].drop(['ARR_DELAY','DAY_OF_WEEK','OP_CARRIER','DEP_HOUR'],axis=1)
    
    #Predict Delay in minutes (for all flights)
    predictions = lm.predict(X)
    df_predict['PREDICTED_ARR_DELAY']=[np.exp(p) for p in predictions]

    
    #Do the Logistic Regression to decide if flight was greater than 5 min late or not
    n_models = 10 #Number of models to average over
    df_predict['PREDICTED_DELAY_YN'] = np.zeros(len(df_predict.index))
    
    for i in range(n_models):
        logmodel = joblib.load(str(i)+'_logmodel.pkl') 
        df_predict['PREDICTED_DELAY_YN'] = df_predict['PREDICTED_DELAY_YN'] + logmodel.predict_proba(X)[:,1]
    
    #Take mean of probability of getting DELAY_YN = 1(i.e. Delay greater than 5 min) from all the models
    df_predict['PREDICTED_DELAY_YN'] = df_predict['PREDICTED_DELAY_YN']/n_models
    df_predict['PREDICTED_DELAY_YN'] = df_predict['PREDICTED_DELAY_YN'].apply(lambda x:1 if x>0.46 else 0) #Take 0.46 as threshold
   
    #If Logistic Regression predicts DELAY_YN = 0, then PREDICTED_ARR_DELAY = 0
    df_predict['PREDICTED_ARR_DELAY'] = df_predict['PREDICTED_ARR_DELAY'].multiply(df_predict['PREDICTED_DELAY_YN'])
    df_predict['ARR_DELAY'] = df_predict['ARR_DELAY'].apply(lambda x:x if x>=5 else 0)

    df_predicted = df_predicted.append(df_predict)
    del df_predict

    index += partial_size    
df_predicted.to_csv('predicted_data.csv',index=False)

toc = time.time()
print("Finished prediction in " + str(toc-tic) + " seconds")

Finished prediction in 9.137588500976562 seconds
