In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None) 
from sklearn import preprocessing
import datetime as dt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import pickle

## Read in the test file for final predictions

In [2]:
df = pd.read_csv('flights_test.csv', sep=',')
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363
1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363
2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",2020,2130,N,70,1,333
3,2020-01-01,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",1340,1455,N,75,1,333
4,2020-01-01,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",915,1035,N,80,1,333


In [3]:
# drop columns that are highly correlated with others
df = df.drop(columns=['branded_code_share', 'mkt_carrier', 'op_unique_carrier', 'op_carrier_fl_num', 'flights', \
                      'origin_airport_id', 'origin_city_name', 'dest_airport_id', 'dest_city_name', 'tail_num', 'dup'])

In [4]:
df.dropna()
df.shape

(150623, 9)

In [5]:
# convert flight date from string to datetime
df['fl_date'] = pd.to_datetime(df['fl_date'])
df.dtypes

fl_date               datetime64[ns]
mkt_unique_carrier            object
mkt_carrier_fl_num             int64
origin                        object
dest                          object
crs_dep_time                   int64
crs_arr_time                   int64
crs_elapsed_time               int64
distance                       int64
dtype: object

In [6]:
# extract additional information from flight date
df['year'] = df['fl_date'].dt.year
df['month'] = df['fl_date'].dt.month
df['day'] = df['fl_date'].dt.day
df['dayofweek'] = df['fl_date'].dt.dayofweek

In [7]:
df = df.drop(columns='fl_date')

In [8]:
df.head()

Unnamed: 0,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,year,month,day,dayofweek
0,WN,5888,ONT,SFO,1810,1945,95,363,2020,1,1,2
1,WN,6276,ONT,SFO,1150,1320,90,363,2020,1,1,2
2,WN,4598,ONT,SJC,2020,2130,70,333,2020,1,1,2
3,WN,4761,ONT,SJC,1340,1455,75,333,2020,1,1,2
4,WN,5162,ONT,SJC,915,1035,80,333,2020,1,1,2


In [9]:
df['crs_dep_time'] = df['crs_dep_time'].astype(int)
df['crs_arr_time'] = df['crs_arr_time'].astype(int)
df['crs_dep_time'] = df['crs_dep_time'].astype(str)
df['crs_arr_time'] = df['crs_arr_time'].astype(str)

In [10]:
# create departure hour and arrival hour columns
df['crs_dep_time'] = df['crs_dep_time'].apply(lambda x: str(x).zfill(4))
df['crs_arr_time'] = df['crs_arr_time'].apply(lambda x: str(x).zfill(4))
df['dep_h'] = df['crs_dep_time'].str[0:2]
df['arr_h'] = df['crs_arr_time'].str[0:2]
df['dep_h'] = df['dep_h'].replace('24', '00')
df['arr_h'] = df['arr_h'].replace('24', '00')

In [11]:
df1 = df.copy()

### Follow same steps as in the Data Preparation file to get the data in the same format that the models are trained on

In [12]:
# Read in saved historical data
df_carrier_delays = pd.read_csv('feature_data/df_carrier_delays.csv', sep=',')
df_carrier_delays = df_carrier_delays.drop(columns=['Unnamed: 0'])

df_origin_delays = pd.read_csv('feature_data/df_origin_delays.csv', sep=',')
df_origin_delays = df_origin_delays.drop(columns=['Unnamed: 0'])

df_dest_delays = pd.read_csv('feature_data/df_dest_delays.csv', sep=',')
df_dest_delays = df_dest_delays.drop(columns=['Unnamed: 0'])

df_month_delays = pd.read_csv('feature_data/df_month_delays.csv', sep=',')
df_month_delays = df_month_delays.drop(columns=['Unnamed: 0'])

df_dep_h_delays = pd.read_csv('feature_data/df_dep_h_delays.csv', sep=',')
df_dep_h_delays = df_dep_h_delays.drop(columns=['Unnamed: 0'])

df_arr_h_delays = pd.read_csv('feature_data/df_arr_h_delays.csv', sep=',')
df_arr_h_delays = df_arr_h_delays.drop(columns=['Unnamed: 0'])

df_day_of_week_delays = pd.read_csv('feature_data/df_day_of_week_delays.csv', sep=',')
df_day_of_week_delays = df_day_of_week_delays.drop(columns=['Unnamed: 0'])

df_route_delays = pd.read_csv('feature_data/df_route_delays.csv', sep=',')
df_route_delays = df_route_delays.drop(columns=['Unnamed: 0'])

df_dep_h_taxi_out = pd.read_csv('feature_data/df_dep_h_taxi_out.csv', sep=',')
df_dep_h_taxi_out = df_dep_h_taxi_out.drop(columns=['Unnamed: 0'])

df_arr_h_taxi_in = pd.read_csv('feature_data/df_arr_h_taxi_in.csv', sep=',')
df_arr_h_taxi_in = df_arr_h_taxi_in.drop(columns=['Unnamed: 0'])

df_flights_dep_h = pd.read_csv('feature_data/df_flights_dep_h.csv', sep=',')
df_flights_dep_h = df_flights_dep_h.drop(columns=['Unnamed: 0'])

In [13]:
# set the columns in the same order as training data
df1 = df1[['year', 'month', 'day', 'dayofweek', 'mkt_unique_carrier', 'mkt_carrier_fl_num', 'origin', \
                  'dest', 'crs_dep_time', 'dep_h', 'crs_arr_time', 'arr_h', 'crs_elapsed_time', 'distance']].copy()

In [14]:
df_test = df1.copy()

### Merge dataframes based on historical summary statistics

In [15]:
df_merged = df_test.merge(df_carrier_delays, how='left', left_on='mkt_unique_carrier' , right_on='carrier')
df_merged.head()

Unnamed: 0,year,month,day,dayofweek,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,crs_dep_time,dep_h,crs_arr_time,arr_h,crs_elapsed_time,distance,carrier,mean_delay_carrier
0,2020,1,1,2,WN,5888,ONT,SFO,1810,18,1945,19,95,363,WN,3.352232
1,2020,1,1,2,WN,6276,ONT,SFO,1150,11,1320,13,90,363,WN,3.352232
2,2020,1,1,2,WN,4598,ONT,SJC,2020,20,2130,21,70,333,WN,3.352232
3,2020,1,1,2,WN,4761,ONT,SJC,1340,13,1455,14,75,333,WN,3.352232
4,2020,1,1,2,WN,5162,ONT,SJC,915,9,1035,10,80,333,WN,3.352232


In [16]:
df_merged = df_merged.drop(columns='carrier')

In [17]:
df_merged = df_merged.merge(df_origin_delays, how='left', left_on='origin' , right_on='origin')
df_merged.head()

Unnamed: 0,year,month,day,dayofweek,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,crs_dep_time,dep_h,crs_arr_time,arr_h,crs_elapsed_time,distance,mean_delay_carrier,mean_delay_origin
0,2020,1,1,2,WN,5888,ONT,SFO,1810,18,1945,19,95,363,3.352232,1.241158
1,2020,1,1,2,WN,6276,ONT,SFO,1150,11,1320,13,90,363,3.352232,1.241158
2,2020,1,1,2,WN,4598,ONT,SJC,2020,20,2130,21,70,333,3.352232,1.241158
3,2020,1,1,2,WN,4761,ONT,SJC,1340,13,1455,14,75,333,3.352232,1.241158
4,2020,1,1,2,WN,5162,ONT,SJC,915,9,1035,10,80,333,3.352232,1.241158


In [18]:
df_merged = df_merged.merge(df_dest_delays, how='left', left_on='dest' , right_on='dest')
df_merged.head()

Unnamed: 0,year,month,day,dayofweek,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,crs_dep_time,dep_h,crs_arr_time,arr_h,crs_elapsed_time,distance,mean_delay_carrier,mean_delay_origin,mean_delay_dest
0,2020,1,1,2,WN,5888,ONT,SFO,1810,18,1945,19,95,363,3.352232,1.241158,2.888122
1,2020,1,1,2,WN,6276,ONT,SFO,1150,11,1320,13,90,363,3.352232,1.241158,2.888122
2,2020,1,1,2,WN,4598,ONT,SJC,2020,20,2130,21,70,333,3.352232,1.241158,1.383513
3,2020,1,1,2,WN,4761,ONT,SJC,1340,13,1455,14,75,333,3.352232,1.241158,1.383513
4,2020,1,1,2,WN,5162,ONT,SJC,915,9,1035,10,80,333,3.352232,1.241158,1.383513


In [19]:
df_merged = df_merged.merge(df_month_delays, how='left', left_on='month' , right_on='month')
df_merged.head()

Unnamed: 0,year,month,day,dayofweek,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,crs_dep_time,dep_h,crs_arr_time,arr_h,crs_elapsed_time,distance,mean_delay_carrier,mean_delay_origin,mean_delay_dest,mean_delay_month
0,2020,1,1,2,WN,5888,ONT,SFO,1810,18,1945,19,95,363,3.352232,1.241158,2.888122,0.430771
1,2020,1,1,2,WN,6276,ONT,SFO,1150,11,1320,13,90,363,3.352232,1.241158,2.888122,0.430771
2,2020,1,1,2,WN,4598,ONT,SJC,2020,20,2130,21,70,333,3.352232,1.241158,1.383513,0.430771
3,2020,1,1,2,WN,4761,ONT,SJC,1340,13,1455,14,75,333,3.352232,1.241158,1.383513,0.430771
4,2020,1,1,2,WN,5162,ONT,SJC,915,9,1035,10,80,333,3.352232,1.241158,1.383513,0.430771


In [20]:
df_merged['dep_h'] = df_merged['dep_h'].astype(int)
df_merged['arr_h'] = df_merged['arr_h'].astype(int)

In [21]:
df_merged = df_merged.merge(df_dep_h_delays, how='left', left_on='dep_h' , right_on='dep_h')
df_merged.head()

Unnamed: 0,year,month,day,dayofweek,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,crs_dep_time,dep_h,crs_arr_time,arr_h,crs_elapsed_time,distance,mean_delay_carrier,mean_delay_origin,mean_delay_dest,mean_delay_month,mean_delay_dep_h
0,2020,1,1,2,WN,5888,ONT,SFO,1810,18,1945,19,95,363,3.352232,1.241158,2.888122,0.430771,5.422463
1,2020,1,1,2,WN,6276,ONT,SFO,1150,11,1320,13,90,363,3.352232,1.241158,2.888122,0.430771,0.639966
2,2020,1,1,2,WN,4598,ONT,SJC,2020,20,2130,21,70,333,3.352232,1.241158,1.383513,0.430771,4.958738
3,2020,1,1,2,WN,4761,ONT,SJC,1340,13,1455,14,75,333,3.352232,1.241158,1.383513,0.430771,2.347908
4,2020,1,1,2,WN,5162,ONT,SJC,915,9,1035,10,80,333,3.352232,1.241158,1.383513,0.430771,-0.834127


In [22]:
df_merged = df_merged.merge(df_arr_h_delays, how='left', left_on='arr_h' , right_on='arr_h')
df_merged.head()

Unnamed: 0,year,month,day,dayofweek,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,crs_dep_time,dep_h,crs_arr_time,arr_h,crs_elapsed_time,distance,mean_delay_carrier,mean_delay_origin,mean_delay_dest,mean_delay_month,mean_delay_dep_h,mean_delay_arr_h
0,2020,1,1,2,WN,5888,ONT,SFO,1810,18,1945,19,95,363,3.352232,1.241158,2.888122,0.430771,5.422463,4.050495
1,2020,1,1,2,WN,6276,ONT,SFO,1150,11,1320,13,90,363,3.352232,1.241158,2.888122,0.430771,0.639966,0.715789
2,2020,1,1,2,WN,4598,ONT,SJC,2020,20,2130,21,70,333,3.352232,1.241158,1.383513,0.430771,4.958738,4.449983
3,2020,1,1,2,WN,4761,ONT,SJC,1340,13,1455,14,75,333,3.352232,1.241158,1.383513,0.430771,2.347908,1.266858
4,2020,1,1,2,WN,5162,ONT,SJC,915,9,1035,10,80,333,3.352232,1.241158,1.383513,0.430771,-0.834127,-1.399276


In [23]:
df_merged = df_merged.merge(df_day_of_week_delays, how='left', left_on='dayofweek' , right_on='dayofweek')
df_merged.head()

Unnamed: 0,year,month,day,dayofweek,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,crs_dep_time,dep_h,crs_arr_time,arr_h,crs_elapsed_time,distance,mean_delay_carrier,mean_delay_origin,mean_delay_dest,mean_delay_month,mean_delay_dep_h,mean_delay_arr_h,mean_delay_day_of_week
0,2020,1,1,2,WN,5888,ONT,SFO,1810,18,1945,19,95,363,3.352232,1.241158,2.888122,0.430771,5.422463,4.050495,1.443391
1,2020,1,1,2,WN,6276,ONT,SFO,1150,11,1320,13,90,363,3.352232,1.241158,2.888122,0.430771,0.639966,0.715789,1.443391
2,2020,1,1,2,WN,4598,ONT,SJC,2020,20,2130,21,70,333,3.352232,1.241158,1.383513,0.430771,4.958738,4.449983,1.443391
3,2020,1,1,2,WN,4761,ONT,SJC,1340,13,1455,14,75,333,3.352232,1.241158,1.383513,0.430771,2.347908,1.266858,1.443391
4,2020,1,1,2,WN,5162,ONT,SJC,915,9,1035,10,80,333,3.352232,1.241158,1.383513,0.430771,-0.834127,-1.399276,1.443391


In [24]:
df_merged['route'] = df_merged['origin'] + df_merged['dest']

In [25]:
df_merged = df_merged.merge(df_route_delays, how='left', left_on='route' , right_on='route')
df_merged.head()

Unnamed: 0,year,month,day,dayofweek,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,crs_dep_time,dep_h,crs_arr_time,arr_h,crs_elapsed_time,distance,mean_delay_carrier,mean_delay_origin,mean_delay_dest,mean_delay_month,mean_delay_dep_h,mean_delay_arr_h,mean_delay_day_of_week,route,mean_delay_route
0,2020,1,1,2,WN,5888,ONT,SFO,1810,18,1945,19,95,363,3.352232,1.241158,2.888122,0.430771,5.422463,4.050495,1.443391,ONTSFO,7.3125
1,2020,1,1,2,WN,6276,ONT,SFO,1150,11,1320,13,90,363,3.352232,1.241158,2.888122,0.430771,0.639966,0.715789,1.443391,ONTSFO,7.3125
2,2020,1,1,2,WN,4598,ONT,SJC,2020,20,2130,21,70,333,3.352232,1.241158,1.383513,0.430771,4.958738,4.449983,1.443391,ONTSJC,-1.315789
3,2020,1,1,2,WN,4761,ONT,SJC,1340,13,1455,14,75,333,3.352232,1.241158,1.383513,0.430771,2.347908,1.266858,1.443391,ONTSJC,-1.315789
4,2020,1,1,2,WN,5162,ONT,SJC,915,9,1035,10,80,333,3.352232,1.241158,1.383513,0.430771,-0.834127,-1.399276,1.443391,ONTSJC,-1.315789


In [26]:
df_merged = df_merged.merge(df_dep_h_taxi_out, how='left', left_on='dep_h' , right_on='dep_h')
df_merged.head()

Unnamed: 0,year,month,day,dayofweek,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,crs_dep_time,dep_h,crs_arr_time,arr_h,crs_elapsed_time,distance,mean_delay_carrier,mean_delay_origin,mean_delay_dest,mean_delay_month,mean_delay_dep_h,mean_delay_arr_h,mean_delay_day_of_week,route,mean_delay_route,mean_taxi_out_per_dep_h
0,2020,1,1,2,WN,5888,ONT,SFO,1810,18,1945,19,95,363,3.352232,1.241158,2.888122,0.430771,5.422463,4.050495,1.443391,ONTSFO,7.3125,18.517988
1,2020,1,1,2,WN,6276,ONT,SFO,1150,11,1320,13,90,363,3.352232,1.241158,2.888122,0.430771,0.639966,0.715789,1.443391,ONTSFO,7.3125,17.926986
2,2020,1,1,2,WN,4598,ONT,SJC,2020,20,2130,21,70,333,3.352232,1.241158,1.383513,0.430771,4.958738,4.449983,1.443391,ONTSJC,-1.315789,18.392321
3,2020,1,1,2,WN,4761,ONT,SJC,1340,13,1455,14,75,333,3.352232,1.241158,1.383513,0.430771,2.347908,1.266858,1.443391,ONTSJC,-1.315789,16.893937
4,2020,1,1,2,WN,5162,ONT,SJC,915,9,1035,10,80,333,3.352232,1.241158,1.383513,0.430771,-0.834127,-1.399276,1.443391,ONTSJC,-1.315789,19.492329


In [27]:
df_merged = df_merged.merge(df_arr_h_taxi_in, how='left', left_on='arr_h' , right_on='arr_h')
df_merged.head()

Unnamed: 0,year,month,day,dayofweek,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,crs_dep_time,dep_h,crs_arr_time,arr_h,crs_elapsed_time,distance,mean_delay_carrier,mean_delay_origin,mean_delay_dest,mean_delay_month,mean_delay_dep_h,mean_delay_arr_h,mean_delay_day_of_week,route,mean_delay_route,mean_taxi_out_per_dep_h,mean_taxi_in_per_arr_h
0,2020,1,1,2,WN,5888,ONT,SFO,1810,18,1945,19,95,363,3.352232,1.241158,2.888122,0.430771,5.422463,4.050495,1.443391,ONTSFO,7.3125,18.517988,8.599564
1,2020,1,1,2,WN,6276,ONT,SFO,1150,11,1320,13,90,363,3.352232,1.241158,2.888122,0.430771,0.639966,0.715789,1.443391,ONTSFO,7.3125,17.926986,7.503948
2,2020,1,1,2,WN,4598,ONT,SJC,2020,20,2130,21,70,333,3.352232,1.241158,1.383513,0.430771,4.958738,4.449983,1.443391,ONTSJC,-1.315789,18.392321,7.977602
3,2020,1,1,2,WN,4761,ONT,SJC,1340,13,1455,14,75,333,3.352232,1.241158,1.383513,0.430771,2.347908,1.266858,1.443391,ONTSJC,-1.315789,16.893937,7.65395
4,2020,1,1,2,WN,5162,ONT,SJC,915,9,1035,10,80,333,3.352232,1.241158,1.383513,0.430771,-0.834127,-1.399276,1.443391,ONTSJC,-1.315789,19.492329,7.996723


In [28]:
df_merged = df_merged.merge(df_flights_dep_h, how='left', left_on='dep_h' , right_on='dep_h')
df_merged.head()

Unnamed: 0,year,month,day,dayofweek,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,crs_dep_time,dep_h,crs_arr_time,arr_h,crs_elapsed_time,distance,mean_delay_carrier,mean_delay_origin,mean_delay_dest,mean_delay_month,mean_delay_dep_h,mean_delay_arr_h,mean_delay_day_of_week,route,mean_delay_route,mean_taxi_out_per_dep_h,mean_taxi_in_per_arr_h,num_flights_per_dep_h
0,2020,1,1,2,WN,5888,ONT,SFO,1810,18,1945,19,95,363,3.352232,1.241158,2.888122,0.430771,5.422463,4.050495,1.443391,ONTSFO,7.3125,18.517988,8.599564,5449
1,2020,1,1,2,WN,6276,ONT,SFO,1150,11,1320,13,90,363,3.352232,1.241158,2.888122,0.430771,0.639966,0.715789,1.443391,ONTSFO,7.3125,17.926986,7.503948,5905
2,2020,1,1,2,WN,4598,ONT,SJC,2020,20,2130,21,70,333,3.352232,1.241158,1.383513,0.430771,4.958738,4.449983,1.443391,ONTSJC,-1.315789,18.392321,7.977602,4532
3,2020,1,1,2,WN,4761,ONT,SJC,1340,13,1455,14,75,333,3.352232,1.241158,1.383513,0.430771,2.347908,1.266858,1.443391,ONTSJC,-1.315789,16.893937,7.65395,5329
4,2020,1,1,2,WN,5162,ONT,SJC,915,9,1035,10,80,333,3.352232,1.241158,1.383513,0.430771,-0.834127,-1.399276,1.443391,ONTSJC,-1.315789,19.492329,7.996723,5673


In [29]:
data = df_merged.copy()

In [30]:
data = data.drop(columns=['crs_dep_time', 'crs_arr_time', 'route'])
data.head()

Unnamed: 0,year,month,day,dayofweek,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,dep_h,arr_h,crs_elapsed_time,distance,mean_delay_carrier,mean_delay_origin,mean_delay_dest,mean_delay_month,mean_delay_dep_h,mean_delay_arr_h,mean_delay_day_of_week,mean_delay_route,mean_taxi_out_per_dep_h,mean_taxi_in_per_arr_h,num_flights_per_dep_h
0,2020,1,1,2,WN,5888,ONT,SFO,18,19,95,363,3.352232,1.241158,2.888122,0.430771,5.422463,4.050495,1.443391,7.3125,18.517988,8.599564,5449
1,2020,1,1,2,WN,6276,ONT,SFO,11,13,90,363,3.352232,1.241158,2.888122,0.430771,0.639966,0.715789,1.443391,7.3125,17.926986,7.503948,5905
2,2020,1,1,2,WN,4598,ONT,SJC,20,21,70,333,3.352232,1.241158,1.383513,0.430771,4.958738,4.449983,1.443391,-1.315789,18.392321,7.977602,4532
3,2020,1,1,2,WN,4761,ONT,SJC,13,14,75,333,3.352232,1.241158,1.383513,0.430771,2.347908,1.266858,1.443391,-1.315789,16.893937,7.65395,5329
4,2020,1,1,2,WN,5162,ONT,SJC,9,10,80,333,3.352232,1.241158,1.383513,0.430771,-0.834127,-1.399276,1.443391,-1.315789,19.492329,7.996723,5673


In [31]:
data.isnull().sum()

year                          0
month                         0
day                           0
dayofweek                     0
mkt_unique_carrier            0
mkt_carrier_fl_num            0
origin                        0
dest                          0
dep_h                         0
arr_h                         0
crs_elapsed_time              0
distance                      0
mean_delay_carrier            0
mean_delay_origin            16
mean_delay_dest              15
mean_delay_month              0
mean_delay_dep_h              0
mean_delay_arr_h              0
mean_delay_day_of_week        0
mean_delay_route           1659
mean_taxi_out_per_dep_h       0
mean_taxi_in_per_arr_h        0
num_flights_per_dep_h         0
dtype: int64

In [32]:
data['mean_delay_origin'] = data['mean_delay_origin'].fillna(0)
data['mean_delay_dest'] = data['mean_delay_dest'].fillna(0)
data['mean_delay_route'] = data['mean_delay_route'].fillna(data['mean_delay_origin'])

In [33]:
# check
data.isnull().sum()

year                       0
month                      0
day                        0
dayofweek                  0
mkt_unique_carrier         0
mkt_carrier_fl_num         0
origin                     0
dest                       0
dep_h                      0
arr_h                      0
crs_elapsed_time           0
distance                   0
mean_delay_carrier         0
mean_delay_origin          0
mean_delay_dest            0
mean_delay_month           0
mean_delay_dep_h           0
mean_delay_arr_h           0
mean_delay_day_of_week     0
mean_delay_route           0
mean_taxi_out_per_dep_h    0
mean_taxi_in_per_arr_h     0
num_flights_per_dep_h      0
dtype: int64

In [34]:
# check size of dataset
data.shape

(150623, 23)

In [35]:
# function to label encode desired columns
def label_encode(df, cols):
    le = preprocessing.LabelEncoder()
    for i in cols:
        df[i] = le.fit_transform(df[i].values)
    return df

In [36]:
# define columns to encode
cols_to_encode = ['mkt_unique_carrier', 'origin', 'dest']
data = label_encode(data, cols_to_encode)
data.dtypes

year                         int64
month                        int64
day                          int64
dayofweek                    int64
mkt_unique_carrier           int64
mkt_carrier_fl_num           int64
origin                       int64
dest                         int64
dep_h                        int64
arr_h                        int64
crs_elapsed_time             int64
distance                     int64
mean_delay_carrier         float64
mean_delay_origin          float64
mean_delay_dest            float64
mean_delay_month           float64
mean_delay_dep_h           float64
mean_delay_arr_h           float64
mean_delay_day_of_week     float64
mean_delay_route           float64
mean_taxi_out_per_dep_h    float64
mean_taxi_in_per_arr_h     float64
num_flights_per_dep_h        int64
dtype: object

In [37]:
# scale the whole dataset before training the models
scaler = preprocessing.StandardScaler()
X = data.values
scaler.fit(X)
X = scaler.transform(X)
X.shape

(150623, 23)

### Make Predictions

In [38]:
# load in the saved model
reg = pickle.load(open('linreg.sav', 'rb'))

In [39]:
# get final predictions
y_final_predictions = reg.predict(X)

In [40]:
y_final_predictions

array([11.65176852,  7.74536376,  4.76937633, ...,  1.54949156,
        6.82896959, -1.80828247])

In [41]:
df_final = pd.read_csv('flights_test.csv', sep=',')
df_final = df_final.drop(columns=['Unnamed: 0'])
df_final.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363
1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363
2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",2020,2130,N,70,1,333
3,2020-01-01,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",1340,1455,N,75,1,333
4,2020-01-01,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",915,1035,N,80,1,333


In [42]:
df_final = df_final[['fl_date', 'mkt_unique_carrier', 'mkt_carrier_fl_num', \
                    'origin', 'dest']]

In [43]:
df_final=df_final.copy()
df_final.loc[:,'delay_predictions'] = y_final_predictions

In [44]:
df_final.head()

Unnamed: 0,fl_date,mkt_unique_carrier,mkt_carrier_fl_num,origin,dest,delay_predictions
0,2020-01-01,WN,5888,ONT,SFO,11.651769
1,2020-01-01,WN,6276,ONT,SFO,7.745364
2,2020-01-01,WN,4598,ONT,SJC,4.769376
3,2020-01-01,WN,4761,ONT,SJC,1.56344
4,2020-01-01,WN,5162,ONT,SJC,-1.225607


In [45]:
df_final.to_csv('final_submission.csv', index=False)