# Experiments

In [1]:
import pandas as pd
import numpy as np
from modules.functions import run_linear_regression
from modules.functions import run_polynomial_regression
from modules.functions import run_random_forest
from modules.functions import to_hours
from modules.functions import scale_data

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# import csv file with the proper dtypes
dtype_dict = {
 'branded_code_share': 'string',
 'mkt_carrier': 'string',
 'mkt_carrier_fl_num': 'string',
 'op_unique_carrier': 'string',
 'tail_num': 'string',
 'op_carrier_fl_num': 'string',
 'origin_airport_id': 'string',
 'origin': 'string',
 'origin_city_name': 'string',
 'dest_airport_id': 'string',
 'dest': 'string',
 'dest_city_name': 'string',
 'crs_dep_time': 'int64',
 'dep_time': 'int64',
 'dep_delay': 'int64',
 'taxi_out': 'int64',
 'wheels_off': 'int64',
 'wheels_on': 'int64',
 'taxi_in': 'int64',
 'crs_arr_time': 'int64',
 'arr_time': 'int64',
 'arr_delay': 'int64',
 'crs_elapsed_time': 'int64',
 'actual_elapsed_time': 'int64',
 'air_time': 'int64',
 'distance': 'int64'}
flights = pd.read_csv("../LHL_Midterm_Project_Predicting_Flight_Delays/data/ransmpl_clean.csv", parse_dates=[0], dtype=dtype_dict)
df_no_outliers = pd.read_csv("../LHL_Midterm_Project_Predicting_Flight_Delays/data/ransmpl_clean_no_outliers.csv", parse_dates=[0], dtype=dtype_dict)
flights_test = pd.read_csv("../LHL_Midterm_Project_Predicting_Flight_Delays/data/flights_test.csv", parse_dates=[0], dtype=dtype_dict)

In [4]:
flights.head()

Unnamed: 0,fl_date,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,crs_elapsed_time,actual_elapsed_time,air_time,distance
0,2018-03-07,AA,AA,465,AA,N200UU,465,14107,PHX,"Phoenix, AZ",14679,SAN,"San Diego, CA",835,833,-2,13,846,838,2,851,840,-11,76,67,52,304
1,2018-03-07,AA,AA,591,AA,N833AW,591,11057,CLT,"Charlotte, NC",11278,DCA,"Washington, DC",1431,1537,66,16,1553,1648,3,1559,1651,52,88,74,55,331
2,2018-03-07,AA,AA,600,AA,N151UW,600,11697,FLL,"Fort Lauderdale, FL",11057,CLT,"Charlotte, NC",603,557,-6,18,615,746,19,809,805,-4,126,128,91,632
3,2018-03-07,AA,AA,1805,AA,N924US,1805,11057,CLT,"Charlotte, NC",10721,BOS,"Boston, MA",1135,1129,-6,11,1140,1312,12,1352,1324,-28,137,115,92,728
4,2018-03-07,AA,AA,2615,AA,N945NN,2615,11057,CLT,"Charlotte, NC",15370,TUL,"Tulsa, OK",1820,1812,-8,11,1823,1936,6,2002,1942,-20,162,150,133,842


# Feature Engineering

In [7]:
feats = flights[["fl_date", "mkt_carrier", "mkt_carrier_fl_num", "origin", "dest"]].copy()

In [8]:
feats["count"] = 1

feats["op_unique_carrier"] = flights["op_unique_carrier"].copy()
feats["year"] = feats["fl_date"].dt.year.astype("object")
feats["month"] = feats["fl_date"].dt.month.astype("object")
feats["day"] = feats["fl_date"].dt.day.astype("object")
feats["day_of_week"] = feats["fl_date"].dt.weekday.astype("object")

feats["air_time"] = flights["air_time"].copy()
feats["distance"] = flights["distance"].copy()

feats["crs_dep_hour"] = to_hours(flights["crs_dep_time"]).round().replace(24, 0)
feats["crs_arr_hour"] = to_hours(flights["crs_arr_time"]).round().replace(24, 0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


**avg delay by carrier feature**

In [9]:
# calculate avg delay by carrier
avg_delay_df = flights[["op_unique_carrier", "arr_delay"]].groupby("op_unique_carrier").mean().reset_index()
# round values and create a dictionary mapping each carrier to their avg delay
avg_delay_map = dict(avg_delay_df.values)
rounded_delay_vals = [round(val, 2) for val in avg_delay_map.values()]
avg_delay_map = dict(zip(avg_delay_map.keys(), rounded_delay_vals))
# creating new column based on the mapping
feats["avg_delay_for_carrier"] = flights["op_unique_carrier"].map(avg_delay_map)

**num of flights that take off at given time of day feature**

In [10]:
# creating a 'traffic by hour' feature (dep flights)
n_flights_dep = feats[["crs_dep_hour", "count"]].groupby("crs_dep_hour").sum()
n_flights_dep.reset_index(inplace=True)
n_flights_dep_map = dict(zip(n_flights_dep["crs_dep_hour"], n_flights_dep["count"]))
feats["dep_traffic"] = feats["crs_dep_hour"].map(n_flights_dep_map)

**number of flights that land at given time of day feature**

In [11]:
# creating a 'traffic by hour' feature (arr_flights)
n_flights_arr = feats[["crs_arr_hour", "count"]].groupby("crs_arr_hour").sum()
n_flights_arr.reset_index(inplace=True)
n_flights_arr_map = dict(zip(n_flights_arr["crs_arr_hour"], n_flights_arr["count"]))
feats["arr_traffic"] = feats["crs_arr_hour"].map(n_flights_arr_map)

In [12]:
###
avg_delay_by_city = flights[["dest_city_name", "arr_delay"]].groupby("dest_city_name").mean()
avg_delay_by_city

Unnamed: 0_level_0,arr_delay
dest_city_name,Unnamed: 1_level_1
"Aberdeen, SD",19.111111
"Abilene, TX",6.156250
"Adak Island, AK",-6.200000
"Aguadilla, PR",10.313725
"Akron, OH",11.596059
...,...
"Worcester, MA",3.393939
"Wrangell, AK",-4.515152
"Yakima, WA",1.086957
"Yakutat, AK",-8.121212


In [13]:
feats.head()

Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,origin,dest,count,op_unique_carrier,year,month,day,day_of_week,air_time,distance,crs_dep_hour,crs_arr_hour,avg_delay_for_carrier,dep_traffic,arr_traffic
0,2018-03-07,AA,465,PHX,SAN,1,AA,2018,3,7,2,52,304,9.0,9.0,6.07,22293,23025
1,2018-03-07,AA,591,CLT,DCA,1,AA,2018,3,7,2,55,331,15.0,16.0,6.07,23703,23854
2,2018-03-07,AA,600,FLL,CLT,1,AA,2018,3,7,2,91,632,6.0,8.0,6.07,26483,16692
3,2018-03-07,AA,1805,CLT,BOS,1,AA,2018,3,7,2,92,728,12.0,14.0,6.07,24118,23869
4,2018-03-07,AA,2615,CLT,TUL,1,AA,2018,3,7,2,133,842,18.0,20.0,6.07,24517,21411


In [19]:
feats.head(2)

Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,origin,dest,count,op_unique_carrier,year,month,day,day_of_week,air_time,distance,crs_dep_hour,crs_arr_hour,avg_delay_for_carrier,dep_traffic,arr_traffic
0,2018-03-07,AA,465,PHX,SAN,1,AA,2018,3,7,2,52,304,9.0,9.0,6.07,22293,23025
1,2018-03-07,AA,591,CLT,DCA,1,AA,2018,3,7,2,55,331,15.0,16.0,6.07,23703,23854


# Select Features

In [35]:
y = flights["arr_delay"]
y2 = df_no_outliers["arr_delay"]

X1 = flights[["air_time", "distance"]]
X2 = df_num
X3 = df_no_outliers[["air_time", "distance"]]
X4 = feats[["crs_dep_hour", "crs_arr_hour", "avg_delay_for_carrier", "dep_traffic", "arr_traffic"]]
X5 = feats[["crs_dep_hour", "crs_arr_hour", "avg_delay_for_carrier", "dep_traffic", "arr_traffic", "distance"]] # Current Best

# Train Models

In [36]:
# Linear Regression
lr, y_pred, r2_train, r2_test = run_linear_regression(X5, y)
print(r2_train, r2_test)

0.01491187340708866 0.014349730993219234


In [48]:
# Polynomial Regression
for degree in range(2,3):
    pr, y_pred, r2_train, r2_test = run_polynomial_regression(X5, y, degree)

---Results---
degree = 2
Train score = 0.015503838831526728
Test score = 0.015516872092381395


In [50]:
# Random Forest
rf, y_pred, r2_train, r2_test = run_random_forest(X5, y, n_estimators=200, max_depth=5)
print(r2_train, r2_test)

train score = 0.023678730663871272
test score = 0.015101910560902043
0.023678730663871272 0.015101910560902043


# Flights_test Dataframe

In [39]:
flights_test.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,count,year,month,day,day_of_week,crs_dep_hour,crs_arr_hour,avg_delay_for_carrier,dep_traffic,arr_traffic
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363,1,2020,1,1,2,18.0,20.0,3.53,24517,21411
1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363,1,2020,1,1,2,11.0,13.0,3.53,23686,22794
2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",2020,2130,N,70,1,333,1,2020,1,1,2,20.0,21.0,3.53,20979,23548
3,2020-01-01,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",1340,1455,N,75,1,333,1,2020,1,1,2,13.0,15.0,3.53,22409,22471
4,2020-01-01,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",915,1035,N,80,1,333,1,2020,1,1,2,9.0,11.0,3.53,22293,24820


In [40]:
flights_test["count"] = 1

flights_test["year"] = flights_test["fl_date"].dt.year.astype("object")
flights_test["month"] = flights_test["fl_date"].dt.month.astype("object")
flights_test["day"] = flights_test["fl_date"].dt.day.astype("object")
flights_test["day_of_week"] = flights_test["fl_date"].dt.weekday.astype("object")

flights_test["crs_dep_hour"] = to_hours(flights_test["crs_dep_time"]).round().replace(24, 0)
flights_test["crs_arr_hour"] = to_hours(flights_test["crs_arr_time"]).round().replace(24, 0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [41]:
# calculate avg delay by carrier
avg_delay_df = flights[["op_unique_carrier", "arr_delay"]].groupby("op_unique_carrier").mean().reset_index()
# round values and create a dictionary mapping each carrier to their avg delay
avg_delay_map = dict(avg_delay_df.values)
rounded_delay_vals = [round(val, 2) for val in avg_delay_map.values()]
avg_delay_map = dict(zip(avg_delay_map.keys(), rounded_delay_vals))


# adding the feature to flights_test dataframe
flights_test["avg_delay_for_carrier"] = flights_test["op_unique_carrier"].map(avg_delay_map)

In [42]:
# creating a 'traffic by hour' feature (dep flights)
n_flights_dep = feats[["crs_dep_hour", "count"]].groupby("crs_dep_hour").sum()
n_flights_dep.reset_index(inplace=True)
n_flights_dep_map = dict(zip(n_flights_dep["crs_dep_hour"], n_flights_dep["count"]))

# adding the feature to flights_test dataframe
flights_test["dep_traffic"] = flights_test["crs_dep_hour"].map(n_flights_dep_map)

In [43]:
# creating a 'traffic by hour' feature (arr_flights)
n_flights_arr = feats[["crs_arr_hour", "count"]].groupby("crs_arr_hour").sum()
n_flights_arr.reset_index(inplace=True)
n_flights_arr_map = dict(zip(n_flights_arr["crs_arr_hour"], n_flights_arr["count"]))

# adding the feature to flights_test dataframe
flights_test["arr_traffic"] = flights_test["crs_arr_hour"].map(n_flights_arr_map)

In [45]:
# Passing these features to the model
X = flights_test[["crs_dep_hour", "crs_arr_hour", "avg_delay_for_carrier", "dep_traffic",
                  "arr_traffic", "distance"]]

In [54]:
# predicting arr_delay of the flights in the flights_test sample using our linear regression model
lr_pred = lr.predict(X)

In [55]:
lr_pred

array([ 8.68385563,  1.39683385, 10.6800499 , ...,  7.52594935,
        2.24007453, 11.57365562])

In [61]:
submission = flights_test[["fl_date", "mkt_carrier", "mkt_carrier_fl_num", "origin", "dest"]].copy()
# Adding arr_delay prediction column
submission["predicted_delay"] = lr_pred

In [62]:
submission.head()

Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,origin,dest,predicted_delay
0,2020-01-01,WN,5888,ONT,SFO,8.683856
1,2020-01-01,WN,6276,ONT,SFO,1.396834
2,2020-01-01,WN,4598,ONT,SJC,10.68005
3,2020-01-01,WN,4761,ONT,SJC,3.554747
4,2020-01-01,WN,5162,ONT,SJC,-0.603117


In [65]:
submission.to_csv("submission_example.csv", index=False)