In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%pwd

'/content'

In [3]:
%cd drive/MyDrive/'Lighthouse Labs'/'Data Science Boot Camp'/'Midterm Project'

/content/drive/MyDrive/Lighthouse Labs/Data Science Boot Camp/Midterm Project


In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from data_cleaning import clean_flights_df, clean_passengers_df, avg_passengers, clean_fuel_df, avg_carrier_arr_delay, avg_fuel_use, avg_taxi_time

### Import datasets

In [5]:
flights_df = clean_flights_df('data/flights_sample.csv')
flights_df.head()

Unnamed: 0,mkt_unique_carrier,op_unique_carrier,origin,dest,crs_dep_time,dep_time,dep_delay,taxi_out,taxi_in,crs_arr_time,arr_time,arr_delay,crs_elapsed_time,actual_elapsed_time,distance,year,month,day_of_month,day_of_week,dep_hour
0,UA,UA,SFO,LAX,630,627.0,-3.0,16.0,13.0,816,753.0,-23.0,106.0,86.0,337,2019,7,17,2,6
1,WN,WN,BWI,BOS,1340,1400.0,20.0,36.0,5.0,1510,1609.0,59.0,90.0,129.0,369,2018,6,9,5,13
2,B6,B6,BDL,MCO,600,551.0,-9.0,16.0,9.0,906,845.0,-21.0,186.0,174.0,1050,2019,11,6,2,6
3,WN,WN,LAS,OMA,950,958.0,8.0,16.0,2.0,1425,1417.0,-8.0,155.0,139.0,1099,2019,12,24,1,9
4,WN,WN,MCO,MDW,2045,2042.0,-3.0,11.0,5.0,2235,2231.0,-4.0,170.0,169.0,990,2018,1,14,6,20


In [6]:
flights_df = flights_df.sample(frac=.1, random_state=58)
flights_df.reset_index(inplace=True)
flights_df.shape

(468152, 21)

In [7]:
flights_df.head()

Unnamed: 0,index,mkt_unique_carrier,op_unique_carrier,origin,dest,crs_dep_time,dep_time,dep_delay,taxi_out,taxi_in,...,arr_time,arr_delay,crs_elapsed_time,actual_elapsed_time,distance,year,month,day_of_month,day_of_week,dep_hour
0,891643,WN,WN,SDF,MDW,2120,2126.0,6.0,6.0,3.0,...,2124.0,-6.0,70.0,58.0,271,2018,6,25,0,21
1,3506629,DL,OO,PDX,SEA,1314,1306.0,-8.0,38.0,18.0,...,1437.0,20.0,63.0,91.0,129,2018,7,30,0,13
2,4059215,UA,EV,EWR,PVD,2100,2054.0,-6.0,38.0,5.0,...,2211.0,1.0,70.0,77.0,160,2019,8,5,0,21
3,3284365,AA,AA,JFK,SAN,1930,1950.0,20.0,38.0,4.0,...,2312.0,18.0,384.0,382.0,2446,2018,11,19,0,19
4,1958211,AA,AA,BWI,CLT,700,653.0,-7.0,30.0,3.0,...,828.0,-19.0,107.0,95.0,361,2019,11,26,1,7


In [8]:
passengers_df = clean_passengers_df('data/passengers.csv')
fuel_df = clean_fuel_df('data/fuel_consumption.csv')

### Encode Categorical Variables used in analysis

In [9]:
cat_cols = ['origin', 'dest']

enc = OneHotEncoder(drop='first')
encoded = enc.fit_transform(flights_df[cat_cols])
encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded)
print(encoded_df.shape)
encoded_df.head()

(468152, 747)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,737,738,739,740,741,742,743,744,745,746
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
flights_enc = pd.concat([flights_df, encoded_df], axis=1)
print(flights_enc.shape)

(468152, 768)


In [11]:
flights_enc.drop('index', axis=1, inplace=True)
flights_enc.head()

Unnamed: 0,mkt_unique_carrier,op_unique_carrier,origin,dest,crs_dep_time,dep_time,dep_delay,taxi_out,taxi_in,crs_arr_time,...,737,738,739,740,741,742,743,744,745,746
0,WN,WN,SDF,MDW,2120,2126.0,6.0,6.0,3.0,2130,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DL,OO,PDX,SEA,1314,1306.0,-8.0,38.0,18.0,1417,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,UA,EV,EWR,PVD,2100,2054.0,-6.0,38.0,5.0,2210,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AA,AA,JFK,SAN,1930,1950.0,20.0,38.0,4.0,2254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AA,AA,BWI,CLT,700,653.0,-7.0,30.0,3.0,847,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Train Test Split and Feature Engineering

In [12]:
X_train = flights_enc
y_train = flights_enc['arr_delay']

In [13]:
# add average passengers and avg fuel use columns

X_train = avg_passengers(X_train, passengers_df)
X_train = avg_fuel_use(X_train, fuel_df)

In [14]:
# create average taxi in and out columns.
taxi_dep_mean_dict = round(X_train.groupby(X_train['dep_time']//100)['taxi_out'].mean(), 2).to_dict()
taxi_arr_mean_dict = round(X_train.groupby(X_train['arr_time']//100)['taxi_in'].mean(), 2).to_dict()

X_train['arr_hour'] = X_train['arr_time'] // 100
X_train['taxi_dep_mean'] = X_train['dep_hour'].map(taxi_dep_mean_dict)
X_train['taxi_arr_mean'] = X_train['arr_hour'].map(taxi_arr_mean_dict)
X_train.drop('arr_hour', axis=1, inplace=True)

In [15]:
# create avg_carrier_delay column
avg_carrier_arr_delay_dict = round(X_train.groupby('op_unique_carrier')['arr_delay'].mean(), 2).to_dict()
X_train['avg_carrier_arr_delay'] = X_train['op_unique_carrier'].map(avg_carrier_arr_delay_dict)

In [16]:
X_train.columns.tolist()

['mkt_unique_carrier',
 'op_unique_carrier',
 'origin',
 'dest',
 'crs_dep_time',
 'dep_time',
 'dep_delay',
 'taxi_out',
 'taxi_in',
 'crs_arr_time',
 'arr_time',
 'arr_delay',
 'crs_elapsed_time',
 'actual_elapsed_time',
 'distance',
 'year',
 'month',
 'day_of_month',
 'day_of_week',
 'dep_hour',
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,

In [17]:
# drop columns from train data that won't be used in model
X_train.drop(columns=['mkt_unique_carrier','op_unique_carrier', 'dest', 'origin', 'dep_time', 'dep_delay', 'taxi_in', 'taxi_out', 'arr_time', 'arr_delay', 'actual_elapsed_time'], inplace=True)
X_train.shape

(468152, 762)

In [18]:
# check if engineered features created any null values
X_train.isna().sum().sum()

1

In [41]:
X_train[X_train.isna().any(axis=1)]

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,year,month,day_of_month,day_of_week,dep_hour,0,...,743,744,745,746,monthly_avg_passengers,avg_monthly_fuel_gallons,avg_monthly_fuel_cost,taxi_dep_mean,taxi_arr_mean,avg_carrier_arr_delay


In [20]:
X_train.dropna(inplace=True)


In [21]:
y_train.drop(index=287991, inplace=True)

In [None]:
print(X_train.shape, y_train.shape)

(140446, 784) (140446,)
(327705, 784) (327705,)


In [21]:
# change all column names to strings
X_train.columns = X_train.columns.astype(str)

### Scale Numeric Features

In [22]:
X_train.reset_index(inplace=True, drop=True)
X_train.head()

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,year,month,day_of_month,day_of_week,dep_hour,0,...,743,744,745,746,monthly_avg_passengers,avg_monthly_fuel_gallons,avg_monthly_fuel_cost,taxi_dep_mean,taxi_arr_mean,avg_carrier_arr_delay
0,2120,2130,70.0,271,2018,6,25,0,21,0.0,...,0.0,0.0,0.0,0.0,6287.0,178747802.0,342753536.0,17.7,7.94,3.52
1,1314,1417,63.0,129,2018,7,30,0,13,0.0,...,0.0,0.0,0.0,0.0,4280.0,338521162.0,631140692.0,16.63,7.37,7.26
2,2100,2210,70.0,160,2019,8,5,0,21,0.0,...,0.0,0.0,0.0,0.0,815.0,311317638.0,539868102.0,17.7,7.33,11.08
3,1930,2254,384.0,2446,2018,11,19,0,19,0.0,...,0.0,0.0,0.0,0.0,4526.0,272453340.0,467871368.0,17.55,6.86,6.01
4,700,847,107.0,361,2019,11,26,1,7,0.0,...,0.0,0.0,0.0,0.0,4486.0,272453340.0,467871368.0,17.73,8.49,6.01


In [23]:
num_cols = ['crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'distance', 'year', 'month', 'day_of_month', 'day_of_week', 'dep_hour', 'monthly_avg_passengers','avg_monthly_fuel_gallons', 'avg_monthly_fuel_cost', 'taxi_dep_mean', 'taxi_arr_mean', 'avg_carrier_arr_delay']
X_train_num = X_train[num_cols]

In [24]:
scaler = MinMaxScaler()
X_train_num_scaled = pd.DataFrame(scaler.fit_transform(X_train_num))

In [25]:
# replace unscaled num columns with scaled ones
X_train_scaled = pd.concat([X_train.drop(columns=num_cols), X_train_num_scaled], axis=1)
X_train_scaled.shape

(468151, 762)

In [26]:
X_train_scaled.isna().sum().sum()

0

## Model Training

## ElasticNet Model

In [34]:
X_train_scaled.columns=X_train_scaled.columns.astype(str)

In [35]:
from sklearn.linear_model import ElasticNet

elnet = ElasticNet(alpha=.001, l1_ratio=0)

elnet.fit(X_train_scaled, y_train)

  model = cd_fast.enet_coordinate_descent(


In [38]:
import pickle
filename = 'Ben_ENmodelFinal.sav'
pickle.dump(elnet, open(filename, 'wb'))

## Import and prep test data

In [28]:
from data_cleaning import import_flights_test
flights_test = import_flights_test('data/flights_test.csv')
flights_test.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01 00:00:00,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363
1,2020-01-01 00:00:00,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363
2,2020-01-01 00:00:00,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",2020,2130,N,70,1,333
3,2020-01-01 00:00:00,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",1340,1455,N,75,1,333
4,2020-01-01 00:00:00,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",915,1035,N,80,1,333


In [29]:
flights_test['fl_date'] = pd.to_datetime(flights_test['fl_date'], errors='coerce')
flights_test = flights_test[flights_test['fl_date'] <= '2020-1-7']

flights_test['year'] = flights_test['fl_date'].dt.year
flights_test['month'] = flights_test['fl_date'].dt.month
flights_test['day_of_month'] = flights_test['fl_date'].dt.day
flights_test['day_of_week'] = flights_test['fl_date'].dt.dayofweek
flights_test['dep_hour'] = flights_test['crs_dep_time'] // 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flights_test['year'] = flights_test['fl_date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flights_test['month'] = flights_test['fl_date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flights_test['day_of_month'] = flights_test['fl_date'].dt.day
A value is trying to be set on 

In [30]:
flights_test.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,crs_arr_time,dup,crs_elapsed_time,flights,distance,year,month,day_of_month,day_of_week,dep_hour
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,...,1945,N,95,1,363,2020,1,1,2,18
1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,...,1320,N,90,1,363,2020,1,1,2,11
2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,...,2130,N,70,1,333,2020,1,1,2,20
3,2020-01-01,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,...,1455,N,75,1,333,2020,1,1,2,13
4,2020-01-01,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,...,1035,N,80,1,333,2020,1,1,2,9


In [31]:
flights_test.shape

(150623, 25)

In [32]:
flights_test = avg_passengers(flights_test, passengers_df)
flights_test = avg_fuel_use(flights_test, fuel_df)

In [33]:
flights_test['arr_hour'] = flights_test['crs_arr_time'] // 100
flights_test['taxi_dep_mean'] = flights_test['dep_hour'].map(taxi_dep_mean_dict)
flights_test['taxi_arr_mean'] = flights_test['arr_hour'].map(taxi_arr_mean_dict)
flights_test.drop('arr_hour', axis=1, inplace=True)

In [34]:
flights_test['avg_carrier_arr_delay'] = flights_test['op_unique_carrier'].map(avg_carrier_arr_delay_dict)

In [35]:
flights_test.drop(columns=['fl_date','mkt_unique_carrier','op_unique_carrier','mkt_carrier','dup','branded_code_share','mkt_carrier_fl_num','tail_num','op_carrier_fl_num', 'origin_airport_id','origin_city_name','dest_airport_id','dest_city_name', 'flights'], inplace=True)

In [36]:
test_encoded = enc.transform(flights_test[cat_cols])
test_encoded = pd.DataFrame.sparse.from_spmatrix(test_encoded)
test_encoded.shape

(150623, 747)

In [37]:
test_scaled = pd.DataFrame(scaler.transform(flights_test[num_cols]))
test_scaled.shape

(150623, 15)

In [38]:
test_prepped = pd.concat([test_encoded, test_scaled], axis=1)
test_prepped.shape

(150623, 762)

In [39]:
test_prepped.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5.1,6.1,7.1,8.1,9.1,10,11,12,13,14
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.782609,0.075876,0.449356,0.388468,0.85446,0.98913,0.337667
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.478261,0.075876,0.449356,0.388468,0.674491,0.36413,0.337667
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.869565,0.115494,0.449356,0.388468,0.805947,0.701087,0.337667
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.565217,0.115494,0.449356,0.388468,0.574335,0.391304,0.337667
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.391304,0.115494,0.449356,0.388468,0.865415,0.652174,0.337667


In [40]:
test_prepped.isna().sum().sum()

1521

In [46]:
test_prepped[test_prepped.isna().any(axis=1)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5.1,6.1,7.1,8.1,9.1,10,11,12,13,14
1076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.217391,,0.781798,0.588194,0.471049,0.673913,0.448349
1171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.478261,,0.781798,0.588194,0.674491,0.364130,0.448349
1172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.608696,,0.781798,0.588194,0.610329,0.989130,0.448349
1861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.260870,,0.781798,0.588194,0.610329,1.000000,0.463457
1866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.913043,,0.781798,0.588194,0.741784,0.114130,0.463457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.166667,0.217391,,0.740549,0.574159,0.471049,0.673913,0.469079
150036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.166667,0.434783,,0.740549,0.574159,0.755869,0.375000,0.469079
150061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.166667,0.434783,,0.740549,0.574159,0.755869,0.347826,0.469079
150183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.166667,0.695652,,0.740549,0.574159,0.679186,0.701087,0.469079


In [44]:
null_index = test_prepped[test_prepped.isna().any(axis=1)].index.tolist()

In [47]:
test_prepped.dropna(inplace=True)

In [48]:
import pickle
filename = 'Ben_ENmodelFinal.sav'
elnet = pickle.load(open(filename, 'rb'))

In [49]:
y_pred = elnet.predict(test_prepped)



In [50]:
y_pred

array([9.52573345, 9.44227267, 7.83334038, ..., 2.57999616, 8.47246931,
       1.92023927])

In [52]:
predictions = import_flights_test('data/flights_test.csv')
predictions['fl_date'] = pd.to_datetime(predictions['fl_date'], errors='coerce')
predictions = predictions[predictions['fl_date'] <= '2020-1-7']

In [54]:
predictions.drop(index=null_index, inplace=True)

In [56]:
predictions = predictions[['fl_date', 'mkt_carrier', 'mkt_carrier_fl_num', 'origin', 'dest']]

In [58]:
predictions['predicted_delay'] = y_pred

In [59]:
predictions.head()

Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,origin,dest,predicted_delay
0,2020-01-01,WN,5888,ONT,SFO,9.525733
1,2020-01-01,WN,6276,ONT,SFO,9.442273
2,2020-01-01,WN,4598,ONT,SJC,7.83334
3,2020-01-01,WN,4761,ONT,SJC,3.671758
4,2020-01-01,WN,5162,ONT,SJC,-2.268834


In [60]:
predictions.to_csv('predictions.csv', index=False)