In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn import metrics

import pickle

In [2]:
# Read from extracted data
df = pd.read_csv('../data/train_Jan1-7_scaled_2022-10-29_2127.csv')

In [3]:
df.head()

Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,origin_airport_id,dest_airport_id,origin_region_Midwest,origin_region_Northeast,origin_region_South,origin_region_West,dest_region_Midwest,...,mean_seats_per_departure,mean_passengers_per_departure,mean_freight_per_departure,mean_mail_per_departure,mean_empty_seats_per_departure,mean_dep_delay_carrier_origin_month,mean_arr_delay_carrier_origin_month,mean_dep_delay_carrier_origin_date_t-1_year_month,mean_arr_delay_carrier_origin_date_t-1_year_month,arr_delay
0,2019-01-06,AA,324,13930,13198,1.0,0.0,0.0,0.0,1.0,...,-0.27999,-1.076595,-0.235296,-0.337765,1.258195,0.892927,1.178806,0.892927,1.178806,0.568429
1,2019-01-05,UA,467,11042,11292,1.0,0.0,0.0,0.0,0.0,...,0.639417,1.441133,-0.216891,2.048351,-1.36174,-0.42468,-0.617052,-0.42468,-0.617052,-0.521694
2,2019-01-05,DL,5790,14869,14831,0.0,0.0,0.0,1.0,0.0,...,-0.668184,-0.37124,-0.260367,-0.302468,-0.262767,-0.670868,-0.90944,-0.670868,-0.90944,0.487679
3,2019-01-04,AS,359,12478,14747,0.0,1.0,0.0,0.0,0.0,...,0.864161,0.800304,-0.263984,2.164388,-0.135828,0.379328,-0.58341,0.379328,-0.58341,-0.40057
4,2019-01-02,HA,518,12173,12758,0.0,0.0,0.0,1.0,0.0,...,-1.383278,-0.532762,0.053639,-0.341615,-0.894262,-1.027914,-0.031664,-1.027914,-0.031664,-0.23907


In [4]:
columns_for_ID =[
'fl_date',
 'mkt_carrier',
 'mkt_carrier_fl_num','origin_airport_id', 'dest_airport_id']

df = df.set_index(columns_for_ID)

In [5]:
X = df.drop('arr_delay', axis = 1)   # features
y = df['arr_delay']    # labels

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape); 
print(X_test.shape)

(71806, 38)
(30775, 38)


# Random Forest

In [7]:
rf = RandomForestRegressor(n_estimators = 300, max_features = 'sqrt', max_depth = 5, random_state = 18)

rf.fit(X_train, y_train)

y_pred_train = rf.predict(X_train)
print('RSME_train: ',np.sqrt(mean_squared_error(y_train, y_pred_train))) 
print('R2_train: ',r2_score(y_train, y_pred_train))
print('MAE_train: ',mean_absolute_error(y_train, y_pred_train))


y_pred_test = rf.predict(X_test)
print('RSME_test: ',np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))) 
print('R2_test: ',r2_score(y_test, y_pred_test))
print('MAE_train: ',mean_absolute_error(y_test, y_pred_test))

RSME_train:  0.9528740994217348
R2_train:  0.08479341976917898
MAE_train:  0.6565219791707255
RSME_test:  0.9683424514739913
R2_test:  0.07918868199899187
MAE_train:  0.6632542943493162


#### Random Forest Random Search

In [8]:
# Create the random grid
param_dist = {
    'n_estimators' : [15,100,150, 300],
    'max_features' : ['sqrt', 'log2'],
    'max_depth' : [5,10,12,15],
    'random_state' : [18]
}

In [9]:
# Instantiate RandomizedSearchCV, fit model, and make prediction
random_rf = RandomizedSearchCV(RandomForestRegressor(), param_distributions = param_dist)
random_rf.fit(X_train, y_train)
print('Best Score: ')
print(random_rf.best_score_)


Best Score: 
0.16936028270492248


In [10]:
y_pred = random_rf.predict(X_test)
df = pd.DataFrame(random_rf.cv_results_)

In [11]:
display(random_rf.best_params_)

# df.sort_values('rank_test_score', ascending = True).head(3)

{'random_state': 18,
 'n_estimators': 300,
 'max_features': 'sqrt',
 'max_depth': 15}

In [12]:
print('RSME_test: ',np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 
print('R2_test: ',r2_score(y_test, y_pred))
print('MAE_test:', mean_absolute_error(y_test, y_pred))

RSME_test:  0.916118622548084
R2_test:  0.17583127788343955
MAE_test: 0.6236741689196574


In [13]:
# Save the file to CSV
from datetime import datetime
datetime_now = datetime.now().strftime('%Y-%m-%d_%H%M')
filename = f'Random_Forest_{datetime_now}.pickle'
print(filename)

Random_Forest_2022-10-29_2317.pickle


In [14]:
# Save the model
pickle.dump(random_rf,open(filename, 'wb'))