#### Part of Silvia's Function

In [None]:
# Metrics for test data

rmse = mean_squared_error(self.y_test, y_pred, squared=False)
mean_abs_error = mean_absolute_error(self.y_test, y_pred)
r2 = r2_score(self.y_test, y_pred)

# Metrics for training data

rmse_train = mean_squared_error(self.y_train, y_pred_train)
mean_abs_error_train = mean_absolute_error(self.y_train, y_pred_train)
r2_train = r2_score(self.y_train, y_pred_train)

print(f'\n{self.model_name} evaluation metrics: \n\tTest data\tTraining data\t\tDifference')
print(f'RMSE: \t\t{rmse:.2f}\t\t{rmse_train:.2f}\t\t{(rmse - rmse_train):.2f}')
print(f'MAE: \t\t{mean_abs_error:.2f}\t\t{mean_abs_error_train:.2f}\t\t{(mean_abs_error - mean_abs_error_train):.2f}')
print(f'R^2: \t\t{r2:.2f}\t\t{r2:.2f}\t\t{(r2 - r2_train):.2f}')

### Linear, Lasso, Ridge modelling

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import pickle

In [12]:
# Read from extracted data
df = pd.read_csv('../data/train_Jan1-7_scaled_2022-10-29_2127.csv')

In [13]:
df.head()

Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,origin_airport_id,dest_airport_id,origin_region_Midwest,origin_region_Northeast,origin_region_South,origin_region_West,dest_region_Midwest,...,mean_seats_per_departure,mean_passengers_per_departure,mean_freight_per_departure,mean_mail_per_departure,mean_empty_seats_per_departure,mean_dep_delay_carrier_origin_month,mean_arr_delay_carrier_origin_month,mean_dep_delay_carrier_origin_date_t-1_year_month,mean_arr_delay_carrier_origin_date_t-1_year_month,arr_delay
0,2019-01-06,AA,324,13930,13198,1.0,0.0,0.0,0.0,1.0,...,-0.27999,-1.076595,-0.235296,-0.337765,1.258195,0.892927,1.178806,0.892927,1.178806,0.568429
1,2019-01-05,UA,467,11042,11292,1.0,0.0,0.0,0.0,0.0,...,0.639417,1.441133,-0.216891,2.048351,-1.36174,-0.42468,-0.617052,-0.42468,-0.617052,-0.521694
2,2019-01-05,DL,5790,14869,14831,0.0,0.0,0.0,1.0,0.0,...,-0.668184,-0.37124,-0.260367,-0.302468,-0.262767,-0.670868,-0.90944,-0.670868,-0.90944,0.487679
3,2019-01-04,AS,359,12478,14747,0.0,1.0,0.0,0.0,0.0,...,0.864161,0.800304,-0.263984,2.164388,-0.135828,0.379328,-0.58341,0.379328,-0.58341,-0.40057
4,2019-01-02,HA,518,12173,12758,0.0,0.0,0.0,1.0,0.0,...,-1.383278,-0.532762,0.053639,-0.341615,-0.894262,-1.027914,-0.031664,-1.027914,-0.031664,-0.23907


In [14]:
columns_for_ID =[
'fl_date',
 'mkt_carrier',
 'mkt_carrier_fl_num','origin_airport_id', 'dest_airport_id']

df = df.set_index(columns_for_ID)

In [15]:
X = df.drop('arr_delay', axis = 1)   # features
y = df['arr_delay']    # labels

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape); 
print(X_test.shape)

(71806, 38)
(30775, 38)


#### 1. Linear Regression

In [17]:
# Linear Regression
lin_reg = LinearRegression().fit(X_train, y_train)

y_pred_train = lin_reg.predict(X_train)
print('RSME_train: ',np.sqrt(mean_squared_error(y_train, y_pred_train))) 
print('R2_train: ',r2_score(y_train, y_pred_train))
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred_train))

y_pred_test = lin_reg.predict(X_test)
print('RSME_test: ',np.sqrt(mean_squared_error(y_test, y_pred_test))) 
print('R2_test: ',r2_score(y_test, y_pred_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_test))

RSME_train:  0.9563760299027517
R2_train:  0.078054063144406
Mean Absolute Error: 0.6643863466286617
RSME_test:  0.970635111224479
R2_test:  0.07482327142520839
Mean Absolute Error: 0.6710741516720328


#### 2. Lasso Regression

In [18]:
lasso_reg = LassoCV().fit(X_train, y_train)

y_pred_train = lasso_reg.predict(X_train)
print('RSME_train: ',np.sqrt(mean_squared_error(y_train, y_pred_train))) 
print('R2_train: ',r2_score(y_train, y_pred_train))
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred_train))

y_pred_test = lasso_reg.predict(X_test)
print('RSME_test: ',np.sqrt(mean_squared_error(y_test, y_pred_test))) 
print('R2_test: ',r2_score(y_test, y_pred_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_test))

RSME_train:  0.9548492292544603
R2_train:  0.08099538331729983
Mean Absolute Error: 0.6627454187142977
RSME_test:  0.9686732465210737
R2_test:  0.07855945862881852
Mean Absolute Error: 0.66884915986654


#### 3. Ridge Regression

In [19]:
ridge_reg = RidgeCV().fit(X_train, y_train)

y_pred_train = ridge_reg.predict(X_train)
print('RSME_train: ',np.sqrt(mean_squared_error(y_train, y_pred_train))) 
print('R2_train: ',r2_score(y_train, y_pred_train))
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred_train))

y_pred_test = ridge_reg.predict(X_test)
print('RSME_test: ',np.sqrt(mean_squared_error(y_test, y_pred_test))) 
print('R2_test: ',r2_score(y_test, y_pred_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_test))

RSME_train:  0.9548383944645747
R2_train:  0.08101623931241952
Mean Absolute Error: 0.6627874496330955
RSME_test:  0.96863342035984
R2_test:  0.07863522553021252
Mean Absolute Error: 0.6688753627945995
