In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
from xgboost import XGBRegressor

In [2]:
dataset = pd.read_csv("data/data_cleaned.csv").drop("Unnamed: 0", axis=1)

In [3]:
dataset

Unnamed: 0,DateOfDeparture,Departure,Arrival,WeeksToDeparture,log_PAX,std_wtd,DateDay,DateMonth,DateYear
0,19/06/2012,ORD,DFW,12.875000,12.331296,9.812647,19,6,2012
1,10/09/2012,LAS,DEN,14.285714,10.775182,9.466734,10,9,2012
2,05/10/2012,DEN,LAX,10.863636,11.083177,9.035883,5,10,2012
3,09/10/2011,ATL,ORD,11.480000,11.169268,7.990202,9,10,2011
4,21/02/2012,DEN,SFO,11.450000,11.269364,9.517159,21,2,2012
...,...,...,...,...,...,...,...,...,...
8897,02/10/2011,DTW,ATL,9.263158,10.427055,7.316967,2,10,2011
8898,25/09/2012,DFW,ORD,12.772727,12.201552,10.641034,25,9,2012
8899,19/01/2012,SFO,LAS,11.047619,10.508746,7.908705,19,1,2012
8900,03/02/2013,ORD,PHL,6.076923,10.174042,4.030334,3,2,2013


In [4]:
dataset.corr()

  dataset.corr()


Unnamed: 0,WeeksToDeparture,log_PAX,std_wtd,DateDay,DateMonth,DateYear
WeeksToDeparture,1.0,0.163453,0.874999,0.032103,0.161949,-0.065728
log_PAX,0.163453,1.0,0.109231,0.010484,0.00588,-0.083499
std_wtd,0.874999,0.109231,1.0,0.010229,0.082489,0.041543
DateDay,0.032103,0.010484,0.010229,1.0,0.017332,-0.043961
DateMonth,0.161949,0.00588,0.082489,0.017332,1.0,-0.653862
DateYear,-0.065728,-0.083499,0.041543,-0.043961,-0.653862,1.0


In [5]:
dataset.isna().sum()

DateOfDeparture     0
Departure           0
Arrival             0
WeeksToDeparture    0
log_PAX             0
std_wtd             0
DateDay             0
DateMonth           0
DateYear            0
dtype: int64

In [6]:
dataset[["Departure"]].nunique()

Departure    20
dtype: int64

In [7]:
dataset[["Arrival"]].nunique()

Arrival    20
dtype: int64

In [8]:
binary_encoder = OrdinalEncoder()
dataset[["Departure"]] = binary_encoder.fit_transform(dataset[["Departure"]])
dataset[["Arrival"]] = binary_encoder.fit_transform(dataset[["Arrival"]])

In [9]:
scaler = StandardScaler()
dataset[["WeeksToDeparture","std_wtd"]] = scaler.fit_transform(dataset[["WeeksToDeparture","std_wtd"]])

In [10]:
dataset.corr().abs()[["log_PAX"]]

  dataset.corr().abs()[["log_PAX"]]


Unnamed: 0,log_PAX
Departure,0.08469
Arrival,0.084624
WeeksToDeparture,0.163453
log_PAX,1.0
std_wtd,0.109231
DateDay,0.010484
DateMonth,0.00588
DateYear,0.083499


In [11]:
features = dataset.drop(["DateOfDeparture","log_PAX"], axis=1).to_numpy()
target = dataset[["log_PAX"]].to_numpy()

In [12]:
features

array([[ 1.50000000e+01,  4.00000000e+00,  5.12572586e-01, ...,
         1.90000000e+01,  6.00000000e+00,  2.01200000e+03],
       [ 9.00000000e+00,  3.00000000e+00,  1.01875226e+00, ...,
         1.00000000e+01,  9.00000000e+00,  2.01200000e+03],
       [ 3.00000000e+00,  1.00000000e+01, -2.09126629e-01, ...,
         5.00000000e+00,  1.00000000e+01,  2.01200000e+03],
       ...,
       [ 1.90000000e+01,  9.00000000e+00, -1.43111633e-01, ...,
         1.90000000e+01,  1.00000000e+00,  2.01200000e+03],
       [ 1.50000000e+01,  1.60000000e+01, -1.92665157e+00, ...,
         3.00000000e+00,  2.00000000e+00,  2.01300000e+03],
       [ 5.00000000e+00,  0.00000000e+00, -6.88971838e-01, ...,
         2.60000000e+01,  1.10000000e+01,  2.01100000e+03]])

In [13]:
target

array([[12.33129622],
       [10.77518151],
       [11.08317675],
       ...,
       [10.508746  ],
       [10.17404162],
       [ 9.20267425]])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)

In [15]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return pd.DataFrame(data=[mse,mae,round(rmsle, precision)], index=["MSE", "MAE", "RMSE"])

In [16]:
linear = LinearRegression()
linear.fit(X_train, y_train)
ridge = Ridge()
ridge.fit(X_train, y_train)
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
rf = RandomForestRegressor(n_estimators=5)
rf.fit(X_train, y_train.ravel())

In [17]:
ridge_evaluate = compute_rmsle(ridge.predict(X_test), y_test)
ridge_evaluate

Unnamed: 0,0
MSE,0.916676
MAE,0.750714
RMSE,0.08


In [18]:
linear_evaluate = compute_rmsle(linear.predict(X_test), y_test)
linear_evaluate

Unnamed: 0,0
MSE,0.916676
MAE,0.750718
RMSE,0.08


In [19]:
xgb_evaluate = compute_rmsle(xgb.predict(X_test), y_test)
xgb_evaluate

Unnamed: 0,0
MSE,0.477599
MAE,0.499675
RMSE,0.06


In [20]:
rf_evaluate = compute_rmsle(rf.predict(X_test), y_test)
rf_evaluate

Unnamed: 0,0
MSE,0.689543
MAE,0.611906
RMSE,0.07


IMPORVE MODEL WITH DATA EXTERNAL

In [77]:
data_merged = pd.read_csv("data/data_merged.csv").drop(["Unnamed: 0", "DateOfDeparture", "Precipitationmm", "Events", ], axis=1)

In [79]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
data_merged[["Max Gust SpeedKm/h"]] = imputer.fit_transform(data_merged[["Max Gust SpeedKm/h"]])

In [80]:
data_merged

Unnamed: 0,Departure,Arrival,WeeksToDeparture,log_PAX,std_wtd,DateDay,DateMonth,DateYear,Max TemperatureC,Mean TemperatureC,...,Mean Sea Level PressurehPa,Min Sea Level PressurehPa,Max VisibilityKm,Mean VisibilityKm,Min VisibilitykM,Max Wind SpeedKm/h,Mean Wind SpeedKm/h,Max Gust SpeedKm/h,CloudCover,WindDirDegrees
0,ORD,DFW,12.875000,12.331296,9.812647,19,6,2012,35,31,...,1012,1009,16,16,16,37,25,56.0,1,208
1,ORD,PHX,11.772727,10.502073,8.970490,19,6,2012,35,31,...,1012,1009,16,16,16,37,25,56.0,1,208
2,ORD,SFO,13.480000,12.160800,9.372477,19,6,2012,35,31,...,1012,1009,16,16,16,37,25,56.0,1,208
3,ORD,ATL,11.173913,11.788080,8.819669,19,6,2012,35,31,...,1012,1009,16,16,16,37,25,56.0,1,208
4,LAS,DEN,14.285714,10.775182,9.466734,10,9,2012,36,32,...,1009,1005,16,15,11,32,18,50.0,5,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8897,DTW,ATL,9.263158,10.427055,7.316967,2,10,2011,17,11,...,1020,1017,16,16,16,35,19,50.0,4,327
8898,DFW,ORD,12.772727,12.201552,10.641034,25,9,2012,35,29,...,1013,1010,16,16,16,48,30,60.0,4,187
8899,SFO,LAS,11.047619,10.508746,7.908705,19,1,2012,11,7,...,1021,1017,16,15,8,24,10,32.0,7,125
8900,ORD,PHL,6.076923,10.174042,4.030334,3,2,2013,-6,-9,...,1016,1014,16,9,2,34,14,42.0,7,268


In [81]:
data_merged[["Departure"]] = binary_encoder.fit_transform(data_merged[["Departure"]])
data_merged[["Arrival"]] = binary_encoder.fit_transform(data_merged[["Arrival"]])
data_merged[["DateDay"]] = binary_encoder.fit_transform(data_merged[["DateDay"]])
data_merged[["DateMonth"]] = binary_encoder.fit_transform(data_merged[["DateMonth"]])
data_merged[["DateYear"]] = binary_encoder.fit_transform(data_merged[["DateYear"]])

In [82]:
data_merged

Unnamed: 0,Departure,Arrival,WeeksToDeparture,log_PAX,std_wtd,DateDay,DateMonth,DateYear,Max TemperatureC,Mean TemperatureC,...,Mean Sea Level PressurehPa,Min Sea Level PressurehPa,Max VisibilityKm,Mean VisibilityKm,Min VisibilitykM,Max Wind SpeedKm/h,Mean Wind SpeedKm/h,Max Gust SpeedKm/h,CloudCover,WindDirDegrees
0,15.0,4.0,12.875000,12.331296,9.812647,18.0,5.0,1.0,35,31,...,1012,1009,16,16,16,37,25,56.0,1,208
1,15.0,17.0,11.772727,10.502073,8.970490,18.0,5.0,1.0,35,31,...,1012,1009,16,16,16,37,25,56.0,1,208
2,15.0,19.0,13.480000,12.160800,9.372477,18.0,5.0,1.0,35,31,...,1012,1009,16,16,16,37,25,56.0,1,208
3,15.0,0.0,11.173913,11.788080,8.819669,18.0,5.0,1.0,35,31,...,1012,1009,16,16,16,37,25,56.0,1,208
4,9.0,3.0,14.285714,10.775182,9.466734,9.0,8.0,1.0,36,32,...,1009,1005,16,15,11,32,18,50.0,5,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8897,5.0,0.0,9.263158,10.427055,7.316967,1.0,9.0,0.0,17,11,...,1020,1017,16,16,16,35,19,50.0,4,327
8898,4.0,15.0,12.772727,12.201552,10.641034,24.0,8.0,1.0,35,29,...,1013,1010,16,16,16,48,30,60.0,4,187
8899,19.0,9.0,11.047619,10.508746,7.908705,18.0,0.0,1.0,11,7,...,1021,1017,16,15,8,24,10,32.0,7,125
8900,15.0,16.0,6.076923,10.174042,4.030334,2.0,1.0,2.0,-6,-9,...,1016,1014,16,9,2,34,14,42.0,7,268


In [83]:
data_merged.isnull().any()

Departure                     False
Arrival                       False
WeeksToDeparture              False
log_PAX                       False
std_wtd                       False
DateDay                       False
DateMonth                     False
DateYear                      False
Max TemperatureC              False
Mean TemperatureC             False
Min TemperatureC              False
Dew PointC                    False
MeanDew PointC                False
Min DewpointC                 False
Max Humidity                  False
Mean Humidity                 False
Min Humidity                  False
Max Sea Level PressurehPa     False
Mean Sea Level PressurehPa    False
Min Sea Level PressurehPa     False
Max VisibilityKm              False
Mean VisibilityKm             False
Min VisibilitykM              False
Max Wind SpeedKm/h            False
Mean Wind SpeedKm/h           False
Max Gust SpeedKm/h            False
CloudCover                    False
WindDirDegrees              

In [84]:
core_features = data_merged.drop("log_PAX",axis=1).to_numpy()
new_target = data_merged[["log_PAX"]].to_numpy()

In [85]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(core_features, new_target, test_size=0.25, random_state=42)

In [86]:
nlinear = LinearRegression()
nridge = Ridge()
nxgb = XGBRegressor()
nrf = RandomForestRegressor()

In [87]:
nlinear.fit(X_train_new, y_train_new)
nridge.fit(X_train_new, y_train_new)
nxgb.fit(X_train_new, y_train_new)
nrf.fit(X_train_new, y_train_new)

  nrf.fit(X_train_new, y_train_new)


In [88]:
compute_rmsle(nlinear.predict(X_test_new), y_test_new)

Unnamed: 0,0
MSE,0.894907
MAE,0.735938
RMSE,0.08


In [89]:
compute_rmsle(nridge.predict(X_test_new), y_test_new)

Unnamed: 0,0
MSE,0.894907
MAE,0.735935
RMSE,0.08


In [90]:
compute_rmsle(nxgb.predict(X_test_new), y_test_new)

Unnamed: 0,0
MSE,0.5039
MAE,0.521144
RMSE,0.06


In [91]:
compute_rmsle(nrf.predict(X_test_new), y_test_new)

Unnamed: 0,0
MSE,0.585781
MAE,0.562916
RMSE,0.07
