In [1]:
import csv
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

plt.style.use('ggplot')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
from scipy.stats import uniform, randint
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, TimeSeriesSplit

In [3]:
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

In [4]:
f = pd.read_csv('C:\\beijing_engineered.csv')
df = pd.DataFrame(f)

In [5]:
df['date'] = pd.to_datetime(df.date)

In [6]:
# Setting DateTime index
df['date'] = pd.to_datetime(df.date)
df.set_index('date', inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43799 entries, 2010-01-02 00:00:00 to 2014-12-31 22:00:00
Data columns (total 49 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pm25         43799 non-null  int64  
 1   dewp         43799 non-null  int64  
 2   temp         43799 non-null  int64  
 3   cws          43799 non-null  float64
 4   target       43799 non-null  float64
 5   cwkend       43799 non-null  float64
 6   cbwd_1       43799 non-null  int64  
 7   cbwd_2       43799 non-null  int64  
 8   cbwd_3       43799 non-null  int64  
 9   mth_sin      43799 non-null  float64
 10  mth_cos      43799 non-null  float64
 11  hour_sin     43799 non-null  float64
 12  hour_cos     43799 non-null  float64
 13  hour_num     43799 non-null  float64
 14  mth_num      43799 non-null  float64
 15  cmonth_2.0   43799 non-null  int64  
 16  cmonth_3.0   43799 non-null  int64  
 17  cmonth_4.0   43799 non-null  int64  
 18  cmonth_5.0 

In [8]:
# Change the integer variables to int32 type to reduce memory usage
df[['pm25', 'dewp', 'temp', 'target', 'hour_num', 'mth_num']] = \
    df[['pm25', 'dewp', 'temp', 'target', 'hour_num', 'mth_num']].astype(np.int32)

In [9]:
# Change the dummy variables to uint8 type to reduce memory usage
df[['cmonth_2.0', 'cmonth_3.0', 'cmonth_4.0', 'cmonth_5.0', 'cmonth_6.0', 'cmonth_7.0', 'cmonth_8.0', 'cmonth_9.0',
    'cmonth_10.0', 'cmonth_11.0', 'cmonth_12.0', 'chour_1.0', 'chour_2.0', 'chour_3.0', 'chour_4.0', 'chour_5.0',
    'chour_6.0', 'chour_7.0', 'chour_8.0', 'chour_9.0', 'chour_10.0', 'chour_11.0', 'chour_12.0', 'chour_13.0', 
    'chour_14.0', 'chour_15.0', 'chour_16.0', 'chour_17.0', 'chour_18.0', 'chour_19.0', 'chour_20.0', 'chour_21.0',
    'chour_22.0', 'chour_23.0', 'cwkend', 'cbwd_1', 'cbwd_2', 'cbwd_3']] = \
    df[['cmonth_2.0', 'cmonth_3.0', 'cmonth_4.0', 'cmonth_5.0', 'cmonth_6.0', 'cmonth_7.0', 'cmonth_8.0', 
        'cmonth_9.0', 'cmonth_10.0', 'cmonth_11.0', 'cmonth_12.0', 'chour_1.0', 'chour_2.0', 'chour_3.0',
        'chour_4.0', 'chour_5.0', 'chour_6.0', 'chour_7.0', 'chour_8.0', 'chour_9.0', 'chour_10.0', 'chour_11.0',
        'chour_12.0', 'chour_13.0', 'chour_14.0', 'chour_15.0', 'chour_16.0', 'chour_17.0', 'chour_18.0',
        'chour_19.0', 'chour_20.0', 'chour_21.0', 'chour_22.0', 'chour_23.0', 'cwkend', 'cbwd_1', 'cbwd_2', 
        'cbwd_3']].astype(np.uint8)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43799 entries, 2010-01-02 00:00:00 to 2014-12-31 22:00:00
Data columns (total 49 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pm25         43799 non-null  int32  
 1   dewp         43799 non-null  int32  
 2   temp         43799 non-null  int32  
 3   cws          43799 non-null  float64
 4   target       43799 non-null  int32  
 5   cwkend       43799 non-null  uint8  
 6   cbwd_1       43799 non-null  uint8  
 7   cbwd_2       43799 non-null  uint8  
 8   cbwd_3       43799 non-null  uint8  
 9   mth_sin      43799 non-null  float64
 10  mth_cos      43799 non-null  float64
 11  hour_sin     43799 non-null  float64
 12  hour_cos     43799 non-null  float64
 13  hour_num     43799 non-null  int32  
 14  mth_num      43799 non-null  int32  
 15  cmonth_2.0   43799 non-null  uint8  
 16  cmonth_3.0   43799 non-null  uint8  
 17  cmonth_4.0   43799 non-null  uint8  
 18  cmonth_5.0 

# Create Dummies & Numeric Datasets

Form **two** separate datasets:
- DataFrame with **dummy month and hour variables**
- DataFrame with **single numerical month and hour variables**

In [11]:
# The dummy seasonality dataset
df_dum = df.drop(['mth_sin', 'mth_cos', 'hour_sin', 'hour_cos', 'hour_num', 'mth_num'], axis=1)

In [12]:
# The numerical seasonality dataset
df_num = df[['pm25', 'dewp', 'temp', 'cws', 'target', 'cwkend', 'cbwd_1', 
             'cbwd_2', 'cbwd_3', 'hour_num', 'mth_num']]

# Preparing Data for Optimization

### Train-Test Split

In [13]:
# Set test set at 20% of data
sample = int(len(df)*0.2)

In [14]:
sample

8759

In [15]:
# Split dummy seasonality dataset
df_dumt = df_dum.iloc[-sample:]
df_dum = df_dum.iloc[:-sample]

In [16]:
df_dum.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 35040 entries, 2010-01-02 00:00:00 to 2013-12-31 23:00:00
Data columns (total 43 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pm25         35040 non-null  int32  
 1   dewp         35040 non-null  int32  
 2   temp         35040 non-null  int32  
 3   cws          35040 non-null  float64
 4   target       35040 non-null  int32  
 5   cwkend       35040 non-null  uint8  
 6   cbwd_1       35040 non-null  uint8  
 7   cbwd_2       35040 non-null  uint8  
 8   cbwd_3       35040 non-null  uint8  
 9   cmonth_2.0   35040 non-null  uint8  
 10  cmonth_3.0   35040 non-null  uint8  
 11  cmonth_4.0   35040 non-null  uint8  
 12  cmonth_5.0   35040 non-null  uint8  
 13  cmonth_6.0   35040 non-null  uint8  
 14  cmonth_7.0   35040 non-null  uint8  
 15  cmonth_8.0   35040 non-null  uint8  
 16  cmonth_9.0   35040 non-null  uint8  
 17  cmonth_10.0  35040 non-null  uint8  
 18  cmonth_11.0

In [17]:
df_dumt.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8759 entries, 2014-01-01 00:00:00 to 2014-12-31 22:00:00
Data columns (total 43 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pm25         8759 non-null   int32  
 1   dewp         8759 non-null   int32  
 2   temp         8759 non-null   int32  
 3   cws          8759 non-null   float64
 4   target       8759 non-null   int32  
 5   cwkend       8759 non-null   uint8  
 6   cbwd_1       8759 non-null   uint8  
 7   cbwd_2       8759 non-null   uint8  
 8   cbwd_3       8759 non-null   uint8  
 9   cmonth_2.0   8759 non-null   uint8  
 10  cmonth_3.0   8759 non-null   uint8  
 11  cmonth_4.0   8759 non-null   uint8  
 12  cmonth_5.0   8759 non-null   uint8  
 13  cmonth_6.0   8759 non-null   uint8  
 14  cmonth_7.0   8759 non-null   uint8  
 15  cmonth_8.0   8759 non-null   uint8  
 16  cmonth_9.0   8759 non-null   uint8  
 17  cmonth_10.0  8759 non-null   uint8  
 18  cmonth_11.0 

In [18]:
# Target variables
y_dumt = df_dumt.pop('target')
y_dum = df_dum.pop('target')

In [19]:
# Split numerical seasonality dataset
df_numt = df_num.iloc[-sample:]
df_num = df_num.iloc[:-sample]

In [20]:
# Target variables
y_numt = df_numt.pop('target')
y_num = df_num.pop('target')

### Scaling data

In [21]:
# Using MinMaxScaler due to the comparison with the dummy seasonality data
scaler = MinMaxScaler()

In [22]:
# Scaling the dummies dataset
X_dum = scaler.fit_transform(df_dum)
X_dumt = scaler.transform(df_dumt)

In [23]:
# Scaling the numerical dataset
X_num = scaler.fit_transform(df_num)
X_numt = scaler.transform(df_numt)

In [24]:
#Make time series validation scheme for Nested Cross-Validation
time_split = TimeSeriesSplit(n_splits = 3)

# Randomized Search CV

The first optimization is undertaken on the **dummy seasonality** dataset

In [25]:
# Lasso Regression
lasso = Lasso(random_state=11)

las_dist = {'fit_intercept': [1, 0],
            'alpha': uniform(0.001, 2)}


In [26]:
rs_las = RandomizedSearchCV(lasso, las_dist, cv=time_split, scoring='neg_mean_squared_error', 
                            n_jobs=-1, verbose=1)

In [27]:
rs_las.fit(X_dum, y_dum)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=Lasso(random_state=11), n_jobs=-1,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B3948A1F0>,
                                        'fit_intercept': [1, 0]},
                   scoring='neg_mean_squared_error', verbose=1)

In [28]:
# Ridge Regression
ridge = Ridge(random_state=11)

rdg_dist = {'fit_intercept': [1, 0],
            'solver': ['lsqr', 'sag', 'cholesky'],
            'alpha': uniform(0.001, 2)}


In [29]:
rs_rdg = RandomizedSearchCV(ridge, rdg_dist, cv=time_split, scoring='neg_mean_squared_error', 
                            n_jobs=-1, verbose=1)

In [30]:
rs_rdg.fit(X_dum, y_dum)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=Ridge(random_state=11), n_jobs=-1,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B394AD790>,
                                        'fit_intercept': [1, 0],
                                        'solver': ['lsqr', 'sag', 'cholesky']},
                   scoring='neg_mean_squared_error', verbose=1)

In [31]:
# Random Forest
rf = RandomForestRegressor(random_state=11)

rf_dist = {'n_estimators': randint(50, 500),
           'min_samples_split': randint(2, 9),
           'max_features': ['auto', 'log2', 'sqrt']}


In [32]:
rs_rf = RandomizedSearchCV(rf, rf_dist, cv=time_split, scoring = 'neg_mean_squared_error', 
                           n_jobs=-1, verbose=1)

In [33]:
rs_rf.fit(X_dum, y_dum)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=RandomForestRegressor(random_state=11), n_jobs=-1,
                   param_distributions={'max_features': ['auto', 'log2',
                                                         'sqrt'],
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B39491D60>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B39214790>},
                   scoring='neg_mean_squared_error', verbose=1)

In [34]:
# XG Boost Regressor
xgb = xgb.XGBRegressor(objective='reg:squarederror', random_state=11)

xgb_dist = {'n_estimators': randint(50, 500),
            'subsample': [0.5, 0.7, 1],
            'eta': uniform(0.05, 1),
            'gamma': randint(0, 300)}


In [35]:
rs_xgb = RandomizedSearchCV(xgb, xgb_dist, cv=time_split, scoring = 'neg_mean_squared_error', 
                            n_jobs=-1, verbose=1)

In [36]:
rs_xgb.fit(X_dum, y_dum)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_w...
                                          verbosity=None),
                   n_jobs=-1,
                   param_distributions={'eta': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B39423910>,
                                        'gamma': <scipy.stats._

In [37]:
# Support Vector Regressor
svm = SVR()

svm_dist = {'kernel': ['rbf', 'linear', 'poly'],
            'gamma': ['scale', 'auto', 0.2], 
            'epsilon': uniform(0.01, 3),
            'C': uniform(0.1, 5)}


In [38]:
rs_svm = RandomizedSearchCV(svm, svm_dist, cv=time_split, scoring = 'neg_mean_squared_error', 
                            n_jobs=-1, verbose=1)

In [39]:
rs_svm.fit(X_dum, y_dum)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=SVR(), n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B39438F40>,
                                        'epsilon': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B3942E760>,
                                        'gamma': ['scale', 'auto', 0.2],
                                        'kernel': ['rbf', 'linear', 'poly']},
                   scoring='neg_mean_squared_error', verbose=1)

In [40]:
# Multi-Layer Perceptron
# Typically 1-2 hidden layers are adequate, and the optimal size of the first hidden layer is usually... 
# between that of the input and the output layers, or 42 and 1 in this case (42 is the max with all the time dummies)

mlp = MLPRegressor(early_stopping=True, max_iter=10000, random_state=11)

mlp_dist = {'hidden_layer_sizes': [(18,), (24,), (32,), (18,10), (24,10), (24,18), (32,10), (32,18), (32,24)],
            'alpha': uniform(0.01, 5),
            'activation': ['relu', 'tanh'],
            'solver': ['lbfgs', 'adam']}


In [41]:
rs_mlp = RandomizedSearchCV(mlp, mlp_dist, cv=time_split, scoring = 'neg_mean_squared_error', 
                            n_jobs=-1, verbose=1)

In [42]:
rs_mlp.fit(X_dum, y_dum)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=MLPRegressor(early_stopping=True, max_iter=10000,
                                          random_state=11),
                   n_jobs=-1,
                   param_distributions={'activation': ['relu', 'tanh'],
                                        'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B39454850>,
                                        'hidden_layer_sizes': [(18,), (24,),
                                                               (32,), (18, 10),
                                                               (24, 10),
                                                               (24, 18),
                                                               (32, 10),
                                                               (32, 18),
                                                               (32, 24)],
                

### Best parameters found

In [43]:
rs_las.best_params_

{'alpha': 0.39588637600600995, 'fit_intercept': 0}

In [44]:
rs_rdg.best_params_

{'alpha': 0.7116179037220222, 'fit_intercept': 1, 'solver': 'cholesky'}

In [45]:
rs_rf.best_params_

{'max_features': 'auto', 'min_samples_split': 7, 'n_estimators': 458}

In [46]:
rs_xgb.best_params_

{'eta': 0.1683725558025208, 'gamma': 256, 'n_estimators': 467, 'subsample': 1}

In [47]:
rs_svm.best_params_

{'C': 4.975296493539874,
 'epsilon': 1.2482012816757908,
 'gamma': 'auto',
 'kernel': 'linear'}

In [48]:
rs_mlp.best_params_

{'activation': 'relu',
 'alpha': 3.6460757588153223,
 'hidden_layer_sizes': (24,),
 'solver': 'adam'}

# Cross-Validation & Scoring on Dummies Data

### Cross-validation on training data

In [49]:
las_scores_dum = cross_val_score(rs_las, X_dum, y_dum, cv = time_split, n_jobs=-1, 
                                 scoring = 'neg_mean_absolute_error')

In [50]:
rdg_scores_dum = cross_val_score(rs_rdg, X_dum, y_dum, cv = time_split, n_jobs=-1, 
                                 scoring = 'neg_mean_absolute_error')

In [51]:
rf_scores_dum = cross_val_score(rs_rf, X_dum, y_dum, cv = time_split, n_jobs=-1, 
                                scoring = 'neg_mean_absolute_error')

In [52]:
xgb_scores_dum = cross_val_score(rs_xgb, X_dum, y_dum, cv = time_split, 
                                 scoring = 'neg_mean_absolute_error')

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [53]:
svm_scores_dum = cross_val_score(rs_svm, X_dum, y_dum, cv = time_split, n_jobs=-1, 
                                 scoring = 'neg_mean_absolute_error')

In [54]:
mlp_scores_dum = cross_val_score(rs_mlp, X_dum, y_dum, cv = time_split, n_jobs=-1, 
                                 scoring = 'neg_mean_absolute_error')

In [55]:
cv_dict_dum = {
    'Lasso Regression': -np.round(las_scores_dum.mean(), 4),
    'Ridge Regression': -np.round(rdg_scores_dum.mean(), 4),
    'Random Forest': -np.round(rf_scores_dum.mean(), 4),
    'Xtreme Gradient Boost': -np.round(xgb_scores_dum.mean(), 4),
    'Support Vector Machine': -np.round(svm_scores_dum.mean(), 4),
    'Multi-Layer Perceptron': -np.round(mlp_scores_dum.mean(), 4),
}

In [56]:
df_cv_dum = pd.DataFrame({'Model': cv_dict_dum.keys(), 'Average MAE': cv_dict_dum.values()})

### Scoring test data

In [57]:
def reg_scoring(X, y, reg_dict):
    '''
    Objective: Cycles through a dictionary of trained models, using them to make predictions, scores those 
    predictions on MAE, MSE & RMSE, and generates DataFrames of the scores and model predictions respectively
    
    X: DataFrame containing the explanatory variables
    
    y: Target variable
    
    reg_dict: Dictionary of trained/fitted models
    '''
    
    test1_scores = []
    test2_scores = []
    
    df_pred = pd.DataFrame(columns=reg_dict.keys()) # Columns of DF will accord with reg_dict keys
    
    # Loop through Dictionary items
    for key, reg in reg_dict.items():
        
        pred_y = reg.predict(X)
        df_pred[key] = pd.Series(pred_y).transpose()
        
        # Computing test scores for each model
        test1_scores.append(round(mean_absolute_error(y, pred_y), 4))
        test2_scores.append(round(mean_squared_error(y, pred_y, squared=False), 4))
        
    # Generate results DataFrame
    results = pd.DataFrame({'Model': list(reg_dict.keys()), 
                            'Mean Absolute Error': test1_scores,
                            'Root Mean Squared Error': test2_scores
                            })
    
    # Add target variable to the DataFrame of predictions
    df_pred['Target'] = y.tolist()
    
    return results, df_pred


In [58]:
# Dictionary of TRAINED models
reg_dict = {
    'Lasso Regression': rs_las,
    'Ridge Regression': rs_rdg,
    'Random Forest': rs_rf,
    'Xtreme Gradient Boost': rs_xgb,
    'Support Vector Machine': rs_svm,
    'Multi-Layer Perceptron': rs_mlp
}

In [59]:
scores_dum, df_pred_dum = reg_scoring(X_dumt, y_dumt, reg_dict)

# Refit on Numerical Seasonality Data & Scoring

In [60]:
rs_las.fit(X_num, y_num)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=Lasso(random_state=11), n_jobs=-1,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B3948A1F0>,
                                        'fit_intercept': [1, 0]},
                   scoring='neg_mean_squared_error', verbose=1)

In [61]:
rs_rdg.fit(X_num, y_num)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=Ridge(random_state=11), n_jobs=-1,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B394AD790>,
                                        'fit_intercept': [1, 0],
                                        'solver': ['lsqr', 'sag', 'cholesky']},
                   scoring='neg_mean_squared_error', verbose=1)

In [62]:
rs_rf.fit(X_num, y_num)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=RandomForestRegressor(random_state=11), n_jobs=-1,
                   param_distributions={'max_features': ['auto', 'log2',
                                                         'sqrt'],
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B39491D60>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B39214790>},
                   scoring='neg_mean_squared_error', verbose=1)

In [63]:
rs_xgb.fit(X_num, y_num)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_w...
                                          verbosity=None),
                   n_jobs=-1,
                   param_distributions={'eta': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B39423910>,
                                        'gamma': <scipy.stats._

In [64]:
rs_svm.fit(X_num, y_num)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=SVR(), n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B39438F40>,
                                        'epsilon': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B3942E760>,
                                        'gamma': ['scale', 'auto', 0.2],
                                        'kernel': ['rbf', 'linear', 'poly']},
                   scoring='neg_mean_squared_error', verbose=1)

In [65]:
rs_mlp.fit(X_num, y_num)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=MLPRegressor(early_stopping=True, max_iter=10000,
                                          random_state=11),
                   n_jobs=-1,
                   param_distributions={'activation': ['relu', 'tanh'],
                                        'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020B39454850>,
                                        'hidden_layer_sizes': [(18,), (24,),
                                                               (32,), (18, 10),
                                                               (24, 10),
                                                               (24, 18),
                                                               (32, 10),
                                                               (32, 18),
                                                               (32, 24)],
                

### Best parameters found

In [66]:
rs_las.best_params_

{'alpha': 0.03372347124849073, 'fit_intercept': 1}

In [67]:
rs_rdg.best_params_

{'alpha': 0.07173889899357089, 'fit_intercept': 1, 'solver': 'cholesky'}

In [68]:
rs_rf.best_params_

{'max_features': 'auto', 'min_samples_split': 8, 'n_estimators': 283}

In [69]:
rs_xgb.best_params_

{'eta': 0.05529035373470943, 'gamma': 88, 'n_estimators': 121, 'subsample': 1}

In [70]:
rs_svm.best_params_

{'C': 2.4125397311304853,
 'epsilon': 2.759305988743781,
 'gamma': 'scale',
 'kernel': 'linear'}

In [71]:
rs_mlp.best_params_

{'activation': 'relu',
 'alpha': 1.2797064270567304,
 'hidden_layer_sizes': (24,),
 'solver': 'adam'}

# Cross-Validation & Scoring on Categorical Data

In [72]:
las_scores_num = cross_val_score(rs_las, X_num, y_num, cv = time_split, n_jobs=-1, 
                                 scoring = 'neg_mean_absolute_error')

In [73]:
rdg_scores_num = cross_val_score(rs_rdg, X_num, y_num, cv = time_split, n_jobs=-1, 
                                 scoring = 'neg_mean_absolute_error')

In [74]:
rf_scores_num = cross_val_score(rs_rf, X_num, y_num, cv = time_split, n_jobs=-1, 
                                scoring = 'neg_mean_absolute_error')

In [75]:
xgb_scores_num = cross_val_score(rs_xgb, X_num, y_num, cv = time_split, 
                                 scoring = 'neg_mean_absolute_error')

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [76]:
svm_scores_num = cross_val_score(rs_svm, X_num, y_num, cv = time_split, n_jobs=-1, 
                                 scoring = 'neg_mean_absolute_error')

In [77]:
mlp_scores_num = cross_val_score(rs_mlp, X_num, y_num, cv = time_split, n_jobs=-1, 
                                 scoring = 'neg_mean_absolute_error')

In [78]:
cv_dict_num = {
    'Lasso Regression': -np.round(las_scores_num.mean(), 4),
    'Ridge Regression': -np.round(rdg_scores_num.mean(), 4),
    'Random Forest': -np.round(rf_scores_num.mean(), 4),
    'Xtreme Gradient Boost': -np.round(xgb_scores_num.mean(), 4),
    'Support Vector Machine': -np.round(svm_scores_num.mean(), 4),
    'Multi-Layer Perceptron': -np.round(mlp_scores_num.mean(), 4),
}

In [79]:
df_cv_num = pd.DataFrame({'Model': cv_dict_num.keys(), 'Average MAE': cv_dict_num.values()})

### Scoring on test data

In [80]:
scores_num, df_pred_num = reg_scoring(X_numt, y_numt, reg_dict)

In [81]:
df_pred_dum['date'] = df_dumt.index
df_pred_num['date'] = df_numt.index

In [82]:
#df_pred_dum.to_csv(r'C:\\bj_pred_dum.csv', index=False)

In [83]:
#df_pred_num.to_csv(r'C:\\bj_pred_num.csv', index=False)

# Comparative Scores

### Cross-validation scores

In [84]:
df_cv_dum

Unnamed: 0,Model,Average MAE
0,Lasso Regression,12.933
1,Ridge Regression,13.0188
2,Random Forest,13.87
3,Xtreme Gradient Boost,13.8072
4,Support Vector Machine,14.2932
5,Multi-Layer Perceptron,13.2002


In [85]:
df_cv_num

Unnamed: 0,Model,Average MAE
0,Lasso Regression,13.071
1,Ridge Regression,12.9175
2,Random Forest,13.7069
3,Xtreme Gradient Boost,14.2799
4,Support Vector Machine,18.8394
5,Multi-Layer Perceptron,12.782


### Test data scores

In [86]:
scores_dum

Unnamed: 0,Model,Mean Absolute Error,Root Mean Squared Error
0,Lasso Regression,12.972,22.6608
1,Ridge Regression,12.1694,21.7987
2,Random Forest,12.6537,22.7154
3,Xtreme Gradient Boost,12.4125,22.5786
4,Support Vector Machine,11.7966,21.8209
5,Multi-Layer Perceptron,12.1988,21.9011


In [87]:
scores_num

Unnamed: 0,Model,Mean Absolute Error,Root Mean Squared Error
0,Lasso Regression,12.1246,21.867
1,Ridge Regression,12.1074,21.848
2,Random Forest,12.612,22.5262
3,Xtreme Gradient Boost,11.8641,21.7519
4,Support Vector Machine,12.1071,21.9543
5,Multi-Layer Perceptron,12.1093,21.9595
