In [1]:
import csv
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

plt.style.use('ggplot')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
from scipy.stats import uniform, randint
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, TimeSeriesSplit

In [3]:
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

In [4]:
f = pd.read_csv('C:\\ny_engineered.csv')
df = pd.DataFrame(f)

In [5]:
df['date'] = pd.to_datetime(df.date)

In [6]:
# Setting DateTime index
df['date'] = pd.to_datetime(df.date)
df.set_index('date', inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 39455 entries, 2015-07-02 00:00:00 to 2019-12-31 22:00:00
Data columns (total 45 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   load         39455 non-null  float64
 1   temp         39455 non-null  float64
 2   humid        39455 non-null  float64
 3   target       39455 non-null  float64
 4   cwkend       39455 non-null  float64
 5   mth_sin      39455 non-null  float64
 6   mth_cos      39455 non-null  float64
 7   hour_sin     39455 non-null  float64
 8   hour_cos     39455 non-null  float64
 9   hour_num     39455 non-null  float64
 10  mth_num      39455 non-null  float64
 11  cmonth_2.0   39455 non-null  int64  
 12  cmonth_3.0   39455 non-null  int64  
 13  cmonth_4.0   39455 non-null  int64  
 14  cmonth_5.0   39455 non-null  int64  
 15  cmonth_6.0   39455 non-null  int64  
 16  cmonth_7.0   39455 non-null  int64  
 17  cmonth_8.0   39455 non-null  int64  
 18  cmonth_9.0 

In [8]:
# Change the integer variables to int32 type to reduce memory usage
df[['load', 'humid', 'target', 'hour_num', 'mth_num']] = \
    df[['load', 'humid', 'target', 'hour_num', 'mth_num']].astype(np.int32)

In [9]:
# Change the dummy variables to uint8 type to reduce memory usage
df[['cmonth_2.0', 'cmonth_3.0', 'cmonth_4.0', 'cmonth_5.0', 'cmonth_6.0', 'cmonth_7.0', 'cmonth_8.0', 'cmonth_9.0',
    'cmonth_10.0', 'cmonth_11.0', 'cmonth_12.0', 'chour_1.0', 'chour_2.0', 'chour_3.0', 'chour_4.0', 'chour_5.0',
    'chour_6.0', 'chour_7.0', 'chour_8.0', 'chour_9.0', 'chour_10.0', 'chour_11.0', 'chour_12.0', 'chour_13.0', 
    'chour_14.0', 'chour_15.0', 'chour_16.0', 'chour_17.0', 'chour_18.0', 'chour_19.0', 'chour_20.0', 'chour_21.0',
    'chour_22.0', 'chour_23.0', 'cwkend']] = \
    df[['cmonth_2.0', 'cmonth_3.0', 'cmonth_4.0', 'cmonth_5.0', 'cmonth_6.0', 'cmonth_7.0', 'cmonth_8.0', 
        'cmonth_9.0', 'cmonth_10.0', 'cmonth_11.0', 'cmonth_12.0', 'chour_1.0', 'chour_2.0', 'chour_3.0',
        'chour_4.0', 'chour_5.0', 'chour_6.0', 'chour_7.0', 'chour_8.0', 'chour_9.0', 'chour_10.0', 'chour_11.0',
        'chour_12.0', 'chour_13.0', 'chour_14.0', 'chour_15.0', 'chour_16.0', 'chour_17.0', 'chour_18.0',
        'chour_19.0', 'chour_20.0', 'chour_21.0', 'chour_22.0', 'chour_23.0', 'cwkend']].astype(np.uint8)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 39455 entries, 2015-07-02 00:00:00 to 2019-12-31 22:00:00
Data columns (total 45 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   load         39455 non-null  int32  
 1   temp         39455 non-null  float64
 2   humid        39455 non-null  int32  
 3   target       39455 non-null  int32  
 4   cwkend       39455 non-null  uint8  
 5   mth_sin      39455 non-null  float64
 6   mth_cos      39455 non-null  float64
 7   hour_sin     39455 non-null  float64
 8   hour_cos     39455 non-null  float64
 9   hour_num     39455 non-null  int32  
 10  mth_num      39455 non-null  int32  
 11  cmonth_2.0   39455 non-null  uint8  
 12  cmonth_3.0   39455 non-null  uint8  
 13  cmonth_4.0   39455 non-null  uint8  
 14  cmonth_5.0   39455 non-null  uint8  
 15  cmonth_6.0   39455 non-null  uint8  
 16  cmonth_7.0   39455 non-null  uint8  
 17  cmonth_8.0   39455 non-null  uint8  
 18  cmonth_9.0 

# Create Cyclic Seasonality Dataset

In [11]:
df_cyc = df.drop(['cmonth_2.0', 'cmonth_3.0', 'cmonth_4.0', 'cmonth_5.0', 'cmonth_6.0', 'cmonth_7.0', 
                  'cmonth_8.0', 'cmonth_9.0', 'cmonth_10.0', 'cmonth_11.0', 'cmonth_12.0', 'chour_1.0', 
                  'chour_2.0', 'chour_3.0', 'chour_4.0', 'chour_5.0', 'chour_6.0', 'chour_7.0', 
                  'chour_8.0', 'chour_9.0', 'chour_10.0', 'chour_11.0', 'chour_12.0', 'chour_13.0', 
                  'chour_14.0', 'chour_15.0', 'chour_16.0', 'chour_17.0', 'chour_18.0', 'chour_19.0', 
                  'chour_20.0', 'chour_21.0', 'chour_22.0', 'chour_23.0', 'mth_num', 'hour_num'], axis=1)

# Preparing Data for Optimization

### Train-Test Split

In [12]:
# Set test set at 20% of data
sample = int(len(df)*0.2)

In [13]:
sample

7891

In [14]:
df_cyct = df_cyc.iloc[-sample:]
df_cyc = df_cyc.iloc[:-sample]

In [15]:
df_cyc.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 31564 entries, 2015-07-02 00:00:00 to 2019-02-06 03:00:00
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   load      31564 non-null  int32  
 1   temp      31564 non-null  float64
 2   humid     31564 non-null  int32  
 3   target    31564 non-null  int32  
 4   cwkend    31564 non-null  uint8  
 5   mth_sin   31564 non-null  float64
 6   mth_cos   31564 non-null  float64
 7   hour_sin  31564 non-null  float64
 8   hour_cos  31564 non-null  float64
dtypes: float64(5), int32(3), uint8(1)
memory usage: 1.8 MB


In [16]:
df_cyct.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7891 entries, 2019-02-06 04:00:00 to 2019-12-31 22:00:00
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   load      7891 non-null   int32  
 1   temp      7891 non-null   float64
 2   humid     7891 non-null   int32  
 3   target    7891 non-null   int32  
 4   cwkend    7891 non-null   uint8  
 5   mth_sin   7891 non-null   float64
 6   mth_cos   7891 non-null   float64
 7   hour_sin  7891 non-null   float64
 8   hour_cos  7891 non-null   float64
dtypes: float64(5), int32(3), uint8(1)
memory usage: 470.1 KB


In [17]:
# Target variables, which will be the SAME for all the various datasets
y_cyct = df_cyct.pop('target')
y_cyc = df_cyc.pop('target')

### Scaling data

In [18]:
# Using MinMaxScaler due to the comparison with the dummy seasonality treatment
scaler = MinMaxScaler()

In [19]:
# Scaling the cyclic dataset
X_cyc = scaler.fit_transform(df_cyc)
X_cyct = scaler.transform(df_cyct)

In [20]:
#Make an inner and outer validation scheme for Nested Cross-Validation
time_split = TimeSeriesSplit(n_splits = 3)

# Randomized Search CV

Optimization on the **cyclic seasonality** dataset

In [21]:
# Lasso Regression
lasso = Lasso(random_state=11)

las_dist = {'fit_intercept': [1, 0],
            'alpha': uniform(0.001, 2)}


In [22]:
rs_las = RandomizedSearchCV(lasso, las_dist, cv=time_split, scoring='neg_mean_squared_error', 
                            n_jobs=-1, verbose=1)

In [23]:
rs_las.fit(X_cyc, y_cyc)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=Lasso(random_state=11), n_jobs=-1,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021CA01E6D30>,
                                        'fit_intercept': [1, 0]},
                   scoring='neg_mean_squared_error', verbose=1)

In [24]:
# Ridge Regression
ridge = Ridge(random_state=11)

rdg_dist = {'fit_intercept': [1, 0],
            'solver': ['lsqr', 'sag', 'cholesky'],
            'alpha': uniform(0.001, 2)}


In [25]:
rs_rdg = RandomizedSearchCV(ridge, rdg_dist, cv=time_split, scoring='neg_mean_squared_error', 
                            n_jobs=-1, verbose=1)

In [26]:
rs_rdg.fit(X_cyc, y_cyc)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=Ridge(random_state=11), n_jobs=-1,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021CA0126850>,
                                        'fit_intercept': [1, 0],
                                        'solver': ['lsqr', 'sag', 'cholesky']},
                   scoring='neg_mean_squared_error', verbose=1)

In [27]:
# Random Forest
rf = RandomForestRegressor(random_state=11)

rf_dist = {'n_estimators': randint(50, 500),
           'min_samples_split': randint(2, 9),
           'max_features': ['auto', 'log2', 'sqrt']}


In [28]:
rs_rf = RandomizedSearchCV(rf, rf_dist, cv=time_split, scoring = 'neg_mean_squared_error', 
                           n_jobs=-1, verbose=1)

In [29]:
rs_rf.fit(X_cyc, y_cyc)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=RandomForestRegressor(random_state=11), n_jobs=-1,
                   param_distributions={'max_features': ['auto', 'log2',
                                                         'sqrt'],
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021C9FFE3EE0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021C9FAC3430>},
                   scoring='neg_mean_squared_error', verbose=1)

In [30]:
# XG Boost Regressor
xgb = xgb.XGBRegressor(objective='reg:squarederror', random_state=11)

xgb_dist = {'n_estimators': randint(50, 500),
            'subsample': [0.5, 0.7, 1],
            'eta': uniform(0.05, 1.0),
            'gamma': randint(0, 300)}


In [31]:
rs_xgb = RandomizedSearchCV(xgb, xgb_dist, cv=time_split, scoring = 'neg_mean_squared_error', 
                            n_jobs=-1, verbose=1)

In [32]:
rs_xgb.fit(X_cyc, y_cyc)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_w...
                                          verbosity=None),
                   n_jobs=-1,
                   param_distributions={'eta': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021C9FB57760>,
                                        'gamma': <scipy.stats._

In [33]:
# Support Vector Regressor
svm = SVR()

svm_dist = {'kernel': ['rbf', 'linear', 'poly'],
            'gamma': ['scale', 'auto', 0.2], 
            'epsilon': uniform(0.01, 3),
            'C': uniform(0.1, 5)}


In [34]:
rs_svm = RandomizedSearchCV(svm, svm_dist, cv=time_split, scoring = 'neg_mean_squared_error', 
                            n_jobs=-1, verbose=1)

In [35]:
rs_svm.fit(X_cyc, y_cyc)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=SVR(), n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021C9FB687C0>,
                                        'epsilon': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021C9FB619A0>,
                                        'gamma': ['scale', 'auto', 0.2],
                                        'kernel': ['rbf', 'linear', 'poly']},
                   scoring='neg_mean_squared_error', verbose=1)

In [36]:
# Multi-Layer Perceptron
# Typically 1-2 hidden layers are adequate, and the optimal size of the first hidden layer is usually... 
# between that of the input and the output layers, or 38 and 1 in this case (38 is the max with all the time dummies)

mlp = MLPRegressor(early_stopping=True, max_iter=10000, random_state=11)

mlp_dist = {'hidden_layer_sizes': [(15,), (22,), (30,), (15,8), (22,8), (22,15), (30,8), (30,15)],
            'alpha': uniform(0.01, 5),
            'activation': ['relu', 'tanh'],
            'solver': ['lbfgs', 'adam']}


In [37]:
rs_mlp = RandomizedSearchCV(mlp, mlp_dist, cv=time_split, scoring = 'neg_mean_squared_error', 
                            n_jobs=-1, verbose=1)

In [38]:
rs_mlp.fit(X_cyc, y_cyc)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=MLPRegressor(early_stopping=True, max_iter=10000,
                                          random_state=11),
                   n_jobs=-1,
                   param_distributions={'activation': ['relu', 'tanh'],
                                        'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021C9FB868E0>,
                                        'hidden_layer_sizes': [(15,), (22,),
                                                               (30,), (15, 8),
                                                               (22, 8),
                                                               (22, 15),
                                                               (30, 8),
                                                               (30, 15)],
                                        'solver': ['lbfgs', 'adam']},
                   sco

### Best parameters found

In [39]:
rs_las.best_params_

{'alpha': 0.5227665708609351, 'fit_intercept': 1}

In [40]:
rs_rdg.best_params_

{'alpha': 0.47675736919835654, 'fit_intercept': 1, 'solver': 'lsqr'}

In [41]:
rs_rf.best_params_

{'max_features': 'log2', 'min_samples_split': 4, 'n_estimators': 414}

In [42]:
rs_xgb.best_params_

{'eta': 0.2109542143654975, 'gamma': 148, 'n_estimators': 492, 'subsample': 1}

In [43]:
rs_svm.best_params_

{'C': 4.974239991626877,
 'epsilon': 1.2819822983393951,
 'gamma': 'scale',
 'kernel': 'poly'}

In [44]:
rs_mlp.best_params_

{'activation': 'relu',
 'alpha': 2.2756697508499046,
 'hidden_layer_sizes': (30, 15),
 'solver': 'lbfgs'}

# Cross-Validation & Scoring on Cyclic Data

### Cross-validation on training data

In [45]:
las_scores_cyc = cross_val_score(rs_las, X_cyc, y_cyc, cv = time_split, n_jobs=-1, 
                                 scoring = 'neg_mean_absolute_error')

In [46]:
rdg_scores_cyc = cross_val_score(rs_rdg, X_cyc, y_cyc, cv = time_split, n_jobs=-1, 
                                 scoring = 'neg_mean_absolute_error')

In [47]:
rf_scores_cyc = cross_val_score(rs_rf, X_cyc, y_cyc, cv = time_split, n_jobs=-1, 
                                scoring = 'neg_mean_absolute_error')

In [48]:
xgb_scores_cyc = cross_val_score(rs_xgb, X_cyc, y_cyc, cv = time_split, 
                                 scoring = 'neg_mean_absolute_error')

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [49]:
svm_scores_cyc = cross_val_score(rs_svm, X_cyc, y_cyc, cv = time_split, n_jobs=-1, 
                                 scoring = 'neg_mean_absolute_error')

In [50]:
mlp_scores_cyc = cross_val_score(rs_mlp, X_cyc, y_cyc, cv = time_split, n_jobs=-1, 
                                 scoring = 'neg_mean_absolute_error')

In [51]:
cv_dict_cyc = {
    'Lasso Regression': -np.round(las_scores_cyc.mean(), 4),
    'Ridge Regression': -np.round(rdg_scores_cyc.mean(), 4),
    'Random Forest': -np.round(rf_scores_cyc.mean(), 4),
    'Xtreme Gradient Boost': -np.round(xgb_scores_cyc.mean(), 4),
    'Support Vector Machine': -np.round(svm_scores_cyc.mean(), 4),
    'Multi-Layer Perceptron': -np.round(mlp_scores_cyc.mean(), 4),
}

In [52]:
df_cv_cyc = pd.DataFrame({'Model': cv_dict_cyc.keys(), 'Average MAE': cv_dict_cyc.values()})

### Scoring test data

In [53]:
def reg_scoring(X, y, reg_dict):
    '''
    Objective: Cycles through a dictionary of trained models, using them to make predictions, scores those 
    predictions on MAE, MSE & RMSE, and generates DataFrames of the scores and model predictions respectively
    
    X: DataFrame containing the explanatory variables
    
    y: Target variable
    
    reg_dict: Dictionary of trained/fitted models
    '''
    
    test1_scores = []
    test2_scores = []
    
    df_pred = pd.DataFrame(columns=reg_dict.keys()) # Columns of DF will accord with reg_dict keys
    
    # Loop through Dictionary items
    for key, reg in reg_dict.items():
        
        pred_y = reg.predict(X)
        df_pred[key] = pd.Series(pred_y).transpose()
        
        # Computing test scores for each model
        test1_scores.append(round(mean_absolute_error(y, pred_y), 4))
        test2_scores.append(round(mean_squared_error(y, pred_y, squared=False), 4))
        
    # Generate results DataFrame
    results = pd.DataFrame({'Model': list(reg_dict.keys()), 
                            'Mean Absolute Error': test1_scores,
                            'Root Mean Squared Error': test2_scores
                            })
    
    # Add target variable to the DataFrame of predictions
    df_pred['Target'] = y.tolist()
    
    return results, df_pred


In [54]:
# Dictionary of TRAINED models
reg_dict = {
    'Lasso Regression': rs_las,
    'Ridge Regression': rs_rdg,
    'Random Forest': rs_rf,
    'Xtreme Gradient Boost': rs_xgb,
    'Support Vector Machine': rs_svm,
    'Multi-Layer Perceptron': rs_mlp
}

In [55]:
scores_cyc, df_pred_cyc = reg_scoring(X_cyct, y_cyct, reg_dict)

In [56]:
df_pred_cyc['date'] = df_cyct.index

In [57]:
#df_pred_cyc.to_csv(r'C:\\ny_pred_cyc.csv', index=False)

# Scores

### Cross-validation scores

In [58]:
df_cv_cyc

Unnamed: 0,Model,Average MAE
0,Lasso Regression,396.0423
1,Ridge Regression,396.4415
2,Random Forest,137.3749
3,Xtreme Gradient Boost,120.938
4,Support Vector Machine,1073.7258
5,Multi-Layer Perceptron,255.7837


### Test data scores

In [59]:
scores_cyc

Unnamed: 0,Model,Mean Absolute Error,Root Mean Squared Error
0,Lasso Regression,382.1898,480.1887
1,Ridge Regression,382.3853,480.4957
2,Random Forest,131.4494,175.9876
3,Xtreme Gradient Boost,116.8413,156.4439
4,Support Vector Machine,323.2023,423.3375
5,Multi-Layer Perceptron,382.1247,480.4495
