In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import pickle
import pickle5
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score   
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import folium
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.ensemble import GradientBoostingRegressor

## block warning
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open("../data/ut_poiV6.pkl", "rb") as fh:
    df = pickle5.load(fh)
    
df.head()

Unnamed: 0,life_time,datetime_start,datetime_end,latitude,longitude,distance_center_km,Station_Name,weekday_b,weekday_b_name,hour_b,...,near_inactivity_6H,near_charge_events_6H,service,entertainment,food,childcare,medical,education,parking,waste-management
0,928.5,2018-01-31 19:51:00,2018-02-01 11:19:30,40.018482,-105.281066,0.152203,COMM VITALITY / 1104 SPRUCE1,2,Wednesday,19,...,0.0,0,0.057343,0.109796,0.073649,3.517121,0.466518,0.631771,0.021832,1.145959
1,363.5,2018-02-01 14:03:00,2018-02-01 20:06:30,40.018482,-105.281066,0.152203,COMM VITALITY / 1104 SPRUCE1,3,Thursday,14,...,0.0,0,0.057343,0.109796,0.073649,3.517121,0.466518,0.631771,0.021832,1.145959
2,6828.5,2018-02-01 21:15:00,2018-02-06 15:03:30,40.018482,-105.281066,0.152203,COMM VITALITY / 1104 SPRUCE1,3,Thursday,21,...,0.0,0,0.057343,0.109796,0.073649,3.517121,0.466518,0.631771,0.021832,1.145959
3,5871.5,2018-02-06 15:27:00,2018-02-10 17:18:30,40.018482,-105.281066,0.152203,COMM VITALITY / 1104 SPRUCE1,1,Tuesday,15,...,0.0,0,0.057343,0.109796,0.073649,3.517121,0.466518,0.631771,0.021832,1.145959
4,1454.5,2018-02-10 18:26:00,2018-02-11 18:40:30,40.018482,-105.281066,0.152203,COMM VITALITY / 1104 SPRUCE1,5,Saturday,18,...,0.0,0,0.057343,0.109796,0.073649,3.517121,0.466518,0.631771,0.021832,1.145959


In [3]:
df.shape

(12711, 71)

### Remove events
The features we have constructed means that we automatically will loose some data. To use the lagged feature we need to remove the 3 first observations. In the code below we check if the sum of the 3 first life times are above 3 hours. If they are then we do not need to remove additinal variables due to the 3 hours.

In [4]:
names = df['Station_Name'].unique()

In [5]:
for name in names:
    temp = df[df['Station_Name'] == name]
    rows = temp[temp['lag3'].isna()]
    if rows.life_time.sum() < 3*60:
        print("The station has the lagged")
        print(name)

# remove rows
df = df.dropna()

The station has the lagged
BOULDER / REC CENTER
The station has the lagged
BOULDER / ATRIUM ST1
The station has the lagged
BOULDER / ALPINE ST1
The station has the lagged
COMM VITALITY / 1400 WALNUT1
The station has the lagged
BOULDER / FACILITIES ST1
The station has the lagged
COMM VITALITY / 1500PEARL
The station has the lagged
BOULDER / JUNCTION ST1
The station has the lagged
COMM VITALITY / BOULDER JCTN
The station has the lagged
COMM VITALITY / 1100WALNUT1
The station has the lagged
BOULDER / BOULDER PARK
The station has the lagged
COMM VITALITY / 2200 BROADWAY1
The station has the lagged
BOULDER / EAST REC
The station has the lagged
BOULDERJUNCTION / JUNCTION ST1


In [6]:
df.shape

(12699, 71)

In [7]:
## Limit to only top 10 stations
temp = df.groupby(['Station_Name']).count().latitude # group the data for each station
names10 = temp.sort_values(ascending=False)[0:10]
names10 = names10.index.values

df = df[df['Station_Name'].isin(names10)]
df = df.reset_index(drop=True)

In [8]:
df.shape

(10762, 71)

## Remove outliers
Use boxplot outlier definition

In [9]:
# remove outliers
drop_index = [] # list of indexes to keep
names = df['Station_Name'].unique()

for name in names:
    temp = df[df['Station_Name'] == name]
    # Get interquantile ranges
    Q1, Q3 = temp.life_time.quantile([0.25, 0.75])
    IQR = Q3-Q1
    minimum = Q1 - 1.5*IQR
    maximum = Q3 + 1.5*IQR
    # Define observations which should be removed
    temp2 = df[(df['life_time'] < minimum) | (df['life_time'] > maximum) &
                  (df['Station_Name'] == name)]
    print("{n}: {s} ({p} %)".format(n=name, 
                                    s=temp2.shape[0], 
                                    p=round((temp2.shape[0]/temp.shape[0])*100,2)))
    # Add the indexes which should be dropped
    drop_index.extend(list(temp2.index))
    
print("\nThe total amount of lost events: {n} ({p} %)".format(n=len(drop_index), 
                                                              p=round(len(drop_index)/df.shape[0]*100,2)))

df = df.drop(drop_index)
df = df.reset_index(drop=True)

COMM VITALITY / 1104 SPRUCE1: 57 (4.4 %)
COMM VITALITY / 1000WALNUT: 48 (3.32 %)
BOULDER / REC CENTER: 40 (3.51 %)
BOULDER / BASELINE ST1: 42 (3.53 %)
BOULDER / ATRIUM ST1: 65 (6.58 %)
COMM VITALITY / 1400 WALNUT1: 38 (7.29 %)
COMM VITALITY / 1500PEARL: 41 (4.5 %)
COMM VITALITY / BOULDER JCTN: 40 (5.28 %)
COMM VITALITY / 1100WALNUT1: 39 (3.51 %)
BOULDER / N BOULDER REC 1: 29 (2.07 %)

The total amount of lost events: 439 (4.08 %)


In [10]:
df.shape

(10323, 71)

# Modeling

In [11]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true))*100

In [12]:
## To test model
def test_model(y_test, y_pred):
    MAE = metrics.mean_absolute_error(y_test, y_pred)
    #print('MAE (Mean Absolute Error):', MAE)
    MSE = metrics.mean_squared_error(y_test, y_pred)
    #print('MSE (Mean Squared Error):', MSE)
    RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    #print('RMSE (Root Mean Squared Error):', RMSE)
    MAPE = mean_absolute_percentage_error(y_test, y_pred)
    
    NRMSE = RMSE/np.mean(y_test)
    return MAE, RMSE, MSE, MAPE, NRMSE

In [15]:
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
positions = [0,1,2,3,4,5,6]

### Prepare data

Add aggregated features for modeling purposes. 

In [17]:
def hour_label(X):
    if (X >= 7) & (X <= 10):
        return 'Morning'
    elif (X >= 11) & (X <= 14):
        return 'Midday'
    elif (X >= 15) & (X <= 18):
        return 'Afternoon'
    elif (X >= 19) or (X < 1):
        return 'Evening'
    elif (X >= 1) & (X <= 6):
        return 'Night'

In [18]:
## Add time of day and day of week
df['tod'] = df['hour_b'].apply(hour_label)

In [19]:
df = df[df['tod'] != 'Night']

Limit to 4 stations

In [20]:
station_subset = ['BOULDER / N BOULDER REC 1', 'COMM VITALITY / 1000WALNUT', 
        'COMM VITALITY / 1104 SPRUCE1', 'BOULDER / BASELINE ST1']
df_stat = df[df['Station_Name'].isin(station_subset)]

Add dummies

In [21]:
# Categorical features
columns_categorical = ['weekday_b_name','tod','hour_b','Station_Name']
df_stat = pd.get_dummies(df_stat, columns=columns_categorical, drop_first=True)

The models are renamed to make coding easier:
- M1: Baseline
- M2: Activity
- M3: Temporal
- M4: Full

In [22]:
## M1
features1 = ['weekday_b_name_Monday', 'weekday_b_name_Saturday',
             'weekday_b_name_Sunday', 'weekday_b_name_Thursday',
             'weekday_b_name_Tuesday', 'weekday_b_name_Wednesday',
             'tod_Evening','tod_Midday', 'tod_Morning',
             'Station_Name_BOULDER / N BOULDER REC 1',
             'Station_Name_COMM VITALITY / 1000WALNUT',
             'Station_Name_COMM VITALITY / 1104 SPRUCE1'] # dow + tod

## M5
features2_cat = ['weekday_b_name_Monday', 'weekday_b_name_Saturday',
                 'weekday_b_name_Sunday', 'weekday_b_name_Thursday',
                 'weekday_b_name_Tuesday', 'weekday_b_name_Wednesday',
                 'tod_Evening','tod_Midday', 'tod_Morning',
                 'Station_Name_BOULDER / N BOULDER REC 1',
                 'Station_Name_COMM VITALITY / 1000WALNUT',
                 'Station_Name_COMM VITALITY / 1104 SPRUCE1']
features2_con = ['lag1', 'lag2', 'lag3','near_charge_time_4H', 'near_charge_energy_4H',
                 'charge_time_4H', 'charge_energy_4H','service'] # dow + agg. tod + lag + activ.

## M5
features3_cat = ['weekday_b_name_Monday', 'weekday_b_name_Saturday',
                 'weekday_b_name_Sunday', 'weekday_b_name_Thursday',
                 'weekday_b_name_Tuesday', 'weekday_b_name_Wednesday',
                 'tod_Evening','tod_Midday', 'tod_Morning',
                 'Station_Name_BOULDER / N BOULDER REC 1',
                 'Station_Name_COMM VITALITY / 1000WALNUT',
                 'Station_Name_COMM VITALITY / 1104 SPRUCE1']
features3_con = ['service', 'entertainment', 'food', 
                 'childcare', 'medical', 'education','parking', 'waste-management'] # dow + agg. tod + lag + activ.

## M5
features4_cat = ['weekday_b_name_Monday', 'weekday_b_name_Saturday',
                 'weekday_b_name_Sunday', 'weekday_b_name_Thursday',
                 'weekday_b_name_Tuesday', 'weekday_b_name_Wednesday',
                 'tod_Evening','tod_Midday', 'tod_Morning',
                 'Station_Name_BOULDER / N BOULDER REC 1',
                 'Station_Name_COMM VITALITY / 1000WALNUT',
                 'Station_Name_COMM VITALITY / 1104 SPRUCE1']
features4_con = ['lag1', 'lag2', 'lag3','near_charge_time_4H', 'near_charge_energy_4H',
                 'charge_time_4H', 'charge_energy_4H','service', 'entertainment', 'food', 
                 'childcare', 'medical', 'education','parking', 'waste-management'] # dow + agg. tod + lag + activ.


Test train split

In [23]:
split = 0.8

In [24]:
# choose data
df_stat = df_stat.sort_values(by=['datetime_start'])

In [25]:
train, test = np.split(df_stat, [int(split * len(df_stat))])
print("Training shape:",train.shape)
print("Testing shape:", test.shape)

Training shape: (4088, 97)
Testing shape: (1022, 97)


In [26]:
# Get X
#M1 - Baseline
X_train1, X_test1 = train[features1], test[features1]
# M2 - Activity
X_train2, X_test2 = train[features2_cat+features2_con], test[features2_cat+features2_con]
scaler = StandardScaler()
X_train2[features2_con] = scaler.fit_transform(X_train2[features2_con])
X_test2[features2_con] = scaler.transform(X_test2[features2_con])
# M3 - Spatial
X_train3, X_test3 = train[features3_cat+features3_con], test[features3_cat+features3_con]
scaler = StandardScaler()
X_train3[features3_con] = scaler.fit_transform(X_train3[features3_con])
X_test3[features3_con] = scaler.transform(X_test3[features3_con])
# M4 - Full
X_train4, X_test4 = train[features4_cat+features4_con], test[features4_cat+features4_con]
scaler = StandardScaler()
X_train4[features4_con] = scaler.fit_transform(X_train4[features4_con])
X_test4[features4_con] = scaler.transform(X_test4[features4_con])

## Get y - train
y_train, y_test = train['life_time'], test['life_time']

In [29]:
results = {}

In [30]:
N_bootstraps = 20

### LINEAR REGRESSION

In [31]:
grid_lr = {'alpha':[0.001,0.01,0.1,0.5,1,3,5,7,10,13,15,20], 'fit_intercept':['True','False']}
results['LR'] = {}

#### Linear regression - model 1

In [32]:
results['LR']['M1'] = {}

## Find hyperparam
reg = Ridge(random_state=42)
reg_gs = RandomizedSearchCV(reg, grid_lr, random_state=42, cv=5, 
                            n_iter=50, scoring='neg_mean_squared_error')
reg_gs.fit(X_train1,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)

# Fit model
reg = Ridge(random_state=42, alpha=param['alpha'])
reg.fit(X_train1,y_train)

## Predict
y_test_pred = reg.predict(X_test1)
y_train_pred = reg.predict(X_train1)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['LR']['M1'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train1,y_train), 
                          'rsq_test':reg.score(X_test1,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'fit_intercept': 'True', 'alpha': 15}


#### LR - Model 2

In [33]:
results['LR']['M2'] = {}

## Find hyperparam
reg = Ridge(random_state=42)
reg_gs = RandomizedSearchCV(reg, grid_lr, random_state=42, cv=5, 
                            n_iter=50, scoring='neg_mean_squared_error')
reg_gs.fit(X_train2,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)

# Fit model
reg = Ridge(random_state=42, alpha=param['alpha'])
reg.fit(X_train2,y_train)

## Predict
y_test_pred = reg.predict(X_test2)
y_train_pred = reg.predict(X_train2)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['LR']['M2'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train2,y_train), 
                          'rsq_test':reg.score(X_test2,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'fit_intercept': 'True', 'alpha': 20}


#### LR - Model 3

In [34]:
results['LR']['M3'] = {}

## Find hyperparam
reg = Ridge(random_state=42)
reg_gs = RandomizedSearchCV(reg, grid_lr, random_state=42, cv=5, 
                            n_iter=50, scoring='neg_mean_squared_error')
reg_gs.fit(X_train3,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)

# Fit model
reg = Ridge(random_state=42, alpha=param['alpha'])
reg.fit(X_train3,y_train)

## Predict
y_test_pred = reg.predict(X_test3)
y_train_pred = reg.predict(X_train3)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['LR']['M3'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train3,y_train), 
                          'rsq_test':reg.score(X_test3,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'fit_intercept': 'True', 'alpha': 20}


#### LR - Model 4

In [35]:
results['LR']['M4'] = {}

## Find hyperparam
reg = Ridge(random_state=42)
reg_gs = RandomizedSearchCV(reg, grid_lr, random_state=42, cv=5, 
                            n_iter=50, scoring='neg_mean_squared_error')
reg_gs.fit(X_train4,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)

# Fit model
reg = Ridge(random_state=42, alpha=param['alpha'])
reg.fit(X_train4,y_train)

## Predict
y_test_pred = reg.predict(X_test4)
y_train_pred = reg.predict(X_train4)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['LR']['M4'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train4,y_train), 
                          'rsq_test':reg.score(X_test4,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'fit_intercept': 'True', 'alpha': 20}


### RANDOM FOREST REGRESSOR

In [36]:
grid_RF = {'n_estimators':[100,300,500,1000], 'min_samples_split':[2,3,5,7],
           'min_samples_leaf':[3,5,10,15], 'max_depth':[2,3,5,7]}
results['RF'] = {}

#### Random forest regressor - model 1

In [37]:
results['RF']['M1'] = {}

## Find hyperparam
reg = RandomForestRegressor(random_state=42)
reg_gs = RandomizedSearchCV(reg, grid_RF, random_state=42, cv=5, 
                            n_iter=50, scoring='neg_mean_squared_error')
reg_gs.fit(X_train1,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)
reg = RandomForestRegressor(random_state=42, 
                            min_samples_split=param['min_samples_split'],
                            min_samples_leaf=param['min_samples_leaf'],
                            max_depth=param['max_depth'],
                            n_estimators=param['n_estimators'])    
reg.fit(X_train1, y_train) 

## Predict
y_test_pred = reg.predict(X_test1)
y_train_pred = reg.predict(X_train1)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['RF']['M1'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train1,y_train), 
                          'rsq_test':reg.score(X_test1,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 15, 'max_depth': 7}


#### RF - M2

In [38]:
results['RF']['M2'] = {}

reg = RandomForestRegressor(random_state=42)
reg_gs = RandomizedSearchCV(reg, grid_RF, random_state=42, cv=5, 
                            n_iter=50, scoring='neg_mean_squared_error')
reg_gs.fit(X_train2,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)
reg = RandomForestRegressor(random_state=42, 
                            min_samples_split=param['min_samples_split'],
                            min_samples_leaf=param['min_samples_leaf'],
                            max_depth=param['max_depth'],
                            n_estimators=param['n_estimators'])    
reg.fit(X_train2, y_train) 

## Predict
y_test_pred = reg.predict(X_test2)
y_train_pred = reg.predict(X_train2)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['RF']['M2'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train2,y_train), 
                          'rsq_test':reg.score(X_test2,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'n_estimators': 300, 'min_samples_split': 7, 'min_samples_leaf': 15, 'max_depth': 7}


#### RF - M3

In [39]:
results['RF']['M3'] = {}

reg = RandomForestRegressor(random_state=42)
reg_gs = RandomizedSearchCV(reg, grid_RF, random_state=42, cv=5, 
                            n_iter=50, scoring='neg_mean_squared_error')
reg_gs.fit(X_train3,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)
reg = RandomForestRegressor(random_state=42, 
                            min_samples_split=param['min_samples_split'],
                            min_samples_leaf=param['min_samples_leaf'],
                            max_depth=param['max_depth'],
                            n_estimators=param['n_estimators'])    
reg.fit(X_train3, y_train) 

## Predict
y_test_pred = reg.predict(X_test3)
y_train_pred = reg.predict(X_train3)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['RF']['M3'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train3,y_train), 
                          'rsq_test':reg.score(X_test3,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 15, 'max_depth': 7}


#### RF - M4

In [40]:
results['RF']['M4'] = {}

reg = RandomForestRegressor(random_state=42)
reg_gs = RandomizedSearchCV(reg, grid_RF, random_state=42, cv=5, 
                            n_iter=50, scoring='neg_mean_squared_error')
reg_gs.fit(X_train4,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)
reg = RandomForestRegressor(random_state=42, 
                            min_samples_split=param['min_samples_split'],
                            min_samples_leaf=param['min_samples_leaf'],
                            max_depth=param['max_depth'],
                            n_estimators=param['n_estimators'])    
reg.fit(X_train4, y_train) 

## Predict
y_test_pred = reg.predict(X_test4)
y_train_pred = reg.predict(X_train4)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['RF']['M4'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train4,y_train), 
                          'rsq_test':reg.score(X_test4,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'n_estimators': 500, 'min_samples_split': 3, 'min_samples_leaf': 10, 'max_depth': 7}


### MLP

In [42]:
grid_MLP = {'alpha':[0.0001,0.001,0.1,1,2,5,10], 'hidden_layer_sizes':
            [(32),(32,32),(32,32),(32,32,32,32),(64),(64,64),(64,64,64),(64,64,64,64),
             (128),(128,128),(128,128,128),(128,128,128,128),(256),(256,256),(256,256,256),
             (256,256,256,256)],
            'activation': ['tanh', 'relu', 'logistic']}
results['MLP'] = {}

#### MLP - Model 1

In [43]:
results['MLP']['M1'] = {}

## Find hyperparam
reg = MLPRegressor(random_state=42, max_iter=500, early_stopping=True)
reg_gs = RandomizedSearchCV(reg, grid_MLP, random_state=42, n_iter=50,
                            cv=5, scoring='neg_mean_squared_error')
reg_gs.fit(X_train1,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)
reg = MLPRegressor(random_state=42, alpha=param['alpha'], 
                   hidden_layer_sizes=param['hidden_layer_sizes'],
                   activation = param['activation'],max_iter=500, early_stopping=True)
reg.fit(X_train1, y_train)  

## Predict
y_test_pred = reg.predict(X_test1)
y_train_pred = reg.predict(X_train1)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['MLP']['M1'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train1,y_train), 
                          'rsq_test':reg.score(X_test1,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'hidden_layer_sizes': (128, 128, 128, 128), 'alpha': 0.001, 'activation': 'relu'}


#### MLP - M2

In [44]:
results['MLP']['M2'] = {}

reg = MLPRegressor(random_state=42, max_iter=500, early_stopping=True)
reg_gs = RandomizedSearchCV(reg, grid_MLP, random_state=42, n_iter=50,
                            cv=5, scoring='neg_mean_squared_error')
reg_gs.fit(X_train2,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)
reg = MLPRegressor(random_state=42, alpha=param['alpha'], 
                   hidden_layer_sizes=param['hidden_layer_sizes'],
                   activation = param['activation'],max_iter=500, early_stopping=True)
reg.fit(X_train2, y_train)

## Predict
y_test_pred = reg.predict(X_test2)
y_train_pred = reg.predict(X_train2)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['MLP']['M2'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train2,y_train), 
                          'rsq_test':reg.score(X_test2,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'hidden_layer_sizes': 256, 'alpha': 10, 'activation': 'tanh'}


#### MLP - M3

In [45]:
results['MLP']['M3'] = {}

reg = MLPRegressor(random_state=42, max_iter=500, early_stopping=True)
reg_gs = RandomizedSearchCV(reg, grid_MLP, random_state=42, n_iter=50,
                            cv=5, scoring='neg_mean_squared_error')
reg_gs.fit(X_train3,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)
reg = MLPRegressor(random_state=42, alpha=param['alpha'], 
                   hidden_layer_sizes=param['hidden_layer_sizes'],
                   activation = param['activation'],max_iter=500, early_stopping=True)
reg.fit(X_train3, y_train)

## Predict
y_test_pred = reg.predict(X_test3)
y_train_pred = reg.predict(X_train3)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['MLP']['M3'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train3,y_train), 
                          'rsq_test':reg.score(X_test3,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'hidden_layer_sizes': (64, 64, 64, 64), 'alpha': 5, 'activation': 'relu'}


#### MLP - M4

In [46]:
results['MLP']['M4'] = {}

reg = MLPRegressor(random_state=42, max_iter=500, early_stopping=True)
reg_gs = RandomizedSearchCV(reg, grid_MLP, random_state=42, n_iter=50,
                            cv=5, scoring='neg_mean_squared_error')
reg_gs.fit(X_train4,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)
reg = MLPRegressor(random_state=42, alpha=param['alpha'], 
                   hidden_layer_sizes=param['hidden_layer_sizes'],
                   activation = param['activation'],max_iter=500, early_stopping=True)
reg.fit(X_train4, y_train)

## Predict

y_test_pred = reg.predict(X_test4)
y_train_pred = reg.predict(X_train4)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['MLP']['M4'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train4,y_train), 
                          'rsq_test':reg.score(X_test4,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'hidden_layer_sizes': 256, 'alpha': 0.001, 'activation': 'relu'}


### GRADIANT BOOST

In [47]:
grid_GB = {'max_depth':[2,3,5,10],'n_estimators':[100,300,500,1000], 
           'learning_rate':[0.001,0.01,0.1,0.5,0.9], 'min_samples_split':[2,3,5,7],
           'min_samples_leaf':[3,5,10,15]}
results['GB'] = {}

#### Gradiant boosting - M1

In [48]:
results['GB']['M1'] = {}

reg = GradientBoostingRegressor(random_state=42)
reg_gs = RandomizedSearchCV(reg, grid_GB, random_state=42, n_iter = 50, 
                            cv=5, scoring='neg_mean_squared_error')
reg_gs.fit(X_train1,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)
reg = GradientBoostingRegressor(random_state=42, 
                                max_depth=param['max_depth'],
                                min_samples_split=param['min_samples_split'],
                                min_samples_leaf=param['min_samples_leaf'],
                                n_estimators=param['n_estimators'], 
                                learning_rate=param['learning_rate'])
reg.fit(X_train1, y_train)

## Predict
y_test_pred = reg.predict(X_test1)
y_train_pred = reg.predict(X_train1)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['GB']['M1'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train1,y_train), 
                          'rsq_test':reg.score(X_test1,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_depth': 2, 'learning_rate': 0.5}


#### GB - M2

In [49]:
results['GB']['M2'] = {}

reg = GradientBoostingRegressor(random_state=42)
reg_gs = RandomizedSearchCV(reg, grid_GB, random_state=42, n_iter = 50, 
                            cv=5, scoring='neg_mean_squared_error')
reg_gs.fit(X_train2,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)
reg = GradientBoostingRegressor(random_state=42, 
                                max_depth=param['max_depth'],
                                min_samples_split=param['min_samples_split'],
                                min_samples_leaf=param['min_samples_leaf'],
                                n_estimators=param['n_estimators'], 
                                learning_rate=param['learning_rate'])
reg.fit(X_train2, y_train)

## Predict
y_test_pred = reg.predict(X_test2)
y_train_pred = reg.predict(X_train2)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['GB']['M2'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train2,y_train), 
                          'rsq_test':reg.score(X_test2,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'n_estimators': 300, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_depth': 5, 'learning_rate': 0.01}


#### GB - M3

In [50]:
results['GB']['M3'] = {}

reg = GradientBoostingRegressor(random_state=42)
reg_gs = RandomizedSearchCV(reg, grid_GB, random_state=42, n_iter = 50, 
                            cv=5, scoring='neg_mean_squared_error')
reg_gs.fit(X_train3,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)
reg = GradientBoostingRegressor(random_state=42, 
                                max_depth=param['max_depth'],
                                min_samples_split=param['min_samples_split'],
                                min_samples_leaf=param['min_samples_leaf'],
                                n_estimators=param['n_estimators'], 
                                learning_rate=param['learning_rate'])
reg.fit(X_train3, y_train)

## Predict
y_test_pred = reg.predict(X_test3)
y_train_pred = reg.predict(X_train3)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['GB']['M3'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train3,y_train), 
                          'rsq_test':reg.score(X_test3,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'n_estimators': 300, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_depth': 5, 'learning_rate': 0.01}


#### GB - M4

In [51]:
results['GB']['M4'] = {}

reg = GradientBoostingRegressor(random_state=42)
reg_gs = RandomizedSearchCV(reg, grid_GB, random_state=42, n_iter = 50, 
                            cv=5, scoring='neg_mean_squared_error') 
reg_gs.fit(X_train4,y_train)
param = reg_gs.best_params_
print("Best parameters:", param)
reg = GradientBoostingRegressor(random_state=42, 
                                max_depth=param['max_depth'],
                                min_samples_split=param['min_samples_split'],
                                min_samples_leaf=param['min_samples_leaf'],
                                n_estimators=param['n_estimators'], 
                                learning_rate=param['learning_rate'])
reg.fit(X_train4, y_train)

## Predict
y_test_pred = reg.predict(X_test4)
y_train_pred = reg.predict(X_train4)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(y_test,y_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(y_train,y_train_pred)

## Save results
results['GB']['M4'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                          'MAE_train': MAE_train,'MAE_test':MAE_test,
                          'rsq_train':reg.score(X_train4,y_train), 
                          'rsq_test':reg.score(X_test4,y_test),
                          'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                          'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train}

Best parameters: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_depth': 3, 'learning_rate': 0.01}


## Results

In [54]:
methods = ['LR', 'RF', 'MLP', 'GB']
models = ['M1','M2','M3','M4']

for me in methods:
    for mo in models:
        r = results[me][mo]
        RMSEtest = r['RMSE_test']
        R2test = r['rsq_test']
        MAEtest = r['MAE_test']
        MAPE = r['MAPE_test']
        NRMSE = r['NRMSE_test']
        
        print("{} - {}:\n R2: {} RMSE: {} MAE: {} MAPE: {} NRMSE: {}".format(me,mo,
                                                                              round(np.mean(R2test),2),
                                                                              round(np.mean(RMSEtest),0),
                                                                              round(np.mean(MAEtest),0),
                                                                              round(np.mean(MAPE),0),
                                                                              round(np.mean(NRMSE),2)))        
    print("\n")

LR - M1:
 R2: 0.21 RMSE: 424.0 MAE: 328.0 MAPE: 1872.0 NRMSE: 0.92
LR - M2:
 R2: 0.2 RMSE: 426.0 MAE: 328.0 MAPE: 1817.0 NRMSE: 0.92
LR - M3:
 R2: 0.22 RMSE: 423.0 MAE: 327.0 MAPE: 1862.0 NRMSE: 0.92
LR - M4:
 R2: 0.21 RMSE: 425.0 MAE: 327.0 MAPE: 1811.0 NRMSE: 0.92


RF - M1:
 R2: 0.23 RMSE: 419.0 MAE: 321.0 MAPE: 1804.0 NRMSE: 0.91
RF - M2:
 R2: 0.22 RMSE: 421.0 MAE: 323.0 MAPE: 1825.0 NRMSE: 0.91
RF - M3:
 R2: 0.23 RMSE: 419.0 MAE: 320.0 MAPE: 1794.0 NRMSE: 0.91
RF - M4:
 R2: 0.21 RMSE: 424.0 MAE: 323.0 MAPE: 1834.0 NRMSE: 0.92


MLP - M1:
 R2: 0.24 RMSE: 417.0 MAE: 317.0 MAPE: 1744.0 NRMSE: 0.9
MLP - M2:
 R2: 0.21 RMSE: 424.0 MAE: 324.0 MAPE: 1846.0 NRMSE: 0.92
MLP - M3:
 R2: 0.24 RMSE: 416.0 MAE: 320.0 MAPE: 1798.0 NRMSE: 0.9
MLP - M4:
 R2: 0.22 RMSE: 422.0 MAE: 324.0 MAPE: 1755.0 NRMSE: 0.91


GB - M1:
 R2: 0.23 RMSE: 418.0 MAE: 319.0 MAPE: 1815.0 NRMSE: 0.91
GB - M2:
 R2: 0.22 RMSE: 421.0 MAE: 328.0 MAPE: 1890.0 NRMSE: 0.91
GB - M3:
 R2: 0.24 RMSE: 418.0 MAE: 323.0 MAPE: 1822.0 