## 4. Model Validation

- 4.1. Ensemble Modelling: Stacking 
- 4.2. Model Evaluation using Cross Validation

In [None]:
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
import re
from sklearn.linear_model import ElasticNet
import pickle
import warnings
warnings.filterwarnings('ignore')

In [6]:
train_input = pd.read_csv('data/3_train_model_input.csv')
test_input = pd.read_csv('data/3_test_model_input.csv')

X_train = train_input.iloc[:,0:6]
y_train = train_input.iloc[:,-1]
X_test = test_input.iloc[:,0:6]
y_test = test_input.iloc[:,-1]

### 4.1. Ensemble Modelling: Stacking 

In [3]:
#' @param  [no input]
#' @returns an object which is the stacked ensemble model

def stack():
    # define the base models
    # Takes too long to run due to parameters
#     level0.append(('XGB', XGBRegressor(n_estimators = 50, max_depth = 10, learning_rate = 0.1)))
#     level0.append(('SVR', SVR(C = 1000)))
#     level0.append(('GBR', GradientBoostingRegressor(n_estimators = 300, max_depth = 4)))
#     level0.append(('CATB', CatBoostRegressor())),
#     level0.append(('ElNET', ElasticNet(alpha=0.0001291549665014884, l1_ratio=0.2, max_iter=200))),
#     level0.append(('LGBM', LGBMRegressor(num_leaves = 500, feature_fraction = 0.6, bagging_freq = 50, learning_rate = 0.05)))
    
    level0 = list()
    level0.append(('XGB', XGBRegressor()))
    level0.append(('SVR', SVR()))
    level0.append(('GBR', GradientBoostingRegressor()))
    # level0.append(('CATB', CatBoostRegressor())),
    level0.append(('ElNET', ElasticNet())),
    level0.append(('LGBM', LGBMRegressor()))
    # define meta learner model
    level1 = LinearRegression()
    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=10)
    return model

### 4.2. Model Evaluation using Cross Validation

In [4]:
#' @param X_train: training dataframe with features
#' @param y_train: training dataframe with site_eui
#' @param X_test: testing dataframe with features
#' @param y_test: testing dataframe with site_eui
#' @return a dataframe with all the performance and time metrics for all the models listed below

def validate_model(X_train, y_train, X_test, y_test) -> pd.DataFrame:
    
    dfs = []
    models = [
          ('Stack', stack()),
          ('ElNet', ElasticNet(alpha=0.0001291549665014884, l1_ratio=0.2, max_iter=200)),     
          ('XGB', XGBRegressor(n_estimators = 50, max_depth = 10, learning_rate = 0.1)), 
          ('LGBM', LGBMRegressor(num_leaves = 500, feature_fraction = 0.6, bagging_freq = 50,
                                 learning_rate = 0.05)),
          ('GBR', GradientBoostingRegressor(n_estimators = 300, max_depth = 4)),
          ('SVR', SVR(C = 1000))
#          ('CATB', CatBoostRegressor(n_estimators = 500, max_depth = 4, learning_rate = 0.1)),)
          ]
    results = []
    names = []
    scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2']

    for name, model in models:
        kfold = model_selection.RepeatedKFold(n_splits=5, n_repeats = 3, random_state=90210)
        cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
        clf = model.fit(X_train, y_train)
        f = 'models/{}.sav'.format(name)
        pickle.dump(model, open(f, 'wb'))
        y_pred = clf.predict(X_test)
        print(name)
        print(mean_squared_error(y_test, y_pred, squared = False))
        
        results.append(cv_results)
        names.append(name)

        final_results = pd.DataFrame(cv_results)
        final_results['model'] = name
        dfs.append(final_results)

        final = pd.concat(dfs, ignore_index=True)
        

    return final

In [5]:
final_result = validate_model(X_train, y_train, X_test, y_test)

Stack
18.764902454425087
ElNet
22.679059614368878
XGB
18.623098596947294
LGBM
18.500645345403143
GBR
19.160293693341192
SVR
19.704045500583575


In [6]:
final_result.to_csv('data/4_validation_result.csv', index = False)  

| **Model** | **RMSE** |
|---|---|
| Light Gradient Boosting Regressor | 18.501 |
| Gradient Boosting Regressor | 19.160 |
| Support Vector Regressor (Radial) | 19.704 |
| XGBoost Regressor | 18.623 |
| Elastic Net | 22.679 |
| Stacked Ensemble |18.765  |