# About
Multivariate ML model training. Single step.

# Libraries

In [10]:
%run "/home/cesar/Python_NBs/HDL_Project/HDL_Project/global_fv.ipynb"

User information is ready!


In [11]:
import os

# Save trained models
import joblib

# Data
from sklearn.model_selection import train_test_split

# Nonlinear models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.gaussian_process import GaussianProcessRegressor

# Ensemble models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

# Clone of time class
s = t

# Random seed
np.random.seed(10)

os.getcwd()

'/home/cesar/Python_NBs/HDL_Project/HDL_Project/2_Models/Multivariate/ML'

# Global parameters

# User-Defined Functions

In [None]:
# create a dict of standard models to evaluate {name:object}
def define_models():
    models=dict()
    
    # nonlinear models
    models['k-Nearest Neighbors'] = KNeighborsRegressor(weights= 'distance'
                                                        , p=1
                                                        , n_neighbors= 9
                                                        , n_jobs= -1
                                                        , metric= 'manhattan'
                                                        , leaf_size= 29
                                                        , algorithm= 'kd_tree')
    
    
    models['Decision Tree Regressor'] = DecisionTreeRegressor(splitter= 'best'
                                                              , min_weight_fraction_leaf= 0.061224489795918366
                                                              , min_samples_split= 8
                                                              , min_samples_leaf= 6
                                                              , max_depth= 8
                                                              , criterion= 'friedman_mse')
    
    models['Support Vector Regression - Polynomial'] = svm.SVR(kernel='poly')
    
    models['Support Vector Regression - RBF'] = svm.SVR(kernel= 'rbf'
                                                        , gamma= 0.1
                                                        , C= 1000)
    
    models['Support Vector Regression - Linear'] = svm.SVR(kernel='linear'
                                                           , gamma= 0.1
                                                           , C= 100)
    
    # ensemble models
    models['Random Forest'] = RandomForestRegressor(n_estimators= 500
                                                    , min_samples_split= 6
                                                    , max_features= 9
                                                    , max_depth= 8.0)
    
    models['Extra-trees classifier'] = ExtraTreesRegressor(n_estimators= 100
                                                           , min_samples_split= 7
                                                           , max_features= 12
                                                           , max_depth= 29.0
                                                           , criterion= 'squared_error')
    
    models['XG Boost'] = XGBRegressor(subsample= 0.8
                                      , n_estimators= 1000
                                      , max_depth= 3
                                      , learning_rate= 0.01
                                      , colsample_bytree= 0.9
                                      , colsample_bylevel= 0.8)
    X_test
    print( 'Defined %d models:' % len(models))
    print()
    return models

In [3]:
# Evaluate a single model
def single_model_evaluation(X_train, y_train, X_test, y_test, name, model):
    # fit the model
    model.fit(X_train, y_train)
    
    # Save the trained model
    filename = 'trained_ml_models/{}.sav'.format(name)
    joblib.dump(model, filename)
    
    # make predictions
    y_prediction = model.predict(X_test)
    
    metrics = dict()
    # evaluate predictions
    # accuracy = accuracy_score(y_test, y_prediction) * 100
    metrics["RMSE"] = mean_squared_error(y_test, y_prediction, squared=False)
    metrics["MAE"] = mean_absolute_error(y_test, y_prediction)
    metrics["MAPE"] = mean_absolute_percentage_error(y_test, y_prediction)
    metrics["R^2"] = r2_score(y_test, y_prediction)
    metrics["Max Error"] = max_error(y_test, y_prediction)    
    
    return metrics

In [None]:
# Evaluate a dict of models {name:object}, returns {name:score}
def multiple_model_evaluation(X_train, y_train, X_test, y_test, models):
    metrics_df = pd.DataFrame()
    
    for name, model in models.items():
        # evaluate the model
        s.tic()
        tmp_df = pd.DataFrame(single_model_evaluation(X_train, y_train, X_test, y_test, name, model), index=[0])
        tmp_df.insert(0, "Model Name", name, True)
        tmp_df.insert(0, "Type", "ML", True)
        metrics_df = metrics_df.append(tmp_df)
        print("> {}.".format(name))
        s.toc(restart=True)
        
    return metrics_df.reset_index(drop = True)

# Data

## Sample preparation

In [12]:
sql_table = "MVI_sima_station_CE"
target = "pm25"

# Define columns of interest from sql table
#     Select all columns:
column = "*"
#     Select specific columns:
#column = "datetime, prs, rainf, rh, sr, tout, wdr, wsr, " + str(target)

# Filter data with WHERE command
sql_where = "where datetime >=\'2021-12-17\'"
# sql_where = "where datetime >=\'2020-04-20\' and datetime <=\'2021-04-17 23:00:00\'"
#sql_where = "where datetime >= \'2021-03-01\'"

# Initialize class to create multivariate samples:
multi_ts = multivariate_samples(sql_table, target, column, sql_where)

# Datasets can't be trained with sample batches by default. So parameter is 1.
X, y, df = multi_ts.samples_creation(1, target)

X_train, X_test, y_train, y_test = train_test_split(X[:,0,:], y, test_size = 0.30, shuffle= False)

In [13]:
df

Unnamed: 0_level_0,no,no2,nox,o3,pm10,prs,rainf,rh,so2,sr,tout,wdr,wsr,pm25
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-12-17 00:00:00,0.153785,0.375000,0.305035,0.172807,0.081967,0.405858,0.0,0.711111,0.030429,0.189237,0.578947,0.089136,0.277273,32.00
2021-12-17 01:00:00,0.077291,0.313073,0.217213,0.172807,0.113388,0.401674,0.0,0.766667,0.041494,0.141711,0.552632,0.988858,0.204545,26.00
2021-12-17 02:00:00,0.050199,0.267202,0.173302,0.172807,0.103825,0.393305,0.0,0.822222,0.038728,0.115780,0.526316,0.952646,0.195455,23.00
2021-12-17 03:00:00,0.038247,0.230505,0.146956,0.181078,0.118852,0.389121,0.0,0.866667,0.030429,0.104923,0.500000,0.949861,0.159091,21.00
2021-12-17 04:00:00,0.022311,0.204128,0.121194,0.189350,0.107923,0.376569,0.0,0.900000,0.026279,0.126368,0.500000,0.008357,0.200000,33.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-17 19:00:00,0.009246,0.059633,0.038056,0.487140,0.092896,0.271967,0.0,0.433333,0.076072,0.057243,0.894737,0.208914,0.645455,38.59
2022-04-17 20:00:00,0.009667,0.081422,0.049180,0.420965,0.092896,0.322176,0.0,0.455556,0.060858,0.006071,0.815789,0.136490,0.636364,33.83
2022-04-17 21:00:00,0.008765,0.169725,0.093677,0.379605,0.120219,0.372385,0.0,0.455556,0.060858,0.006071,0.763158,0.827298,0.163636,22.25
2022-04-17 22:00:00,0.008765,0.139908,0.077869,0.379605,0.075137,0.389121,0.0,0.500000,0.058091,0.006071,0.763158,0.264624,0.277273,25.19


In [27]:
#df.iloc[:2049, ]
df.iloc[2050:, ]

Unnamed: 0_level_0,no,no2,nox,o3,pm10,prs,rainf,rh,so2,sr,tout,wdr,wsr,pm25
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-03-12 10:00:00,0.141833,0.255734,0.235363,0.338245,0.077869,0.907950,0.0,0.355556,0.143845,0.227491,0.263158,0.189415,0.322727,17.40
2022-03-12 11:00:00,0.091633,0.183486,0.161593,0.404421,0.084699,0.899582,0.0,0.333333,0.092669,0.206825,0.289474,0.225627,0.345455,15.88
2022-03-12 12:00:00,0.051793,0.133028,0.106557,0.478868,0.080601,0.882845,0.0,0.300000,0.044260,0.768739,0.315789,0.125348,0.422727,15.59
2022-03-12 13:00:00,0.039044,0.133028,0.097190,0.528500,0.087432,0.845188,0.0,0.277778,0.045643,0.817944,0.342105,0.214485,0.486364,16.63
2022-03-12 14:00:00,0.035060,0.128440,0.091920,0.561587,0.084699,0.803347,0.0,0.266667,0.044260,0.824832,0.394737,0.155989,0.550000,19.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-17 19:00:00,0.009246,0.059633,0.038056,0.487140,0.092896,0.271967,0.0,0.433333,0.076072,0.057243,0.894737,0.208914,0.645455,38.59
2022-04-17 20:00:00,0.009667,0.081422,0.049180,0.420965,0.092896,0.322176,0.0,0.455556,0.060858,0.006071,0.815789,0.136490,0.636364,33.83
2022-04-17 21:00:00,0.008765,0.169725,0.093677,0.379605,0.120219,0.372385,0.0,0.455556,0.060858,0.006071,0.763158,0.827298,0.163636,22.25
2022-04-17 22:00:00,0.008765,0.139908,0.077869,0.379605,0.075137,0.389121,0.0,0.500000,0.058091,0.006071,0.763158,0.264624,0.277273,25.19


In [26]:
y_train

array([32.  , 26.  , 23.  , ..., 11.03, 10.26,  9.54])

# Models
We can define a list of machine learning models to evaluate on this problem. We will evaluate the models using default configurations. We are not looking for optimal configurations of these models at this point, just a general idea of how well sophisticated models with default configurations perform on this problem. We will evaluate a diverse set of nonlinear and ensemble machine learning algorithms:

**Nonlinear Algorithms**:
* k-Nearest Neighbors
* Classification and Regression Tree
* Support Vector Machine
* Naive Bayes

**Ensemble Algorithms**:
* Bagged Decision Trees
* Random Forest
* Extra Trees
* Gradient Boosting Machine

## Model tuning

In [None]:
from sklearn.metrics import SCORERS
sorted(SCORERS.keys())

In [None]:
# get model list
models = define_models()

# evaluate models
t.tic() #Start timer
results = multiple_model_evaluation(X_train, y_train, X_test, y_test, models)
t.toc() #Time elapsed since t.tic()

results

# Load and test model

In [5]:
# Evaluate a single model
def single_model_evaluation(X_test, y_test, name):
    # Load the trained model
    filename = 'trained_ml_models_mvi/{}.sav'.format(name)
    model = joblib.load(filename)

    # make predictions
    y_prediction = model.predict(X_test)
    
    metrics = dict()
    # evaluate predictions
    metrics["RMSE"] = mean_squared_error(y_test, y_prediction, squared=False)
    metrics["MAE"] = mean_absolute_error(y_test, y_prediction)
    metrics["MAPE (%)"] = mean_absolute_percentage_error(y_test, y_prediction) *100
    metrics["R^2 (%)"] = r2_score(y_test, y_prediction) * 100
    metrics["Max Error"] = max_error(y_test, y_prediction)    
    
    return metrics

In [6]:
# Evaluate a list of models 
def multiple_model_evaluation(X_test, y_test, models_list):
    metrics_df = pd.DataFrame()
    
    for name in models_list:
        # evaluate the model
        s.tic()
        tmp_df = pd.DataFrame(single_model_evaluation(X_test, y_test, name), index=[0])
        tmp_df.insert(0, "Model Name", name, True)
        tmp_df.insert(0, "Type", "ML", True)
        metrics_df = metrics_df.append(tmp_df)
        print("> {}.".format(name))
        s.toc(restart=True)
        
    return metrics_df.reset_index(drop = True)

In [9]:
# get model list
models_list = ["KNN", "DecisionTrees", "SVR_RBF", "SVR_Linear", "RandomForest", "ExtraTrees", "XGBoost"]

# evaluate models
t.tic() #Start timer
results = multiple_model_evaluation(X_test, y_test, models_list)
t.toc() #Time elapsed since t.tic()

results

> KNN.
Elapsed time is 0.035305 seconds.
> DecisionTrees.
Elapsed time is 0.005484 seconds.
> SVR_RBF.
Elapsed time is 0.093966 seconds.
> SVR_Linear.
Elapsed time is 0.050214 seconds.
> RandomForest.
Elapsed time is 0.080039 seconds.
> ExtraTrees.
Elapsed time is 0.055800 seconds.
> XGBoost.
Elapsed time is 0.227372 seconds.
Elapsed time is 0.000730 seconds.


Unnamed: 0,Type,Model Name,RMSE,MAE,MAPE (%),R^2 (%),Max Error
0,ML,KNN,5.451618,3.732253,18.635009,84.322467,33.047146
1,ML,DecisionTrees,10.080524,7.194727,35.240874,46.39651,56.103333
2,ML,SVR_RBF,7.676812,5.138398,23.869243,68.912295,57.040877
3,ML,SVR_Linear,9.219357,6.460102,32.64537,55.163869,61.563358
4,ML,RandomForest,6.558606,4.668735,26.168661,77.30919,44.634411
5,ML,ExtraTrees,5.233726,3.695964,22.050052,85.550632,24.040972
6,ML,XGBoost,6.67115,5.007703,29.157544,76.523772,29.987175


# Single Evaluation

In [None]:
# Load the trained model
filename = 'trained_ml_models_mvi/{}.sav'.format("KNN")
model = joblib.load(filename)

# make predictions
y_prediction = model.predict(X_test)

metrics = dict()
# evaluate predictions
metrics["RMSE"] = mean_squared_error(y_test, y_prediction, squared=False)
metrics["MAE"] = mean_absolute_error(y_test, y_prediction)
metrics["MAPE (%)"] = mean_absolute_percentage_error(y_test, y_prediction) *100
metrics["R^2 (%)"] = r2_score(y_test, y_prediction) * 100
metrics["Max Error"] = max_error(y_test, y_prediction)    

print(y_prediction)
metrics

In [None]:
y_test

# Sources:
* https://scikit-learn.org/stable/modules/model_evaluation.html
* https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
    
* https://scikit-learn.org/stable/modules/svm.html
* https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html    
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html    