# About
Multivariate ML model training. Single step.

# Libraries

In [1]:
%run "/home/cesar/Python_NBs/HDL_Project/HDL_Project/global_fv.ipynb"

In [2]:
import os

# Save trained models
import joblib

# Data
from sklearn.model_selection import train_test_split

# Nonlinear models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.gaussian_process import GaussianProcessRegressor

# Ensemble models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

# Clone of time class
s = t

os.getcwd()

'/home/cesar/Python_NBs/HDL_Project/HDL_Project/2_Models/Multivariate/ML'

# User-Defined Functions

In [3]:
# create a dict of standard models to evaluate {name:object}
def define_models():
    models=dict()
    
    # nonlinear models
    models['k-Nearest Neighbors'] = KNeighborsRegressor(n_neighbors=7)
    models['Classification and Regression Tree'] = DecisionTreeRegressor()
    models['Support Vector Regression - Polynomial'] = svm.SVR(kernel='poly')
    models['Support Vector Regression - RBF'] = svm.SVR(kernel='rbf')
    models['Support Vector Regression - Linear'] = svm.SVR(kernel='linear')
    models['Gaussian Naive Bayes'] = GaussianProcessRegressor()

    # ensemble models
    models['Bagging classifier'] = BaggingRegressor(n_estimators=100)
    models['Random Forest'] = RandomForestRegressor(n_estimators=100)
    models['Extra-trees classifier'] = ExtraTreesRegressor(n_estimators=100)
    models['XG Boost'] = XGBRegressor()

    print( 'Defined %d models:' % len(models))
    print()
    return models

In [4]:
# Evaluate a single model
def single_model_evaluation(X_train, y_train, X_test, y_test, name, model):
    # fit the model
    model.fit(X_train, y_train)
    
    # Save the trained model
    filename = 'trained_ml_models/{}.sav'.format(name)
    joblib.dump(model, filename)
    
    # make predictions
    y_prediction = model.predict(X_test)
    
    metrics = dict()
    # evaluate predictions
    # accuracy = accuracy_score(y_test, y_prediction) * 100
    metrics["RMSE"] = mean_squared_error(y_test, y_prediction, squared=False)
    metrics["MAE"] = mean_absolute_error(y_test, y_prediction)
    metrics["MAPE"] = mean_absolute_percentage_error(y_test, y_prediction)
    metrics["R^2"] = r2_score(y_test, y_prediction)
    metrics["Max Error"] = max_error(y_test, y_prediction)    
    
    return metrics

In [5]:
# Evaluate a dict of models {name:object}, returns {name:score}
def multiple_model_evaluation(X_train, y_train, X_test, y_test, models):
    metrics_df = pd.DataFrame()
    
    for name, model in models.items():
        # evaluate the model
        s.tic()
        tmp_df = pd.DataFrame(single_model_evaluation(X_train, y_train, X_test, y_test, name, model), index=[0])
        tmp_df.insert(0, "Model Name", name, True)
        tmp_df.insert(0, "Type", "ML", True)
        metrics_df = metrics_df.append(tmp_df)
        print("> {}.".format(name))
        s.toc(restart=True)
        
    return metrics_df.reset_index(drop = True)

# Data

## Sample preparation

In [6]:
sql_table = "sima_station_CE"
target = "pm25"

# Define columns of interest from sql table
#     Select all columns:
column = "*"
#     Select specific columns:
#column = "datetime, prs, rainf, rh, sr, tout, wdr, wsr, " + str(target)

# Filter data with WHERE command
sql_where = "where datetime > \'2020-04-17\'"

# Initialize class to create multivariate samples:
uni_ts = multivariate_samples(sql_table, column, sql_where)

# Datasets can't be trained with sample batches by default. So parameter is 1.
X, y = uni_ts.samples_creation(1, target)

X_train, X_test, y_train, y_test = train_test_split(X[:,0,:], y, test_size = 0.30, shuffle= False)

# Models
We can define a list of machine learning models to evaluate on this problem. We will evaluate the models using default configurations. We are not looking for optimal configurations of these models at this point, just a general idea of how well sophisticated models with default configurations perform on this problem. We will evaluate a diverse set of nonlinear and ensemble machine learning algorithms:

**Nonlinear Algorithms**:
* k-Nearest Neighbors
* Classification and Regression Tree
* Support Vector Machine
* Naive Bayes

**Ensemble Algorithms**:
* Bagged Decision Trees
* Random Forest
* Extra Trees
* Gradient Boosting Machine

In [7]:
# get model list
models = define_models()

# evaluate models
t.tic() #Start timer
results = multiple_model_evaluation(X_train, y_train, X_test, y_test, models)
t.toc() #Time elapsed since t.tic()

results

Defined 10 models:

> k-Nearest Neighbors.
Elapsed time is 0.244915 seconds.
> Classification and Regression Tree.
Elapsed time is 0.108440 seconds.
> Support Vector Regression - Polynomial.
Elapsed time is 6.713116 seconds.
> Support Vector Regression - RBF.
Elapsed time is 9.162684 seconds.
> Support Vector Regression - Linear.
Elapsed time is 490.375557 seconds.
> Gaussian Naive Bayes.
Elapsed time is 77.791753 seconds.
> Bagging classifier.
Elapsed time is 7.007687 seconds.
> Random Forest.
Elapsed time is 6.861038 seconds.
> Extra-trees classifier.
Elapsed time is 3.188062 seconds.
> XG Boost.
Elapsed time is 0.683858 seconds.
Elapsed time is 0.000314 seconds.


Unnamed: 0,Type,Model Name,RMSE,MAE,MAPE,R^2,Max Error
0,ML,k-Nearest Neighbors,10.929006,7.698453,5611213000000000.0,0.520896,63.0
1,ML,Classification and Regression Tree,12.911781,8.458937,4750680000000000.0,0.331285,97.29
2,ML,Support Vector Regression - Polynomial,12.473047,8.751947,6059212000000000.0,0.375958,127.32749
3,ML,Support Vector Regression - RBF,12.256334,8.807903,6240459000000000.0,0.397455,87.699156
4,ML,Support Vector Regression - Linear,12.335551,8.485288,6093785000000000.0,0.38964,155.85859
5,ML,Gaussian Naive Bayes,26.38025,21.148282,71465380000000.0,-1.791428,125.0
6,ML,Bagging classifier,9.181792,6.150982,5182345000000000.0,0.661839,67.792
7,ML,Random Forest,9.17081,6.150157,5239113000000000.0,0.662647,65.6097
8,ML,Extra-trees classifier,8.941312,5.970123,5138056000000000.0,0.679321,63.57
9,ML,XG Boost,9.562426,6.512986,5140807000000000.0,0.633221,69.140689


# Load and test model

In [8]:
# load the model of interest from disk
filename = "trained_ml_models/Support Vector Regression - Polynomial.sav"

loaded_model = joblib.load(filename)

loaded_model.predict(X_test)

array([18.78787011, 11.35242691, 11.57405153, ..., 23.6963921 ,
       19.62281087, 21.48355564])

In [9]:
tmp_df = pd.DataFrame(single_model_evaluation(X_train, y_train, X_test, y_test, filename[18:], loaded_model), index=[0])
tmp_df.insert(0, "Model", filename[18:], True)
tmp_df

Unnamed: 0,Model,RMSE,MAE,MAPE,R^2,Max Error
0,Support Vector Regression - Polynomial.sav,12.473047,8.751947,6059212000000000.0,0.375958,127.32749


# Sources:
* https://scikit-learn.org/stable/modules/model_evaluation.html
* https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
    
* https://scikit-learn.org/stable/modules/svm.html
* https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html    
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html    