# About
Multivariate ML model training. Single step.

# Libraries

In [1]:
%run "/home/cesar/Python_NBs/HDL_Project/HDL_Project/global_fv.ipynb"

In [2]:
import os

# Save trained models
import joblib

# Data
from sklearn.model_selection import train_test_split

# Nonlinear models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.gaussian_process import GaussianProcessRegressor

# Ensemble models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

# Clone of time class
s = t

os.getcwd()

'/home/cesar/Python_NBs/HDL_Project/HDL_Project/2_Models/Multivariate/ML'

# Global parameters

In [3]:
seed = 101

# User-Defined Functions

In [4]:
# create a dict of standard models to evaluate {name:object}
def define_models():
    models=dict()
    
    # nonlinear models
    models['k-Nearest Neighbors'] = KNeighborsRegressor(weights= 'distance'
                                                         , p= 2
                                                         , n_neighbors= 8
                                                         , n_jobs= -1
                                                         , metric= 'manhattan'
                                                         , leaf_size= 30
                                                         , algorithm= 'brute')
    
    models['Decision Tree Regressor'] = DecisionTreeRegressor(splitter= 'best'
                                                        , min_weight_fraction_leaf= 0.0
                                                        , min_samples_split= 6
                                                        , min_samples_leaf= 4
                                                        , max_depth= 7
                                                        , criterion= 'friedman_mse')
    
    models['Support Vector Regression - Polynomial'] = svm.SVR(kernel='poly')
    
    models['Support Vector Regression - RBF'] = svm.SVR(kernel= 'rbf'
                                                         , gamma= 0.1
                                                         , C= 1000)
    
    models['Support Vector Regression - Linear'] = svm.SVR(kernel='linear'
                                                           , gamma= 1
                                                           , C= 10)
    
    # ensemble models
    models['Random Forest'] = RandomForestRegressor(n_estimators= 500
                                                    , min_samples_split= 3
                                                    , max_features= 8
                                                    , max_depth= 18.0)
    
    models['Extra-trees classifier'] = ExtraTreesRegressor(n_estimators= 100
                                                           , min_samples_split= 5
                                                           , max_features= 11
                                                           , max_depth= 18.0
                                                           , criterion= 'squared_error')
    
    models['XG Boost'] = XGBRegressor(subsample= 0.5
                                      , n_estimators= 1000
                                      , max_depth= 20
                                      , learning_rate= 0.01
                                      , colsample_bytree= 0.9
                                      , colsample_bylevel= 0.5)

    print( 'Defined %d models:' % len(models))
    print()
    return models

In [5]:
# Evaluate a single model
def single_model_evaluation(X_train, y_train, X_test, y_test, name, model):
    # fit the model
    model.fit(X_train, y_train)
    
    # Save the trained model
    filename = 'trained_ml_models/{}.sav'.format(name)
    joblib.dump(model, filename)
    
    # make predictions
    y_prediction = model.predict(X_test)
    
    metrics = dict()
    # evaluate predictions
    # accuracy = accuracy_score(y_test, y_prediction) * 100
    metrics["RMSE"] = mean_squared_error(y_test, y_prediction, squared=False)
    metrics["MAE"] = mean_absolute_error(y_test, y_prediction)
    metrics["MAPE"] = mean_absolute_percentage_error(y_test, y_prediction)
    metrics["R^2"] = r2_score(y_test, y_prediction)
    metrics["Max Error"] = max_error(y_test, y_prediction)    
    
    return metrics

In [6]:
# Evaluate a dict of models {name:object}, returns {name:score}
def multiple_model_evaluation(X_train, y_train, X_test, y_test, models):
    metrics_df = pd.DataFrame()
    
    for name, model in models.items():
        # evaluate the model
        s.tic()
        tmp_df = pd.DataFrame(single_model_evaluation(X_train, y_train, X_test, y_test, name, model), index=[0])
        tmp_df.insert(0, "Model Name", name, True)
        tmp_df.insert(0, "Type", "ML", True)
        metrics_df = metrics_df.append(tmp_df)
        print("> {}.".format(name))
        s.toc(restart=True)
        
    return metrics_df.reset_index(drop = True)

# Data

## Sample preparation

In [7]:
sql_table = "sima_station_CE"
target = "pm25"

# Define columns of interest from sql table
#     Select all columns:
column = "*"
#     Select specific columns:
#column = "datetime, prs, rainf, rh, sr, tout, wdr, wsr, " + str(target)

# Filter data with WHERE command
sql_where = "where datetime >= \'2021-03-01\'"

# Initialize class to create multivariate samples:
multi_ts = multivariate_samples(sql_table, target, column, sql_where)

# Datasets can't be trained with sample batches by default. So parameter is 1.
X, y = multi_ts.samples_creation(1, target)

X_train, X_test, y_train, y_test = train_test_split(X[:,0,:], y, test_size = 0.30, shuffle= False)

# Models
We can define a list of machine learning models to evaluate on this problem. We will evaluate the models using default configurations. We are not looking for optimal configurations of these models at this point, just a general idea of how well sophisticated models with default configurations perform on this problem. We will evaluate a diverse set of nonlinear and ensemble machine learning algorithms:

**Nonlinear Algorithms**:
* k-Nearest Neighbors
* Classification and Regression Tree
* Support Vector Machine
* Naive Bayes

**Ensemble Algorithms**:
* Bagged Decision Trees
* Random Forest
* Extra Trees
* Gradient Boosting Machine

## Model tuning

In [8]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import SCORERS

In [9]:
sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we

In [10]:
# get model list
models = define_models()

# evaluate models
t.tic() #Start timer
results = multiple_model_evaluation(X_train, y_train, X_test, y_test, models)
t.toc() #Time elapsed since t.tic()

results

Defined 8 models:

> k-Nearest Neighbors.
Elapsed time is 0.474310 seconds.
> Classification and Regression Tree.
Elapsed time is 0.027777 seconds.
> Support Vector Regression - Polynomial.
Elapsed time is 2.098741 seconds.
> Support Vector Regression - RBF.
Elapsed time is 3.401193 seconds.
> Support Vector Regression - Linear.
Elapsed time is 1.851090 seconds.
> Random Forest.
Elapsed time is 10.514489 seconds.
> Extra-trees classifier.
Elapsed time is 0.928498 seconds.
> XG Boost.
Elapsed time is 21.559286 seconds.
Elapsed time is 0.000613 seconds.


Unnamed: 0,Type,Model Name,RMSE,MAE,MAPE,R^2,Max Error
0,ML,k-Nearest Neighbors,14.275914,10.009431,3039451000000000.0,0.320154,92.535972
1,ML,Classification and Regression Tree,13.670001,9.815185,3425106000000000.0,0.376638,73.916667
2,ML,Support Vector Regression - Polynomial,12.367335,8.4326,2669503000000000.0,0.489783,87.073756
3,ML,Support Vector Regression - RBF,11.987484,8.147755,2544570000000000.0,0.520643,68.998162
4,ML,Support Vector Regression - Linear,13.845888,9.63764,3188088000000000.0,0.360494,127.526817
5,ML,Random Forest,11.984117,8.347523,3216397000000000.0,0.520912,70.443078
6,ML,Extra-trees classifier,11.811486,8.124539,3196433000000000.0,0.534615,77.8095
7,ML,XG Boost,12.020588,7.97876,2990141000000000.0,0.517992,75.989864


# Load and test model

In [11]:
# load the model of interest from disk
filename = "trained_ml_models/Support Vector Regression - Polynomial.sav"

loaded_model = joblib.load(filename)

loaded_model.predict(X_test)

array([23.7898682 , 24.46937119, 25.66182219, ..., 24.01327033,
       16.53652162, 18.01476121])

In [12]:
tmp_df = pd.DataFrame(single_model_evaluation(X_train, y_train, X_test, y_test, filename[18:], loaded_model), index=[0])
tmp_df.insert(0, "Model", filename[18:], True)
tmp_df

Unnamed: 0,Model,RMSE,MAE,MAPE,R^2,Max Error
0,Support Vector Regression - Polynomial.sav,12.367335,8.4326,2669503000000000.0,0.489783,87.073756


# Sources:
* https://scikit-learn.org/stable/modules/model_evaluation.html
* https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
    
* https://scikit-learn.org/stable/modules/svm.html
* https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html    
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html    