# About
Hyperparameter optimization is required to get the most out of your machine learning models.

Hyperparameters are points of choice or configuration that allow a machine learning model to be customized for a specific task or dataset.

Parameters are different from hyperparameters. Parameters are learned automatically; hyperparameters are set manually to help guide the learning process.

Choosing a hyperparameter grid is probably the most difficult part of hyperparameter tuning: it's nearly impossible ahead of time to say which values of hyperparameters will work well and the optimal settings will depend on the dataset. Moreover, the hyperparameters have complex interactions with each other which means that just tuning one at a time doesn't work because when we start changing other hyperparameters that will affect the one we just tuned!

# Libraries

In [19]:
%run "../../main_global.ipynb"

Connection with MySQL database is ready!


In [35]:
from numpy.random import seed
from numpy import array

In [36]:
# Save trained models
import joblib

# Data
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import type_of_target

# Hypertuning tools
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV

# Metrics
from sklearn.metrics import SCORERS

# Nonlinear models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.gaussian_process import GaussianProcessRegressor

# Ensemble models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

# Random seed
seed(101)

'/home/cesar/Python_NBs/HDL_Project/HDL_Main/2_DataModeling/ML_Models'

# UDF

In [30]:
def hyper_tuning(name, model, space, X, y):
    # The searching algorithm includes a “cv” argument that allows:
    # a) An integer number of folds to be specified, e.g. 5
    #cross_val = 5
    # b) A configured cross-validation object.
    kfold = KFold(n_splits=3, shuffle=False)

    # The scoring metric must be maximizing, meaning better models result in larger scores.
    scoring_metric = 'neg_mean_squared_error'

    # Search for best hyperparameters
    grid = RandomizedSearchCV(estimator=model, 
                              param_distributions=search_space, 
                              cv=kfold, 
                              n_iter=100,
                              scoring=scoring_metric)

    result = grid.fit(X_test, y_test)
    
    # Save the trained model
    filename = 'trained_ml_models_mvi/{}.sav'.format(name)
    joblib.dump(result, filename)

    return result

In [37]:
mkdir ml_trained_models

In [None]:
# Evaluate a single model
def single_model_evaluation(X_test, y_test, name):
    # Load the trained model
    filename = 'ml_trained_models/{}.sav'.format(name)
    model = joblib.load(filename)

    # make predictions
    y_prediction = model.predict(X_test)
    
    metrics = dict()
    # evaluate predictions
    metrics["RMSE"] = mean_squared_error(y_test, y_prediction, squared=False)
    metrics["MAE"] = mean_absolute_error(y_test, y_prediction)
    metrics["MAPE (%)"] = mean_absolute_percentage_error(y_test, y_prediction) *100
    metrics["R^2 (%)"] = r2_score(y_test, y_prediction) * 100
    metrics["Max Error"] = max_error(y_test, y_prediction)    
    
    return metrics

In [36]:
class multivariate_samples(object):
    """
    Sequential processing of data to obtain time series.
    
    Activities:
    - initial_df: Read SQL dataset for specific station number.
    - samples_creation: Creation of samples array.
    """

    def __init__(self, sqlq, target):
        """
        Input:
        * station_number: Database station number to process
        """
        self.sqlq = sqlq
        self.table_name = tablename_from_sqlq(sqlq)
        self.cols = cols_from_sqlq(sqlq)
        self.where = where_from_sqlq(sqlq)
        self.target = target
        
    def initial_df(self):
        # Read raw dataset components from SQL database
        df = qdata(self.sqlq)

        # Set `datetime` column as dataframe index
        df = df.set_index('datetime')
        df.sort_index(inplace=True)
        
        # Save temporary array with unmodified target information
        target_arr = df[self.target]
        
        # Data normalization
        df=(df-df.min())/(df.max()-df.min())
        df = df.fillna(0)
        df[self.target] = target_arr

        # Overview
        return df
    
    def samples_creation(self, n_steps):
        """
        Transformation of Dataframe object into numpy.ndarray objects (input, output)
        """
        target_name = self.target
        
        # Rearrangin dataset to place target as last column
        df = self.initial_df()
        
        target_col = df[target_name]

        df = df.loc[:, df.columns != target_name]
        df[target_name] = target_col     
        
        arr = df.to_numpy()
        del target_col
        
        # Creating samples
        tmp = list(reversed(range(len(arr)+1)))
        tmp = tmp[:-n_steps][::-1]
        tmp = DataFrame(tmp).reset_index(drop = False)
        tmp.columns = ["index", "end_ix"]
        
        # Create empty lists 
        X, y = list(), list()

        for i, end_ix in zip(tmp["index"], tmp["end_ix"]):
            
            # Gather input and output parts of the pattern
            seq_x, seq_y = arr[i:end_ix, :-1], arr[end_ix-1, -1]
            X.append(seq_x)
            y.append(seq_y)        
        
        return array(X), array(y), df

In [37]:
sqlq = "SELECT * FROM sima_station_CE where datetime >=\'2021-04-17 23:00:00\'"
target = "pm25"

init_mv = multivariate_samples(sqlq, target)

X, y, _ = init_mv.samples_creation(1)

In [41]:
type(X)

numpy.ndarray

# Data

# Main