# Model Training

In [None]:
# to handle paths
import sys
import os
from pathlib import Path

# to handle datasets
import pandas as pd
import numpy as np

# for iterating
from itertools import product

# models
from sklearn.ensemble import RandomForestRegressor

# to tune models
from sklearn.model_selection import GridSearchCV

# to cross-validate models
from sklearn.model_selection import LeaveOneGroupOut

# to save the model
import joblib

# to evaluate models
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', 10)

In [None]:
# variables
project_name = "RF+Clust"
project_folder = f"C:/Users/anani/Downloads/{project_name}"
sys.path.append(project_folder)

from variables import suite_name, features, target, transformation, budgets, algorithms
from variables import n_folds, crossval_column
from variables import model_name
from utils import read_fold_data

In [None]:
# define directories
if features != "all":
    data_folder = f"{project_folder}/Data/{suite_name}/preprocessed/transformation={transformation}_features={features}/{model_name}"
else: 
    data_folder = f"{project_folder}/Data/{suite_name}/preprocessed/transformation={transformation}_features={features}"

In [None]:
# create directories
results_folder = f"{project_folder}/Results/{suite_name}/transformation={transformation}_features={features}/{model_name}"
os.makedirs(results_folder, exist_ok=True)

In [None]:
def get_model_hyperparameters(model_name: str):
    """
    Function to create a ML algorithm parameter grid.
    """
    if model_name == 'random_forest':
            return dict(n_estimators=[10, 20, 50, 70]
                        , max_depth=[3, 5, 7, 10]
                        , max_features=[1.0, 'sqrt', 'log2']
                        , min_samples_split=[2, 5, 7, 10]
                        , random_state=[1]
                        )
        
def get_model(model_name: str, model_kwargs: dict):
    """
    Function to initialize a ML model instance.
    """
    if model_name == 'random_forest':
        return RandomForestRegressor(**model_kwargs)

In [None]:
def train_model(X_train: pd.DataFrame, y_train: pd.DataFrame, model_name: str, cv: object, cv_groups: list, scoring: str):
    """
    Function to tuned model with grid search.
    """
    # get model 
    model_init = get_model(model_name=model_name, model_kwargs={})
    
    # get param grid
    model_params_grid = get_model_hyperparameters(model_name)
    print(f"parameter grid: {model_params_grid}")

    # perform model tunning
    grid = GridSearchCV(estimator=model_init, param_grid=model_params_grid, cv=cv, scoring=scoring
                         , return_train_score=True, verbose=3).fit(X=X_train, y=y_train, groups=cv_groups)
 
    return grid.best_estimator_, pd.DataFrame(grid.best_params_, index=[0])

In [None]:
def model_performance_summary(true_train: list, predicted_train: list, true_test: list, predicted_test: list): 
    """
    Function to calculate model performance with respect to different metrics.
    """
    return pd.DataFrame({"mae_train": mean_absolute_error(true_train, predicted_train)
            , "mae_test": mean_absolute_error(true_test, predicted_test)
            , "mdae_train": median_absolute_error(true_train, predicted_train)
            , "mdae_test": median_absolute_error(true_test, predicted_test)
            , "r2_train": r2_score(true_train, predicted_train)
            , "r2_test": r2_score(true_test, predicted_test)}
            , index=[0])

In [None]:
for algorithm_name, budget in product(algorithms, budgets):
    print(f"Algorithm name: {algorithm_name}, budget: {budget}")

    # define directories
    data_folder_temp = f"{data_folder}/algorithm_name={algorithm_name}_budget={budget}"
    results_folder_temp = f"{results_folder}/algorithm_name={algorithm_name}_budget={budget}"
    
    # create directories
    os.makedirs(f"{results_folder_temp}/models", exist_ok=True)
    os.makedirs(f"{results_folder_temp}/predictions", exist_ok=True)

    # create results placeholders
    performance = pd.DataFrame()
    parameters = pd.DataFrame()
    predictions = pd.DataFrame()
    
    for fold_number in np.arange(1, n_folds+1):
        print(f"Fold: {fold_number}")

        # load data
        X_train, y_train, X_test, y_test = read_fold_data(directory=data_folder_temp, fold_number=fold_number) 

        print("Preview train: ")
        print(X_train.shape)
        print("Preview test: ")
        print(X_test.shape)
        print("preview y train: ")
        print(y_train.head(3))
        
        # train model 
        model, parameters_temp = train_model(X_train=X_train, y_train=y_train[target], model_name=model_name
                      , cv=LeaveOneGroupOut(), cv_groups=X_train.index.get_level_values(crossval_column).values, scoring="neg_mean_absolute_error")
        # predict
        predictions_train = y_train.rename(columns={target: "true"}).copy()
        predictions_train['predicted'] = model.predict(X_train)

        predictions_test = y_test.rename(columns={target: "true"}).copy()
        predictions_test['predicted'] = model.predict(X_test)

        print("Preview predictions test: ")
        print(predictions_test.head())
        print(predictions_test.shape)   
        
        # calculate model performance
        performance_temp = model_performance_summary(true_train=predictions_train["true"].values
                                                , predicted_train=predictions_train["predicted"].values
                                                , true_test=predictions_test["true"].values
                                                , predicted_test=predictions_test["predicted"].values)
        # save 
        joblib.dump(model, f"{results_folder_temp}/models/model_fold={str(fold_number)}.joblib")

        parameters_temp["fold_number"] = fold_number
        parameters = pd.concat([parameters, parameters_temp], axis=0)

        performance_temp["fold_number"] = fold_number
        performance = pd.concat([performance, performance_temp], axis=0)

        predictions_train.reset_index().to_csv(f"{results_folder_temp}/predictions/predictions_set=train_fold={fold_number}.csv", index=False)
        predictions_test.reset_index().to_csv(f"{results_folder_temp}/predictions/predictions_set=test_fold={fold_number}.csv", index=False)

        predictions = pd.concat([predictions, predictions_test], axis=0)
        
    print("Preview parameters: ")
    print(parameters.head())
    print(parameters.shape)

    print("Preview perfromance")
    print(performance.head())
    print(performance.shape)
    
    print("Preview predictions")
    print(predictions.head())
    print(predictions.shape)

    # save 
    performance.to_csv(f"{results_folder_temp}/models/performance.csv", index=False)
    parameters.to_csv(f"{results_folder_temp}/models/parameters.csv", index=False)
    predictions.reset_index().to_csv(f"{results_folder_temp}/predictions/predictions_set=test.csv", index=False)