# Hyperparameter Optimization

Trying to improve the results by using Bayesian Optimization for hyperparameter tuning.

Bayesian Optimization is used to optimize the f1 in a first attempt, and in a second attempt to optimize the precision. Metrics and learning curves are used for comparison.

In [None]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

from src.data_prep_for_model import clean_data, feature_engineer, prep_data_for_model, pipeline_classifier

# For Bayesian Optimization
import time
import optuna 
from optuna.samplers import TPESampler
from sklearn.model_selection import cross_val_score

# importing plotly and enable jupyter notebooks for showing optuna visualisations 
import plotly.io as pio
pio.renderers.default = 'iframe'

In [None]:
# read data
df = pd.read_csv('data/spotify_dataset.csv')
df.head()

### Data preparation

In [None]:
# get features and target sets for train, test and val data from function output
features_train, target_train, features_test, target_test, features_val, target_val = prep_data_for_model(df)

features_train.head()

In [None]:
# getting columns easy for copy-paste
print(features_train.columns)

# specific categories (for onehotencoding) and num cols list for pipeline
CAT_COLS = ['key', 'time_signature']

NUM_COLS = [col for col in features_train.columns if col not in CAT_COLS]

print(CAT_COLS)
print(NUM_COLS)

### Optimization on f1_weighted

In [None]:
# Bayesian Optimization on f1_weighed
def objective(trial):
    """return maximized f1-score"""
   
    # search space
    n_estimators = trial.suggest_int('n_estimators', 50, 250)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    max_features = trial.suggest_categorical('max_features', choices = ['sqrt', 'log2', None])
    min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=10, step=2)
    min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1)
    
    params = {'n_estimators': n_estimators,
             'max_features': max_features,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf}
    
    # random forest classifier object     
    pipeline = pipeline_classifier(cat_cols=CAT_COLS,
                                    num_cols=NUM_COLS,
                                    classifier=RandomForestClassifier,
                                    class_weight='balanced',
                                    random_state=42,
                                    **params)

    # initiating cv
    score =  cross_val_score(estimator=pipeline, 
                             X=features_train, 
                             y=target_train, 
                             scoring='f1_weighted',
                             cv=5,
                             n_jobs=-1).mean()
    
    return score

# create a study (aim to maximize score) und setting a seed (random_state) for reproduceability
study = optuna.create_study(sampler=TPESampler(seed = 42), direction='maximize')

# perform hyperparamter tuning (while timing the process)
time_start = time.time()
# starting optimization process with our defined function and 50 iterations
study.optimize(objective, n_trials=50)
time_bayesian = time.time() - time_start

# store result in a data frame 
values_bayesian = [50, study.best_trial.number, study.best_trial.value, time_bayesian]
results_bayesian = pd.DataFrame([values_bayesian], columns = ['Number of iterations', 
                                                                        'Iteration Number of Optimal Hyperparamters', 
                                                                        'Score', 
                                                                        'Time Elapsed (s)'])

In [None]:
# show results
display(results_bayesian)
study.best_params

In [None]:
# best model optimized on f1_weighted (manually filled from last optimization run, so the run does not need to be repeated (takes a while))
best_params = {'n_estimators': 193, 
               'max_depth': 15,
               'max_features': None,
               'min_samples_split': 4,
               'min_samples_leaf': 2}

### Optimization on precision score (weighted)

In [None]:
# Bayesian Optimization on precision score weighted
def objective(trial):
    """return maximized f1-score"""
   
    # search space
    n_estimators = trial.suggest_int('n_estimators', 50, 250)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    max_features = trial.suggest_categorical('max_features', choices = ['sqrt', 'log2', None])
    min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=10, step=2)
    min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1)
    
    params = {'n_estimators': n_estimators,
             'max_features': max_features,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf}
    
    # random forest classifier object     
    pipeline = pipeline_classifier(cat_cols=CAT_COLS,
                                    num_cols=NUM_COLS,
                                    classifier=RandomForestClassifier,
                                    class_weight='balanced',
                                    random_state=42,
                                    **params)
    
    # initiating cv
    score =  cross_val_score(estimator=pipeline, 
                             X=features_train, 
                             y=target_train, 
                             scoring='precision_weighted',
                             cv=5,
                             n_jobs=-1).mean()
    
    return score

# create a study (aim to maximize score) und setting a seed (random_state) for reproduceability
study = optuna.create_study(sampler=TPESampler(seed = 42), direction='maximize')

# perform hyperparamter tuning (while timing the process)
time_start = time.time()
# starting optimization process with our defined function and 50 iterations
study.optimize(objective, n_trials=50)
time_bayesian = time.time() - time_start

# store result in a data frame 
values_bayesian = [50, study.best_trial.number, study.best_trial.value, time_bayesian]
results_bayesian = pd.DataFrame([values_bayesian], columns = ['Number of iterations', 
                                                                        'Iteration Number of Optimal Hyperparamters', 
                                                                        'Score', 
                                                                        'Time Elapsed (s)'])

In [None]:
# show results
display(results_bayesian)
study.best_params

In [None]:
# best model optimized on precision score weighted (manually filled from last optimization run, so the run does not need to be repeated (takes a while))
best_params = {'n_estimators': 202, 
               'max_depth': 15,
               'max_features': None,
               'min_samples_split': 2,
               'min_samples_leaf': 4}