# Binary SVC w/ Hiperparameter tunning (Grid Search, Random Search, Bayesian Opt, Bayes search, Optuna) & lifecycle management w/ MLFlow

In [1]:
# Importa librerias
import pandas as pd
from sklearn.metrics import classification_report, roc_curve, auc

In [2]:
import sys
sys.path.append("../scripts_setup/")

from my_config_loader import load_configuration_model_gen, \
                             load_configuration_mlflow, \
                             load_configuration_model_svc, \
                             load_configuration_model_knn

from my_mlflow_utils import mlf_log_tags_params_gen, \
                            mlf_log_metrics_models, \
                            log_metrics_auc_intervals, \
                            mlf_log_tables

from my_data_processing_utils import save_datasets_to_csv,\
                      scale_dataset, \
                      split_data

In [3]:
import warnings
# Suppress the warning
warnings.filterwarnings("ignore", message="Distutils was imported before Setuptools")
warnings.filterwarnings("ignore", message="Setuptools is replacing distutils.", category=UserWarning)

## Data loading & preprocessing

In [4]:
# Lee dataset
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("../../data/raw/magic04.data", names=cols)
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [5]:
# Valores de la variable objetivo
df['class'].unique()

array(['g', 'h'], dtype=object)

In [6]:
# Reclasifica variable objetivo - Class as int (g=1, h=0)

df['class'] = (df['class'] == "g").astype(int)

### General parameters

In [7]:
# Load configurations
(mlf_key, 
 cv, random_state, n_jobs, n_iter, n_trials, init_points, testing
) = load_configuration_model_gen("../scripts_setup/General_params.yaml")

(mlf_tracking_server_uri, mlf_experiment_name, mlf_project_name, mlf_team
) = load_configuration_mlflow("../scripts_setup/General_params.yaml")

(c_min, c_max, gamma_min, gamma_max
) = load_configuration_model_svc("../scripts_setup/General_params.yaml")

(n_neighbors_min, n_neighbors_max, leaf_size_min, leaf_size_max 
) = load_configuration_model_knn("../scripts_setup/General_params.yaml")

### Split dataset

In [8]:
df_train, df_test, df_valid = split_data(df, random_state = random_state)

### Scale & oversample

In [9]:
train, X_train, y_train = scale_dataset(df_train, random_state, oversample = True)
valid, X_valid, y_valid = scale_dataset(df_valid, random_state, oversample = False)
test, X_test, y_test = scale_dataset(df_test, random_state, oversample = False)

### Saves train / test / validation files & paths to log them in MLFlow

In [10]:
# Call the function to save the datasets to CSV files
dataset_paths = save_datasets_to_csv(X_train, y_train, X_test, y_test, X_valid, y_valid)

# Access the file paths from the dictionary
X_train_path = dataset_paths["X_train_path"]
y_train_path = dataset_paths["y_train_path"]
X_test_path = dataset_paths["X_test_path"]
y_test_path = dataset_paths["y_test_path"]
X_valid_path = dataset_paths["X_valid_path"]
y_valid_path = dataset_paths["y_valid_path"]

## MLFLow

### Initialize MLFlow experiment

In [11]:
import mlflow

# Code for init MLFLOW server: mlflow server --host 127.0.0.1 --port 5000
#mlf_tracking_server_uri = "http://localhost:5000"
mlflow.set_tracking_uri(mlf_tracking_server_uri)

#mlf_experiment_name = "Magic"
mlf_experiment_description = "This is a(n) " + mlf_experiment_name + " experiment initiated on " + mlf_key
mlf_experiment_tags = {
    "project_name": mlf_project_name,
    "team": mlf_team,
    "mlflow.note.content": mlf_experiment_description,
}

try:
    mlf_exp_id = mlflow.create_experiment(name=mlf_experiment_name, tags=mlf_experiment_tags)
except Exception as e:
    mlf_exp_id = mlflow.get_experiment_by_name(mlf_experiment_name).experiment_id

mlflow.sklearn.autolog(disable=True)    
#print("Experiment ID:", mlf_exp_id)

## Model

### SVC base

In [12]:
%%time

# SVC
from sklearn.svm import SVC

# Init MLFlow run
with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_SVC_Baseline') as run:  
    #mlflow.sklearn.autolog()
    
    # Log an input datasets used for training & testing
    # t_data = mlflow.data.from_numpy(X_train, targets=y_train) 
    # mlflow.log_input(t_data, context="training")

    # t_data = mlflow.data.from_numpy(X_test, targets=y_test) 
    # mlflow.log_input(t_data, context="test")
    mlf_log_tables(X_train, y_train, X_test, y_test)
    
    # Create algo object, train & predict
    svc_model = SVC(random_state=random_state)
    svc_model.fit(X_train, y_train)
    y_pred = svc_model.predict(X_test)
    y_pred_base = svc_model.decision_function(X_test)
    
    # Get the hyperparameters of the trained model
    best_params = svc_model.get_params()
    
    # Log tags & params in MLFlow 
    mlf_log_tags_params_gen ("tag", "cv", cv, "random_state", random_state, "n_jobs", n_jobs, "n_iter", n_iter, "n_trials", n_trials, "init_points", init_points, "c_min", c_min, "c_max", c_max, "gamma_min", gamma_min, "gamma_max", gamma_max)
    mlf_log_tags_params_gen ("tag", "model_name", "SVC_Baseline", "model_description", "Baseline SVC", "X_train_path", X_train_path, "y_train_path", y_train_path, "X_test_path", X_test_path, "y_test_path", y_test_path)
    mlf_log_tags_params_gen("param", "C", best_params['C'], "gamma", best_params['gamma'], "kernel", best_params['kernel'])
    
    # Values for getting AUC
    base_svc_fpr, base_svc_tpr, threshold = roc_curve(y_test, y_pred_base)
    #print(base_svc_fpr)
    #print(base_svc_tpr)
    auc_base_svc = auc(base_svc_fpr, base_svc_tpr)

    # Log metrics & model in MLFlow
    mlf_log_metrics_models(classification_report(y_test, y_pred, output_dict=True), svc_model, "SVC_Baseline", auc_base_svc)
    log_metrics_auc_intervals(base_svc_fpr, base_svc_tpr)


CPU times: total: 14.5 s
Wall time: 19.9 s


### SVC Grid search

In [14]:
%%time

# SVM Grid search
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for the grid search
if testing:    
    param_grid = {
        'kernel': ['linear'],    
        'C': [0.001, 1],                 
        'gamma': ['scale', 'auto']              
    }
else:
    param_grid = {
        'kernel': ['linear', 'rbf', 'poly'],    # Types of kernel functions
        'C': [0.001, 1, 50, 100, 1000],                 # Values of the regularization parameter
        'gamma': ['scale', 'auto']              # Kernel coefficient for 'rbf' kernel   
    }

# Create the GridSearchCV object
grid_search_svc = GridSearchCV(svc_model, param_grid, cv=cv, n_jobs=n_jobs, scoring='accuracy', verbose=2)

# Perform the grid search on training data
grid_search_svc.fit(X_train, y_train)

# Start MLFlow run
with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_SVC_best_GridSearch') as run:  
    #mlflow.sklearn.autolog()
    
    # Get hyperparameters of the best trained model
    best_params = grid_search_svc.best_params_
    
    # Get best trained model
    best_svc = grid_search_svc.best_estimator_
    
    # train & predict
    y_pred = best_svc.predict(X_test)
    y_pred_gridsearch = best_svc.decision_function(X_test)
    
    # Log tags & params in MLFlow  
    mlf_log_tags_params_gen ("tag", "cv", cv, "random_state", random_state, "n_jobs", n_jobs, "n_iter", n_iter, "n_trials", n_trials, "init_points", init_points, "c_min", c_min, "c_max", c_max, "gamma_min", gamma_min, "gamma_max", gamma_max)
    mlf_log_tags_params_gen ("tag", "model_name", "SVC_best_gridsearch", "model_description", "Best Grid Search SVC", "X_train_path", X_train_path, "y_train_path", y_train_path, "X_test_path", X_test_path, "y_test_path", y_test_path)
    mlf_log_tags_params_gen("param", "C", best_params['C'], "gamma", best_params['gamma'], "kernel", best_params['kernel'])
    # Values for getting AUC
    gridsearch_svc_fpr, gridsearch_svc_tpr, threshold = roc_curve(y_test, y_pred_gridsearch)
    auc_gridsearch_svc = auc(gridsearch_svc_fpr, gridsearch_svc_tpr)
    
    # Log metrics & model in MLFlow
    mlf_log_metrics_models(classification_report(y_test, y_pred, output_dict=True), best_svc, "SVC_best_GridSearch", auc_gridsearch_svc)
    log_metrics_auc_intervals(gridsearch_svc_fpr, gridsearch_svc_tpr)


Fitting 3 folds for each of 4 candidates, totalling 12 fits
Error log_metrics_auc_intervals: name 'np' is not defined
CPU times: total: 11.7 s
Wall time: 50.6 s


### SVC Random search

In [None]:
%%time

# SVM Random search
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Define parameter distributions for the random search
param_dist = {'C': uniform(c_min, c_max),
              'gamma': ['scale', 'auto'],
 #'gamma': uniform(0.01, 10),
              'kernel': ['rbf', 'linear', 'poly']}

# Create the RandomizedSearchCV object
random_search_svm = RandomizedSearchCV(svc_model, param_distributions=param_dist, cv=cv, n_jobs=n_jobs, n_iter=n_iter, random_state=random_state, scoring='accuracy', verbose=2)

# Perform the random search on the training data
random_search_svm.fit(X_train, y_train)

# Start MLFlow run
with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_SVC_best_RandomSearch') as run:  
    #mlflow.sklearn.autolog()
    
    # Get hyperparameters of the best trained model
    best_params = random_search_svm.best_params_
    
    # Get best trained model
    best_svc = random_search_svm.best_estimator_

    # train & predict
    y_pred = best_svc.predict(X_test)
    y_pred_randomsearch = best_svc.decision_function(X_test)

    # Log tags & params in MLFlow
    mlf_log_tags_params_gen ("tag", "cv", cv, "random_state", random_state, "n_jobs", n_jobs, "n_iter", n_iter, "n_trials", n_trials, "init_points", init_points, "c_min", c_min, "c_max", c_max, "gamma_min", gamma_min, "gamma_max", gamma_max)
    mlf_log_tags_params_gen ("tag", "model_name", "SVC_best_randomsearch", "model_description", "Best Random Search SVC", "X_train_path", X_train_path, "y_train_path", y_train_path, "X_test_path", X_test_path, "y_test_path", y_test_path)
    mlf_log_tags_params_gen("param", "C", best_params['C'], "gamma", best_params['gamma'], "kernel", best_params['kernel'])

    # Values for getting AUC
    randomsearch_svc_fpr, randomsearch_svc_tpr, threshold = roc_curve(y_test, y_pred_randomsearch)
    auc_randomsearch_svc = auc(randomsearch_svc_fpr, randomsearch_svc_tpr)

    # Log metrics & model in MLFlow
    mlf_log_metrics_models(classification_report(y_test, y_pred, output_dict=True), best_svc, "SVC_best_RandomSearch", auc_randomsearch_svc)
    log_metrics_auc_intervals(randomsearch_svc_fpr, randomsearch_svc_tpr)


### SVC Bayesian optimization

In [None]:
%%time

# SVM Bayesian optimization - BayesianOptimization
from bayes_opt import BayesianOptimization

# To specify the kernel
kernel='rbf' 

# SVM objective function with cross-validation
def svm_cv(C, gamma):    
    svm = SVC(C=C, gamma=gamma, kernel=kernel, random_state=random_state) ### en vez de usar svm se puede pasar el svc_model / capaz cambiar el nombre de la funcion?
    scores = cross_val_score(svm, X_train, y_train, cv=cv, scoring='accuracy')
    return np.mean(scores)

# Define the search space
pbounds = {'C': (c_min, c_max), 
           'gamma': (gamma_min, gamma_max)
          }

# Create the Bayesian optimizer
bayes_optimizer_svm = BayesianOptimization(
    f=svm_cv,
    pbounds=pbounds,
    random_state=random_state  
)

# Perform the search on the training data
bayes_optimizer_svm.maximize(init_points=init_points, n_iter=n_iter)

# Start MLFlow run
with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_SVC_best_BayesianOptimization') as run:  
    #mlflow.sklearn.autolog()
    
    # Get hyperparameters of the best trained model
    best_params = bayes_optimizer_svm.max['params']

    # Get best trained model    
    best_svc = SVC(**best_params)
    # train & predict
    best_svc.fit(X_train, y_train)
    y_pred = best_svc.predict(X_test)
    y_pred_bayesianoptimization = best_svc.decision_function(X_test)

    # Log tags & params in MLFlow
    mlf_log_tags_params_gen ("tag", "cv", cv, "random_state", random_state, "n_jobs", n_jobs, "n_iter", n_iter, "n_trials", n_trials, "init_points", init_points, "c_min", c_min, "c_max", c_max, "gamma_min", gamma_min, "gamma_max", gamma_max)
    mlf_log_tags_params_gen ("tag", "model_name", "SVC_best_BayesianOptimization", "model_description", "Best Bayesian Optimization SVC", "X_train_path", X_train_path, "y_train_path", y_train_path, "X_test_path", X_test_path, "y_test_path", y_test_path)
    mlf_log_tags_params_gen("param", "C", best_params['C'], "gamma", best_params['gamma'], "kernel", best_svc.get_params()['kernel'])

    # Values for getting AUC
    bayesianoptimization_svc_fpr, bayesianoptimization_svc_tpr, threshold = roc_curve(y_test, y_pred_bayesianoptimization)
    auc_bayesianoptimization_svc = auc(bayesianoptimization_svc_fpr, bayesianoptimization_svc_tpr)

    # Log metrics & model in MLFlow
    mlf_log_metrics_models(classification_report(y_test, y_pred, output_dict=True), best_svc, "SVC_best_BayesianOptimization", auc_bayesianoptimization_svc)
    log_metrics_auc_intervals(bayesianoptimization_svc_fpr, bayesianoptimization_svc_tpr)
    

### SVC Bayes search

In [None]:
%%time

# SVM Bayesian optimization - BayesSearchCV
from skopt import BayesSearchCV

# Define the search space
param_space = {
    'C': (c_min, c_max, 'log-uniform'),  
    'gamma': (gamma_min, gamma_max, 'log-uniform')  
}

# Initialize BayesSearchCV object
bayes_search_svm = BayesSearchCV(
    SVC(random_state=random_state), # pasar svc_model
    param_space,
    n_iter=n_iter,
    n_jobs=n_jobs,
    cv=cv
)

#print(bayes_search_svm.best_params_)

# Perform the search on the training data
bayes_search_svm.fit(X_train, y_train)

# Start MLFlow run
with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_SVC_best_BayesSearch') as run: 
    #mlflow.sklearn.autolog()
    
    # Get hyperparameters of the best trained model
    best_params = bayes_search_svm.best_params_
    
    # Get best trained model
    best_svc = bayes_search_svm.best_estimator_
    # train & predict
    
    y_pred = best_svc.predict(X_test)
    y_pred_bayessearch = best_svc.decision_function(X_test)
    
    # Log tags & params in MLFlow
    mlf_log_tags_params_gen ("tag", "cv", cv, "random_state", random_state, "n_jobs", n_jobs, "n_iter", n_iter, "n_trials", n_trials, "init_points", init_points, "c_min", c_min, "c_max", c_max, "gamma_min", gamma_min, "gamma_max", gamma_max)
    mlf_log_tags_params_gen ("tag", "model_name", "SVC_best_BayesSearch", "model_description", "Best Bayes Search SVC", "X_train_path", X_train_path, "y_train_path", y_train_path, "X_test_path", X_test_path, "y_test_path", y_test_path)
    mlf_log_tags_params_gen("param", "C", best_params['C'], "gamma", best_params['gamma'], "kernel", best_svc.get_params()['kernel'])
    
    # Values for getting AUC
    bayessearch_svc_fpr, bayessearch_svc_tpr, threshold = roc_curve(y_test, y_pred_bayessearch)
    auc_bayessearch_svc = auc(bayessearch_svc_fpr, bayessearch_svc_tpr)
    
    # Log metrics & model in MLFlow
    mlf_log_metrics_models(classification_report(y_test, y_pred, output_dict=True), best_svc, "SVC_best_BayesSearch", auc_bayessearch_svc)
    log_metrics_auc_intervals(bayessearch_svc_fpr, bayessearch_svc_tpr)
    

### SVC Optuna

In [None]:
%%time

import optuna
from optuna.samplers import TPESampler
#import optuna.visualization as vis

# SVM objective function
def objective(trial):
    C = trial.suggest_float('C', c_min, c_max, log=True)
    gamma = trial.suggest_float('gamma', gamma_min, gamma_max, log=True)    
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])

    svc = SVC(C=C, gamma=gamma, kernel=kernel)
    
    # Supongamos que X_train, y_train están definidos
    score = cross_val_score(svc, X_train, y_train, n_jobs=n_jobs, cv=cv).mean()

    # To save trials in current MLFlow run
    # with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_SVC_trial_Optuna_' + str(trial.number)):  
    #     mlflow.log_params(trial.params)
    #     mlflow.log_metric('score', score)
    #     # Set Optuna's trial id as the parent run ID for tracking
    #     mlflow.set_tag("parent_id", str(trial.number))
    #     mlflow.end_run()
    return score

# Perform the search on the training data
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(sampler=TPESampler(), direction='maximize', pruner=pruner)
study.optimize(objective, n_trials=n_trials)

# Start MLFlow run
with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_SVC_best_Optuna') as run:  
    #mlflow.sklearn.autolog()
    
    # Get hyperparameters of the best trained model
    best_params = study.best_params
    
    # Get best trained model
    best_svc = SVC(C=best_params['C'], gamma=best_params['gamma'], kernel=best_params['kernel'])
    best_svc.fit(X_train, y_train)
    
    # train & predict
    y_pred = best_svc.predict(X_test)
    y_pred_optuna = best_svc.decision_function(X_test)

    # Log tags & params in MLFlow
    mlf_log_tags_params_gen ("tag", "cv", cv, "random_state", random_state, "n_jobs", n_jobs, "n_iter", n_iter, "n_trials", n_trials, "init_points", init_points, "c_min", c_min, "c_max", c_max, "gamma_min", gamma_min, "gamma_max", gamma_max)
    mlf_log_tags_params_gen ("tag", "model_name", "SVC_best_Optuna", "model_description", "Best Optuna SVC", "X_train_path", X_train_path, "y_train_path", y_train_path, "X_test_path", X_test_path, "y_test_path", y_test_path)
    mlf_log_tags_params_gen("param", "C", best_params['C'], "gamma", best_params['gamma'], "kernel", best_params['kernel'])
    
    # Values for getting AUC
    optuna_svc_fpr, optuna_svc_tpr, threshold = roc_curve(y_test, y_pred_optuna)
    auc_optuna_svc = auc(optuna_svc_fpr, optuna_svc_tpr)

    # Log metrics & model in MLFlow
    mlf_log_metrics_models(classification_report(y_test, y_pred, output_dict=True), best_svc, "SVC_best_Optuna", auc_optuna_svc)
    log_metrics_auc_intervals(optuna_svc_fpr, optuna_svc_tpr)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 12), dpi=150)
plt.plot(optuna_svc_fpr, optuna_svc_tpr, label='Optuna SVC (auc = %0.3f)' % auc_optuna_svc)
plt.plot(bayessearch_svc_fpr, bayessearch_svc_tpr, label='Bayes Search SVC (auc = %0.3f)' % auc_bayessearch_svc)
plt.plot(bayesianoptimization_svc_fpr, bayesianoptimization_svc_tpr, label='Bayesian optimization SVC (auc = %0.3f)' % auc_bayesianoptimization_svc)
plt.plot(randomsearch_svc_fpr, randomsearch_svc_tpr, label='Random search SVC (auc = %0.3f)' % auc_randomsearch_svc)
plt.plot(gridsearch_svc_fpr, gridsearch_svc_tpr, label='Grid search SVC (auc = %0.3f)' % auc_gridsearch_svc)
plt.plot(base_svc_fpr, base_svc_tpr, label='Base (auc = %0.3f)' % auc_base_svc)

plt.xlabel('False Positive Rate -->')
plt.ylabel('True Positive Rate -->')

plt.legend()

plt.show()

In [None]:
# MLflow guarda los resultados sin especificar bd? - si
# agregar como metrica resultados del classification report - si
# agregar descripcion - si
# hacer que cargue todas las corridas de optuna? - si
# hacer que la aleatoriedad al hacer validation y test sean fijas - si
# meter los demas algoritmos - base - optuna - gridsearch - random search - bayesian optimization - bayes search - si
# poner en funciones lo de MLFlow, optuna quizas - si
# guardar dataset (train/test/val) en mlflow - si
# guardar criterios de split dataset (train/test/val) en mlflow - si
# guardar ini/fin de algoritmos - si
# guardar parametros generales - si
# que pasa al correr varias veces esto con el experimento - si
#    hacer que cargue diferentes valores? - si
# validar si optuna da resultados diferentes, por que? - si
# graficar curva ROC; se puede con accuracy, precission, recal, f1, etc? - si
# tirar a git - si
# documentar - si
# YAML config file for general variables - si

# meter KNN, naive bayes, log regressopn
# usar Accuracy & AUC como objetivo en los algoritmos
# usar score & probability en los algoritmos
# data validation, 
#     son los datasets similares en extructura
#     son los datasets similares en valores
# hiperparemeter validation
# error handling
#    MLFlow error
#    Train / test error
# train, test, validation best practices

# logged table artifacts en mlflow? - na