# Binary SVC w/ Hiperparameter tunning (Grid Search, Random Search, Bayesian Opt, Bayes search, Optuna) & lifecycle management w/ MLFlow

In [1]:
# Importa librerias
import pandas as pd
#import numpy as np
#from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, roc_curve, auc

In [2]:
import sys
sys.path.append("../scripts_setup/")

from my_config_loader import load_configuration_model_gen, \
                             load_configuration_mlflow, \
                             load_configuration_model_svc, \
                             load_configuration_model_knn

from my_mlflow_utils import mlf_log_tags_params_gen, \
                            mlf_log_metrics_models, \
                            log_metrics_auc_intervals

from my_data_processing_utils import save_datasets_to_csv,\
                      scale_dataset, \
                      split_data

In [3]:
import warnings

# Suppress the warning
warnings.filterwarnings("ignore", message="Distutils was imported before Setuptools")
warnings.filterwarnings("ignore", message="Setuptools is replacing distutils.", category=UserWarning)

## Data loading & preprocessing

In [4]:
# Lee dataset
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("../../data/raw/magic04.data", names=cols)
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [5]:
# Valores de la variable objetivo
df['class'].unique()

array(['g', 'h'], dtype=object)

In [6]:
# Reclasifica variable objetivo - Class as int (g=1, h=0)

df['class'] = (df['class'] == "g").astype(int)

### General parameters

In [7]:
# # Key to group & identify different runs
# import datetime

# mlf_key = datetime.datetime.now().strftime("%y%m%d%H%M")

# # Load general parameters
# import yaml

# try:
#     with open("../scripts_setup/General_params.yaml") as f:
#         config_file = yaml.safe_load(f)
# except FileNotFoundError:
#     raise FileNotFoundError("Configuration file not found")

# # General params
# try:
#     cv = config_file["General"]["cv"]
#     random_state = config_file["General"]["random_state"]
#     n_jobs = config_file["General"]["n_jobs"]
#     n_iter = config_file["General"]["n_iter"]            # BayesSearchCV, BayesianOptimization, RandomizedSearchCV
#     n_trials = config_file["General"]["n_trials"]           # Optuna trials 
#     init_points = config_file["General"]["init_points"]        # BayesianOptimization
#     testing = config_file["General"]["testing"]
# except KeyError as e:
#     raise KeyError(f"Missing key in General section: {e}")
    
# # MLFlow params
# try:
#     mlf_tracking_server_uri = config_file["MLFlow"]["tracking_server_uri"]
#     mlf_experiment_name = config_file["MLFlow"]["experiment_name"]
#     mlf_project_name = config_file["MLFlow"]["project_name"]
#     mlf_team = config_file["MLFlow"]["team"]
# except KeyError as e:
#     raise KeyError(f"Missing key in MLFlow section: {e}")
    
# # SVC range params
# try:
#     c_min = config_file["SVC"]["c_min"]
#     c_max = config_file["SVC"]["c_max"]
#     gamma_min = config_file["SVC"]["gamma_min"]
#     gamma_max = config_file["SVC"]["gamma_max"]
# except KeyError as e:
#     raise KeyError(f"Missing key in SVC section: {e}")

# # KNN range params
# try:
#     n_neighbors_min = config_file["KNN"]["n_neighbors_min"] 
#     n_neighbors_max = config_file["KNN"]["n_neighbors_max"] 
#     leaf_size_min = config_file["KNN"]["leaf_size_min"] 
#     leaf_size_max = config_file["KNN"]["leaf_size_max"] 
# except KeyError as e:
#     raise KeyError(f"Missing key in KNN section: {e}")

In [8]:
# Load configurations
(mlf_key, 
 cv, random_state, n_jobs, n_iter, n_trials, init_points, testing
) = load_configuration_model_gen("../scripts_setup/General_params.yaml")

(mlf_tracking_server_uri, mlf_experiment_name, mlf_project_name, mlf_team
) = load_configuration_mlflow("../scripts_setup/General_params.yaml")

(c_min, c_max, gamma_min, gamma_max
) = load_configuration_model_svc("../scripts_setup/General_params.yaml")

(n_neighbors_min, n_neighbors_max, leaf_size_min, leaf_size_max 
) = load_configuration_model_knn("../scripts_setup/General_params.yaml")

In [9]:
# from config_loader_gen import load_configuration_gen
# from config_loader_model import load_configuration_model

# # Load configurations
# (mlf_key, 
#  cv, random_state, n_jobs, n_iter, n_trials, init_points, testing, 
#  mlf_tracking_server_uri, mlf_experiment_name, mlf_project_name, mlf_team
# ) = load_configuration_gen("../scripts_setup/General_params.yaml")

# (c_min, c_max, gamma_min, gamma_max, 
#  n_neighbors_min, n_neighbors_max, leaf_size_min, leaf_size_max 
# ) = load_configuration_model("../scripts_setup/General_params.yaml")

### Split dataset

In [10]:
# # Shuffle with a fixed random state for reproducibility
# data_shuffled = df.sample(frac=1, random_state=random_state)  

# # Split data into training and temporary sets (80% training, 20% temp)
# df_train, df_temp = train_test_split(data_shuffled, test_size=0.4, random_state=random_state)

# # Split the temporary set into testing and validation sets (50% test, 50% validation)
# df_test, df_valid = train_test_split(df_temp, test_size=0.5, random_state=random_state)

# # Reset the indices of the resulting DataFrames
# df_train.reset_index(drop=True, inplace=True)
# df_test.reset_index(drop=True, inplace=True)
# df_valid.reset_index(drop=True, inplace=True)

df_train, df_test, df_valid = split_data(df, random_state = random_state)

### Function for scaling and oversampling the dataset

In [11]:
# def scale_dataset(df_param, random_state=None, oversample = False):
#   X = df_param[df_param.columns[:-1]].values
#   y = df_param[df_param.columns[-1]].values

#   scaler = StandardScaler()
#   X = scaler.fit_transform(X)

#   if oversample:
#     ros = RandomOverSampler(random_state=random_state)
#     X, y = ros.fit_resample(X, y)

#   data = np.hstack((X, np.reshape(y, (-1, 1))))

#   return data, X, y

### Scale & oversample

In [12]:
train, X_train, y_train = scale_dataset(df_train, random_state, oversample = True)
valid, X_valid, y_valid = scale_dataset(df_valid, random_state, oversample = False)
test, X_test, y_test = scale_dataset(df_test, random_state, oversample = False)

### Saves train / test / validation files & paths to log them in MLFlow

In [13]:
# import os
# # Save to CSV for logging
# X_train_path = os.path.abspath("../../data/processed/X_train.csv")
# y_train_path = os.path.abspath("../../data/processed/y_train.csv")
# X_test_path = os.path.abspath("../../data/processed/X_test.csv")
# y_test_path = os.path.abspath("../../data/processed/y_test.csv")
# X_valid_path = os.path.abspath("../../data/processed/X_valid.csv")
# y_valid_path = os.path.abspath("../../data/processed/y_valid.csv")

# pd.DataFrame(X_train).to_csv(X_train_path, index=False)
# pd.DataFrame(y_train).to_csv(y_train_path, index=False)
# pd.DataFrame(X_test).to_csv(X_test_path, index=False)
# pd.DataFrame(y_test).to_csv(y_test_path, index=False)
# pd.DataFrame(X_valid).to_csv(X_valid_path, index=False)
# pd.DataFrame(y_valid).to_csv(y_valid_path, index=False)

# Call the function to save the datasets to CSV files
dataset_paths = save_datasets_to_csv(X_train, y_train, X_test, y_test, X_valid, y_valid)

# Access the file paths from the dictionary
X_train_path = dataset_paths["X_train_path"]
y_train_path = dataset_paths["y_train_path"]
X_test_path = dataset_paths["X_test_path"]
y_test_path = dataset_paths["y_test_path"]
X_valid_path = dataset_paths["X_valid_path"]
y_valid_path = dataset_paths["y_valid_path"]

## MLFLow

### Initialize MLFlow experiment

In [14]:
import mlflow

# Code for init MLFLOW server: mlflow server --host 127.0.0.1 --port 5000
#mlf_tracking_server_uri = "http://localhost:5000"
mlflow.set_tracking_uri(mlf_tracking_server_uri)

#mlf_experiment_name = "Magic"
mlf_experiment_description = "This is a(n) " + mlf_experiment_name + " experiment initiated on " + mlf_key
mlf_experiment_tags = {
    "project_name": mlf_project_name,
    "team": mlf_team,
    "mlflow.note.content": mlf_experiment_description,
}

try:
    mlf_exp_id = mlflow.create_experiment(name=mlf_experiment_name, tags=mlf_experiment_tags)
except Exception as e:
    mlf_exp_id = mlflow.get_experiment_by_name(mlf_experiment_name).experiment_id

mlflow.sklearn.autolog(disable=True)    
#print("Experiment ID:", mlf_exp_id)

### MLFlow procedures

In [15]:
# def mlf_log_tags_params_gen(param_tag, *args):
#     try:
#         # Ensure the number of arguments is even (tag_name, value)
#         if len(args) % 2 != 0:
#             raise ValueError("Arguments must be provided in pairs (param_name, value)")
#         # Loop through pairs of arguments
#         for i in range(0, len(args), 2):
#             # Get tag name and value
#             name = args[i]
#             value = args[i + 1]
#             if param_tag == "tag":          # Log tag
#                 mlflow.set_tag(name, value)
#             elif param_tag == "param":      # Log parameter
#                 mlflow.log_param(name, value)
#     except Exception as e:
#         print(f"Error mlf_log_tags_params_generic: {e}")

# def mlf_log_metrics_models(class_report, model, tag, auc):
#     try:
#         mlflow.log_metric("accuracy", class_report["accuracy"])
#         mlflow.log_metric("AUC", auc)
        
#         for class_name, metrics in class_report.items():
#             if class_name not in ["macro avg", "weighted avg"]:
#                 if isinstance(metrics, dict):  
#                     for metric, value in metrics.items():
#                         if metric in ["precision", "recall", "f1-score", "support"]:
#                             mlflow.log_metric(f"{metric}_{class_name}", value)    
#         #mlflow.log_figure(fig8, "qq_plot.png")
#         mlflow.sklearn.log_model(model, tag) 
#     except Exception as e:
#         print(f"Error mlf_log_metrics_models: {e}")

# def log_metrics_auc_intervals(svc_fpr, svc_tpr):
#     try:
#         # Define the intervals (e.g., 10%, 20%, ..., 90%)
#         intervals = range(10, 91, 10)  # 10, 20, ..., 90        
#         # Initialize a dictionary to store the AUC for each interval
#         auc_intervals = {}        
#         # Compute the total number of data points
#         total_points = len(svc_fpr)
        
#         # Loop through each interval
#         for interval in intervals:
#             # Calculate the index up to which the current interval falls
#             index = int((interval / 100) * total_points)            
#             # Extract the FPR and TPR values up to the current index
#             fpr_interval = svc_fpr[:index + 1]
#             tpr_interval = svc_tpr[:index + 1]            
#             # Compute the AUC for the current interval
#             auc_interval = auc(fpr_interval, tpr_interval)            
#             # Store the AUC for the current interval
#             auc_intervals[interval] = auc_interval
            
#         for interval, auc_interval in auc_intervals.items():
#             #print(f"AUC for {interval}%: {auc_interval}")  
#             mlflow.log_metric(f"AUC for {interval} perc", auc_interval)
#     except Exception as e:
#         print(f"Error log_metrics_auc_intervals: {e}")

## Model

### KNN base

In [16]:
%%time

from sklearn.neighbors import KNeighborsClassifier

# Init MLFlow run
with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_KNN_Baseline'):  
    # Create KNN object, train & predict
    knn_model = KNeighborsClassifier(n_neighbors=5)
    knn_model.fit(X_train, y_train)
    y_pred = knn_model.predict(X_test)
    y_pred_base_knn = knn_model.predict_proba(X_test)[:, 1]

    # Get the hyperparameters of the trained model
    best_params = knn_model.get_params()

    # Log tags & params in MLFlow 
    mlf_log_tags_params_gen ("tag", "cv", cv, "random_state", random_state, "n_jobs", n_jobs, "n_iter", n_iter, "n_trials", n_trials, "init_points", init_points)
    mlf_log_tags_params_gen ("tag", "model_name", "KNN_Baseline", "model_description", "Baseline KNN", "X_train_path", X_train_path, "y_train_path", y_train_path, "X_test_path", X_test_path, "y_test_path", y_test_path)
    mlf_log_tags_params_gen("param", "n_neighbors", best_params['n_neighbors'], "weights", best_params['weights'], "algorithm", best_params['algorithm'], "leaf_size", best_params['leaf_size'], "p", best_params['p'])
        
    # Values for getting AUC
    base_knn_fpr, base_knn_tpr, threshold = roc_curve(y_test, y_pred_base_knn)
    auc_knn_base = auc(base_knn_fpr, base_knn_tpr)
    
    # Log metrics & model in MLFlow
    mlf_log_metrics_models(classification_report(y_test, y_pred, output_dict=True), knn_model, "KNN_Baseline", auc_knn_base)
    
    # End MLFlow run
    mlflow.end_run()

CPU times: total: 3.12 s
Wall time: 8.74 s


### KNN Grid search

In [17]:
%%time

# SVM Grid search
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for the grid search
if testing:    
    param_grid = {
        'n_neighbors': [3, 5],  # Number of neighbors to consider
        'weights': ['distance'],  # Weighting function for predictions
        'algorithm': ['auto'],  # Algorithm used to compute nearest neighbors
        'leaf_size' : [30, 35],
        'p' : [1]
    }
else:
    param_grid = {
        'n_neighbors': [3, 5, 10, 15, 20, 25, 30],  # Number of neighbors to consider
        'weights': ['uniform', 'distance'],  # Weighting function for predictions
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm used to compute nearest neighbors
        'leaf_size' : [leaf_size_min, leaf_size_max],
        'p' : [1, 2]
    }

# Create the GridSearchCV object
grid_search_knn = GridSearchCV(knn_model, param_grid, cv=cv, scoring='accuracy', verbose=2)

# Perform the grid search on training data
grid_search_knn.fit(X_train, y_train)

# Start MLFlow run
with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_KNN_best_GridSearch'):  
    #mlflow.sklearn.autolog()
    
    # Get hyperparameters of the best trained model
    best_params = grid_search_knn.best_params_
    
    #print(best_params)
    
    # Get best trained model
    best_knn = grid_search_knn.best_estimator_
    
    # train & predict
    y_pred = best_knn.predict(X_test)
    y_pred_gridsearch_knn = best_knn.predict_proba(X_test)[:, 1]
    
    # Log tags & params in MLFlow  
    mlf_log_tags_params_gen ("tag", "cv", cv, "random_state", random_state, "n_jobs", n_jobs, "n_iter", n_iter, "n_trials", n_trials, "init_points", init_points)
    mlf_log_tags_params_gen ("tag", "model_name", "KNN_best_gridsearch", "model_description", "Best Grid Search KNN", "X_train_path", X_train_path, "y_train_path", y_train_path, "X_test_path", X_test_path, "y_test_path", y_test_path)
    mlf_log_tags_params_gen("param", "n_neighbors", best_params['n_neighbors'], "weights", best_params['weights'], "algorithm", best_params['algorithm'], "leaf_size", best_params['leaf_size'], "p", best_params['p'])    # Values for getting AUC
   
    gridsearch_knn_fpr, gridsearch_knn_tpr, threshold = roc_curve(y_test, y_pred_gridsearch_knn)
    auc_gridsearch_knn = auc(gridsearch_knn_fpr, gridsearch_knn_tpr)
    
    # Log metrics & model in MLFlow
    mlf_log_metrics_models(classification_report(y_test, y_pred, output_dict=True), best_knn, "KNN_best_GridSearch", auc_gridsearch_knn)
    log_metrics_auc_intervals(gridsearch_knn_fpr, gridsearch_knn_tpr)

    # End MLFlow run
    mlflow.end_run()

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END algorithm=auto, leaf_size=30, n_neighbors=3, p=1, weights=distance; total time=   0.3s
[CV] END algorithm=auto, leaf_size=30, n_neighbors=3, p=1, weights=distance; total time=   0.3s
[CV] END algorithm=auto, leaf_size=30, n_neighbors=3, p=1, weights=distance; total time=   0.3s
[CV] END algorithm=auto, leaf_size=30, n_neighbors=5, p=1, weights=distance; total time=   0.3s
[CV] END algorithm=auto, leaf_size=30, n_neighbors=5, p=1, weights=distance; total time=   0.3s
[CV] END algorithm=auto, leaf_size=30, n_neighbors=5, p=1, weights=distance; total time=   0.3s
[CV] END algorithm=auto, leaf_size=35, n_neighbors=3, p=1, weights=distance; total time=   0.3s
[CV] END algorithm=auto, leaf_size=35, n_neighbors=3, p=1, weights=distance; total time=   0.3s
[CV] END algorithm=auto, leaf_size=35, n_neighbors=3, p=1, weights=distance; total time=   0.3s
[CV] END algorithm=auto, leaf_size=35, n_neighbors=5, p=1, weights=distance;

### SVC Random search

In [18]:
%%time

# SVM Random search
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define parameter distributions for the random search
if testing:    
    param_dist = {
        'n_neighbors': randint(3, 5),  # Sample integer values between 1 and 20 for the number of neighbors
        'weights': ['distance'],  # Choose between uniform and distance weighting
        'algorithm': ['auto'],  # Choose between different algorithms
        'leaf_size': randint(30, 35),  # Sample integer values between 10 and 100 for leaf size
        'p': [1]  # Choose between Manhattan distance (1) and Euclidean distance (2)
    }
else:
    param_dist = {
        'n_neighbors': randint(n_neighbors_min, n_neighbors_max),  # Sample integer values between 1 and 20 for the number of neighbors
        'weights': ['uniform', 'distance'],  # Choose between uniform and distance weighting
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Choose between different algorithms
        'leaf_size': randint(leaf_size_min, leaf_size_max),  # Sample integer values between 10 and 100 for leaf size
        'p': [1, 2]  # Choose between Manhattan distance (1) and Euclidean distance (2)
    }

# Create the RandomizedSearchCV object
random_search_knn = RandomizedSearchCV(knn_model, param_distributions=param_dist, cv=cv, n_iter=n_iter, random_state=random_state, verbose=2)

# Perform the random search on the training data
random_search_knn.fit(X_train, y_train)

# Start MLFlow run
with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_KNN_best_RandomSearch'):  
    #mlflow.sklearn.autolog()
    
    # Get hyperparameters of the best trained model
    best_params = random_search_knn.best_params_
    
    # Get best trained model
    best_knn = random_search_knn.best_estimator_

    # train & predict
    y_pred = best_knn.predict(X_test)
    y_pred_randomsearch_knn = best_knn.predict_proba(X_test)[:, 1]

    # Log tags & params in MLFlow
    mlf_log_tags_params_gen ("tag", "cv", cv, "random_state", random_state, "n_jobs", n_jobs, "n_iter", n_iter, "n_trials", n_trials, "init_points", init_points)
    mlf_log_tags_params_gen ("tag", "model_name", "KNN_best_randomsearch", "model_description", "Best Random Search KNN", "X_train_path", X_train_path, "y_train_path", y_train_path, "X_test_path", X_test_path, "y_test_path", y_test_path)
    mlf_log_tags_params_gen("param", "n_neighbors", best_params['n_neighbors'], "weights", best_params['weights'], "algorithm", best_params['algorithm'], "leaf_size", best_params['leaf_size'], "p", best_params['p'])    # Values for getting AUC
    
    # Values for getting AUC
    randomsearch_knn_fpr, randomsearch_knn_tpr, threshold = roc_curve(y_test, y_pred_randomsearch_knn)
    auc_randomsearch_knn = auc(randomsearch_knn_fpr, randomsearch_knn_tpr)

    # Log metrics & model in MLFlow
    mlf_log_metrics_models(classification_report(y_test, y_pred, output_dict=True), best_knn, "SVC_best_RandomSearch", auc_randomsearch_knn)
    log_metrics_auc_intervals(randomsearch_knn_fpr, randomsearch_knn_tpr)

    # End MLFlow run
    mlflow.end_run()

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] END algorithm=auto, leaf_size=34, n_neighbors=3, p=1, weights=distance; total time=   0.3s
[CV] END algorithm=auto, leaf_size=34, n_neighbors=3, p=1, weights=distance; total time=   0.4s
[CV] END algorithm=auto, leaf_size=34, n_neighbors=3, p=1, weights=distance; total time=   0.3s
[CV] END algorithm=auto, leaf_size=30, n_neighbors=3, p=1, weights=distance; total time=   0.4s
[CV] END algorithm=auto, leaf_size=30, n_neighbors=3, p=1, weights=distance; total time=   0.3s
[CV] END algorithm=auto, leaf_size=30, n_neighbors=3, p=1, weights=distance; total time=   0.3s
CPU times: total: 3.89 s
Wall time: 8.46 s


### SVC Bayesian optimization

In [19]:
# #InvalidParameterError: The 'n_neighbors' parameter of KNeighborsClassifier must be an int in the range [1, inf) or None. Got 4.0 instead.

# %%time

# # SVM Bayesian optimization - BayesianOptimization
# from bayes_opt import BayesianOptimization

# # KNN objective function with cross-validation
# def knn_cv(n_neighbors: int):
#     n_neighbors = int(n_neighbors)
#     print("Type of n_neighbors:", type(n_neighbors))
#     print("Value of n_neighbors:", n_neighbors)

#     # Ensure n_neighbors is within allowed range (optional but recommended)
#     if n_neighbors < 1 or n_neighbors >= 30:
#         raise ValueError("n_neighbors must be between 1 and 29 (inclusive)")
    
#     knn = KNeighborsClassifier(n_neighbors=n_neighbors)
#     scores = cross_val_score(knn, X_train, y_train, cv=cv, scoring='accuracy')
#     return np.mean(scores)

# # Define the search space
# pbounds_knn = {'n_neighbors': (3, 30)}
# #pbounds_knn = dict(n_neighbors=range(3, 30))

# # Create the Bayesian optimizer for KNN
# bayes_optimizer_knn = BayesianOptimization(
#     f=knn_cv,
#     pbounds=pbounds_knn,
#     random_state=random_state
# )

# # Perform the search on the training data
# bayes_optimizer_knn.maximize(init_points=init_points, n_iter=n_iter)

# # Start MLFlow run
# with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_KNN_best_BayesianOptimization'):  
#     #mlflow.sklearn.autolog()
    
#     # Get hyperparameters of the best trained model
#     best_params = bayes_optimizer_knn.max['params']

#     # Get best trained model    
#     best_knn = KNeighborsClassifier(**best_params)
#     # train & predict
#     best_knn.fit(X_train, y_train)
#     y_pred = best_knn.predict(X_test)
#     y_pred_bayesianoptimization_knn = best_knn.decision_function(X_test)

#     # Log tags & params in MLFlow
#     mlf_log_tags_params_gen ("tag", "cv", cv, "random_state", random_state, "n_jobs", n_jobs, "n_iter", n_iter, "n_trials", n_trials, "init_points", init_points)
#     mlf_log_tags_params_gen ("tag", "model_name", "KNN_best_BayesianOptimization", "model_description", "Best Bayesian optimization KNN", "X_train_path", X_train_path, "y_train_path", y_train_path, "X_test_path", X_test_path, "y_test_path", y_test_path)
#     mlf_log_tags_params_gen("param", "n_neighbors", best_params['n_neighbors'], "weights", best_params['weights'], "algorithm", best_params['algorithm'], "leaf_size", best_params['leaf_size'], "p", best_params['p'])    # Values for getting AUC
    
#     # Values for getting AUC
#     bayesianoptimization_knn_fpr, bayesianoptimization_knn_tpr, threshold = roc_curve(y_test, y_pred_bayesianoptimization_knn)
#     auc_bayesianoptimization_knn = auc(bayesianoptimization_knn_fpr, bayesianoptimization_knn_tpr)

#     # Log metrics & model in MLFlow
#     mlf_log_metrics_models(classification_report(y_test, y_pred, output_dict=True), best_knn, "KNN_best_BayesianOptimization", auc_bayesianoptimization_knn)
#     log_metrics_auc_intervals(bayesianoptimization_knn_fpr, bayesianoptimization_knn_tpr)

#     # End MLFlow run
#     mlflow.end_run()

### SVC Bayes search

In [20]:
%%time

# SVM Bayesian optimization - BayesSearchCV
from skopt import BayesSearchCV

# Define the search space
param_space = {
    'C': (c_min, c_max, 'log-uniform'),  
    'gamma': (gamma_min, gamma_max, 'log-uniform')  
}

# Initialize BayesSearchCV object
bayes_search_svm = BayesSearchCV(
    SVC(random_state=random_state), # pasar svc_model
    param_space,
    n_iter=n_iter,
    n_jobs=n_jobs,
    cv=cv
)

#print(bayes_search_svm.best_params_)

# Perform the search on the training data
bayes_search_svm.fit(X_train, y_train)

# Start MLFlow run
with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_SVC_best_BayesSearch'): 
    #mlflow.sklearn.autolog()
    
    # Get hyperparameters of the best trained model
    best_params = bayes_search_svm.best_params_
    
    # Get best trained model
    best_svc = bayes_search_svm.best_estimator_
    # train & predict
    
    y_pred = best_svc.predict(X_test)
    y_pred_bayessearch = best_svc.decision_function(X_test)
    
    # Log tags & params in MLFlow
    mlf_log_tags_gen (cv, random_state, n_jobs, n_iter, n_trials, init_points, c_min, c_max, gamma_min, gamma_max)
    mlf_log_tags_params("SVC_best_BayesSearch", "Best Bayes Search SVC", best_params['C'], best_params['gamma'], best_svc.get_params()['kernel'], X_train_path, y_train_path, X_test_path, y_test_path)
    
    # Values for getting AUC
    bayessearch_svc_fpr, bayessearch_svc_tpr, threshold = roc_curve(y_test, y_pred_bayessearch)
    auc_bayessearch_svc = auc(bayessearch_svc_fpr, bayessearch_svc_tpr)
    
    # Log metrics & model in MLFlow
    mlf_log_metrics_models(classification_report(y_test, y_pred, output_dict=True), best_svc, "SVC_best_BayesSearch", auc_bayessearch_svc)
    log_metrics_auc_intervals(bayessearch_svc_fpr, bayessearch_svc_tpr)
    
    # End MLFlow run
    mlflow.end_run()

NameError: name 'SVC' is not defined

### SVC Optuna

In [None]:
# %%time

# import optuna
# from optuna.samplers import TPESampler
# #import optuna.visualization as vis

# # SVM objective function
# def objective(trial):
#     C = trial.suggest_float('C', c_min, c_max, log=True)
#     gamma = trial.suggest_float('gamma', gamma_min, gamma_max, log=True)    
#     kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])

#     svc = SVC(C=C, gamma=gamma, kernel=kernel)
    
#     # Supongamos que X_train, y_train están definidos
#     score = cross_val_score(svc, X_train, y_train, n_jobs=n_jobs, cv=cv).mean()

#     # To save trials in current MLFlow run
#     # with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_SVC_trial_Optuna_' + str(trial.number)):  
#     #     mlflow.log_params(trial.params)
#     #     mlflow.log_metric('score', score)
#     #     # Set Optuna's trial id as the parent run ID for tracking
#     #     mlflow.set_tag("parent_id", str(trial.number))
#     #     mlflow.end_run()
#     return score

# # Perform the search on the training data
# pruner = optuna.pruners.MedianPruner()
# study = optuna.create_study(sampler=TPESampler(), direction='maximize', pruner=pruner)
# study.optimize(objective, n_trials=n_trials)

# # Start MLFlow run
# with mlflow.start_run(experiment_id=mlf_exp_id, run_name= mlf_key + '_SVC_best_Optuna'):  
#     #mlflow.sklearn.autolog()
    
#     # Get hyperparameters of the best trained model
#     best_params = study.best_params
    
#     # Get best trained model
#     best_svc = SVC(C=best_params['C'], gamma=best_params['gamma'], kernel=best_params['kernel'])
#     best_svc.fit(X_train, y_train)
    
#     # train & predict
#     y_pred = best_svc.predict(X_test)
#     y_pred_optuna = best_svc.decision_function(X_test)

#     # Log tags & params in MLFlow
#     mlf_log_tags_gen (cv, random_state, n_jobs, n_iter, n_trials, init_points, c_min, c_max, gamma_min, gamma_max)
#     mlf_log_tags_params("SVC_best_Optuna", "Best Optuna SVC", best_params['C'], best_params['gamma'], best_params['kernel'], X_train_path, y_train_path, X_test_path, y_test_path)

#     # Values for getting AUC
#     optuna_svc_fpr, optuna_svc_tpr, threshold = roc_curve(y_test, y_pred_optuna)
#     auc_optuna_svc = auc(optuna_svc_fpr, optuna_svc_tpr)

#     # Log metrics & model in MLFlow
#     mlf_log_metrics_models(classification_report(y_test, y_pred, output_dict=True), best_svc, "SVC_best_Optuna", auc_optuna_svc)
#     log_metrics_auc_intervals(optuna_svc_fpr, optuna_svc_tpr)

#     # End MLFlow run
#     mlflow.end_run()

In [None]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 12), dpi=150)
# plt.plot(optuna_svc_fpr, optuna_svc_tpr, label='Optuna SVC (auc = %0.3f)' % auc_optuna_svc)
# plt.plot(bayessearch_svc_fpr, bayessearch_svc_tpr, label='Bayes Search SVC (auc = %0.3f)' % auc_bayessearch_svc)
# plt.plot(bayesianoptimization_svc_fpr, bayesianoptimization_svc_tpr, label='Bayesian optimization SVC (auc = %0.3f)' % auc_bayesianoptimization_svc)
# plt.plot(randomsearch_svc_fpr, randomsearch_svc_tpr, label='Random search SVC (auc = %0.3f)' % auc_randomsearch_svc)
# plt.plot(gridsearch_svc_fpr, gridsearch_svc_tpr, label='Grid search SVC (auc = %0.3f)' % auc_gridsearch_svc)
# plt.plot(base_fpr, base_tpr, label='Base (auc = %0.3f)' % auc_base)

# plt.xlabel('False Positive Rate -->')
# plt.ylabel('True Positive Rate -->')

# plt.legend()

# plt.show()

DecisionTreeClassifier
RandomForestClassifier 

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import load_iris

# # Load the Iris dataset
# iris = load_iris()
# X = iris.data
# y = iris.target

# # Create a Random Forest Classifier model
# clf = RandomForestClassifier(n_estimators=100, random_state=42)

# # Train the model on the training data
# clf.fit(X_train, y_train)

# # Make predictions on the test data
# y_pred = clf.predict(X_test)