# Exercise 11: Test the Randomized Search cv function
 
## SIB - Intelligent Systems for Bioinformatics

BÃ¡rbara Freitas PG55693

In [6]:
import numpy as np
import time

from si.io.csv_file import read_csv 
from si.data.dataset import Dataset
from si.metrics.accuracy import accuracy 
from si.models.logistic_regression import LogisticRegression
from si.model_selection.randomized_search import randomized_search_cv 
from si.model_selection.cross_validate import k_fold_cross_validation

# --- 1. Use  breast-bin dataset  ---
path_to_breast = '../datasets/breast_bin/breast-bin.csv' 
breast_dataset = read_csv(path_to_breast, sep=',', label=True)

print("--- Randomized Search CV Test Protocol ---")
print("Shape:", breast_dataset.shape())
print("-" * 50)

# --- Create a LogisticRegression model ---

# Initial hyperparameters that will be overridden in search
model = LogisticRegression(l2_penalty=1.0, alpha=0.001, max_iter=1000, scale=True)

# --- Define hyperparameter grid for Randomized Search CV ---
hyperparameter_grid = {
    # l2_penalty: distribution between 1 and 10 with 10 intervals
    'l2_penalty': np.linspace(1, 10, 10), 
    
    # alpha: distribution between 0.001 and 0.0001 with 100 intervals
    'alpha': np.linspace(0.001, 0.0001, 100),
    
    # max_iter: distribution between 1000 and 2000 with 200 intervals
    'max_iter': np.linspace(1000, 2000, 200).astype(int) 
}

N_ITER = 10
CV_FOLDS = 3

print(f"Model: {model.__class__.__name__}")
print(f"Hyperparameters to test: {N_ITER} random combinations over {CV_FOLDS} folds.")
print("-" * 50)

# --- Execute Randomized Search CV ---
results = randomized_search_cv(
    model=model,
    dataset=breast_dataset,
    hyperparameter_grid=hyperparameter_grid,
    scoring=accuracy,
    cv=CV_FOLDS,
    n_iter=N_ITER,
    random_state=42 
)

# --- Results ----

print("--- Final results ---")

# Best result
print("\nBest Result:")
print(f"Best Score (Mean CV Accuracy): {results['best_score']:.4f}")
print("Best Hyperparameters Found:")
for k, v in results['best_hyperparameters'].items():
    print(f"  {k}: {v}")

--- Randomized Search CV Test Protocol ---
Shape: (698, 9)
--------------------------------------------------
Model: LogisticRegression
Hyperparameters to test: 10 random combinations over 3 folds.
--------------------------------------------------
--- Final results ---

Best Result:
Best Score (Mean CV Accuracy): 0.9670
Best Hyperparameters Found:
  l2_penalty: 2.0
  alpha: 0.0009090909090909091
  max_iter: 1100
