# DSRP - Machine Learning III

In [69]:
import pandas as pd
import numpy as np

from loguru import logger

import dagshub

import mlflow

import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

from hyperopt import fmin, tpe, space_eval, Trials, STATUS_OK, hp
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import optuna

# clasificación
from sklearn.metrics import accuracy_score,classification_report, precision_score, auc, recall_score, f1_score, roc_curve, roc_auc_score
# regresión
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

dagshub.init(repo_owner='abdala9512', repo_name='dsrp-machine-learning-engineering-3', mlflow=True)

  from .autonotebook import tqdm as notebook_tqdm


# Métricas Clasificación

In [4]:
bank_marketing_raw_data = pd.read_csv("../data/bank-marketing.csv")

In [5]:
bank_marketing_raw_data

Unnamed: 0,age,age group,eligible,job,salary,marital,education,marital-education,targeted,default,...,contact,day,month,duration,campaign,pdays,previous,poutcome,y,response
0,58,5,Y,management,100000,married,tertiary,married-tertiary,yes,no,...,unknown,5,may,261,1,-1,0,unknown,no,0
1,44,4,Y,technician,60000,single,secondary,single-secondary,yes,no,...,unknown,5,may,151,1,-1,0,unknown,no,0
2,33,3,Y,entrepreneur,120000,married,secondary,married-secondary,yes,no,...,unknown,5,may,76,1,-1,0,unknown,no,0
3,47,4,Y,blue-collar,20000,married,unknown,married-unknown,no,no,...,unknown,5,may,92,1,-1,0,unknown,no,0
4,33,3,Y,unknown,0,single,unknown,single-unknown,no,no,...,unknown,5,may,198,1,-1,0,unknown,no,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,5,Y,technician,60000,married,tertiary,married-tertiary,yes,no,...,cellular,17,nov,977,3,-1,0,unknown,yes,1
45207,71,7,N,retired,55000,divorced,primary,divorced-primary,yes,no,...,cellular,17,nov,456,2,-1,0,unknown,yes,1
45208,72,7,N,retired,55000,married,secondary,married-secondary,yes,no,...,cellular,17,nov,1127,5,184,3,success,yes,1
45209,57,5,Y,blue-collar,20000,married,secondary,married-secondary,yes,no,...,telephone,17,nov,508,4,-1,0,unknown,no,0


In [6]:
bank_marketing_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                45211 non-null  int64 
 1   age group          45211 non-null  int64 
 2   eligible           45211 non-null  object
 3   job                45211 non-null  object
 4   salary             45211 non-null  int64 
 5   marital            45211 non-null  object
 6   education          45211 non-null  object
 7   marital-education  45211 non-null  object
 8   targeted           45211 non-null  object
 9   default            45211 non-null  object
 10  balance            45211 non-null  int64 
 11  housing            45211 non-null  object
 12  loan               45211 non-null  object
 13  contact            45211 non-null  object
 14  day                45211 non-null  int64 
 15  month              45211 non-null  object
 16  duration           45211 non-null  int64

In [6]:
bank_marketing_raw_data["response"].value_counts()

response
0    39922
1     5289
Name: count, dtype: int64

In [30]:
CLASSIFICATION_FEATURES = ["age", "balance", "salary"]
CLASSIFICATION_RESPONSE = "response"


X = bank_marketing_raw_data[CLASSIFICATION_FEATURES]
y = bank_marketing_raw_data[CLASSIFICATION_RESPONSE]

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y, random_state=1, test_size=0.25)

In [8]:
classifier = GradientBoostingClassifier()
classifier.fit(X_train, y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [9]:
y_test

3610     0
11677    0
33018    0
44323    1
8119     0
        ..
39673    0
29223    0
34939    0
28358    0
27682    0
Name: response, Length: 11303, dtype: int64

In [13]:
predictions_gbt = classifier.predict(X_test)
probabilities_gbt = classifier.predict(X_test)

In [18]:
def calculate_classification_metrics(y_real, predictions ) -> dict:
    """
    Calcula metricas de clasificacion
    """
    
    
    return {
        "accuracy": accuracy_score(y_real, predictions),
        "precision": precision_score(y_real, predictions),
        "recall": recall_score(y_real, predictions),
        "f1_score": f1_score(y_real, predictions),
        "auc_score": roc_auc_score( predictions, y_real)
        
    }

calculate_classification_metrics(y_real=y_test, predictions=predictions_gbt)

{'accuracy': 0.884278510130054,
 'precision': 0.4482758620689655,
 'recall': 0.00996168582375479,
 'f1_score': 0.019490254872563718,
 'auc_score': 0.6668379487744154}

## Métricas Regresión

In [19]:
lift_raw_data = pd.read_csv("../data/Lyftdataset.csv")

In [20]:
lift_raw_data

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-11,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,01-01-11,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,01-01-11,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,01-01-11,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,01-01-11,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,31-12-12,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17375,17376,31-12-12,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17376,17377,31-12-12,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17377,17378,31-12-12,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


In [21]:
lift_raw_data["cnt"].mean()

np.float64(189.46308763450142)

In [22]:
lift_raw_data.describe()

Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,8690.0,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,5017.0295,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,4345.5,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,8690.0,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,13034.5,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0
max,17379.0,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,367.0,886.0,977.0


In [23]:
lift_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [24]:
REGRESSION_FEATURES = ["temp", "hum", "holiday"]
REGRESSION_RESPONSE = "cnt"

X = lift_raw_data[REGRESSION_FEATURES]
y = lift_raw_data[REGRESSION_RESPONSE]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.25)

In [25]:
regressor = GradientBoostingRegressor()
regressor.fit(X_train, y_train)

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [26]:
predictions_rgbt = regressor.predict(X_test)

In [27]:
def calculate_regression_metrics(y_real, predictions) -> dict:
    """calculo de metricas de regresion
    """
    return {
        "mse": mean_squared_error(y_real, predictions),
        "rmse": mean_squared_error(y_real, predictions)**0.5,
        "mae":mean_absolute_error(y_real, predictions),
        "mape": mean_absolute_percentage_error(y_real, predictions)
    }

calculate_regression_metrics(y_real=y_test, predictions=predictions_rgbt)

{'mse': 24251.08150394049,
 'rmse': 155.7275874851354,
 'mae': 115.79177047173098,
 'mape': 4.9329785109117195}

In [28]:
np.mean(predictions_rgbt)

np.float64(190.92582444022773)

In [29]:
y_test - predictions_rgbt

15652    288.825199
3085       5.371410
1684      30.965416
10555      5.709704
14622    -63.004079
            ...    
10817     56.421339
8228     -21.331784
2951     -76.596529
4659    -191.332051
11093    -63.099270
Name: cnt, Length: 4345, dtype: float64

# Optimización

In [53]:
class AIOptimizer:

    def __init__(self, opt_strategy: str, search_space, algorithm ) -> None:
        self.strategy = opt_strategy
        self.search_space = search_space
        self.algorithm = algorithm

    def optimize(self):

        if self.strategy == "grid_search":
            gs_classifier = GridSearchCV(
                estimator=self.algorithm, 
                param_grid=self.search_space, 
                cv=3,
                scoring="accuracy"
            )
            gs_classifier.fit(X_train_clf, y_train_clf)
            logger.info(f"Best Score {gs_classifier.best_score_}")
            logger.info(f"Best Params {gs_classifier.best_params_}")
            return (
                gs_classifier.best_estimator_, 
                gs_classifier.best_params_, 
                gs_classifier.best_score_
            )
        elif self.strategy == "random_search":
            rs_classifier = RandomizedSearchCV(
                estimator=self.algorithm, 
                param_distributions=self.search_space, 
                cv=3,
                scoring="accuracy",
                n_iter=5
            )
            rs_classifier.fit(X_train_clf, y_train_clf)
            logger.info(f"Best Score {rs_classifier.best_score_}")
            logger.info(f"Best Params {rs_classifier.best_params_}")
            return (
                rs_classifier.best_estimator_, 
                rs_classifier.best_params_, 
                rs_classifier.best_score_
            )

classifier = GradientBoostingClassifier()

## GridSearch

In [83]:

with mlflow.start_run(run_name="gridsearch") as run:
    
    # Espacio de busqueda
    gridsearch_params = {
        "loss": ("log_loss", "exponential"),
        "learning_rate": [0.1,  0.5],
        "n_estimators": [10, 100]
    }
    
    optimizer = AIOptimizer(
        opt_strategy="grid_search",
        search_space=gridsearch_params,
        algorithm=classifier
    )
    _, params, score = optimizer.optimize()

    mlflow.log_metric("accuracy", score)
    mlflow.log_params(params)

[32m2025-10-01 20:58:04.753[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize[0m:[36m18[0m - [1mBest Score 0.8825056031930402[0m
[32m2025-10-01 20:58:04.753[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize[0m:[36m19[0m - [1mBest Params {'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 10}[0m


🏃 View run gridsearch at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-3.mlflow/#/experiments/0/runs/2250232b3cc14a7a9a7cd3b5f7287124
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-3.mlflow/#/experiments/0


## Random Search

In [84]:

with mlflow.start_run(run_name="randomsearch") as run:

    # Espacio de busqueda
    randomsearch_params = {
        "loss": ("log_loss", "exponential"),
        "learning_rate": [0.1, 0.001, 0.5, 0.02],
        "n_estimators": [10, 50, 100, 500],
        "min_samples_split": [2, 5, 10, 15],
        "max_depth": [2, 5, 3, 8]
    }
    
    
    optimizer = AIOptimizer(
        opt_strategy="random_search",
        search_space=randomsearch_params,
        algorithm=classifier
    )
    optimizer.optimize()
    _, params, score = optimizer.optimize()

    mlflow.log_metric("accuracy", score)
    mlflow.log_params(params)

[32m2025-10-01 20:59:25.148[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize[0m:[36m34[0m - [1mBest Score 0.8827415339771928[0m
[32m2025-10-01 20:59:25.149[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize[0m:[36m35[0m - [1mBest Params {'n_estimators': 100, 'min_samples_split': 10, 'max_depth': 2, 'loss': 'exponential', 'learning_rate': 0.1}[0m
[32m2025-10-01 21:00:17.359[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize[0m:[36m34[0m - [1mBest Score 0.8826235372731054[0m
[32m2025-10-01 21:00:17.360[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize[0m:[36m35[0m - [1mBest Params {'n_estimators': 500, 'min_samples_split': 2, 'max_depth': 5, 'loss': 'log_loss', 'learning_rate': 0.02}[0m


🏃 View run randomsearch at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-3.mlflow/#/experiments/0/runs/fe8c200ee32a44d8bd672923efb67629
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-3.mlflow/#/experiments/0


# TPE / Hyperopt

In [85]:
tpe_search_space = {
    "loss": hp.choice("loss", ["log_loss", "exponential"]) ,
    "learning_rate": hp.normal("learning_rate", 0.1,0.01 ),
    "n_estimators": hp.quniform("n_estimators", 10, 100, 10),
    "min_samples_split": hp.quniform("min_samples_split", 2, 10 ,1),
    "max_depth": hp.quniform("max_depth", 3, 20, 1)
}

def objective(params):
    # params -> accuracy -> {mas alto posible}

    adj_params = {
         "loss": params["loss"] ,
        "learning_rate": params["learning_rate"],
        "n_estimators": int(params["n_estimators"]),
        "min_samples_split": int(params["min_samples_split"]),
        "max_depth": int(params["max_depth"])
    }
    classifier = GradientBoostingClassifier(**adj_params)
    classifier.fit(X_train_clf, y_train_clf)
    predictions = classifier.predict(X_test_clf)
    _accuracy = accuracy_score(y_test_clf, predictions)
    
    return {
        "loss": 1 - _accuracy,
        "status": STATUS_OK
    }

with mlflow.start_run(run_name="tpe_hyperopt") as run:
 

    trials = Trials()
    best   = fmin(
        fn=objective,
        space=tpe_search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=trials,
    )

    best_params = space_eval(tpe_search_space, best)
    best_params = {
             "loss": best_params["loss"] ,
            "learning_rate": best_params["learning_rate"],
            "n_estimators": int(best_params["n_estimators"]),
            "min_samples_split": int(best_params["min_samples_split"]),
            "max_depth": int(best_params["max_depth"])
        }
    
    classifier = GradientBoostingClassifier(**best_params)
    classifier.fit(X_train_clf, y_train_clf)
    predictions = classifier.predict(X_test_clf)
    logger.info(f"Best Model accuracy {accuracy_score(y_test_clf, predictions)}")

    mlflow.log_metric("accuracy", accuracy_score(y_test_clf, predictions))
    mlflow.log_params(best_params)
    

100%|███████████████████████████████████████████████████| 10/10 [00:44<00:00,  4.49s/trial, best loss: 0.11483676899938067]


[32m2025-10-01 21:01:05.428[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m53[0m - [1mBest Model accuracy 0.8851632310006193[0m


🏃 View run tpe_hyperopt at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-3.mlflow/#/experiments/0/runs/4f72e7d7584042c3ba459738bdae7bb6
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-3.mlflow/#/experiments/0


# Optuna

In [89]:
def objective_optuna(trial):
    # params -> accuracy -> {mas alto posible}

    params = {
        "loss": trial.suggest_categorical("loss", ["log_loss", "exponential"]),
        "max_depth": trial.suggest_int("max_depth", 5, 20)
    }
    classifier = GradientBoostingClassifier(**params)
    classifier.fit(X_train_clf, y_train_clf)
    predictions = classifier.predict(X_test_clf)
    _accuracy = accuracy_score(y_test_clf, predictions)
    
    return 1 - _accuracy


with mlflow.start_run(run_name="tpe_optuna") as run:

    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler())
    study.optimize(objective_optuna, n_trials=10)

    mlflow.log_metric("accuracy", 1 - study.best_value)
    mlflow.log_params(study.best_params)

[I 2025-10-01 21:01:41,176] A new study created in memory with name: no-name-29bf0685-f52a-4336-8b86-c2b4fcb7a467
[I 2025-10-01 21:01:45,441] Trial 0 finished with value: 0.11704857117579404 and parameters: {'loss': 'log_loss', 'max_depth': 10}. Best is trial 0 with value: 0.11704857117579404.
[I 2025-10-01 21:01:49,102] Trial 1 finished with value: 0.11713704326285057 and parameters: {'loss': 'log_loss', 'max_depth': 9}. Best is trial 0 with value: 0.11704857117579404.
[I 2025-10-01 21:02:00,609] Trial 2 finished with value: 0.12111828718039463 and parameters: {'loss': 'exponential', 'max_depth': 17}. Best is trial 0 with value: 0.11704857117579404.
[I 2025-10-01 21:02:02,507] Trial 3 finished with value: 0.11510218526055027 and parameters: {'loss': 'exponential', 'max_depth': 5}. Best is trial 3 with value: 0.11510218526055027.
[I 2025-10-01 21:02:12,385] Trial 4 finished with value: 0.12430328231442977 and parameters: {'loss': 'log_loss', 'max_depth': 16}. Best is trial 3 with value

🏃 View run tpe_optuna at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-3.mlflow/#/experiments/0/runs/c01f05e09f6a4b39a277d6734b95c95f
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-3.mlflow/#/experiments/0


## Algoritmos evolutivos

In [77]:
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Categorical, Integer, Continuous

In [None]:
with mlflow.start_run(run_name="ga_search") as run:


    ga_search_space = {
        "loss": Categorical(["log_loss", "exponential"]) ,
        "learning_rate": Continuous( 0.001,0.1, distribution="uniform" ),
        "n_estimators": Integer(10, 100),
        "min_samples_split": Integer(2, 10 ),
        "max_depth": Integer(3, 20)
    }
    
    evolution_classifier = GASearchCV(
        estimator=classifier,
        scoring="accuracy",
        param_grid=ga_search_space,
        population_size=3,
        generations=3,
        verbose=True,
    )
    evolution_classifier.fit(X_train_clf, y_train_clf)
    mlflow.log_metric("accuracy", evolution_classifier.best_score)
    mlflow.log_params(evolution_classifier.best_params)

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	3     	0.876538	0.00585297 	0.882063   	0.868438   
1  	6     	0.877512	0.00226611 	0.879114   	0.874307   


In [82]:
evolution_classifier.best_estimator_

0,1,2
,loss,'exponential'
,learning_rate,np.float64(0....6952107849916)
,n_estimators,69
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,9
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,11
,min_impurity_decrease,0.0
