# Modelos supervisados Avanzados

In [25]:
import pandas as pd
import numpy as np
import mlflow

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


from sklearn.metrics import accuracy_score, f1_score

In [6]:
import dagshub

dagshub.init(repo_owner='abdala9512',
             repo_name='dsrp-machine-learning-engineering-2',
             mlflow=True)

# Baseline

In [39]:
df = pd.read_csv("../data/raw/hotel_bookings.csv")
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [40]:
FEATURES = ["lead_time", "stays_in_week_nights", "children", "adr", "booking_changes" ]
X, y = df[FEATURES], df["is_canceled"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

In [16]:
mlflow.set_tracking_uri("https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow")

In [53]:
mlflow.create_experiment("DSRP - Booking Clase 14 Julio - Con más variables y Loggeando modelos")

'3'

In [54]:
mlflow.set_experiment("DSRP - Booking Clase 14 Julio - Con más variables y Loggeando modelos")

<Experiment: artifact_location='mlflow-artifacts:/883f731294df4543ba85bfff9c149c8e', creation_time=1752547016689, experiment_id='3', last_update_time=1752547016689, lifecycle_stage='active', name='DSRP - Booking Clase 14 Julio - Con más variables y Loggeando modelos', tags={}>

In [55]:
mlflow.autolog(log_models=True,)
with mlflow.start_run(run_name="Baseline - Dummy Classifier - Con métricas") as run:

    algorithm = DummyClassifier(strategy="most_frequent")
    algorithm.fit(X_train, y_train)

    predictions = algorithm.predict(X_test)

    _accuracy_score = accuracy_score(y_test, predictions)
    _f1_score = f1_score(y_test, predictions)
    
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
        
    )

2025/07/14 21:37:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/07/14 21:37:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/07/14 21:37:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


🏃 View run Baseline - Dummy Classifier - Con métricas at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3/runs/110dcc0030154dc5800155c4bb612521
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3


# Regresión Logistica

In [56]:
with mlflow.start_run(run_name="Regresión logistica") as run:

    algorithm = LogisticRegression()
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("reg_logistica", algorithm)
        ]
    )
    pipeline.fit(X_train, y_train)

    predictions = pipeline.predict(X_test)

    _accuracy_score = accuracy_score(y_test, predictions)
    _f1_score = f1_score(y_test, predictions)
    
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
        
    )



🏃 View run Regresión logistica at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3/runs/a92ca045086043e9abb3f3d8dd4b697b
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3


# Ensamble 1: Bagging

In [57]:
with mlflow.start_run(run_name="Bagging") as run:

    algorithm = BaggingClassifier()
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("bagging", algorithm)
        ]
    )
    pipeline.fit(X_train, y_train)

    predictions = pipeline.predict(X_test)

    _accuracy_score = accuracy_score(y_test, predictions)
    _f1_score = f1_score(y_test, predictions)
    
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
        
    )



🏃 View run Bagging at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3/runs/a73c5abb0b354c54a43465827e8364e9
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3


# Ensamble 2: Random Forest

In [58]:
with mlflow.start_run(run_name="Random Forest") as run:

    algorithm = RandomForestClassifier()
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("rf", algorithm)
        ]
    )
    pipeline.fit(X_train, y_train)

    predictions = pipeline.predict(X_test)

    _accuracy_score = accuracy_score(y_test, predictions)
    _f1_score = f1_score(y_test, predictions)
    
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
        
    )



🏃 View run Random Forest at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3/runs/53bc093304884fa1a37948a1c57adfc5
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3


# Ensamble 3: XGboost

In [59]:
with mlflow.start_run(run_name="XGboost 2") as run:

    algorithm = XGBClassifier(
        max_depth=5,
        n_estimators=100
    )
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("xgb", algorithm)
        ]
    )
    pipeline.fit(X_train, y_train)

    predictions = pipeline.predict(X_test)

    _accuracy_score = accuracy_score(y_test, predictions)
    _f1_score = f1_score(y_test, predictions)
    
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
        
    )



🏃 View run XGboost 2 at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3/runs/5def7b8358424b3f80001fde1b8d3885
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3


# Ensamble 4: LGBM

In [60]:
with mlflow.start_run(run_name="LGBM") as run:

    algorithm = LGBMClassifier()
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("lgbm", algorithm)
        ]
    )
    pipeline.fit(X_train, y_train)

    predictions = pipeline.predict(X_test)

    _accuracy_score = accuracy_score(y_test, predictions)
    _f1_score = f1_score(y_test, predictions)
    
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
        
    )



[LightGBM] [Info] Number of positive: 33167, number of negative: 56375
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000912 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 555
[LightGBM] [Info] Number of data points in the train set: 89542, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370407 -> initscore=-0.530470
[LightGBM] [Info] Start training from score -0.530470




🏃 View run LGBM at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3/runs/7942d55d7f6745958da5db08e7d1ed92
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3


# Ensamble 5: Catboost

In [61]:
with mlflow.start_run(run_name="CatBoost") as run:

    algorithm = CatBoostClassifier()
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("catboost", algorithm)
        ]
    )
    pipeline.fit(X_train, y_train)

    predictions = pipeline.predict(X_test)

    _accuracy_score = accuracy_score(y_test, predictions)
    _f1_score = f1_score(y_test, predictions)
    
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
        
    )



Learning rate set to 0.070218
0:	learn: 0.6738107	total: 8.76ms	remaining: 8.75s
1:	learn: 0.6577171	total: 14.5ms	remaining: 7.23s
2:	learn: 0.6435684	total: 21.2ms	remaining: 7.04s
3:	learn: 0.6321459	total: 27.4ms	remaining: 6.82s
4:	learn: 0.6222569	total: 33.5ms	remaining: 6.67s
5:	learn: 0.6135038	total: 39.1ms	remaining: 6.47s
6:	learn: 0.6061515	total: 44.5ms	remaining: 6.31s
7:	learn: 0.5998937	total: 50.8ms	remaining: 6.29s
8:	learn: 0.5941178	total: 56.6ms	remaining: 6.24s
9:	learn: 0.5893167	total: 62.5ms	remaining: 6.19s
10:	learn: 0.5856666	total: 67.9ms	remaining: 6.1s
11:	learn: 0.5821017	total: 73.5ms	remaining: 6.05s
12:	learn: 0.5786853	total: 79.1ms	remaining: 6.01s
13:	learn: 0.5756694	total: 84.8ms	remaining: 5.97s
14:	learn: 0.5733002	total: 90.8ms	remaining: 5.96s
15:	learn: 0.5706374	total: 96.5ms	remaining: 5.93s
16:	learn: 0.5683691	total: 102ms	remaining: 5.92s
17:	learn: 0.5667672	total: 108ms	remaining: 5.9s
18:	learn: 0.5653223	total: 113ms	remaining: 5.8



🏃 View run CatBoost at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3/runs/40c3205f208942b98462f82375bc6f81
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3


# Ensamble 6: Voting

In [62]:
with mlflow.start_run(run_name="Ensamble de Votos") as run:
    
    algorithm1 = CatBoostClassifier()
    algorithm2 = RandomForestClassifier()
    algorithm3 = LGBMClassifier()

    voting_clf = VotingClassifier(
        estimators=[
            ("catboost", algorithm1),
            ("rf", algorithm2),
            ("lgbm", algorithm3),
        ],
        voting="hard"
    )
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("voting", voting_clf)
        ]
    )
    pipeline.fit(X_train, y_train)

    predictions = pipeline.predict(X_test)

    _accuracy_score = accuracy_score(y_test, predictions)
    _f1_score = f1_score(y_test, predictions)
    
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
        
    )



Learning rate set to 0.070218
0:	learn: 0.6738107	total: 7.32ms	remaining: 7.31s
1:	learn: 0.6577171	total: 12.9ms	remaining: 6.42s
2:	learn: 0.6435684	total: 19.1ms	remaining: 6.35s
3:	learn: 0.6321459	total: 25.7ms	remaining: 6.39s
4:	learn: 0.6222569	total: 31ms	remaining: 6.17s
5:	learn: 0.6135038	total: 36.5ms	remaining: 6.05s
6:	learn: 0.6061515	total: 42.2ms	remaining: 5.99s
7:	learn: 0.5998937	total: 48ms	remaining: 5.96s
8:	learn: 0.5941178	total: 54.7ms	remaining: 6.02s
9:	learn: 0.5893167	total: 60.3ms	remaining: 5.97s
10:	learn: 0.5856666	total: 65.6ms	remaining: 5.9s
11:	learn: 0.5821017	total: 71.4ms	remaining: 5.88s
12:	learn: 0.5786853	total: 77.1ms	remaining: 5.85s
13:	learn: 0.5756694	total: 82.9ms	remaining: 5.84s
14:	learn: 0.5733002	total: 89.1ms	remaining: 5.85s
15:	learn: 0.5706374	total: 95ms	remaining: 5.84s
16:	learn: 0.5683691	total: 101ms	remaining: 5.83s
17:	learn: 0.5667672	total: 107ms	remaining: 5.82s
18:	learn: 0.5653223	total: 112ms	remaining: 5.78s
19



🏃 View run Ensamble de Votos at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3/runs/afee0675613c440ba0f87d32a21d3906
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3


# Ensamble 7: Stacking

In [63]:
with mlflow.start_run(run_name="Ensamble de Pilas") as run:
    
    algorithm1 = CatBoostClassifier()
    algorithm2 = XGBClassifier()
    algorithm3 = LGBMClassifier()

    stacking_clf = StackingClassifier(
        estimators=[
            ("catboost", algorithm1),
            ("xgb", algorithm2),
            ("lgbm", algorithm3),
        ],
        final_estimator=RandomForestClassifier()
    )
    pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("staking", stacking_clf)
        ]
    )
    pipeline.fit(X_train, y_train)

    predictions = pipeline.predict(X_test)

    _accuracy_score = accuracy_score(y_test, predictions)
    _f1_score = f1_score(y_test, predictions)
    
    mlflow.log_metrics(
        {
            "accuracy": _accuracy_score,
            "f1": _f1_score
        }
        
    )



Learning rate set to 0.070218
0:	learn: 0.6738107	total: 7.02ms	remaining: 7.01s
1:	learn: 0.6577171	total: 12.9ms	remaining: 6.42s
2:	learn: 0.6435684	total: 18.9ms	remaining: 6.27s
3:	learn: 0.6321459	total: 24.8ms	remaining: 6.17s
4:	learn: 0.6222569	total: 30.5ms	remaining: 6.08s
5:	learn: 0.6135038	total: 36.3ms	remaining: 6.02s
6:	learn: 0.6061515	total: 42.1ms	remaining: 5.98s
7:	learn: 0.5998937	total: 48ms	remaining: 5.95s
8:	learn: 0.5941178	total: 54ms	remaining: 5.95s
9:	learn: 0.5893167	total: 60.3ms	remaining: 5.97s
10:	learn: 0.5856666	total: 65.4ms	remaining: 5.88s
11:	learn: 0.5821017	total: 70.8ms	remaining: 5.83s
12:	learn: 0.5786853	total: 76.7ms	remaining: 5.83s
13:	learn: 0.5756694	total: 82.4ms	remaining: 5.8s
14:	learn: 0.5733002	total: 88.4ms	remaining: 5.81s
15:	learn: 0.5706374	total: 94.3ms	remaining: 5.8s
16:	learn: 0.5683691	total: 99.9ms	remaining: 5.78s
17:	learn: 0.5667672	total: 106ms	remaining: 5.76s
18:	learn: 0.5653223	total: 111ms	remaining: 5.73s




[LightGBM] [Info] Number of positive: 26533, number of negative: 45100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000728 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 553
[LightGBM] [Info] Number of data points in the train set: 71633, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370402 -> initscore=-0.530493
[LightGBM] [Info] Start training from score -0.530493




[LightGBM] [Info] Number of positive: 26534, number of negative: 45100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000914 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 554
[LightGBM] [Info] Number of data points in the train set: 71634, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370411 -> initscore=-0.530455
[LightGBM] [Info] Start training from score -0.530455




[LightGBM] [Info] Number of positive: 26534, number of negative: 45100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 555
[LightGBM] [Info] Number of data points in the train set: 71634, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370411 -> initscore=-0.530455
[LightGBM] [Info] Start training from score -0.530455




[LightGBM] [Info] Number of positive: 26534, number of negative: 45100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000681 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 554
[LightGBM] [Info] Number of data points in the train set: 71634, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370411 -> initscore=-0.530455
[LightGBM] [Info] Start training from score -0.530455




🏃 View run Ensamble de Pilas at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3/runs/863e7f3ae6dc4beba4d5f86eb8fcb23f
🧪 View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-2.mlflow/#/experiments/3
