# **Importamos librerias**

In [1]:
from etl import GetData
from feature_engineer import FeatureEngineer

  from .autonotebook import tqdm as notebook_tqdm


# **ETL**

In [2]:
get_data = GetData()

In [5]:
get_data.download_data()

In [3]:
df = get_data.create_dataset()

In [4]:
df.head()

Unnamed: 0,type,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


# **Feature Engineering**

In [5]:
feature_engineer = FeatureEngineer(df)

In [6]:
df_engineered = feature_engineer.create_features()

In [7]:
df_engineered

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [8]:
numeric_features = ['volatile_acidity', 'residual_sugar', 'density', 'alcohol']
categorical_features = []
target_column = 'quality'
 
test_size = 0.25

# **Step 3: Modelo con MLFlow + Optuna**

mlflow server --backend-store-uri sqlite:///wine_quality.db

In [9]:
from sklearn.ensemble import RandomForestClassifier
from train_with_mlflow_optuna import TrainMlflowOptuna
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [10]:
# Define RandomForest parameter distributions
param_distributions = {
    'n_estimators': ('int', 50, 200),
    'max_depth': ('int', 5, 30),
    'min_samples_split': ('int', 2, 10),
    'min_samples_leaf': ('int', 1, 5),
    'max_features': ('categorical', ['sqrt', 'log2', None])
}

# Create trainer for RandomForest
trainer = TrainMlflowOptuna(
    df=df_engineered,
    numeric_features=numeric_features,
    categorical_features=categorical_features,
    target_column=target_column,
    model_class=RandomForestClassifier,
    test_size=0.3,
    n_trials=30,
    optimization_metric='f1',  # Optimize for F1 score
    param_distributions=param_distributions,
    model_params={'random_state': 42, 'n_jobs': -1},
    mlflow_setup = mlflow
)

# Run optimization
best_pipeline, run_id, study = trainer.train()

[I 2025-10-04 12:19:56,481] A new study created in memory with name: optuna_RandomForestClassifier
[I 2025-10-04 12:19:56,626] Trial 0 finished with value: 0.4975857844314939 and parameters: {'n_estimators': 159, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 0 with value: 0.4975857844314939.


Starting Optuna optimization with 30 trials...
Optimizing for: f1
Model type: RandomForestClassifier


2025/10/04 12:19:56 INFO mlflow.tracking.fluent: Experiment with name 'optuna_RandomForestClassifier' does not exist. Creating a new experiment.
[I 2025-10-04 12:19:57,190] Trial 1 finished with value: 0.5610976784458215 and parameters: {'n_estimators': 186, 'max_depth': 11, 'min_samples_split': 7, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.5610976784458215.


🏃 View run 0 at: http://127.0.0.1:5000/#/experiments/1/runs/a9695423409d4c289c031c872a32ea4d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 1 at: http://127.0.0.1:5000/#/experiments/1/runs/78ed9029ddec4180915e811930e6e344
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:19:57,428] Trial 2 finished with value: 0.5717134931943791 and parameters: {'n_estimators': 178, 'max_depth': 16, 'min_samples_split': 7, 'min_samples_leaf': 5, 'max_features': None}. Best is trial 2 with value: 0.5717134931943791.
[I 2025-10-04 12:19:57,567] Trial 3 finished with value: 0.5747200307937766 and parameters: {'n_estimators': 102, 'max_depth': 16, 'min_samples_split': 9, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 3 with value: 0.5747200307937766.


🏃 View run 2 at: http://127.0.0.1:5000/#/experiments/1/runs/649daa0635684e8db40ca19e82554a68
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 3 at: http://127.0.0.1:5000/#/experiments/1/runs/3a221562bd5245fa90293bfc76f81032
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:19:57,732] Trial 4 finished with value: 0.5862222099716785 and parameters: {'n_estimators': 133, 'max_depth': 28, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 4 with value: 0.5862222099716785.
[I 2025-10-04 12:19:57,877] Trial 5 finished with value: 0.5964787937317346 and parameters: {'n_estimators': 93, 'max_depth': 14, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': None}. Best is trial 5 with value: 0.5964787937317346.


🏃 View run 4 at: http://127.0.0.1:5000/#/experiments/1/runs/5fbd1d3b92784effbb4a5a2dcf7fdd5d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 5 at: http://127.0.0.1:5000/#/experiments/1/runs/715be48e15934cd384b7b70aaa9b03c9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:19:58,069] Trial 6 finished with value: 0.5739368287081377 and parameters: {'n_estimators': 174, 'max_depth': 15, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 5 with value: 0.5964787937317346.
[I 2025-10-04 12:19:58,246] Trial 7 finished with value: 0.5989615993145014 and parameters: {'n_estimators': 136, 'max_depth': 27, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 7 with value: 0.5989615993145014.


🏃 View run 6 at: http://127.0.0.1:5000/#/experiments/1/runs/ff687238172d4a64b9076631a0abc0e0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 7 at: http://127.0.0.1:5000/#/experiments/1/runs/076757fdc7a94bb799d8eb3ea0f31306
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:19:58,390] Trial 8 finished with value: 0.5759347945335258 and parameters: {'n_estimators': 104, 'max_depth': 18, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 7 with value: 0.5989615993145014.
[I 2025-10-04 12:19:58,506] Trial 9 finished with value: 0.5817207349176814 and parameters: {'n_estimators': 52, 'max_depth': 17, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': None}. Best is trial 7 with value: 0.5989615993145014.


🏃 View run 8 at: http://127.0.0.1:5000/#/experiments/1/runs/7c76f845df1c40fd939b0a2636f9c050
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 9 at: http://127.0.0.1:5000/#/experiments/1/runs/329579e39ee44ed4b1b7e7f962cd4209
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:19:58,682] Trial 10 finished with value: 0.6425546584086081 and parameters: {'n_estimators': 142, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.6425546584086081.
[I 2025-10-04 12:19:58,865] Trial 11 finished with value: 0.6406632282941132 and parameters: {'n_estimators': 141, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.6425546584086081.


🏃 View run 10 at: http://127.0.0.1:5000/#/experiments/1/runs/5112a0a600d8429e98463cf037d6e4e1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 11 at: http://127.0.0.1:5000/#/experiments/1/runs/0ee6e747be6340efbb3bc2f03b607dca
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:19:59,047] Trial 12 finished with value: 0.6401326221294122 and parameters: {'n_estimators': 150, 'max_depth': 23, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.6425546584086081.
[I 2025-10-04 12:19:59,199] Trial 13 finished with value: 0.6436037827853707 and parameters: {'n_estimators': 118, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.6436037827853707.


🏃 View run 12 at: http://127.0.0.1:5000/#/experiments/1/runs/5c2d572c6fdd4c48927025ec64f3a1b4
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 13 at: http://127.0.0.1:5000/#/experiments/1/runs/8683fc47344148bb92067eb7a7a2ed95
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:19:59,354] Trial 14 finished with value: 0.6307088121222432 and parameters: {'n_estimators': 115, 'max_depth': 23, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.6436037827853707.
[I 2025-10-04 12:19:59,488] Trial 15 finished with value: 0.6325280811070879 and parameters: {'n_estimators': 79, 'max_depth': 24, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.6436037827853707.


🏃 View run 14 at: http://127.0.0.1:5000/#/experiments/1/runs/f07c9746e6e4425b8c87c354f1de5243
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 15 at: http://127.0.0.1:5000/#/experiments/1/runs/27048864e9574c2eaa5f32c835a9b27b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:19:59,643] Trial 16 finished with value: 0.6025434086168732 and parameters: {'n_estimators': 120, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.6436037827853707.
[I 2025-10-04 12:19:59,765] Trial 17 finished with value: 0.6133424660484174 and parameters: {'n_estimators': 73, 'max_depth': 26, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.6436037827853707.


🏃 View run 16 at: http://127.0.0.1:5000/#/experiments/1/runs/31c1e71911724bb8ad0bf6efa98c73cd
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 17 at: http://127.0.0.1:5000/#/experiments/1/runs/91ef4b481ae049f5841a3ee446510d23
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:19:59,949] Trial 18 finished with value: 0.6416426291687248 and parameters: {'n_estimators': 160, 'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 13 with value: 0.6436037827853707.
[I 2025-10-04 12:20:00,130] Trial 19 finished with value: 0.6002537723340149 and parameters: {'n_estimators': 121, 'max_depth': 21, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': None}. Best is trial 13 with value: 0.6436037827853707.


🏃 View run 18 at: http://127.0.0.1:5000/#/experiments/1/runs/c4e85c9fde564ae9950bbbca540f00ba
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 19 at: http://127.0.0.1:5000/#/experiments/1/runs/39ee5dc3ff0647d5a7677602da12ccdc
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:20:00,363] Trial 20 finished with value: 0.6289327140142494 and parameters: {'n_estimators': 197, 'max_depth': 25, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.6436037827853707.
[I 2025-10-04 12:20:00,543] Trial 21 finished with value: 0.637708592162187 and parameters: {'n_estimators': 162, 'max_depth': 21, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 13 with value: 0.6436037827853707.


🏃 View run 20 at: http://127.0.0.1:5000/#/experiments/1/runs/3d17fdfdc35b4d579e07d1abd911176c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 21 at: http://127.0.0.1:5000/#/experiments/1/runs/b2725001c544415a8a7fe01de7a013c4
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:20:00,731] Trial 22 finished with value: 0.6310823010386437 and parameters: {'n_estimators': 156, 'max_depth': 28, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 13 with value: 0.6436037827853707.
[I 2025-10-04 12:20:00,896] Trial 23 finished with value: 0.5623936215612295 and parameters: {'n_estimators': 144, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 13 with value: 0.6436037827853707.


🏃 View run 22 at: http://127.0.0.1:5000/#/experiments/1/runs/9dce31ed9b37437ca7fe1753ed43441a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 23 at: http://127.0.0.1:5000/#/experiments/1/runs/cc00253b9a474ffe98379de6fcbdb58d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:20:01,086] Trial 24 finished with value: 0.6358535533012939 and parameters: {'n_estimators': 169, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 13 with value: 0.6436037827853707.
[I 2025-10-04 12:20:01,257] Trial 25 finished with value: 0.6273209368979905 and parameters: {'n_estimators': 130, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 13 with value: 0.6436037827853707.


🏃 View run 24 at: http://127.0.0.1:5000/#/experiments/1/runs/7c0e423093c34a9a8944e3b4f560b158
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 25 at: http://127.0.0.1:5000/#/experiments/1/runs/3a26df64505045f88a3083af77888162
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:20:01,413] Trial 26 finished with value: 0.5896469112817493 and parameters: {'n_estimators': 105, 'max_depth': 13, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.6436037827853707.
[I 2025-10-04 12:20:01,593] Trial 27 finished with value: 0.6263294754314146 and parameters: {'n_estimators': 149, 'max_depth': 27, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.6436037827853707.


🏃 View run 26 at: http://127.0.0.1:5000/#/experiments/1/runs/a429e6ffe79c4f3996ea0dcbf4456512
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 27 at: http://127.0.0.1:5000/#/experiments/1/runs/cb0ffbdb889a47af90428aaf05987f72
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


[I 2025-10-04 12:20:01,738] Trial 28 finished with value: 0.49461317791768844 and parameters: {'n_estimators': 113, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': None}. Best is trial 13 with value: 0.6436037827853707.
[I 2025-10-04 12:20:01,937] Trial 29 finished with value: 0.6441271833031672 and parameters: {'n_estimators': 161, 'max_depth': 19, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 29 with value: 0.6441271833031672.


🏃 View run 28 at: http://127.0.0.1:5000/#/experiments/1/runs/3abda288a12147a695ee34ca46daaa89
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
🏃 View run 29 at: http://127.0.0.1:5000/#/experiments/1/runs/755c1d93816244b3877f7c55bf77db14
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1

Optimization complete!
Best f1: 0.6441
Best parameters: {'n_estimators': 161, 'max_depth': 19, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2'}





Best Model MLflow Run ID: 9e6c63303eb64f079bf19aea61b9885e
Tracking URI: http://127.0.0.1:5000
Train Accuracy: 0.9869
Test Accuracy: 0.6531
🏃 View run best_model_RandomForestClassifier at: http://127.0.0.1:5000/#/experiments/1/runs/9e6c63303eb64f079bf19aea61b9885e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
