In [5]:
import os
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
import pickle
import numpy as np

##Carga de los Datos
Se hace la carga de los datos desde la ruta Data y se separa el dataframe solo con los datos de vino blanco

In [6]:
mapeo_columnas = {
    'fixed acidity': 'fixed_acidity', 
    'volatile acidity': 'volatile_acidity',
    'citric acid': 'citric_acid', 
    'residual sugar': 'residual_sugar',
    'free sulfur dioxide': 'free_sulfur_dioxide',
    'total sulfur dioxide': 'total_sulfur_dioxide'
}


In [7]:
df_wine_quality = pd.read_csv("../data/winequalityN.csv")
df_wine_quality = df_wine_quality.rename(columns=mapeo_columnas)
datos_white = df_wine_quality[df_wine_quality['type'] == 'white']
datos_white.head()

Unnamed: 0,type,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


##Separar los datos en Test y Train

In [11]:
np.random.seed(101)
sample = np.random.choice(datos_white.index, size=int(0.8 * len(datos_white)), replace=False)
datos_white_train = datos_white.loc[sample]
datos_white_test = datos_white.drop(sample)
display(datos_white_train.head())
display(datos_white_test.head())

Unnamed: 0,type,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
4576,white,5.9,0.3,0.29,1.1,0.036,23.0,56.0,0.9904,3.19,0.38,11.3,5
167,white,6.7,0.46,0.18,2.4,0.034,25.0,98.0,0.9896,3.08,0.44,12.6,7
1889,white,7.6,0.15,0.4,1.3,0.036,24.0,112.0,0.9932,3.14,0.76,10.0,5
958,white,6.6,0.2,0.32,1.1,0.039,25.0,78.0,0.9926,3.39,0.54,10.2,7
3010,white,6.7,0.25,0.31,1.35,0.061,30.5,218.0,0.99388,3.16,0.53,9.5,5


Unnamed: 0,type,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
11,white,8.6,0.23,0.4,4.2,0.035,17.0,109.0,0.9947,3.14,0.53,9.7,5
13,white,6.6,0.16,0.4,1.5,0.044,48.0,143.0,0.9912,3.54,0.52,12.4,7
14,white,8.3,0.42,0.62,19.25,0.04,41.0,172.0,1.0002,2.98,0.67,9.7,5
15,white,6.6,0.17,0.38,1.5,0.032,28.0,112.0,0.9914,3.25,0.55,11.4,7


# Prepocesar datos

Para nuestro caso de analisis vamos a aplicar una normalizacion

In [18]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [23]:
N_SAMPLES = 4898 
FEATURES = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 
            'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 
            'density', 'pH', 'sulphates', 'alcohol']
TARGET = 'quality'

datos_white = pd.DataFrame({
    col: np.random.rand(N_SAMPLES) * np.random.randint(1, 10) for col in FEATURES
})
datos_white[TARGET] = np.random.randint(4, 9, N_SAMPLES)

X = datos_white[FEATURES]
y = datos_white[TARGET]

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.25 * 0.80 = 0.20 (20% para Validation)


preprocessor_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor_pipeline.fit(X_train)

X_train_scaled = preprocessor_pipeline.transform(X_train)
X_val_scaled = preprocessor_pipeline.transform(X_val)
X_test_scaled = preprocessor_pipeline.transform(X_test)

# 4. Verificación y Persistencia (MLOps)
print(f"Media de 'alcohol' en X_train original: {X_train['alcohol'].mean():.4f}")
print(f"Media de 'alcohol' en X_train escalado: {X_train_scaled[:, X_train.columns.get_loc('alcohol')].mean():.4f}") # La media debe ser cercana a 0

# Guardar el Pipeline (¡Esto es clave para el MLOps!)
# Debes guardar el objeto preprocessor_pipeline para usarlo en producción
# cuando llegue un nuevo dato de vino.

OUTPUT_DIR = "./data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

with open(os.path.join(OUTPUT_DIR, "preprocessor_pipeline.pkl"), "wb") as f:
    pickle.dump(preprocessor_pipeline, f)

print("-" * 50)
print(f"✅ Pipeline y datos escalados listos. El preprocesador se guardó en: {OUTPUT_DIR}")


Media de 'alcohol' en X_train original: 0.5015
Media de 'alcohol' en X_train escalado: -0.0000
--------------------------------------------------
✅ Pipeline y datos escalados listos. El preprocesador se guardó en: ./data/


Entrenar
Antes de ejecutar lo siguiente correr en la terminal en la misma carpeta donde esta el notebook:

Levantar en otra terminal mlflow ui

mlflow server --backend-store-uri sqlite:///backend.db

matar procesos que este arriba sudo lsof -i :5000

In [31]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [32]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000'


In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

experiment_name = "White_wine_experiment1_RFR"
mlflow.set_experiment(experiment_name)

with mlflow.start_run():
    rf = RandomForestRegressor(n_estimators=100, random_state=0, max_depth=10)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 10)

    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric("rmse", rmse)

2025/10/03 11:19:04 INFO mlflow.tracking.fluent: Experiment with name 'White_wine_experiment1_RFR' does not exist. Creating a new experiment.


🏃 View run respected-cat-917 at: http://127.0.0.1:5000/#/experiments/359158135218172771/runs/eb626177ae624039a3d8cf4f3c9d5631
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/359158135218172771


In [34]:
import xgboost as xgb

with mlflow.start_run():
    xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_reg.fit(X_train, y_train)
    y_pred = xgb_reg.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric("rmse", rmse)

🏃 View run lyrical-quail-738 at: http://127.0.0.1:5000/#/experiments/359158135218172771/runs/f724d6df095946448faf7c162f91f233
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/359158135218172771


In [35]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error


with mlflow.start_run(run_name="SVR_Baseline"): # Damos un nombre descriptivo a esta ejecución
    
    svr_reg = SVR(
        kernel='rbf', # Un kernel común para problemas de regresión no lineales
        C=1.0,        # Parámetro de penalización (rigidez de la restricción de error)
        gamma='scale' # Coeficiente de kernel (cuánto influye un único ejemplo de entrenamiento)
    )
    
    print("Iniciando entrenamiento SVR...")
    svr_reg.fit(X_train, y_train)
    print("Entrenamiento SVR finalizado.")
    
    y_pred = svr_reg.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    mlflow.log_param("kernel", 'rbf')
    mlflow.log_param("C", 1.0)
    mlflow.log_param("gamma", 'scale')
    
    mlflow.log_metric("rmse", rmse)
    
    mlflow.sklearn.log_model(svr_reg, "model_svr_baseline")
    
    print(f"SVR RMSE registrado en MLflow: {rmse:.4f}")

Iniciando entrenamiento SVR...
Entrenamiento SVR finalizado.




SVR RMSE registrado en MLflow: 1.4613
🏃 View run SVR_Baseline at: http://127.0.0.1:5000/#/experiments/359158135218172771/runs/4032e809c0484d18b3a7562e6b071ef7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/359158135218172771
