# Actividad: MLOps

### Configuración y carga de datos

In [0]:
# Importaciones
import mlflow
import mlflow.sklearn
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Configurar MLflow
#mlflow.create_experiment("/Users/caog.freeman@gmail.com/wine_quality_mlops")
mlflow.set_experiment("/Users/caog.freeman@gmail.com/wine_quality_mlops")

# Cargar datos
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = wine.target

# Split estratificado
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

### Entrenamiento sin logging de MLflow

In [0]:
# Entrenar modelo sin tracking
model_v1 = RandomForestClassifier(n_estimators=100, random_state=42)
model_v1.fit(X_train, y_train)

predictions = model_v1.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"Accuracy: {accuracy}")
# Pregunta para reflexión: ¿Qué pasa si quiero reproducir esto mañana?
# ¿qué información estamos perdiendo al no guardar datos del experimento?

### Experimento con logging de MLflow

In [0]:
# Ahora CON tracking
with mlflow.start_run(run_name="rf_baseline") as run:
    
    # Parámetros
    params = {
        "n_estimators": 100,
        "max_depth": None,
        "min_samples_split": 2,
        "random_state": 42
    }
    
    # Log de parámetros
    mlflow.log_params(params)
    
    # Entrenar
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    
    # Predicciones
    predictions = model.predict(X_test)
    
    # Métricas
    metrics = {
        "accuracy": accuracy_score(y_test, predictions),
        "f1_macro": f1_score(y_test, predictions, average='macro')
    }
    mlflow.log_metrics(metrics)
    
    # Log del modelo
    # IMPORTANTE: Se debe agregar un model signature como buena práctica. Pero en Databricks es obligatorio
    # Revisa más información aquí: https://mlflow.org/docs/latest/ml/model/signatures/#how-to-log-models-with-signatures$0
    mlflow.sklearn.log_model(
        model, 
        "model",
        registered_model_name="wine_classifier",
        input_example=X_train.iloc[:5],
    )
    
    # Matriz de confusión como artifact
    cm = confusion_matrix(y_test, predictions)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")
    
    print(f"Run ID: {run.info.run_id}")
    print(f"Metrics: {metrics}")

### Experimentando con hiperparámetros

In [0]:
# Experimento con diferentes configuraciones
hyperparameter_configs = [
    {"n_estimators": 50, "max_depth": 5},
    {"n_estimators": 100, "max_depth": 10},
    {"n_estimators": 200, "max_depth": None},
]

best_run_id = None
best_accuracy = 0

for i, params in enumerate(hyperparameter_configs):
    with mlflow.start_run(run_name=f"rf_config_{i+1}"):
        
        mlflow.log_params(params)
        mlflow.log_param("config_number", i+1)
        
        model = RandomForestClassifier(**params, random_state=42)
        model.fit(X_train, y_train)
        
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(model, "model", input_example=X_train.iloc[:5])
        
        # Tracking del mejor modelo
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_run_id = mlflow.active_run().info.run_id
        
        print(f"Config {i+1} - Accuracy: {accuracy:.4f}")

print(f"\nBest Run ID: {best_run_id} with Accuracy: {best_accuracy:.4f}")

### Next steps
1. Explorar la interfaz de MLflow
2. Explorar la interfaz del model registry
3. Desplegar nuestro modelo en un endpoint
4. Explorar la interfaz de jobs para desplegar nuestro modelo como un Job

### Usando el endpoint

In [0]:
import os
import requests
import numpy as np
import pandas as pd
import json

def create_tf_serving_json(data):
    return {'inputs': {name: data[name].tolist() for name in data.keys()} if isinstance(data, dict) else data.tolist()}

# Recuerden que la URL y el token son secrets. NUNCA los pongan como texto plano en producción.
# Usen variables de entorno en su lugar

def score_model(dataset):
    url = ''
    headers = {'Authorization': f'Bearer ', 'Content-Type': 'application/json'}
    ds_dict = {'dataframe_split': dataset.to_dict(orient='split')} if isinstance(dataset, pd.DataFrame) else create_tf_serving_json(dataset)
    data_json = json.dumps(ds_dict, allow_nan=True)
    response = requests.request(method='POST', headers=headers, url=url, data=data_json)
    if response.status_code != 200:
        raise Exception(f'Request failed with status {response.status_code}, {response.text}')
    return response.json()

 


In [0]:
results = score_model(X_test)
print(results)