## 1. Verificaci√≥n del Entorno MLflow

MLflow viene pre-instalado en Azure Databricks. Verificamos la configuraci√≥n inicial.

In [None]:
# Verificar versi√≥n de MLflow instalada
import mlflow
print(f"MLflow version: {mlflow.__version__}")

# Verificar URI de tracking (apunta al workspace de Databricks)
print(f"Tracking URI: {mlflow.get_tracking_uri()}")

# Importar librer√≠as necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("\n‚úì Librer√≠as importadas exitosamente")

## 2. Configuraci√≥n del Experimento

Creamos un experimento para organizar nuestros runs de entrenamiento.

In [None]:
# Crear o configurar un experimento
# Reemplaza <tu-usuario> con tu nombre de usuario de Databricks
experiment_name = "/Users/<tu-usuario>/energy-prediction-experiment"

# Configurar el experimento
mlflow.set_experiment(experiment_name)

# Obtener informaci√≥n del experimento
experiment = mlflow.get_experiment_by_name(experiment_name)
print(f"Experiment ID: {experiment.experiment_id}")
print(f"Artifact Location: {experiment.artifact_location}")
print(f"\n‚úì Experimento configurado exitosamente")

## 3. Carga y Preparaci√≥n de Datos

Cargamos los datos procesados del Lab 3 o datos originales si es necesario.

In [None]:
# Opci√≥n 1: Cargar desde Delta Lake (resultado del Lab 3)
try:
    df_energy = spark.read.format("delta").load("/delta/energy_features")
    print("‚úì Datos cargados desde Delta Lake")
except:
    # Opci√≥n 2: Cargar desde archivo CSV original
    df_energy = spark.read.csv("/FileStore/tables/owid-energy-data.csv", header=True, inferSchema=True)
    print("‚úì Datos cargados desde CSV original")

# Convertir a Pandas para este ejemplo
df = df_energy.toPandas()

print(f"\nDataset shape: {df.shape}")
print(f"Columnas disponibles: {len(df.columns)}")
display(df.head())

In [None]:
# Preparaci√≥n de datos para modelado
# Objetivo: Predecir el nivel de consumo energ√©tico

# Seleccionar features relevantes
df_model = df[['year', 'population', 'gdp', 
               'primary_energy_consumption', 
               'renewables_consumption',
               'fossil_fuel_consumption']].copy()

# Remover filas con valores nulos
df_model = df_model.dropna()

# Crear features adicionales
df_model['renewable_ratio'] = df_model['renewables_consumption'] / (df_model['primary_energy_consumption'] + 1)
df_model['energy_per_capita'] = (df_model['primary_energy_consumption'] / df_model['population']) * 1000000
df_model['fossil_ratio'] = df_model['fossil_fuel_consumption'] / (df_model['primary_energy_consumption'] + 1)

print(f"‚úì Features creados")
print(f"Dataset limpio: {df_model.shape}")

In [None]:
# Crear target: clasificar pa√≠ses por consumo per c√°pita
df_model['energy_class'] = pd.cut(
    df_model['energy_per_capita'], 
    bins=[0, 30, 70, 150, float('inf')],
    labels=['Low', 'Medium', 'High', 'Very High']
)

# Preparar features (X) y target (y)
feature_cols = [
    'year', 'population', 'gdp',
    'primary_energy_consumption',
    'renewables_consumption',
    'fossil_fuel_consumption',
    'renewable_ratio',
    'fossil_ratio'
]

X = df_model[feature_cols]
y = df_model['energy_class']

# Codificar target
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"Features shape: {X.shape}")
print(f"\nTarget distribution:")
print(pd.Series(y).value_counts())
print(f"\nClases: {le.classes_}")

In [None]:
# Split estratificado
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42,
    stratify=y_encoded
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTrain class distribution: {np.bincount(y_train)}")
print(f"Test class distribution: {np.bincount(y_test)}")

## 4. Primer Modelo con MLflow Tracking

Entrenaremos un modelo Random Forest con logging completo en MLflow.

In [None]:
# Iniciar un run de MLflow
with mlflow.start_run(run_name="random_forest_baseline") as run:
    
    # 1. Log de par√°metros
    n_estimators = 100
    max_depth = 10
    random_state = 42
    
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("random_state", random_state)
    mlflow.log_param("test_size", 0.2)
    
    # 2. Entrenar modelo
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state
    )
    model.fit(X_train, y_train)
    
    # 3. Predicciones
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # 4. Calcular m√©tricas
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    precision = precision_score(y_test, y_pred_test, average='weighted')
    recall = recall_score(y_test, y_pred_test, average='weighted')
    f1 = f1_score(y_test, y_pred_test, average='weighted')
    
    # 5. Log de m√©tricas
    mlflow.log_metric("train_accuracy", train_accuracy)
    mlflow.log_metric("test_accuracy", test_accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    
    # 6. Log del modelo
    mlflow.sklearn.log_model(
        model, 
        "random_forest_model",
        registered_model_name="energy_classifier_rf"
    )
    
    print(f"‚úì Run ID: {run.info.run_id}")
    print(f"‚úì Test Accuracy: {test_accuracy:.4f}")
    print(f"‚úì F1 Score: {f1:.4f}")
    print(f"\n‚ûú Ve al Experiments tab para ver los resultados completos")

## 5. Visualizaci√≥n de Resultados

Creamos artefactos visuales que se registrar√°n en MLflow.

In [None]:
# Crear visualizaciones con el √∫ltimo modelo entrenado
with mlflow.start_run(run_name="rf_with_visualizations") as run:
    
    # Entrenar modelo
    model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Log m√©tricas
    mlflow.log_metric("test_accuracy", accuracy_score(y_test, y_pred))
    mlflow.log_metric("f1_score", f1_score(y_test, y_pred, average='weighted'))
    
    # 1. Matriz de confusi√≥n
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig("/tmp/confusion_matrix.png")
    mlflow.log_artifact("/tmp/confusion_matrix.png", "visualizations")
    plt.show()
    
    # 2. Feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['feature'], feature_importance['importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig("/tmp/feature_importance.png")
    mlflow.log_artifact("/tmp/feature_importance.png", "visualizations")
    plt.show()
    
    # Log modelo
    mlflow.sklearn.log_model(model, "model")
    
    print("‚úì Visualizaciones creadas y registradas")

## 6. Comparaci√≥n de M√∫ltiples Modelos

Entrenaremos varios algoritmos y los compararemos.

In [None]:
def train_and_log_model(model, model_name, params, X_train, X_test, y_train, y_test):
    """
    Funci√≥n para entrenar y registrar modelos con MLflow
    """
    with mlflow.start_run(run_name=model_name):
        
        # Log de par√°metros
        mlflow.log_param("model_type", model_name)
        for param_name, param_value in params.items():
            mlflow.log_param(param_name, param_value)
        
        # Entrenar modelo
        model.fit(X_train, y_train)
        
        # Predicciones
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        
        # M√©tricas
        train_accuracy = accuracy_score(y_train, y_pred_train)
        test_accuracy = accuracy_score(y_test, y_pred_test)
        precision = precision_score(y_test, y_pred_test, average='weighted')
        recall = recall_score(y_test, y_pred_test, average='weighted')
        f1 = f1_score(y_test, y_pred_test, average='weighted')
        
        # Log de m√©tricas
        mlflow.log_metric("train_accuracy", train_accuracy)
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("overfitting", train_accuracy - test_accuracy)
        
        # Log del modelo
        mlflow.sklearn.log_model(
            model, 
            f"{model_name}_model",
            registered_model_name=f"energy_classifier_{model_name.lower().replace(' ', '_')}"
        )
        
        print(f"‚úì {model_name} - Test Accuracy: {test_accuracy:.4f}, F1: {f1:.4f}")
        
        return model, test_accuracy, f1

print("‚úì Funci√≥n de entrenamiento definida")

In [None]:
# Diccionario de modelos a probar
models_config = {
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": 100,
            "max_depth": 15,
            "min_samples_split": 5
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": 100,
            "learning_rate": 0.1,
            "max_depth": 5
        }
    },
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000, random_state=42),
        "params": {
            "max_iter": 1000,
            "C": 1.0,
            "solver": "lbfgs"
        }
    }
}

# Entrenar todos los modelos
results = {}

for model_name, config in models_config.items():
    model = config["model"]
    params = config["params"]
    
    # Configurar par√°metros del modelo
    model.set_params(**params)
    
    # Entrenar y registrar
    trained_model, accuracy, f1 = train_and_log_model(
        model, model_name, params, 
        X_train, X_test, y_train, y_test
    )
    
    results[model_name] = {
        "accuracy": accuracy,
        "f1": f1,
        "model": trained_model
    }

print("\n" + "="*50)
print("RESUMEN DE RESULTADOS")
print("="*50)
results_df = pd.DataFrame(results).T
display(results_df[['accuracy', 'f1']])

## 7. B√∫squeda de Hiperpar√°metros

Realizaremos Grid Search con logging autom√°tico de todos los resultados.

In [None]:
from sklearn.model_selection import GridSearchCV

# Definir grid de par√°metros
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}

# Iniciar parent run
with mlflow.start_run(run_name="rf_grid_search") as parent_run:
    
    mlflow.log_param("search_type", "GridSearch")
    mlflow.log_param("param_grid", str(param_grid))
    
    # Grid Search
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(
        rf, param_grid, 
        cv=3, 
        scoring='f1_weighted',
        n_jobs=-1,
        verbose=1
    )
    
    print("Ejecutando Grid Search...")
    grid_search.fit(X_train, y_train)
    
    # Log mejores par√°metros
    mlflow.log_params(grid_search.best_params_)
    
    # Log mejor score
    mlflow.log_metric("best_cv_score", grid_search.best_score_)
    
    # Evaluar en test
    y_pred = grid_search.predict(X_test)
    test_f1 = f1_score(y_test, y_pred, average='weighted')
    test_accuracy = accuracy_score(y_test, y_pred)
    
    mlflow.log_metric("test_f1", test_f1)
    mlflow.log_metric("test_accuracy", test_accuracy)
    
    # Log del mejor modelo
    mlflow.sklearn.log_model(
        grid_search.best_estimator_,
        "best_rf_model",
        registered_model_name="energy_classifier_rf_optimized"
    )
    
    # Log de resultados de grid search
    cv_results = pd.DataFrame(grid_search.cv_results_)
    cv_results.to_csv("/tmp/grid_search_results.csv", index=False)
    mlflow.log_artifact("/tmp/grid_search_results.csv")
    
    print(f"\n‚úì Grid Search completado")
    print(f"Mejores par√°metros: {grid_search.best_params_}")
    print(f"Mejor CV Score: {grid_search.best_score_:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")

## 8. MLflow Model Registry

Gestionaremos el ciclo de vida de los modelos usando Model Registry.

In [None]:
from mlflow.tracking import MlflowClient

# Crear cliente de MLflow
client = MlflowClient()

# Listar todos los modelos registrados
print("Modelos Registrados en Model Registry:")
print("="*50)
registered_models = client.search_registered_models()
for rm in registered_models:
    print(f"  üì¶ {rm.name}")
    latest_versions = client.get_latest_versions(rm.name)
    for version in latest_versions:
        print(f"     ‚îî‚îÄ Version {version.version}: {version.current_stage}")

In [None]:
# Promover modelo a diferentes stages
model_name = "energy_classifier_rf_optimized"

try:
    # Obtener √∫ltima versi√≥n
    latest_versions = client.get_latest_versions(model_name)
    
    if latest_versions:
        latest_version = latest_versions[0].version
        
        # Transici√≥n a "Staging"
        client.transition_model_version_stage(
            name=model_name,
            version=latest_version,
            stage="Staging",
            archive_existing_versions=True
        )
        
        print(f"‚úì Model {model_name} v{latest_version} promovido a Staging")
        
        # Agregar descripci√≥n y tags
        client.update_model_version(
            name=model_name,
            version=latest_version,
            description="Random Forest optimizado con GridSearch para clasificaci√≥n de consumo energ√©tico"
        )
        
        client.set_model_version_tag(
            name=model_name,
            version=latest_version,
            key="validation_status",
            value="passed"
        )
        
        client.set_model_version_tag(
            name=model_name,
            version=latest_version,
            key="dataset",
            value="owid_energy"
        )
        
        print("‚úì Metadata actualizada")
    else:
        print(f"‚ö† No se encontraron versiones del modelo {model_name}")
        
except Exception as e:
    print(f"‚ö† Error al promover modelo: {e}")

In [None]:
# Cargar modelo desde Model Registry
model_uri = f"models:/{model_name}/Staging"

try:
    loaded_model = mlflow.sklearn.load_model(model_uri)
    
    # Hacer predicciones
    sample_data = X_test.iloc[:5]
    predictions = loaded_model.predict(sample_data)
    predictions_labels = le.inverse_transform(predictions)
    
    print("‚úì Modelo cargado desde Registry")
    print("\nPredicciones con modelo desde Staging:")
    for i, (pred_label, actual) in enumerate(zip(predictions_labels, y_test[:5])):
        actual_label = le.inverse_transform([actual])[0]
        print(f"  Sample {i+1}: Predicci√≥n={pred_label}, Real={actual_label}")
        
except Exception as e:
    print(f"‚ö† Error al cargar modelo: {e}")

## 9. Autologging con MLflow

MLflow puede registrar autom√°ticamente par√°metros, m√©tricas y modelos.

In [None]:
# Activar autologging para scikit-learn
mlflow.sklearn.autolog()

# Entrenar modelo con autologging
with mlflow.start_run(run_name="autolog_random_forest"):
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    # MLflow autom√°ticamente registra:
    # - Par√°metros del modelo
    # - M√©tricas de training
    # - Modelo serializado
    # - Signature del modelo
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"‚úì Modelo entrenado con autologging")
    print(f"‚úì Accuracy: {accuracy:.4f}")
    print("\n‚ûú MLflow registr√≥ autom√°ticamente:")
    print("  ‚Ä¢ Par√°metros del modelo")
    print("  ‚Ä¢ M√©tricas de entrenamiento")
    print("  ‚Ä¢ Modelo serializado")
    print("  ‚Ä¢ Signature del modelo")

# Desactivar autologging
mlflow.sklearn.autolog(disable=True)
print("\n‚úì Autologging desactivado")

## 10. B√∫squeda y Comparaci√≥n de Experimentos

Buscamos y comparamos los mejores runs del experimento.

In [None]:
# Buscar runs por m√©tricas
runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    filter_string="metrics.test_accuracy > 0.5",
    order_by=["metrics.test_accuracy DESC"],
    max_results=10
)

print("Top 10 runs por accuracy:")
print("="*80)
if len(runs) > 0:
    display(runs[['run_id', 'params.model_type', 'metrics.test_accuracy', 
                  'metrics.f1_score', 'start_time']].head(10))
else:
    print("No se encontraron runs que cumplan el criterio")

In [None]:
# Comparaci√≥n visual de experimentos
runs_comparison = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    filter_string="params.model_type != ''",
    order_by=["start_time DESC"],
    max_results=20
)

if len(runs_comparison) > 0:
    # Visualizar comparaci√≥n
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Accuracy comparison
    model_accuracy = runs_comparison.groupby('params.model_type')['metrics.test_accuracy'].mean().sort_values(ascending=False)
    model_accuracy.plot(kind='bar', ax=axes[0], color='steelblue')
    axes[0].set_title('Test Accuracy por Tipo de Modelo (Promedio)')
    axes[0].set_xlabel('Tipo de Modelo')
    axes[0].set_ylabel('Accuracy')
    axes[0].tick_params(axis='x', rotation=45)
    
    # F1 Score comparison
    model_f1 = runs_comparison.groupby('params.model_type')['metrics.f1_score'].mean().sort_values(ascending=False)
    model_f1.plot(kind='bar', ax=axes[1], color='coral')
    axes[1].set_title('F1 Score por Tipo de Modelo (Promedio)')
    axes[1].set_xlabel('Tipo de Modelo')
    axes[1].set_ylabel('F1 Score')
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    print("‚úì Visualizaci√≥n creada")
else:
    print("No hay suficientes runs para comparar")

## 11. Funci√≥n de Predicci√≥n como Servicio

Creamos una funci√≥n reutilizable para hacer predicciones con modelos del Registry.

In [None]:
def predict_energy_class(model_name, model_stage, input_data):
    """
    Funci√≥n de predicci√≥n que carga modelo desde Registry
    
    Args:
        model_name: Nombre del modelo en Registry
        model_stage: Stage del modelo (Production, Staging, etc.)
        input_data: DataFrame con features de entrada
    
    Returns:
        Array con predicciones y labels
    """
    try:
        # Cargar modelo
        model_uri = f"models:/{model_name}/{model_stage}"
        model = mlflow.sklearn.load_model(model_uri)
        
        # Predecir
        predictions = model.predict(input_data)
        predictions_labels = le.inverse_transform(predictions)
        
        return predictions, predictions_labels
    except Exception as e:
        print(f"Error al hacer predicci√≥n: {e}")
        return None, None

# Ejemplo de uso
sample_input = X_test.iloc[:10]

predictions, labels = predict_energy_class(
    "energy_classifier_rf_optimized",
    "Staging",
    sample_input
)

if predictions is not None:
    print("‚úì Predicciones desde funci√≥n de servicio:")
    print("="*50)
    for i, label in enumerate(labels):
        print(f"  Sample {i+1}: {label}")

## 12. Registro Completo con Mejores Pr√°cticas

Implementamos un entrenamiento con todas las mejores pr√°cticas de reproducibilidad.

In [None]:
import json
import time
import sys
import sklearn

def train_reproducible_model(model, model_name, X_train, X_test, y_train, y_test):
    """
    Funci√≥n con todas las mejores pr√°cticas de reproducibilidad
    """
    with mlflow.start_run(run_name=model_name) as run:
        
        # 1. Tags descriptivos
        mlflow.set_tag("model_family", "tree_based")
        mlflow.set_tag("problem_type", "classification")
        mlflow.set_tag("dataset", "owid_energy")
        mlflow.set_tag("developer", "data_science_team")
        mlflow.set_tag("version", "1.0.0")
        
        # 2. Metadata del dataset
        mlflow.log_param("train_samples", len(X_train))
        mlflow.log_param("test_samples", len(X_test))
        mlflow.log_param("n_features", X_train.shape[1])
        mlflow.log_param("n_classes", len(np.unique(y_train)))
        mlflow.log_param("random_state", 42)
        
        # 3. Informaci√≥n del ambiente
        mlflow.log_param("python_version", sys.version.split()[0])
        mlflow.log_param("sklearn_version", sklearn.__version__)
        mlflow.log_param("mlflow_version", mlflow.__version__)
        
        # 4. Par√°metros del modelo
        model_params = model.get_params()
        for param, value in model_params.items():
            mlflow.log_param(f"model_{param}", value)
        
        # 5. Entrenar con timer
        start_time = time.time()
        model.fit(X_train, y_train)
        training_time = time.time() - start_time
        
        mlflow.log_metric("training_time_seconds", training_time)
        
        # 6. M√©tricas completas
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        
        metrics = {
            "train_accuracy": accuracy_score(y_train, y_pred_train),
            "test_accuracy": accuracy_score(y_test, y_pred_test),
            "precision": precision_score(y_test, y_pred_test, average='weighted'),
            "recall": recall_score(y_test, y_pred_test, average='weighted'),
            "f1_score": f1_score(y_test, y_pred_test, average='weighted')
        }
        
        for metric_name, metric_value in metrics.items():
            mlflow.log_metric(metric_name, metric_value)
        
        # 7. Signature del modelo
        from mlflow.models.signature import infer_signature
        signature = infer_signature(X_train, y_pred_train)
        
        # 8. Input example
        input_example = X_train[:5]
        
        # 9. Log modelo con toda la metadata
        mlflow.sklearn.log_model(
            model,
            "model",
            signature=signature,
            input_example=input_example,
            registered_model_name=f"{model_name}_reproducible"
        )
        
        # 10. Guardar configuraci√≥n completa
        config = {
            "model_config": {k: str(v) for k, v in model_params.items()},
            "training_config": {
                "train_size": len(X_train),
                "test_size": len(X_test),
                "random_state": 42
            },
            "performance": metrics,
            "training_time": training_time
        }
        
        with open("/tmp/model_config.json", "w") as f:
            json.dump(config, f, indent=2)
        mlflow.log_artifact("/tmp/model_config.json")
        
        print(f"‚úì Modelo {model_name} entrenado de forma reproducible")
        print(f"  Run ID: {run.info.run_id}")
        print(f"  Test Accuracy: {metrics['test_accuracy']:.4f}")
        print(f"  Training Time: {training_time:.2f}s")
        
        return model, run.info.run_id

# Ejecutar entrenamiento reproducible
model_rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
trained_model, run_id = train_reproducible_model(
    model_rf, 
    "rf_reproducible",
    X_train, X_test, y_train, y_test
)

## 13. Resumen del Laboratorio

Revisamos lo que hemos aprendido y los resultados obtenidos.

In [None]:
# Resumen de experimentos
print("="*60)
print("RESUMEN DEL LABORATORIO")
print("="*60)

# Buscar todos los runs
all_runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.test_accuracy DESC"]
)

if len(all_runs) > 0:
    print(f"\n‚úì Total de runs ejecutados: {len(all_runs)}")
    print(f"‚úì Mejor accuracy: {all_runs['metrics.test_accuracy'].max():.4f}")
    print(f"‚úì Mejor F1 score: {all_runs['metrics.f1_score'].max():.4f}")
    
    best_run = all_runs.iloc[0]
    print(f"\nüèÜ Mejor modelo:")
    print(f"  ‚Ä¢ Run ID: {best_run['run_id']}")
    print(f"  ‚Ä¢ Modelo: {best_run.get('params.model_type', 'N/A')}")
    print(f"  ‚Ä¢ Accuracy: {best_run['metrics.test_accuracy']:.4f}")
    print(f"  ‚Ä¢ F1 Score: {best_run['metrics.f1_score']:.4f}")
else:
    print("No se encontraron runs en el experimento")

# Listar modelos en Registry
print(f"\nüì¶ Modelos en Registry:")
registered_models = client.search_registered_models()
for rm in registered_models:
    print(f"  ‚Ä¢ {rm.name}")

print("\n" + "="*60)
print("¬°LABORATORIO COMPLETADO CON √âXITO!")
print("="*60)

## Conclusi√≥n

¬°Felicitaciones! Has completado el laboratorio de entrenamiento y registro de modelos con MLflow.

### Habilidades Adquiridas:

‚úÖ Configuraci√≥n de experimentos en MLflow  
‚úÖ Entrenamiento de modelos con tracking completo  
‚úÖ Uso de MLflow Model Registry  
‚úÖ Comparaci√≥n de m√∫ltiples modelos  
‚úÖ Gesti√≥n de artefactos y dependencias  
‚úÖ Implementaci√≥n de reproducibilidad  
‚úÖ B√∫squeda de hiperpar√°metros con logging  
‚úÖ Deployment de modelos desde Registry  

### Pr√≥ximos Pasos:

- Deployment de modelos en producci√≥n
- Monitoreo de modelos en tiempo real
- Reentrenamiento autom√°tico
- MLOps con Azure DevOps
- Serving de modelos con REST APIs

**¬°Excelente trabajo!**