# MLOps Pipeline automatizado

### Este notebook se ejecutará como un scheduled job

In [0]:
import mlflow
from mlflow.tracking import MlflowClient
from pyspark.sql import SparkSession
from datetime import datetime
import pandas as pd

In [0]:
## 1. Cargar Modelo desde Registry

def load_production_model(model_name: str):
    """Carga el modelo en Production desde el Registry"""
    try:
        model_uri = f"models:/{model_name}@champion"
        model = mlflow.pyfunc.load_model(model_uri)
        
        client = MlflowClient()
        
        return model
    except Exception as e:
        print(f"❌ Error loading model: {str(e)}")
        raise

model = load_production_model("wine_classifier")

In [0]:

# 2. Cargar Nuevos Datos para Inferencia

# Simular nuevos datos (en producción vendrían de Delta Lake, S3, etc.)
from sklearn.datasets import load_wine

wine = load_wine()
new_data = pd.DataFrame(wine.data[:20], columns=wine.feature_names)

print(f"📊 Loaded {len(new_data)} records for inference")
new_data.head()

In [0]:
# 3. Realizar Predicciones Batch

# Inferencia
predictions = model.predict(new_data)

# Crear DataFrame con resultados
results_df = new_data.copy()
results_df['prediction'] = predictions
results_df['inference_timestamp'] = datetime.now()

print(f"✅ Generated {len(predictions)} predictions")
results_df.head()

In [0]:
# 4. Guardar Resultados (Delta Lake)

# Convertir a Spark DataFrame
spark = SparkSession.builder.getOrCreate()
results_spark_df = spark.createDataFrame(results_df)

# Guardar en Delta Lake con merge

catalog = "workspace"
schema = "default"
table = "wine_predictions"


results_spark_df.write \
    .format("delta") \
    .mode("append") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog}.{schema}.{table}")

print(f"💾 Results saved to {output_path}")

In [0]:
# 5. Logging de la Corrida del Pipeline

# Trackear la ejecución del pipeline
mlflow.set_experiment("/Users/your_username/wine_pipeline_production")

with mlflow.start_run(run_name=f"batch_inference_{datetime.now().strftime('%Y%m%d_%H%M')}"):
    
    # Metadata del pipeline
    mlflow.log_param("model_name", "wine_classifier")
    mlflow.log_param("records_processed", len(new_data))
    mlflow.log_param("output_path", output_path)
    
    # Métricas básicas
    mlflow.log_metric("num_predictions", len(predictions))
    mlflow.log_metric("execution_time_seconds", 5)  # Calcular real
    
    # Distribución de predicciones
    prediction_distribution = pd.Series(predictions).value_counts().to_dict()
    for class_label, count in prediction_distribution.items():
        mlflow.log_metric(f"class_{class_label}_count", count)
    
    print("✅ Pipeline execution logged to MLflow")

In [0]:
# 6. Validaciones y Alertas (Opcional)

# Data drift detection simplificado
def check_data_quality(df, threshold=0.95):
    """Validación básica de calidad"""
    
    # Check missing values
    missing_pct = df.isnull().sum().max() / len(df)
    
    # Check prediction distribution
    pred_distribution = df['prediction'].value_counts(normalize=True)
    is_balanced = pred_distribution.max() < threshold
    
    if missing_pct > 0.1:
        print(f"⚠️ WARNING: {missing_pct*100:.2f}% missing values detected")
    
    if not is_balanced:
        print(f"⚠️ WARNING: Imbalanced predictions detected")
        print(pred_distribution)
    else:
        print("✅ Data quality checks passed")
    
    return missing_pct < 0.1 and is_balanced

quality_ok = check_data_quality(results_df)

print("=" * 50)
print("🎉 PIPELINE EXECUTION COMPLETED")
print("=" * 50)