In [None]:
# EVALUACIÓN DETALLADA DEL MODELO DE SELECCIÓN DE CANDIDATOS - AZURE ML
# ========================================================================

# 1. CONFIGURACIÓN Y CARGA DE ARTEFACTOS DESDE AZURE ML
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib
import warnings
from pathlib import Path

warnings.filterwarnings('ignore')

# Azure ML imports
from azureml.core import Workspace, Dataset, Experiment, Run, Model
from azureml.core.model import Model as AMLModel
import mlflow
import mlflow.sklearn

# Metrics y visualización
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, roc_curve,
    precision_recall_curve, average_precision_score, f1_score,
    precision_score, recall_score, accuracy_score
)
from sklearn.calibration import calibration_curve
from sklearn.inspection import permutation_importance

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print("🔍 EVALUACIÓN DETALLADA DEL MODELO DE SELECCIÓN DE CANDIDATOS")
print("="*70)

# Conectar al workspace
try:
    ws = Workspace.from_config()
    print(f"✅ Conectado al workspace: {ws.name}")
except Exception as e:
    print(f"❌ Error conectando al workspace: {e}")
    raise

# Configurar MLflow
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

print(f"🔄 Cargando artefactos del entrenamiento...")


In [None]:
# 2. CARGA DEL MODELO Y DATOS DESDE AZURE ML
print("\n📥 CARGA DE MODELO Y DATOS REGISTRADOS")
print("="*50)

# Cargar el modelo registrado más reciente
MODEL_NAME = "candidate-selection-model"

try:
    # Obtener la versión más reciente del modelo
    registered_model = AMLModel(ws, name=MODEL_NAME)
    print(f"✅ Modelo encontrado: {MODEL_NAME}")
    print(f"📋 Versión: {registered_model.version}")
    print(f"🏷️  Tags: {registered_model.tags}")
    
    # Descargar artefactos del modelo
    model_path = registered_model.download(target_dir="./model_artifacts")
    print(f"📁 Artefactos descargados en: {model_path}")
    
    # Cargar modelo y scaler
    model = joblib.load(f"{model_path}/model.pkl")
    scaler = joblib.load(f"{model_path}/scaler.pkl")
    
    # Cargar metadatos
    with open(f"{model_path}/feature_names.json", 'r') as f:
        feature_metadata = json.load(f)
    
    with open(f"{model_path}/notebook_info.json", 'r') as f:
        training_info = json.load(f)
    
    print(f"✅ Modelo cargado: {type(model).__name__}")
    print(f"📊 Features: {feature_metadata['feature_count']}")
    print(f"🎯 Mejor métrica F1: {training_info['test_metrics']['f1_macro']:.3f}")
    
except Exception as e:
    print(f"❌ Error cargando modelo registrado: {e}")
    print("💡 Asegúrate de haber ejecutado el notebook de entrenamiento primero")
    raise

# Cargar datasets de evaluación
try:
    # Cargar datos de test y validación
    test_data = pd.read_parquet(f"{model_path}/test_data.parquet")
    val_data = pd.read_parquet(f"{model_path}/val_data.parquet")
    
    print(f"\n📊 Datos de test cargados: {test_data.shape}")
    print(f"📊 Datos de validación cargados: {val_data.shape}")
    
    # Separar features y targets
    feature_names = feature_metadata['feature_names']
    
    X_test = test_data[feature_names]
    y_test_true = test_data['y_true']
    y_test_pred = test_data['y_pred']
    y_test_proba = test_data['y_proba']
    
    X_val = val_data[feature_names]
    y_val_true = val_data['y_true'] 
    y_val_pred = val_data['y_pred']
    y_val_proba = val_data['y_proba']
    
    print(f"✅ Datos organizados:")
    print(f"  Test: {len(X_test)} muestras, {len(feature_names)} features")
    print(f"  Validación: {len(X_val)} muestras")
    
except Exception as e:
    print(f"❌ Error cargando datos: {e}")
    raise

# Información del modelo entrenado
print(f"\n📋 INFORMACIÓN DEL ENTRENAMIENTO:")
print(f"  Experimento: {training_info['experiment_name']}")
print(f"  Run ID: {training_info['run_id']}")
print(f"  Mejor modelo: {training_info['best_model_name']}")
print(f"  Requiere escalado: {training_info['model_requires_scaling']}")
print(f"  Tamaño entrenamiento: {training_info['dataset_info']['train_size']:,}")
print(f"  Tamaño test: {training_info['dataset_info']['test_size']:,}")


In [None]:
# 3. ANÁLISIS DETALLADO DE MÉTRICAS DE RENDIMIENTO
print("\n📊 ANÁLISIS DETALLADO DE MÉTRICAS")
print("="*50)

def calculate_comprehensive_metrics(y_true, y_pred, y_proba, dataset_name="Test"):
    """Calcula métricas comprensivas para evaluación"""
    
    metrics = {}
    
    # Métricas básicas
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred, average='macro')
    metrics['recall'] = recall_score(y_true, y_pred, average='macro')
    metrics['f1_macro'] = f1_score(y_true, y_pred, average='macro')
    metrics['f1_weighted'] = f1_score(y_true, y_pred, average='weighted')
    
    # Métricas por clase
    metrics['precision_class_0'] = precision_score(y_true, y_pred, pos_label=0)
    metrics['recall_class_0'] = recall_score(y_true, y_pred, pos_label=0)
    metrics['f1_class_0'] = f1_score(y_true, y_pred, pos_label=0)
    
    metrics['precision_class_1'] = precision_score(y_true, y_pred, pos_label=1)
    metrics['recall_class_1'] = recall_score(y_true, y_pred, pos_label=1)
    metrics['f1_class_1'] = f1_score(y_true, y_pred, pos_label=1)
    
    # Métricas basadas en probabilidades
    try:
        metrics['auc_roc'] = roc_auc_score(y_true, y_proba)
        metrics['auc_pr'] = average_precision_score(y_true, y_proba)
    except:
        metrics['auc_roc'] = np.nan
        metrics['auc_pr'] = np.nan
    
    # Matriz de confusión
    cm = confusion_matrix(y_true, y_pred)
    metrics['true_negatives'] = int(cm[0,0])
    metrics['false_positives'] = int(cm[0,1])
    metrics['false_negatives'] = int(cm[1,0])
    metrics['true_positives'] = int(cm[1,1])
    
    # Métricas derivadas
    total = len(y_true)
    metrics['specificity'] = metrics['true_negatives'] / (metrics['true_negatives'] + metrics['false_positives'])
    metrics['sensitivity'] = metrics['true_positives'] / (metrics['true_positives'] + metrics['false_negatives'])
    metrics['positive_predictive_value'] = metrics['true_positives'] / (metrics['true_positives'] + metrics['false_positives'])
    metrics['negative_predictive_value'] = metrics['true_negatives'] / (metrics['true_negatives'] + metrics['false_negatives'])
    
    return metrics

# Calcular métricas para test y validación
test_metrics = calculate_comprehensive_metrics(y_test_true, y_test_pred, y_test_proba, "Test")
val_metrics = calculate_comprehensive_metrics(y_val_true, y_val_pred, y_val_proba, "Validación")

# Crear tabla comparativa
metrics_comparison = pd.DataFrame({
    'Validación': val_metrics,
    'Test': test_metrics
}).round(3)

print("📈 MÉTRICAS COMPARATIVAS (Validación vs Test):")
print("="*60)

# Mostrar métricas principales
main_metrics = [
    'accuracy', 'precision', 'recall', 'f1_macro', 'f1_weighted', 
    'auc_roc', 'auc_pr', 'specificity', 'sensitivity'
]

print(metrics_comparison.loc[main_metrics].to_string())

print(f"\n🔢 MATRIZ DE CONFUSIÓN - CONJUNTO DE TEST:")
print(f"               Predicho")
print(f"Real     No Apto   Apto")
print(f"No Apto    {test_metrics['true_negatives']:3d}     {test_metrics['false_positives']:3d}")
print(f"Apto       {test_metrics['false_negatives']:3d}     {test_metrics['true_positives']:3d}")

print(f"\n📊 MÉTRICAS POR CLASE - TEST:")
print(f"Clase 0 (No Apto):")
print(f"  Precision: {test_metrics['precision_class_0']:.3f}")
print(f"  Recall:    {test_metrics['recall_class_0']:.3f}")
print(f"  F1-Score:  {test_metrics['f1_class_0']:.3f}")

print(f"\nClase 1 (Apto):")
print(f"  Precision: {test_metrics['precision_class_1']:.3f}")
print(f"  Recall:    {test_metrics['recall_class_1']:.3f}")
print(f"  F1-Score:  {test_metrics['f1_class_1']:.3f}")

# Análisis de estabilidad entre validación y test
print(f"\n🎯 ANÁLISIS DE ESTABILIDAD (Val vs Test):")
stability_analysis = {}
for metric in main_metrics:
    if metric in val_metrics and metric in test_metrics:
        diff = abs(val_metrics[metric] - test_metrics[metric])
        stability_analysis[metric] = diff
        
        if diff < 0.02:
            status = "✅ Estable"
        elif diff < 0.05:
            status = "⚠️ Moderado"
        else:
            status = "❌ Inestable"
            
        print(f"  {metric:20s}: Δ={diff:.3f} {status}")

# Guardar métricas para registro
evaluation_results = {
    'validation_metrics': val_metrics,
    'test_metrics': test_metrics,
    'stability_analysis': stability_analysis,
    'model_info': {
        'name': training_info['best_model_name'],
        'version': registered_model.version,
        'features_count': len(feature_names)
    }
}


In [None]:
# 4. VISUALIZACIONES AVANZADAS DE RENDIMIENTO
print("\n📊 VISUALIZACIONES DE RENDIMIENTO")
print("="*50)

# Configurar subplot principal
fig = plt.figure(figsize=(20, 15))

# 1. Curva ROC
plt.subplot(2, 4, 1)
fpr_test, tpr_test, _ = roc_curve(y_test_true, y_test_proba)
fpr_val, tpr_val, _ = roc_curve(y_val_true, y_val_proba)

plt.plot(fpr_test, tpr_test, label=f'Test (AUC = {test_metrics["auc_roc"]:.3f})', linewidth=2)
plt.plot(fpr_val, tpr_val, label=f'Val (AUC = {val_metrics["auc_roc"]:.3f})', linewidth=2, linestyle='--')
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('Curva ROC')
plt.legend()
plt.grid(True, alpha=0.3)

# 2. Curva Precision-Recall
plt.subplot(2, 4, 2)
precision_test, recall_test, _ = precision_recall_curve(y_test_true, y_test_proba)
precision_val, recall_val, _ = precision_recall_curve(y_val_true, y_val_proba)

plt.plot(recall_test, precision_test, label=f'Test (AP = {test_metrics["auc_pr"]:.3f})', linewidth=2)
plt.plot(recall_val, precision_val, label=f'Val (AP = {val_metrics["auc_pr"]:.3f})', linewidth=2, linestyle='--')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall')
plt.legend()
plt.grid(True, alpha=0.3)

# 3. Distribución de probabilidades
plt.subplot(2, 4, 3)
plt.hist(y_test_proba[y_test_true == 0], bins=30, alpha=0.5, label='No Apto', density=True)
plt.hist(y_test_proba[y_test_true == 1], bins=30, alpha=0.5, label='Apto', density=True)
plt.xlabel('Probabilidad Predicha')
plt.ylabel('Densidad')
plt.title('Distribución de Probabilidades')
plt.legend()
plt.grid(True, alpha=0.3)

# 4. Calibration curve
plt.subplot(2, 4, 4)
try:
    fraction_of_positives, mean_predicted_value = calibration_curve(y_test_true, y_test_proba, n_bins=10)
    plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="Modelo", linewidth=2)
    plt.plot([0, 1], [0, 1], "k:", label="Perfectamente calibrado")
    plt.xlabel('Probabilidad Media Predicha')
    plt.ylabel('Fracción de Positivos')
    plt.title('Calibración del Modelo')
    plt.legend()
    plt.grid(True, alpha=0.3)
except Exception as e:
    plt.text(0.5, 0.5, f'Error en calibración: {str(e)[:50]}...', ha='center', va='center')

# 5. Matriz de confusión - Test
plt.subplot(2, 4, 5)
cm_test = confusion_matrix(y_test_true, y_test_pred)
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Apto', 'Apto'], yticklabels=['No Apto', 'Apto'])
plt.title('Matriz Confusión - Test')
plt.ylabel('Real')
plt.xlabel('Predicho')

# 6. Matriz de confusión - Validación
plt.subplot(2, 4, 6)
cm_val = confusion_matrix(y_val_true, y_val_pred)
sns.heatmap(cm_val, annot=True, fmt='d', cmap='Oranges',
            xticklabels=['No Apto', 'Apto'], yticklabels=['No Apto', 'Apto'])
plt.title('Matriz Confusión - Validación')
plt.ylabel('Real')
plt.xlabel('Predicho')

# 7. Comparación de métricas
plt.subplot(2, 4, 7)
metrics_for_plot = ['accuracy', 'precision', 'recall', 'f1_macro', 'auc_roc']
val_values = [val_metrics[m] for m in metrics_for_plot]
test_values = [test_metrics[m] for m in metrics_for_plot]

x = np.arange(len(metrics_for_plot))
width = 0.35

plt.bar(x - width/2, val_values, width, label='Validación', alpha=0.8)
plt.bar(x + width/2, test_values, width, label='Test', alpha=0.8)
plt.xlabel('Métricas')
plt.ylabel('Valor')
plt.title('Comparación de Métricas')
plt.xticks(x, metrics_for_plot, rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)

# 8. Análisis de umbrales
plt.subplot(2, 4, 8)
thresholds = np.linspace(0, 1, 100)
f1_scores = []
precisions = []
recalls = []

for threshold in thresholds:
    y_pred_thresh = (y_test_proba >= threshold).astype(int)
    if len(np.unique(y_pred_thresh)) > 1:  # Evitar divisiones por cero
        f1_scores.append(f1_score(y_test_true, y_pred_thresh))
        precisions.append(precision_score(y_test_true, y_pred_thresh))
        recalls.append(recall_score(y_test_true, y_pred_thresh))
    else:
        f1_scores.append(0)
        precisions.append(0)
        recalls.append(0)

plt.plot(thresholds, f1_scores, label='F1-Score', linewidth=2)
plt.plot(thresholds, precisions, label='Precision', linewidth=2)
plt.plot(thresholds, recalls, label='Recall', linewidth=2)
plt.axvline(x=0.5, color='red', linestyle='--', alpha=0.7, label='Umbral 0.5')
plt.xlabel('Umbral')
plt.ylabel('Métrica')
plt.title('Métricas vs Umbral')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('model_evaluation_dashboard.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Dashboard de evaluación generado y guardado como 'model_evaluation_dashboard.png'")

# Encontrar umbral óptimo para F1
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
optimal_f1 = f1_scores[optimal_idx]

print(f"\n🎯 UMBRAL ÓPTIMO PARA F1-SCORE:")
print(f"  Umbral óptimo: {optimal_threshold:.3f}")
print(f"  F1-Score óptimo: {optimal_f1:.3f}")
print(f"  F1-Score actual (umbral 0.5): {test_metrics['f1_macro']:.3f}")


In [None]:
# 5. ANÁLISIS DE IMPORTANCIA DE CARACTERÍSTICAS
print("\n🔍 ANÁLISIS DE IMPORTANCIA DE CARACTERÍSTICAS")
print("="*60)

# Análisis de importancia según el tipo de modelo
def analyze_feature_importance(model, X_test, y_test_true, feature_names):
    """Analiza la importancia de las características del modelo"""
    
    importance_results = {}
    
    # 1. Feature importance nativa (para modelos basados en árboles)
    if hasattr(model, 'feature_importances_'):
        native_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        importance_results['native'] = native_importance
        print("🌳 Importancia nativa del modelo disponible")
    
    # 2. Permutation importance (para todos los modelos)
    try:
        print("🔄 Calculando permutation importance...")
        perm_importance = permutation_importance(
            model, X_test, y_test_true, 
            n_repeats=10, random_state=42, 
            scoring='f1_macro'
        )
        
        perm_df = pd.DataFrame({
            'feature': feature_names,
            'importance_mean': perm_importance.importances_mean,
            'importance_std': perm_importance.importances_std
        }).sort_values('importance_mean', ascending=False)
        
        importance_results['permutation'] = perm_df
        print("✅ Permutation importance calculado")
        
    except Exception as e:
        print(f"⚠️ Error en permutation importance: {e}")
    
    # 3. Coeficientes (para modelos lineales)
    if hasattr(model, 'coef_'):
        coef_df = pd.DataFrame({
            'feature': feature_names,
            'coefficient': model.coef_[0] if model.coef_.ndim > 1 else model.coef_,
            'abs_coefficient': np.abs(model.coef_[0] if model.coef_.ndim > 1 else model.coef_)
        }).sort_values('abs_coefficient', ascending=False)
        
        importance_results['coefficients'] = coef_df
        print("📊 Coeficientes del modelo disponibles")
    
    return importance_results

# Calcular importancia
feature_importance = analyze_feature_importance(model, X_test, y_test_true, feature_names)

# Visualizar importancia de características
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

plot_idx = 0

# Plot 1: Native feature importance
if 'native' in feature_importance:
    top_features = feature_importance['native'].head(15)
    axes[plot_idx].barh(range(len(top_features)), top_features['importance'])
    axes[plot_idx].set_yticks(range(len(top_features)))
    axes[plot_idx].set_yticklabels(top_features['feature'])
    axes[plot_idx].set_xlabel('Importancia')
    axes[plot_idx].set_title('Importancia Nativa del Modelo')
    axes[plot_idx].invert_yaxis()
    plot_idx += 1

# Plot 2: Permutation importance
if 'permutation' in feature_importance:
    top_perm = feature_importance['permutation'].head(15)
    axes[plot_idx].barh(range(len(top_perm)), top_perm['importance_mean'])
    axes[plot_idx].set_yticks(range(len(top_perm)))
    axes[plot_idx].set_yticklabels(top_perm['feature'])
    axes[plot_idx].set_xlabel('Importancia Media')
    axes[plot_idx].set_title('Permutation Importance')
    axes[plot_idx].invert_yaxis()
    plot_idx += 1

# Plot 3: Coefficients (if available)
if 'coefficients' in feature_importance:
    top_coef = feature_importance['coefficients'].head(15)
    colors = ['red' if x < 0 else 'blue' for x in top_coef['coefficient']]
    axes[plot_idx].barh(range(len(top_coef)), top_coef['coefficient'], color=colors, alpha=0.7)
    axes[plot_idx].set_yticks(range(len(top_coef)))
    axes[plot_idx].set_yticklabels(top_coef['feature'])
    axes[plot_idx].set_xlabel('Coeficiente')
    axes[plot_idx].set_title('Coeficientes del Modelo')
    axes[plot_idx].axvline(x=0, color='black', linestyle='-', alpha=0.3)
    axes[plot_idx].invert_yaxis()
    plot_idx += 1

# Plot 4: Comparación de importancias (si hay múltiples métodos)
if len(feature_importance) > 1:
    comparison_data = []
    
    # Normalizar importancias para comparación
    for method, df in feature_importance.items():
        if method == 'native':
            importance_col = 'importance'
        elif method == 'permutation':
            importance_col = 'importance_mean'
        elif method == 'coefficients':
            importance_col = 'abs_coefficient'
        
        # Tomar top 10 features y normalizar
        top_10 = df.head(10).copy()
        max_val = top_10[importance_col].max()
        if max_val > 0:
            top_10['normalized'] = top_10[importance_col] / max_val
            top_10['method'] = method
            comparison_data.append(top_10[['feature', 'normalized', 'method']])
    
    if comparison_data:
        comparison_df = pd.concat(comparison_data)
        
        # Crear pivot para heatmap
        pivot_df = comparison_df.pivot(index='feature', columns='method', values='normalized')
        
        sns.heatmap(pivot_df.fillna(0), annot=True, fmt='.2f', cmap='viridis', 
                   ax=axes[plot_idx], cbar_kws={'label': 'Importancia Normalizada'})
        axes[plot_idx].set_title('Comparación de Métodos de Importancia')
        axes[plot_idx].set_xlabel('Método')
        axes[plot_idx].set_ylabel('Feature')
        plot_idx += 1

# Ocultar subplots no utilizados
for i in range(plot_idx, len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.savefig('feature_importance_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Análisis de importancia guardado como 'feature_importance_analysis.png'")

# Mostrar top features por cada método
print(f"\n📋 TOP 10 CARACTERÍSTICAS MÁS IMPORTANTES:")
for method, df in feature_importance.items():
    print(f"\n{method.upper()}:")
    if method == 'native':
        print(df.head(10)[['feature', 'importance']].to_string(index=False))
    elif method == 'permutation':
        print(df.head(10)[['feature', 'importance_mean', 'importance_std']].to_string(index=False))
    elif method == 'coefficients':
        print(df.head(10)[['feature', 'coefficient']].to_string(index=False))


In [None]:
# 7. REGISTRO DE RESULTADOS EN AZURE ML
print("\n🔄 REGISTRO DE RESULTADOS EN AZURE ML")
print("="*50)

# Crear un nuevo experimento para evaluación
EVALUATION_EXPERIMENT = "candidate-evaluation"

with mlflow.start_run(experiment_id=mlflow.create_experiment(EVALUATION_EXPERIMENT) if EVALUATION_EXPERIMENT not in [exp.name for exp in mlflow.search_experiments()] else mlflow.get_experiment_by_name(EVALUATION_EXPERIMENT).experiment_id):
    
    # Registrar métricas de evaluación en MLflow
    print("📊 Registrando métricas de evaluación...")
    
    # Métricas principales del conjunto de test
    mlflow.log_metric("test_accuracy", test_metrics['accuracy'])
    mlflow.log_metric("test_precision", test_metrics['precision'])
    mlflow.log_metric("test_recall", test_metrics['recall'])
    mlflow.log_metric("test_f1_macro", test_metrics['f1_macro'])
    mlflow.log_metric("test_f1_weighted", test_metrics['f1_weighted'])
    mlflow.log_metric("test_auc_roc", test_metrics['auc_roc'])
    mlflow.log_metric("test_auc_pr", test_metrics['auc_pr'])
    mlflow.log_metric("test_specificity", test_metrics['specificity'])
    mlflow.log_metric("test_sensitivity", test_metrics['sensitivity'])
    
    # Métricas por clase
    mlflow.log_metric("test_precision_class_0", test_metrics['precision_class_0'])
    mlflow.log_metric("test_recall_class_0", test_metrics['recall_class_0'])
    mlflow.log_metric("test_f1_class_0", test_metrics['f1_class_0'])
    mlflow.log_metric("test_precision_class_1", test_metrics['precision_class_1'])
    mlflow.log_metric("test_recall_class_1", test_metrics['recall_class_1'])
    mlflow.log_metric("test_f1_class_1", test_metrics['f1_class_1'])
    
    # Métricas de matriz de confusión
    mlflow.log_metric("test_true_positives", test_metrics['true_positives'])
    mlflow.log_metric("test_true_negatives", test_metrics['true_negatives'])
    mlflow.log_metric("test_false_positives", test_metrics['false_positives'])
    mlflow.log_metric("test_false_negatives", test_metrics['false_negatives'])
    
    # Métricas de estabilidad
    for metric, diff in stability_analysis.items():
        mlflow.log_metric(f"stability_{metric}", diff)
    
    # Métricas de casos mal clasificados
    mlflow.log_metric("misclassification_rate", misclassification_analysis['misclassification_rate'])
    mlflow.log_metric("false_positives_count", misclassification_analysis['false_positives'])
    mlflow.log_metric("false_negatives_count", misclassification_analysis['false_negatives'])
    
    if 'confidence_stats' in misclassification_analysis:
        mlflow.log_metric("misclassified_mean_confidence", misclassification_analysis['confidence_stats']['mean_confidence'])
        mlflow.log_metric("misclassified_std_confidence", misclassification_analysis['confidence_stats']['std_confidence'])
        mlflow.log_metric("high_confidence_errors", misclassification_analysis['confidence_stats']['high_confidence_errors'])
    
    # Umbral óptimo
    mlflow.log_metric("optimal_threshold", optimal_threshold)
    mlflow.log_metric("optimal_f1_score", optimal_f1)
    
    # Registrar parámetros del modelo evaluado
    mlflow.log_param("evaluated_model", training_info['best_model_name'])
    mlflow.log_param("model_version", registered_model.version)
    mlflow.log_param("original_experiment", training_info['experiment_name'])
    mlflow.log_param("original_run_id", training_info['run_id'])
    mlflow.log_param("test_size", len(X_test))
    mlflow.log_param("validation_size", len(X_val))
    mlflow.log_param("feature_count", len(feature_names))
    
    # Registrar artefactos (gráficos)
    print("🖼️ Registrando visualizaciones...")
    mlflow.log_artifact("model_evaluation_dashboard.png")
    mlflow.log_artifact("feature_importance_analysis.png")
    mlflow.log_artifact("misclassification_analysis.png")
    
    # Guardar resultados detallados como JSON
    detailed_results = {
        'model_info': {
            'name': training_info['best_model_name'],
            'version': registered_model.version,
            'original_experiment': training_info['experiment_name'],
            'original_run_id': training_info['run_id']
        },
        'test_metrics': test_metrics,
        'validation_metrics': val_metrics,
        'stability_analysis': stability_analysis,
        'misclassification_analysis': {
            k: v for k, v in misclassification_analysis.items() 
            if k != 'feature_differences'  # Excluir DataFrame para JSON
        },
        'optimal_threshold': optimal_threshold,
        'optimal_f1_score': optimal_f1,
        'evaluation_timestamp': pd.Timestamp.now().isoformat()
    }
    
    # Guardar como archivo JSON
    with open('evaluation_results.json', 'w') as f:
        json.dump(detailed_results, f, indent=2, default=str)
    
    mlflow.log_artifact('evaluation_results.json')
    
    # Registrar top features importantes
    if 'native' in feature_importance:
        top_features = feature_importance['native'].head(10)
        for i, (_, row) in enumerate(top_features.iterrows()):
            mlflow.log_metric(f"feature_importance_rank_{i+1}", row['importance'])
            mlflow.log_param(f"top_feature_{i+1}", row['feature'])
    
    current_run = mlflow.active_run()
    evaluation_run_id = current_run.info.run_id
    
    print(f"✅ Evaluación registrada en MLflow")
    print(f"📋 Run ID: {evaluation_run_id}")

print(f"\n📊 RESUMEN FINAL DE EVALUACIÓN")
print("="*60)
print(f"🎯 Modelo evaluado: {training_info['best_model_name']}")
print(f"📋 Versión: {registered_model.version}")
print(f"📊 Precisión en test: {test_metrics['accuracy']:.3f}")
print(f"📊 F1-Score macro: {test_metrics['f1_macro']:.3f}")
print(f"📊 AUC-ROC: {test_metrics['auc_roc']:.3f}")
print(f"🎯 Umbral óptimo: {optimal_threshold:.3f}")
print(f"❌ Tasa de error: {misclassification_analysis['misclassification_rate']:.3f}")

# Evaluación de calidad del modelo
def evaluate_model_quality(metrics):
    """Evalúa la calidad general del modelo"""
    
    quality_score = 0
    recommendations = []
    
    # Criterios de evaluación
    if metrics['accuracy'] >= 0.85:
        quality_score += 2
    elif metrics['accuracy'] >= 0.75:
        quality_score += 1
    else:
        recommendations.append("Mejorar precisión general del modelo")
    
    if metrics['f1_macro'] >= 0.80:
        quality_score += 2
    elif metrics['f1_macro'] >= 0.70:
        quality_score += 1
    else:
        recommendations.append("Mejorar F1-Score balanceando clases")
    
    if metrics['auc_roc'] >= 0.85:
        quality_score += 2
    elif metrics['auc_roc'] >= 0.75:
        quality_score += 1
    else:
        recommendations.append("Mejorar capacidad discriminativa del modelo")
    
    # Evaluar balance entre precisión y recall
    precision_recall_diff = abs(metrics['precision'] - metrics['recall'])
    if precision_recall_diff <= 0.1:
        quality_score += 1
    else:
        recommendations.append("Balancear mejor precisión y recall")
    
    # Clasificación de calidad
    if quality_score >= 6:
        quality_level = "🌟 EXCELENTE"
    elif quality_score >= 4:
        quality_level = "✅ BUENO"
    elif quality_score >= 2:
        quality_level = "⚠️ ACEPTABLE"
    else:
        quality_level = "❌ NECESITA MEJORAS"
    
    return quality_level, quality_score, recommendations

quality_level, quality_score, recommendations = evaluate_model_quality(test_metrics)

print(f"\n🏆 EVALUACIÓN DE CALIDAD DEL MODELO:")
print(f"📊 Nivel de calidad: {quality_level}")
print(f"🔢 Puntuación: {quality_score}/7")

if recommendations:
    print(f"\n💡 RECOMENDACIONES PARA MEJORA:")
    for i, rec in enumerate(recommendations, 1):
        print(f"  {i}. {rec}")
else:
    print(f"\n🎉 ¡El modelo cumple con todos los criterios de calidad!")

print(f"\n✅ Evaluación completa finalizada. Archivos generados:")
print(f"  - model_evaluation_dashboard.png")
print(f"  - feature_importance_analysis.png")
print(f"  - misclassification_analysis.png")
print(f"  - evaluation_results.json")
print(f"\n📋 Run de evaluación: {evaluation_run_id}")


In [None]:
# 6. ANÁLISIS DE CASOS MAL CLASIFICADOS
print("\n❌ ANÁLISIS DE CASOS MAL CLASIFICADOS")
print("="*50)

def analyze_misclassified_cases(X_test, y_true, y_pred, y_proba, feature_names):
    """Analiza casos mal clasificados para identificar patrones"""
    
    # Identificar casos mal clasificados
    misclassified_mask = y_true != y_pred
    correctly_classified_mask = y_true == y_pred
    
    X_misclassified = X_test[misclassified_mask]
    X_correct = X_test[correctly_classified_mask]
    
    y_true_misc = y_true[misclassified_mask]
    y_pred_misc = y_pred[misclassified_mask]
    y_proba_misc = y_proba[misclassified_mask]
    
    print(f"📊 Casos mal clasificados: {len(X_misclassified)} ({len(X_misclassified)/len(X_test)*100:.1f}%)")
    
    # Análisis por tipo de error
    false_positives = (y_true_misc == 0) & (y_pred_misc == 1)
    false_negatives = (y_true_misc == 1) & (y_pred_misc == 0)
    
    print(f"🔴 Falsos Positivos: {false_positives.sum()} (predicho apto, real no apto)")
    print(f"🔴 Falsos Negativos: {false_negatives.sum()} (predicho no apto, real apto)")
    
    analysis_results = {
        'total_misclassified': len(X_misclassified),
        'false_positives': false_positives.sum(),
        'false_negatives': false_negatives.sum(),
        'misclassification_rate': len(X_misclassified) / len(X_test)
    }
    
    # Análisis de características en casos mal clasificados
    if len(X_misclassified) > 0:
        print(f"\n📈 CARACTERÍSTICAS DE CASOS MAL CLASIFICADOS:")
        
        # Comparar estadísticas de features entre casos correctos e incorrectos
        feature_comparison = []
        
        for feature in feature_names[:20]:  # Top 20 features para evitar output muy largo
            if feature in X_test.columns:
                correct_mean = X_correct[feature].mean()
                misc_mean = X_misclassified[feature].mean()
                
                # Test estadístico simple (diferencia relativa)
                if correct_mean != 0:
                    relative_diff = abs(misc_mean - correct_mean) / abs(correct_mean)
                else:
                    relative_diff = abs(misc_mean)
                
                feature_comparison.append({
                    'feature': feature,
                    'correct_mean': correct_mean,
                    'misclassified_mean': misc_mean,
                    'relative_difference': relative_diff
                })
        
        # Ordenar por diferencia relativa
        feature_comparison_df = pd.DataFrame(feature_comparison)
        feature_comparison_df = feature_comparison_df.sort_values('relative_difference', ascending=False)
        
        print("Top 10 features con mayor diferencia entre casos correctos e incorrectos:")
        print(feature_comparison_df.head(10)[['feature', 'correct_mean', 'misclassified_mean', 'relative_difference']].to_string(index=False))
        
        analysis_results['feature_differences'] = feature_comparison_df
    
    # Análisis de confianza en predicciones incorrectas
    if len(y_proba_misc) > 0:
        print(f"\n🎯 ANÁLISIS DE CONFIANZA EN CASOS MAL CLASIFICADOS:")
        
        # Distribución de probabilidades en casos mal clasificados
        conf_stats = {
            'mean_confidence': y_proba_misc.mean(),
            'median_confidence': np.median(y_proba_misc),
            'std_confidence': y_proba_misc.std(),
            'high_confidence_errors': (np.abs(y_proba_misc - 0.5) > 0.3).sum()
        }
        
        print(f"  Confianza promedio: {conf_stats['mean_confidence']:.3f}")
        print(f"  Confianza mediana: {conf_stats['median_confidence']:.3f}")
        print(f"  Desviación estándar: {conf_stats['std_confidence']:.3f}")
        print(f"  Errores con alta confianza (>0.8 o <0.2): {conf_stats['high_confidence_errors']}")
        
        analysis_results['confidence_stats'] = conf_stats
    
    return analysis_results

# Realizar análisis
misclassification_analysis = analyze_misclassified_cases(
    X_test, y_test_true, y_test_pred, y_test_proba, feature_names
)

# Visualizar análisis de casos mal clasificados
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Distribución de probabilidades por tipo de predicción
axes[0,0].hist(y_test_proba[y_test_true == y_test_pred], bins=30, alpha=0.7, label='Correctos', density=True)
axes[0,0].hist(y_test_proba[y_test_true != y_test_pred], bins=30, alpha=0.7, label='Incorrectos', density=True)
axes[0,0].set_xlabel('Probabilidad Predicha')
axes[0,0].set_ylabel('Densidad')
axes[0,0].set_title('Distribución de Probabilidades')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# 2. Boxplot de confianza por tipo de caso
correct_conf = np.abs(y_test_proba[y_test_true == y_test_pred] - 0.5)
incorrect_conf = np.abs(y_test_proba[y_test_true != y_test_pred] - 0.5)

axes[0,1].boxplot([correct_conf, incorrect_conf], labels=['Correctos', 'Incorrectos'])
axes[0,1].set_ylabel('Confianza (|prob - 0.5|)')
axes[0,1].set_title('Confianza en Predicciones')
axes[0,1].grid(True, alpha=0.3)

# 3. Top features con mayor diferencia
if 'feature_differences' in misclassification_analysis:
    top_diff_features = misclassification_analysis['feature_differences'].head(10)
    
    x = range(len(top_diff_features))
    axes[1,0].bar(x, top_diff_features['relative_difference'])
    axes[1,0].set_xticks(x)
    axes[1,0].set_xticklabels(top_diff_features['feature'], rotation=45, ha='right')
    axes[1,0].set_ylabel('Diferencia Relativa')
    axes[1,0].set_title('Features con Mayor Diferencia\nCorrect vs Misclassified')
    axes[1,0].grid(True, alpha=0.3)

# 4. Casos por umbral de confianza
thresholds = [0.1, 0.2, 0.3, 0.4]
correct_counts = []
incorrect_counts = []

for thresh in thresholds:
    high_conf_mask = np.abs(y_test_proba - 0.5) > thresh
    correct_counts.append(((y_test_true == y_test_pred) & high_conf_mask).sum())
    incorrect_counts.append(((y_test_true != y_test_pred) & high_conf_mask).sum())

x = np.arange(len(thresholds))
width = 0.35

axes[1,1].bar(x - width/2, correct_counts, width, label='Correctos', alpha=0.8)
axes[1,1].bar(x + width/2, incorrect_counts, width, label='Incorrectos', alpha=0.8)
axes[1,1].set_xlabel('Umbral de Confianza')
axes[1,1].set_ylabel('Número de Casos')
axes[1,1].set_title('Casos por Nivel de Confianza')
axes[1,1].set_xticks(x)
axes[1,1].set_xticklabels([f'>{t}' for t in thresholds])
axes[1,1].legend()
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('misclassification_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Análisis de casos mal clasificados guardado como 'misclassification_analysis.png'")
