In [None]:
# ENTRENAMIENTO DE MODELO PARA SELECCIÓN DE CANDIDATOS - AZURE ML STUDIO
# =====================================================================

# 1. CONFIGURACIÓN Y CONEXIÓN A AZURE ML
import pandas as pd
import numpy as np
import warnings
import os
import json
import joblib
from pathlib import Path

warnings.filterwarnings('ignore')

# Azure ML imports
from azureml.core import Workspace, Dataset, Experiment, Run, Model
from azureml.core.compute import ComputeTarget
from azureml.core.environment import Environment
import mlflow
import mlflow.sklearn

print("🚀 ENTRENAMIENTO DE MODELO DE SELECCIÓN DE CANDIDATOS - AZURE ML")
print("="*70)

# Conectar al workspace
try:
    ws = Workspace.from_config()
    print(f"✅ Conectado al workspace: {ws.name}")
    print(f"📍 Suscripción: {ws.subscription_id}")
    print(f"🏢 Grupo de recursos: {ws.resource_group}")
except Exception as e:
    print(f"❌ Error conectando al workspace: {e}")
    print("Asegúrate de tener un archivo config.json o usar az login")
    raise

# Obtener el contexto del experimento
run = Run.get_context()
experiment_name = "candidate-selection-training"

# Si estamos en un experimento real, usar el run actual; si no, crear uno nuevo
if hasattr(run, 'experiment'):
    print(f"✅ Usando run existente: {run.id}")
else:
    experiment = Experiment(workspace=ws, name=experiment_name)
    run = experiment.start_logging()
    print(f"🆕 Nuevo experimento creado: {experiment_name}")

# Habilitar MLflow tracking
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)


In [None]:
# 2. CARGA DE DATOS DESDE AZURE ML DATASET
print("\n📊 CARGA DE DATOS DESDE AZURE ML")
print("="*50)

# Configuración del dataset
DATASET_NAME = "engineers-dataset-training"
DATASET_VERSION = "latest"

try:
    # Intentar cargar dataset registrado
    dataset = Dataset.get_by_name(workspace=ws, name=DATASET_NAME, version=DATASET_VERSION)
    df = dataset.to_pandas_dataframe()
    print(f"✅ Dataset cargado desde Azure ML: {DATASET_NAME}")
    print(f"📊 Forma del dataset: {df.shape}")
    
    # Registrar información del dataset en el run
    run.log("dataset_name", DATASET_NAME)
    run.log("dataset_version", dataset.version)
    run.log("dataset_size", len(df))
    
except Exception as e:
    print(f"⚠️  No se pudo cargar dataset registrado: {e}")
    print("🔄 Intentando cargar desde datastore...")
    
    try:
        # Fallback: cargar desde datastore
        datastore = ws.get_default_datastore()
        dataset_path = "processed/engineers_dataset_for_training.parquet"
        
        # Crear dataset desde archivo
        dataset = Dataset.Tabular.from_parquet_files(
            path=[(datastore, dataset_path)],
            validate=True
        )
        
        df = dataset.to_pandas_dataframe()
        print(f"✅ Dataset cargado desde datastore: {dataset_path}")
        print(f"📊 Forma del dataset: {df.shape}")
        
        # Registrar el dataset para uso futuro
        dataset = dataset.register(
            workspace=ws,
            name=DATASET_NAME,
            description="Dataset procesado para entrenamiento de selección de candidatos",
            tags={"tipo": "entrenamiento", "procesado": "True"}
        )
        print(f"📝 Dataset registrado como: {DATASET_NAME}")
        
    except Exception as e2:
        print(f"❌ Error cargando datos: {e2}")
        print("💡 Sugerencia: Asegúrate de que el archivo esté en el datastore")
        print("   o crea el dataset manualmente en Azure ML Studio")
        
        # Como último recurso, crear datos sintéticos para pruebas
        print("🧪 Creando datos sintéticos para pruebas...")
        np.random.seed(42)
        n_samples = 1000
        
        df = pd.DataFrame({
            'years_total_experience': np.random.randint(0, 20, n_samples),
            'years_skill_main': np.random.randint(0, 10, n_samples),
            'education_level': np.random.choice(['Bachelor', 'Master', 'PhD'], n_samples),
            'num_certifications': np.random.randint(0, 5, n_samples),
            'english_level': np.random.randint(0, 6, n_samples),
            'num_languages': np.random.randint(1, 4, n_samples),
            'apto': np.random.choice([0, 1], n_samples, p=[0.4, 0.6])
        })
        print(f"📊 Datos sintéticos creados: {df.shape}")

# Verificar que tenemos el campo target 'apto'
if 'apto' not in df.columns:
    print("❌ ERROR: Campo 'apto' no encontrado en el dataset")
    print("Columnas disponibles:", list(df.columns))
    raise ValueError("Campo 'apto' requerido para entrenamiento")

print(f"✅ Dataset preparado con {len(df)} registros")
print(f"📋 Columnas: {list(df.columns)}")

# Registrar metadatos del dataset
run.log("total_records", len(df))
run.log("total_features", len(df.columns) - 1)  # Excluir target


In [None]:
# 3. ANÁLISIS EXPLORATORIO Y PREPARACIÓN DE DATOS
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

print("\n🔍 ANÁLISIS EXPLORATORIO DEL TARGET")
print("="*50)

# Análisis del target
target_counts = df['apto'].value_counts().sort_index()
target_labels = {1: 'Aptos', 0: 'No aptos', -1: 'Revisión manual'}

print("📊 Distribución del target:")
total = len(df)
class_distribution = {}
for value, count in target_counts.items():
    label = target_labels.get(value, f'Valor {value}')
    percentage = count / total * 100
    class_distribution[label] = {"count": count, "percentage": percentage}
    print(f"  {label}: {count:,} ({percentage:.1f}%)")
    
    # Registrar en Azure ML
    run.log(f"class_{value}_count", count)
    run.log(f"class_{value}_percentage", percentage)

# Verificar balance de clases
if len(target_counts) > 1:
    balance_ratio = target_counts.min() / target_counts.max()
    run.log("class_balance_ratio", balance_ratio)
    print(f"\n⚖️  Balance de clases: {balance_ratio:.2f}")
    
    if balance_ratio < 0.3:
        print("⚠️  DATASET DESBALANCEADO - Se aplicarán técnicas de balanceo")
        run.log("dataset_imbalanced", True)
    else:
        print("✅ Dataset relativamente balanceado")
        run.log("dataset_imbalanced", False)

print("\n🔧 PREPARACIÓN DE CARACTERÍSTICAS")
print("="*50)

# Identificar tipos de columnas automáticamente
def identify_and_process_columns(df):
    """Identifica y procesa automáticamente diferentes tipos de columnas"""
    
    numeric_cols = []
    categorical_cols = []
    processed_features = []
    
    for col in df.columns:
        if col in ['apto', 'ID']:  # Excluir target y ID
            continue
            
        # Verificar si es numérico
        if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
            numeric_cols.append(col)
        # Verificar si contiene datos complejos (listas, dicts como strings)
        elif col in ['skills', 'languages', 'certifications']:
            print(f"🔄 Procesando columna compleja: {col}")
            # Procesar según el tipo de datos esperado
            if col == 'skills':
                # Convertir a features binarias
                skills_features = process_skills_for_aml(df[col])
                if skills_features is not None:
                    processed_features.append(skills_features)
            elif col == 'languages':
                # Convertir a features numéricas
                lang_features = process_languages_for_aml(df[col])
                if lang_features is not None:
                    processed_features.append(lang_features)
            elif col == 'certifications':
                # Contar certificaciones
                cert_features = pd.DataFrame({
                    'num_certifications': df[col].apply(lambda x: len(x) if isinstance(x, list) else 0)
                }, index=df.index)
                processed_features.append(cert_features)
        else:
            # Resto son categóricas
            categorical_cols.append(col)
    
    return numeric_cols, categorical_cols, processed_features

def process_skills_for_aml(skills_series):
    """Procesa columna de skills para Azure ML"""
    try:
        # Si son listas, convertir a texto
        if skills_series.apply(lambda x: isinstance(x, list)).any():
            skills_text = skills_series.apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))
        else:
            skills_text = skills_series.astype(str)
        
        # Crear features binarias de skills más comunes
        vectorizer = CountVectorizer(binary=True, max_features=20, min_df=2)
        skills_matrix = vectorizer.fit_transform(skills_text)
        
        feature_names = [f"skill_{name}" for name in vectorizer.get_feature_names_out()]
        return pd.DataFrame(skills_matrix.toarray(), columns=feature_names, index=skills_series.index)
    except Exception as e:
        print(f"⚠️  Error procesando skills: {e}")
        return None

def process_languages_for_aml(languages_series):
    """Procesa columna de idiomas para Azure ML"""
    try:
        level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6, 'NATIVE': 6}
        
        language_features = []
        for langs in languages_series:
            if isinstance(langs, dict):
                features = {
                    'num_languages': len(langs),
                    'english_level': level_mapping.get(langs.get('English', ''), 0),
                    'has_spanish': 1 if 'Spanish' in langs else 0,
                    'max_language_level': max([level_mapping.get(level, 0) for level in langs.values()], default=0)
                }
            else:
                features = {'num_languages': 0, 'english_level': 0, 'has_spanish': 0, 'max_language_level': 0}
            language_features.append(features)
        
        return pd.DataFrame(language_features, index=languages_series.index)
    except Exception as e:
        print(f"⚠️  Error procesando idiomas: {e}")
        return None

# Procesar columnas
numeric_cols, categorical_cols, processed_features = identify_and_process_columns(df)

print(f"📊 Columnas numéricas ({len(numeric_cols)}): {numeric_cols}")
print(f"🏷️  Columnas categóricas ({len(categorical_cols)}): {categorical_cols}")
print(f"🔧 Features procesadas: {len(processed_features)} grupos")

# Registrar información de features
run.log("numeric_features_count", len(numeric_cols))
run.log("categorical_features_count", len(categorical_cols))
run.log("processed_features_count", len(processed_features))


In [None]:
# 4. CONSTRUCCIÓN DEL DATASET FINAL Y DIVISIÓN
from imblearn.over_sampling import SMOTE

print("\n🔗 CONSTRUCCIÓN DEL DATASET FINAL")
print("="*50)

# Combinar todas las features
feature_dfs = []

# Agregar columnas numéricas
if numeric_cols:
    numeric_df = df[numeric_cols].fillna(0)
    feature_dfs.append(numeric_df)
    print(f"✅ {len(numeric_cols)} columnas numéricas agregadas")

# Agregar columnas categóricas (one-hot encoding)
if categorical_cols:
    valid_categorical = []
    for col in categorical_cols:
        if col in df.columns:
            unique_vals = df[col].nunique()
            if unique_vals <= 10:  # Máximo 10 categorías para evitar dispersión
                valid_categorical.append(col)
            else:
                print(f"⚠️  Saltando '{col}': demasiadas categorías ({unique_vals})")
    
    if valid_categorical:
        categorical_df = pd.get_dummies(df[valid_categorical], prefix=valid_categorical, dummy_na=True)
        feature_dfs.append(categorical_df)
        print(f"✅ {len(valid_categorical)} columnas categóricas procesadas -> {categorical_df.shape[1]} features")

# Agregar features procesadas
feature_dfs.extend(processed_features)

# Combinar todo
if feature_dfs:
    X = pd.concat(feature_dfs, axis=1)
    y = df['apto'].copy()
    
    # Limpiar datos
    X = X.fillna(0)
    
    # Eliminar features con varianza cero
    non_zero_var_cols = X.columns[X.var() != 0]
    if len(non_zero_var_cols) < len(X.columns):
        removed_features = len(X.columns) - len(non_zero_var_cols)
        X = X[non_zero_var_cols]
        print(f"🗑️  Eliminadas {removed_features} features con varianza cero")
    
    print(f"\n🎯 Dataset final: {X.shape}")
    
    # Registrar información del dataset final
    run.log("final_features_count", X.shape[1])
    run.log("final_samples_count", X.shape[0])
    
    # Guardar nombres de features para uso posterior
    feature_names = list(X.columns)
    run.log_list("feature_names", feature_names)
    
else:
    raise ValueError("❌ No se pudieron crear features")

print("\n📊 DIVISIÓN DEL DATASET")
print("="*40)

# Manejar clase de revisión manual (-1) si existe
revision_count = (y == -1).sum()
if revision_count > 0:
    print(f"🎯 Encontrados {revision_count} casos de revisión manual")
    print("📋 Excluyendo casos de revisión manual para simplificar el entrenamiento")
    
    mask_no_revision = y != -1
    X_clean = X[mask_no_revision].reset_index(drop=True)
    y_clean = y[mask_no_revision].reset_index(drop=True)
    
    run.log("revision_cases_excluded", revision_count)
    print(f"✅ Dataset limpio: {X_clean.shape[0]} muestras")
else:
    X_clean, y_clean = X.reset_index(drop=True), y.reset_index(drop=True)
    run.log("revision_cases_excluded", 0)

# División estratificada
X_temp, X_test, y_temp, y_test = train_test_split(
    X_clean, y_clean, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_clean
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, 
    test_size=0.25,  # 0.25 * 0.8 = 0.2 del total
    random_state=42, 
    stratify=y_temp
)

print(f"📂 División del dataset:")
print(f"  Entrenamiento: {X_train.shape[0]} muestras ({X_train.shape[0]/len(X_clean)*100:.1f}%)")
print(f"  Validación: {X_val.shape[0]} muestras ({X_val.shape[0]/len(X_clean)*100:.1f}%)")
print(f"  Test: {X_test.shape[0]} muestras ({X_test.shape[0]/len(X_clean)*100:.1f}%)")

# Registrar tamaños de conjuntos
run.log("train_size", len(X_train))
run.log("val_size", len(X_val))
run.log("test_size", len(X_test))

# Aplicar balanceo si es necesario
train_counts = Counter(y_train)
if len(train_counts) > 1:
    minority_class = min(train_counts.values())
    majority_class = max(train_counts.values())
    imbalance_ratio = minority_class / majority_class
    
    print(f"\n⚖️  Ratio de balance en entrenamiento: {imbalance_ratio:.2f}")
    run.log("train_balance_ratio", imbalance_ratio)
    
    if imbalance_ratio < 0.7:  # Aplicar balanceo si está muy desbalanceado
        print("🔄 Aplicando SMOTE para balancear clases...")
        try:
            smote = SMOTE(random_state=42, k_neighbors=min(5, minority_class-1))
            X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
            
            balanced_counts = Counter(y_train_balanced)
            print(f"✅ SMOTE aplicado:")
            for class_val, count in sorted(balanced_counts.items()):
                label = "Aptos" if class_val == 1 else "No aptos"
                print(f"    {label}: {count}")
            
            run.log("smote_applied", True)
            run.log("balanced_train_size", len(X_train_balanced))
            
        except Exception as e:
            print(f"⚠️  Error aplicando SMOTE: {e}")
            print("📊 Usando dataset original")
            X_train_balanced, y_train_balanced = X_train, y_train
            run.log("smote_applied", False)
    else:
        print("✅ Dataset relativamente balanceado - no se requiere balanceo")
        X_train_balanced, y_train_balanced = X_train, y_train
        run.log("smote_applied", False)
else:
    print("⚠️  Solo una clase presente en entrenamiento")
    X_train_balanced, y_train_balanced = X_train, y_train
    run.log("smote_applied", False)

print(f"\n✅ Datos preparados para entrenamiento")


In [None]:
# 5. ENTRENAMIENTO DE MODELOS CON MLFLOW TRACKING
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
import time

print("\n🤖 ENTRENAMIENTO DE MODELOS")
print("="*50)

# Configuración de modelos con hiperparámetros optimizados para Azure ML
models_config = {
    'RandomForest': {
        'model': RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        ),
        'requires_scaling': False
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            random_state=42
        ),
        'requires_scaling': False
    },
    'LogisticRegression': {
        'model': LogisticRegression(
            random_state=42,
            max_iter=1000,
            class_weight='balanced',
            solver='liblinear'
        ),
        'requires_scaling': True
    },
    'SVM': {
        'model': SVC(
            kernel='rbf',
            probability=True,
            random_state=42,
            class_weight='balanced',
            gamma='scale'
        ),
        'requires_scaling': True
    }
}

# Preparar datos escalados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Entrenar modelos y registrar en MLflow
trained_models = {}
model_results = {}

for model_name, config in models_config.items():
    print(f"\n🔄 Entrenando {model_name}...")
    
    # Iniciar run hijo para este modelo
    with mlflow.start_run(nested=True, run_name=f"model_{model_name}") as child_run:
        start_time = time.time()
        
        try:
            model = config['model']
            
            # Registrar hiperparámetros
            mlflow.log_params(model.get_params())
            
            # Seleccionar datos (escalados o no)
            if config['requires_scaling']:
                model.fit(X_train_scaled, y_train_balanced)
                y_val_pred = model.predict(X_val_scaled)
                y_val_proba = model.predict_proba(X_val_scaled)[:, 1]
            else:
                model.fit(X_train_balanced, y_train_balanced)
                y_val_pred = model.predict(X_val)
                y_val_proba = model.predict_proba(X_val)[:, 1]
            
                         # Calcular métricas
             training_time = time.time() - start_time
             accuracy = (y_val_pred == y_val).mean()
            
             try:
                 auc_score = roc_auc_score(y_val, y_val_proba)
             except:
                 auc_score = 0.0
            
             # Reporte detallado
             report = classification_report(y_val, y_val_pred, output_dict=True)
            
             # Registrar métricas en MLflow
             mlflow.log_metric("accuracy", accuracy)
             mlflow.log_metric("auc", auc_score)
             mlflow.log_metric("training_time", training_time)
             mlflow.log_metric("f1_macro", report['macro avg']['f1-score'])
             mlflow.log_metric("precision_macro", report['macro avg']['precision'])
             mlflow.log_metric("recall_macro", report['macro avg']['recall'])
            
             # Registrar métricas por clase
             for class_label in ['0', '1']:
                 if class_label in report:
                     mlflow.log_metric(f"precision_class_{class_label}", report[class_label]['precision'])
                     mlflow.log_metric(f"recall_class_{class_label}", report[class_label]['recall'])
                     mlflow.log_metric(f"f1_class_{class_label}", report[class_label]['f1-score'])
            
             # Registrar modelo en MLflow
             mlflow.sklearn.log_model(
                 model, 
                 f"model_{model_name}",
                 registered_model_name=f"candidate_selection_{model_name}"
             )
            
             # Guardar para comparación posterior
             trained_models[model_name] = {
                 'model': model,
                 'requires_scaling': config['requires_scaling']
             }
            
             model_results[model_name] = {
                 'accuracy': accuracy,
                 'auc': auc_score,
                 'training_time': training_time,
                 'f1_macro': report['macro avg']['f1-score'],
                 'precision_macro': report['macro avg']['precision'],
                 'recall_macro': report['macro avg']['recall'],
                 'predictions': y_val_pred,
                 'probabilities': y_val_proba
             }
            
             print(f"  ✅ Completado en {training_time:.2f}s")
             print(f"  📊 Accuracy: {accuracy:.3f}")
             print(f"  📈 AUC: {auc_score:.3f}")
             print(f"  🎯 F1-Score (macro): {report['macro avg']['f1-score']:.3f}")
            
             # Registrar métricas en el run principal también
             run.log(f"{model_name}_accuracy", accuracy)
             run.log(f"{model_name}_auc", auc_score)
             run.log(f"{model_name}_f1_macro", report['macro avg']['f1-score'])
             run.log(f"{model_name}_training_time", training_time)
            
         except Exception as e:
             print(f"  ❌ Error entrenando {model_name}: {e}")
             mlflow.log_param("error", str(e))
             model_results[model_name] = None

print(f"\n✅ Entrenamiento completado")
successful_models = [name for name, result in model_results.items() if result is not None]
print(f"📊 {len(successful_models)} modelos entrenados exitosamente: {successful_models}")

# Registrar resumen en el run principal
run.log("total_models_trained", len(successful_models))
run.log_list("successful_models", successful_models)


In [None]:
# 6. SELECCIÓN DEL MEJOR MODELO Y EVALUACIÓN FINAL
import matplotlib.pyplot as plt

print("\n🏆 SELECCIÓN Y EVALUACIÓN DEL MEJOR MODELO")
print("="*60)

if len(successful_models) > 0:
    # Crear tabla de comparación
    results_df = pd.DataFrame({
        name: result for name, result in model_results.items() 
        if result is not None
    }).T
    
    # Mostrar tabla de resultados
    print("\n📈 TABLA DE RESULTADOS:")
    display_cols = ['accuracy', 'auc', 'f1_macro', 'precision_macro', 'recall_macro', 'training_time']
    results_display = results_df[display_cols].round(3)
    results_display.columns = ['Accuracy', 'AUC', 'F1-Macro', 'Precision', 'Recall', 'Tiempo(s)']
    print(results_display.to_string())
    
    # Seleccionar mejor modelo basado en F1-score macro
    best_model_name = results_df['f1_macro'].idxmax()
    best_score = results_df.loc[best_model_name, 'f1_macro']
    
    print(f"\n🏆 MEJOR MODELO: {best_model_name}")
    print(f"🎯 F1-Score (macro): {best_score:.3f}")
    print(f"📊 Accuracy: {results_df.loc[best_model_name, 'accuracy']:.3f}")
    print(f"📈 AUC: {results_df.loc[best_model_name, 'auc']:.3f}")
    
    # Registrar mejor modelo en el run principal
    run.log("best_model_name", best_model_name)
    run.log("best_model_f1_macro", best_score)
    run.log("best_model_accuracy", results_df.loc[best_model_name, 'accuracy'])
    run.log("best_model_auc", results_df.loc[best_model_name, 'auc'])
    
    # Evaluación en conjunto de test
    best_model_info = trained_models[best_model_name]
    best_model = best_model_info['model']
    
    # Usar datos escalados si es necesario
    if best_model_info['requires_scaling']:
        y_test_pred = best_model.predict(X_test_scaled)
        y_test_proba = best_model.predict_proba(X_test_scaled)[:, 1]
    else:
        y_test_pred = best_model.predict(X_test)
        y_test_proba = best_model.predict_proba(X_test)[:, 1]
    
    # Métricas en test
    test_accuracy = (y_test_pred == y_test).mean()
    test_auc = roc_auc_score(y_test, y_test_proba)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)
    
    print(f"\n🧪 EVALUACIÓN EN CONJUNTO DE TEST:")
    print(f"📊 Accuracy en test: {test_accuracy:.3f}")
    print(f"📈 AUC en test: {test_auc:.3f}")
    print(f"🎯 F1-Score macro en test: {test_report['macro avg']['f1-score']:.3f}")
    
    # Registrar métricas de test
    run.log("test_accuracy", test_accuracy)
    run.log("test_auc", test_auc)
    run.log("test_f1_macro", test_report['macro avg']['f1-score'])
    run.log("test_precision_macro", test_report['macro avg']['precision'])
    run.log("test_recall_macro", test_report['macro avg']['recall'])
    
    # Matriz de confusión
    cm = confusion_matrix(y_test, y_test_pred)
    print(f"\n🔢 Matriz de Confusión en Test:")
    print(f"               Predicho")
    print(f"Real    No Apto  Apto")
    print(f"No Apto   {cm[0,0]:3d}    {cm[0,1]:3d}")
    print(f"Apto      {cm[1,0]:3d}    {cm[1,1]:3d}")
    
    # Registrar valores de la matriz de confusión
    run.log("test_tn", int(cm[0,0]))  # True Negatives
    run.log("test_fp", int(cm[0,1]))  # False Positives
    run.log("test_fn", int(cm[1,0]))  # False Negatives
    run.log("test_tp", int(cm[1,1]))  # True Positives
    
    # Crear visualización de la matriz de confusión
    try:
        plt.figure(figsize=(8, 6))
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.title(f'Matriz de Confusión - {best_model_name}')
        plt.colorbar()
        
        # Añadir etiquetas
        tick_marks = np.arange(2)
        plt.xticks(tick_marks, ['No Apto', 'Apto'])
        plt.yticks(tick_marks, ['No Apto', 'Apto'])
        
        # Añadir texto en cada celda
        thresh = cm.max() / 2.
        for i, j in np.ndindex(cm.shape):
            plt.text(j, i, format(cm[i, j], 'd'),
                    horizontalalignment="center",
                    color="white" if cm[i, j] > thresh else "black")
        
        plt.ylabel('Etiqueta Real')
        plt.xlabel('Etiqueta Predicha')
        plt.tight_layout()
        
        # Guardar como artifact en MLflow
        confusion_matrix_path = "outputs/confusion_matrix.png"
        plt.savefig(confusion_matrix_path, dpi=300, bbox_inches='tight')
        mlflow.log_artifact(confusion_matrix_path)
        plt.show()
        
        print(f"📊 Matriz de confusión guardada como artifact")
        
    except Exception as e:
        print(f"⚠️ Error creando visualización: {e}")

else:
    print("❌ ERROR: No hay modelos exitosos para evaluar")
    raise ValueError("No se entrenaron modelos exitosamente")


In [None]:
# 7. REGISTRO FINAL DEL MODELO EN AZURE ML Y PREPARACIÓN PARA RAI
import os

print("\n💾 REGISTRO FINAL DEL MODELO EN AZURE ML")
print("="*60)

# Crear directorio de outputs si no existe
os.makedirs("outputs", exist_ok=True)

# Guardar el mejor modelo y artefactos necesarios
model_path = "outputs/model.pkl"
scaler_path = "outputs/scaler.pkl"
feature_names_path = "outputs/feature_names.json"

# Guardar modelo
joblib.dump(best_model, model_path)
print(f"✅ Modelo guardado: {model_path}")

# Guardar scaler si es necesario
if best_model_info['requires_scaling']:
    joblib.dump(scaler, scaler_path)
    print(f"✅ Scaler guardado: {scaler_path}")
    run.log("requires_scaling", True)
else:
    # Crear un scaler dummy para mantener consistencia
    dummy_scaler = StandardScaler()
    dummy_scaler.fit(np.zeros((1, X_train.shape[1])))
    joblib.dump(dummy_scaler, scaler_path)
    run.log("requires_scaling", False)
    print(f"✅ Scaler dummy guardado: {scaler_path}")

# Guardar nombres de features para interpretabilidad
feature_metadata = {
    "feature_names": feature_names,
    "feature_count": len(feature_names),
    "numeric_features": numeric_cols,
    "categorical_features": categorical_cols,
    "processed_features_count": len(processed_features)
}

with open(feature_names_path, 'w') as f:
    json.dump(feature_metadata, f, indent=2)
print(f"✅ Metadatos de features guardados: {feature_names_path}")

# Guardar datasets para evaluación y RAI
test_data_path = "outputs/test_data.parquet"
val_data_path = "outputs/val_data.parquet"

# Combinar features con targets para evaluación posterior
X_test_with_target = X_test.copy()
X_test_with_target['y_true'] = y_test.reset_index(drop=True)
X_test_with_target['y_pred'] = y_test_pred
X_test_with_target['y_proba'] = y_test_proba

X_val_with_target = X_val.copy()
X_val_with_target['y_true'] = y_val.reset_index(drop=True)
X_val_with_target['y_pred'] = model_results[best_model_name]['predictions']
X_val_with_target['y_proba'] = model_results[best_model_name]['probabilities']

# Guardar datasets
X_test_with_target.to_parquet(test_data_path, index=False)
X_val_with_target.to_parquet(val_data_path, index=False)

print(f"✅ Datos de test guardados: {test_data_path}")
print(f"✅ Datos de validación guardados: {val_data_path}")

# Registrar todos los artifacts en MLflow
artifacts_to_log = [
    model_path,
    scaler_path, 
    feature_names_path,
    test_data_path,
    val_data_path
]

for artifact_path in artifacts_to_log:
    if os.path.exists(artifact_path):
        mlflow.log_artifact(artifact_path)
        print(f"📤 Artifact registrado en MLflow: {artifact_path}")

# Registrar el mejor modelo en Azure ML Model Registry
try:
    # Registrar modelo con tags descriptivos
    model_tags = {
        "model_type": best_model_name,
        "framework": "scikit-learn",
        "task": "binary_classification",
        "domain": "candidate_selection",
        "version": "1.0",
        "performance_metric": "f1_macro",
        "performance_value": str(round(best_score, 3)),
        "test_accuracy": str(round(test_accuracy, 3)),
        "test_auc": str(round(test_auc, 3)),
        "requires_scaling": str(best_model_info['requires_scaling'])
    }
    
    model_description = f"""
    Modelo de selección de candidatos entrenado con {best_model_name}.
    
    Métricas de rendimiento:
    - F1-Score (macro): {best_score:.3f}
    - Accuracy en test: {test_accuracy:.3f}
    - AUC en test: {test_auc:.3f}
    
    Características del modelo:
    - Número de features: {len(feature_names)}
    - Requiere escalado: {best_model_info['requires_scaling']}
    - Datos de entrenamiento: {len(X_train_balanced)} muestras
    - Datos de test: {len(X_test)} muestras
    """
    
    registered_model = Model.register(
        workspace=ws,
        model_path="outputs/",  # Directorio con todos los artefactos
        model_name="candidate-selection-model",
        description=model_description,
        tags=model_tags
    )
    
    print(f"✅ Modelo registrado en Azure ML: {registered_model.name}")
    print(f"📋 Versión del modelo: {registered_model.version}")
    print(f"🔗 ID del modelo: {registered_model.id}")
    
    # Registrar información del modelo registrado
    run.log("registered_model_name", registered_model.name)
    run.log("registered_model_version", registered_model.version)
    run.log("registered_model_id", registered_model.id)
    
except Exception as e:
    print(f"⚠️ Error registrando modelo en Azure ML: {e}")
    print("El modelo se guardó localmente y en MLflow")

# Preparar información para notebooks posteriores
notebook_info = {
    "experiment_name": experiment_name,
    "run_id": run.id,
    "best_model_name": best_model_name,
    "model_requires_scaling": best_model_info['requires_scaling'],
    "feature_names": feature_names,
    "test_metrics": {
        "accuracy": test_accuracy,
        "auc": test_auc,
        "f1_macro": test_report['macro avg']['f1-score']
    },
    "dataset_info": {
        "train_size": len(X_train_balanced),
        "val_size": len(X_val),
        "test_size": len(X_test),
        "feature_count": len(feature_names)
    }
}

notebook_info_path = "outputs/notebook_info.json"
with open(notebook_info_path, 'w') as f:
    json.dump(notebook_info, f, indent=2)

mlflow.log_artifact(notebook_info_path)
print(f"✅ Información para notebooks posteriores guardada: {notebook_info_path}")

print(f"\n🎉 ENTRENAMIENTO COMPLETADO EXITOSAMENTE")
print("="*60)
print(f"🏆 Mejor modelo: {best_model_name}")
print(f"📊 F1-Score (macro): {best_score:.3f}")
print(f"🧪 Accuracy en test: {test_accuracy:.3f}")
print(f"📈 AUC en test: {test_auc:.3f}")
print(f"💾 Modelo registrado en Azure ML")
print(f"📋 Run ID: {run.id}")

print(f"\n📋 PRÓXIMOS PASOS:")
print("1. ✅ Ejecutar notebook 03_evaluation.ipynb para análisis detallado")
print("2. ✅ Ejecutar notebook 04_rai_dashboard.ipynb para análisis de responsabilidad")
print("3. 🚀 Desplegar modelo como endpoint en Azure ML")
print("4. 🔄 Configurar pipeline de inferencia")

# Completar el run principal
run.complete()
print(f"\n✅ Experimento completado: {experiment_name}")


In [None]:
# 1. CONFIGURACIÓN Y CARGA DE DATOS
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Configuración de rutas
DATA_PATH = Path("../data/processed/")
MODELS_PATH = Path("../models/")
MODELS_PATH.mkdir(exist_ok=True)

print("🚀 ENTRENAMIENTO DE MODELO DE SELECCIÓN DE CANDIDATOS")
print("="*60)

# Cargar dataset procesado
dataset_path = DATA_PATH / "engineers_dataset_for_training.parquet"
if dataset_path.exists():
    df = pd.read_parquet(dataset_path)
    print(f"✅ Dataset cargado: {df.shape}")
    print(f"📊 Columnas: {list(df.columns)}")
else:
    print(f"❌ ERROR: No se encuentra {dataset_path}")
    print("Ejecuta primero el notebook 01_data_preparation.ipynb")


In [None]:
# 2. ANÁLISIS EXPLORATORIO DEL TARGET
if 'apto' in df.columns:
    print("\n📈 ANÁLISIS DEL CAMPO TARGET 'apto':")
    
    # Distribución del target
    target_counts = df['apto'].value_counts().sort_index()
    target_labels = {1: 'Aptos', 0: 'No aptos', -1: 'Revisión manual'}
    
    print(f"Distribución del target:")
    total = len(df)
    for value, count in target_counts.items():
        label = target_labels.get(value, f'Valor {value}')
        percentage = count / total * 100
        print(f"  {label}: {count:,} ({percentage:.1f}%)")
    
    # Verificar balance de clases
    if len(target_counts) > 1:
        balance_ratio = target_counts.min() / target_counts.max()
        print(f"\n⚖️  Balance de clases: {balance_ratio:.2f}")
        if balance_ratio < 0.3:
            print("⚠️  DATASET DESBALANCEADO - Considerar técnicas de balanceo")
        else:
            print("✅ Dataset relativamente balanceado")
    
    # Mostrar estadísticas por target
    print(f"\n📊 ESTADÍSTICAS POR GRUPO:")
    numeric_cols = ['years_total_experience', 'years_skill_main', 'num_promotions']
    available_numeric = [col for col in numeric_cols if col in df.columns]
    
    if available_numeric:
        for target_value, label in target_labels.items():
            if target_value in target_counts.index:
                subset = df[df['apto'] == target_value]
                print(f"\n{label} (n={len(subset)}):")
                for col in available_numeric:
                    if not subset[col].isna().all():
                        mean_val = subset[col].mean()
                        print(f"  {col}: {mean_val:.1f} años promedio")
else:
    print("❌ ERROR: Campo 'apto' no encontrado en el dataset")
    print("Ejecuta primero el notebook de preparación de datos")


In [None]:
# 3. PREPARACIÓN DE CARACTERÍSTICAS (FEATURE ENGINEERING)
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import json

print("\n🔧 PREPARACIÓN DE CARACTERÍSTICAS")
print("="*40)

# Identificar tipos de columnas
def identify_column_types(df):
    """Identifica automáticamente los tipos de columnas"""
    numeric_cols = []
    categorical_cols = []
    list_cols = []
    dict_cols = []
    
    for col in df.columns:
        if col in ['apto', 'ID']:  # Excluir target y ID
            continue
            
        # Verificar si es numérico
        if df[col].dtype in ['int64', 'float64']:
            numeric_cols.append(col)
        # Verificar si contiene listas o diccionarios
        elif df[col].apply(lambda x: isinstance(x, (list, dict))).any():
            if df[col].apply(lambda x: isinstance(x, list)).any():
                list_cols.append(col)
            else:
                dict_cols.append(col)
        # Resto son categóricas
        else:
            categorical_cols.append(col)
    
    return numeric_cols, categorical_cols, list_cols, dict_cols

# Identificar columnas
numeric_cols, categorical_cols, list_cols, dict_cols = identify_column_types(df)

print(f"📊 Columnas numéricas ({len(numeric_cols)}): {numeric_cols}")
print(f"🏷️  Columnas categóricas ({len(categorical_cols)}): {categorical_cols}")
print(f"📋 Columnas con listas ({len(list_cols)}): {list_cols}")
print(f"📖 Columnas con diccionarios ({len(dict_cols)}): {dict_cols}")

# Función para procesar columnas complejas
def process_skills_column(skills_series):
    """Convierte listas de skills en features binarias"""
    from sklearn.feature_extraction.text import CountVectorizer
    
    # Convertir listas a strings
    skills_text = skills_series.apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
    
    # Usar CountVectorizer para crear features binarias
    vectorizer = CountVectorizer(binary=True, max_features=50, min_df=2)
    skills_matrix = vectorizer.fit_transform(skills_text)
    
    # Crear DataFrame con nombres de features
    feature_names = [f"skill_{name}" for name in vectorizer.get_feature_names_out()]
    skills_df = pd.DataFrame(skills_matrix.toarray(), columns=feature_names, index=skills_series.index)
    
    return skills_df, vectorizer

def process_languages_column(languages_series):
    """Convierte diccionarios de idiomas en features numéricas"""
    language_features = []
    
    # Mapeo de niveles a números
    level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6, 'NATIVE': 6}
    
    for languages in languages_series:
        if isinstance(languages, dict):
            # Extraer características clave
            features = {
                'num_languages': len(languages),
                'english_level': level_mapping.get(languages.get('English', ''), 0),
                'has_spanish': 1 if 'Spanish' in languages else 0,
                'max_language_level': max([level_mapping.get(level, 0) for level in languages.values()], default=0)
            }
        else:
            features = {'num_languages': 0, 'english_level': 0, 'has_spanish': 0, 'max_language_level': 0}
        
        language_features.append(features)
    
    return pd.DataFrame(language_features, index=languages_series.index)

# Procesar columnas complejas
processed_features = []

# Skills
if 'skills' in list_cols:
    print("\n🔨 Procesando columna 'skills'...")
    skills_df, skills_vectorizer = process_skills_column(df['skills'])
    processed_features.append(skills_df)
    print(f"✅ {skills_df.shape[1]} features de skills creadas")

# Languages
if 'languages' in dict_cols:
    print("\n🌍 Procesando columna 'languages'...")
    languages_df = process_languages_column(df['languages'])
    processed_features.append(languages_df)
    print(f"✅ {languages_df.shape[1]} features de idiomas creadas")

# Certifications (contar número)
if 'certifications' in list_cols:
    print("\n🏆 Procesando columna 'certifications'...")
    cert_df = pd.DataFrame({
        'num_certifications': df['certifications'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    }, index=df.index)
    processed_features.append(cert_df)
    print("✅ Feature de certificaciones creada")

print(f"\n✅ {len(processed_features)} grupos de features procesadas")


In [None]:
# 4. CREACIÓN DEL DATASET FINAL PARA ENTRENAMIENTO
print("\n🔗 COMBINANDO TODAS LAS CARACTERÍSTICAS")
print("="*40)

# Combinar todas las features
feature_dfs = []

# Agregar columnas numéricas (limpiar valores nulos)
if numeric_cols:
    numeric_df = df[numeric_cols].fillna(0)  # Rellenar nulos con 0
    feature_dfs.append(numeric_df)
    print(f"✅ {len(numeric_cols)} columnas numéricas agregadas")

# Agregar columnas categóricas (one-hot encoding)
if categorical_cols:
    # Filtrar columnas categóricas que no sean muy sparsas
    valid_categorical = []
    for col in categorical_cols:
        if col in df.columns:
            # Verificar que la columna no tenga demasiados valores únicos
            unique_vals = df[col].nunique()
            if unique_vals <= 20:  # Máximo 20 categorías
                valid_categorical.append(col)
            else:
                print(f"⚠️  Saltando '{col}': demasiadas categorías ({unique_vals})")
    
    if valid_categorical:
        categorical_df = pd.get_dummies(df[valid_categorical], prefix=valid_categorical, dummy_na=True)
        feature_dfs.append(categorical_df)
        print(f"✅ {len(valid_categorical)} columnas categóricas procesadas -> {categorical_df.shape[1]} features")

# Agregar features procesadas
feature_dfs.extend(processed_features)

# Combinar todo
if feature_dfs:
    X = pd.concat(feature_dfs, axis=1)
    print(f"\n🎯 Dataset final: {X.shape}")
    print(f"📊 Total de features: {X.shape[1]}")
    
    # Target
    y = df['apto'].copy()
    
    # Verificar completitud
    print(f"\n📋 VERIFICACIÓN DE DATOS:")
    print(f"  Features con valores nulos: {X.isnull().sum().sum()}")
    print(f"  Target con valores nulos: {y.isnull().sum()}")
    
    # Limpiar cualquier valor nulo restante
    X = X.fillna(0)
    
    # Mostrar algunas estadísticas
    print(f"\n📈 ESTADÍSTICAS DE FEATURES:")
    print(f"  Rango de valores: [{X.min().min():.2f}, {X.max().max():.2f}]")
    print(f"  Features con varianza cero: {(X.var() == 0).sum()}")
    
    # Eliminar features con varianza cero
    non_zero_var_cols = X.columns[X.var() != 0]
    if len(non_zero_var_cols) < len(X.columns):
        removed_features = len(X.columns) - len(non_zero_var_cols)
        X = X[non_zero_var_cols]
        print(f"🗑️  Eliminadas {removed_features} features con varianza cero")
    
    print(f"\n✅ Dataset final preparado: {X.shape}")
    
else:
    print("❌ ERROR: No se pudieron crear features")
    X, y = None, None


In [None]:
# 5. DIVISIÓN TRAIN/VALIDATION/TEST Y BALANCEO
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

print("\n📊 DIVISIÓN Y BALANCEO DEL DATASET")
print("="*40)

if X is not None and y is not None:
    # Manejar clase de "revisión manual" (-1)
    print("🎯 Estrategia para valores de revisión manual (-1):")
    revision_count = (y == -1).sum()
    
    if revision_count > 0:
        print(f"  Encontrados {revision_count} casos de revisión manual")
        print("  Opciones: 1) Excluir, 2) Tratar como clase separada, 3) Reasignar")
        
        # Por ahora, vamos a excluir los casos de revisión manual para simplificar
        mask_no_revision = y != -1
        X_clean = X[mask_no_revision]
        y_clean = y[mask_no_revision]
        
        print(f"  ✅ Excluyendo casos de revisión manual")
        print(f"  📊 Dataset limpio: {X_clean.shape[0]} muestras")
    else:
        X_clean, y_clean = X, y
        print("  ✅ No hay casos de revisión manual")
    
    # División inicial
    X_temp, X_test, y_temp, y_test = train_test_split(
        X_clean, y_clean, 
        test_size=0.2, 
        random_state=42, 
        stratify=y_clean
    )
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, 
        test_size=0.25,  # 0.25 * 0.8 = 0.2 del total
        random_state=42, 
        stratify=y_temp
    )
    
    print(f"\n📂 DIVISIÓN DEL DATASET:")
    print(f"  Entrenamiento: {X_train.shape[0]} muestras ({X_train.shape[0]/len(X_clean)*100:.1f}%)")
    print(f"  Validación: {X_val.shape[0]} muestras ({X_val.shape[0]/len(X_clean)*100:.1f}%)")
    print(f"  Test: {X_test.shape[0]} muestras ({X_test.shape[0]/len(X_clean)*100:.1f}%)")
    
    # Analizar balance en cada conjunto
    for name, y_subset in [("Entrenamiento", y_train), ("Validación", y_val), ("Test", y_test)]:
        counts = Counter(y_subset)
        total = len(y_subset)
        print(f"\n{name}:")
        for class_val, count in sorted(counts.items()):
            label = "Aptos" if class_val == 1 else "No aptos"
            print(f"  {label}: {count} ({count/total*100:.1f}%)")
    
    # Decidir si aplicar balanceo
    train_counts = Counter(y_train)
    if len(train_counts) > 1:
        minority_class = min(train_counts.values())
        majority_class = max(train_counts.values())
        imbalance_ratio = minority_class / majority_class
        
        print(f"\n⚖️  ANÁLISIS DE BALANCE:")
        print(f"  Ratio de balance: {imbalance_ratio:.2f}")
        
        if imbalance_ratio < 0.5:  # Desbalanceado
            print("  🔄 Aplicando técnicas de balanceo...")
            
            try:
                # Aplicar SMOTE para sobremuestreo de la clase minoritaria
                smote = SMOTE(random_state=42)
                X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
                
                balanced_counts = Counter(y_train_balanced)
                print(f"  ✅ SMOTE aplicado:")
                for class_val, count in sorted(balanced_counts.items()):
                    label = "Aptos" if class_val == 1 else "No aptos"
                    print(f"    {label}: {count}")
                
                print(f"  📈 Dataset balanceado: {X_train_balanced.shape}")
                
            except Exception as e:
                print(f"  ⚠️  Error aplicando SMOTE: {e}")
                print("  📊 Usando dataset original")
                X_train_balanced, y_train_balanced = X_train, y_train
        else:
            print("  ✅ Dataset relativamente balanceado - no se requiere balanceo")
            X_train_balanced, y_train_balanced = X_train, y_train
    else:
        print("  ⚠️  Solo una clase presente - revisar datos")
        X_train_balanced, y_train_balanced = X_train, y_train
    
    print(f"\n✅ Datos preparados para entrenamiento")
    
else:
    print("❌ ERROR: No hay datos para dividir")


In [None]:
# 6. ENTRENAMIENTO DE MÚLTIPLES MODELOS
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import joblib
import time

print("\n🤖 ENTRENAMIENTO DE MODELOS")
print("="*40)

# Diccionario de modelos a entrenar
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42
    ),
    'Logistic Regression': LogisticRegression(
        random_state=42,
        max_iter=1000,
        class_weight='balanced'
    ),
    'SVM': SVC(
        kernel='rbf',
        probability=True,
        random_state=42,
        class_weight='balanced'
    )
}

# Scaler para modelos que lo requieren
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Entrenar y evaluar cada modelo
trained_models = {}
model_results = {}

for model_name, model in models.items():
    print(f"\n🔄 Entrenando {model_name}...")
    start_time = time.time()
    
    try:
        # Usar datos escalados para modelos que lo requieren
        if model_name in ['Logistic Regression', 'SVM']:
            model.fit(X_train_scaled, y_train_balanced)
            
            # Predicciones
            y_val_pred = model.predict(X_val_scaled)
            y_val_proba = model.predict_proba(X_val_scaled)[:, 1]
        else:
            model.fit(X_train_balanced, y_train_balanced)
            
            # Predicciones
            y_val_pred = model.predict(X_val)
            y_val_proba = model.predict_proba(X_val)[:, 1]
        
        # Métricas
        training_time = time.time() - start_time
        
        # ROC-AUC
        try:
            auc_score = roc_auc_score(y_val, y_val_proba)
        except:
            auc_score = 0.0
        
        # Accuracy
        accuracy = (y_val_pred == y_val).mean()
        
        # Precision, Recall, F1 por clase
        report = classification_report(y_val, y_val_pred, output_dict=True)
        
        # Guardar modelo y resultados
        trained_models[model_name] = model
        model_results[model_name] = {
            'accuracy': accuracy,
            'auc': auc_score,
            'training_time': training_time,
            'precision_0': report['0']['precision'] if '0' in report else 0,
            'recall_0': report['0']['recall'] if '0' in report else 0,
            'f1_0': report['0']['f1-score'] if '0' in report else 0,
            'precision_1': report['1']['precision'] if '1' in report else 0,
            'recall_1': report['1']['recall'] if '1' in report else 0,
            'f1_1': report['1']['f1-score'] if '1' in report else 0,
            'macro_f1': report['macro avg']['f1-score'],
            'predictions': y_val_pred,
            'probabilities': y_val_proba
        }
        
        print(f"  ✅ Completado en {training_time:.2f}s")
        print(f"  📊 Accuracy: {accuracy:.3f}")
        print(f"  📈 AUC: {auc_score:.3f}")
        print(f"  🎯 F1-Score (macro): {report['macro avg']['f1-score']:.3f}")
        
    except Exception as e:
        print(f"  ❌ Error entrenando {model_name}: {e}")
        model_results[model_name] = None

print(f"\n✅ Entrenamiento completado")
print(f"📊 {len([r for r in model_results.values() if r is not None])} modelos entrenados exitosamente")


In [None]:
# 8. INTERPRETABILIDAD Y ANÁLISIS DE CARACTERÍSTICAS
print("\n🔍 ANÁLISIS DE IMPORTANCIA DE CARACTERÍSTICAS")
print("="*50)

# Análisis de importancia según el tipo de modelo
if len(results_df) > 0:
    best_model = trained_models[best_model_name]
    
    # Feature importance para modelos basados en árboles
    if hasattr(best_model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\n🌳 IMPORTANCIA DE CARACTERÍSTICAS ({best_model_name}):")
        print("Top 15 características más importantes:")
        print(feature_importance.head(15).to_string(index=False))
        
        # Visualización
        try:
            plt.figure(figsize=(10, 8))
            top_features = feature_importance.head(15)
            plt.barh(range(len(top_features)), top_features['importance'])
            plt.yticks(range(len(top_features)), top_features['feature'])
            plt.xlabel('Importancia')
            plt.title(f'Top 15 Características Más Importantes\n({best_model_name})')
            plt.gca().invert_yaxis()
            plt.tight_layout()
            plt.savefig(MODELS_PATH / "feature_importance.png", dpi=150, bbox_inches='tight')
            plt.show()
            print(f"📊 Gráfico guardado en: {MODELS_PATH / 'feature_importance.png'}")
        except Exception as e:
            print(f"⚠️  Error creando visualización: {e}")
    
    # Coeficientes para modelos lineales
    elif hasattr(best_model, 'coef_'):
        feature_coef = pd.DataFrame({
            'feature': X.columns,
            'coefficient': best_model.coef_[0]
        })
        feature_coef['abs_coefficient'] = np.abs(feature_coef['coefficient'])
        feature_coef = feature_coef.sort_values('abs_coefficient', ascending=False)
        
        print(f"\n📊 COEFICIENTES DEL MODELO ({best_model_name}):")
        print("Top 15 características con mayor impacto:")
        display_coef = feature_coef[['feature', 'coefficient']].head(15)
        print(display_coef.to_string(index=False))
    
    # Análisis de casos mal clasificados
    if best_model_name in ['Logistic Regression', 'SVM']:
        y_val_pred_best = best_model.predict(X_val_scaled)
        y_val_proba_best = best_model.predict_proba(X_val_scaled)[:, 1]
    else:
        y_val_pred_best = best_model.predict(X_val)
        y_val_proba_best = best_model.predict_proba(X_val)[:, 1]
    
    # Casos mal clasificados
    misclassified = X_val[y_val != y_val_pred_best]
    misclassified_true = y_val[y_val != y_val_pred_best]
    misclassified_pred = y_val_pred_best[y_val != y_val_pred_best]
    misclassified_proba = y_val_proba_best[y_val != y_val_pred_best]
    
    print(f"\n❌ ANÁLISIS DE CASOS MAL CLASIFICADOS:")
    print(f"Total de casos mal clasificados: {len(misclassified)}")
    
    if len(misclassified) > 0:
        # Falsos positivos (predicho apto, pero no apto)
        false_positives = sum((misclassified_true == 0) & (misclassified_pred == 1))
        # Falsos negativos (predicho no apto, pero apto)
        false_negatives = sum((misclassified_true == 1) & (misclassified_pred == 0))
        
        print(f"  Falsos positivos: {false_positives}")
        print(f"  Falsos negativos: {false_negatives}")
        
        # Mostrar algunos casos con probabilidades
        print(f"\nEjemplos de casos mal clasificados (con probabilidades):")
        for i in range(min(5, len(misclassified))):
            true_label = "Apto" if misclassified_true.iloc[i] == 1 else "No apto"
            pred_label = "Apto" if misclassified_pred[i] == 1 else "No apto"
            proba = misclassified_proba[i]
            print(f"  Real: {true_label}, Predicho: {pred_label}, Probabilidad: {proba:.3f}")

# Resumen final
print(f"\n🎉 RESUMEN DEL ENTRENAMIENTO")
print("="*40)
if len(results_df) > 0:
    print(f"✅ Mejor modelo: {best_model_name}")
    print(f"📊 Accuracy en test: {test_accuracy:.3f}")
    print(f"📈 AUC en test: {test_auc:.3f}")
    print(f"🎯 F1-Score macro: {best_score:.3f}")
    print(f"💾 Modelo guardado en: models/")
    print(f"🚀 Listo para despliegue en Azure ML!")
else:
    print("❌ No se pudieron entrenar modelos exitosamente")

print(f"\n📋 PRÓXIMOS PASOS:")
print("1. Revisar el notebook 03_evaluation.ipynb para análisis detallado")
print("2. Usar el notebook 04_rai_dashboard.ipynb para análisis de responsabilidad")
print("3. Desplegar el modelo en Azure Machine Learning")
print("4. Configurar el pipeline de inferencia")


In [None]:
# 7. COMPARACIÓN Y SELECCIÓN DEL MEJOR MODELO
import matplotlib.pyplot as plt
import seaborn as sns

print("\n📊 COMPARACIÓN DE MODELOS")
print("="*40)

# Crear tabla de comparación
results_df = pd.DataFrame(model_results).T
results_df = results_df.dropna()  # Eliminar modelos que fallaron

if len(results_df) > 0:
    # Mostrar tabla de resultados
    print("\n📈 TABLA DE RESULTADOS:")
    display_cols = ['accuracy', 'auc', 'macro_f1', 'precision_1', 'recall_1', 'training_time']
    available_cols = [col for col in display_cols if col in results_df.columns]
    
    if available_cols:
        results_display = results_df[available_cols].round(3)
        results_display.columns = ['Accuracy', 'AUC', 'F1-Macro', 'Precision(Aptos)', 'Recall(Aptos)', 'Tiempo(s)']
        print(results_display.to_string())
    
    # Encontrar el mejor modelo basado en F1-score macro
    best_model_name = results_df['macro_f1'].idxmax()
    best_score = results_df.loc[best_model_name, 'macro_f1']
    
    print(f"\n🏆 MEJOR MODELO: {best_model_name}")
    print(f"🎯 F1-Score (macro): {best_score:.3f}")
    print(f"📊 Accuracy: {results_df.loc[best_model_name, 'accuracy']:.3f}")
    print(f"📈 AUC: {results_df.loc[best_model_name, 'auc']:.3f}")
    
    # Evaluación detallada del mejor modelo
    best_model = trained_models[best_model_name]
    
    # Usar datos escalados si es necesario
    if best_model_name in ['Logistic Regression', 'SVM']:
        y_test_pred = best_model.predict(X_test_scaled)
        y_test_proba = best_model.predict_proba(X_test_scaled)[:, 1]
    else:
        y_test_pred = best_model.predict(X_test)
        y_test_proba = best_model.predict_proba(X_test)[:, 1]
    
    print(f"\n🧪 EVALUACIÓN EN CONJUNTO DE TEST:")
    test_accuracy = (y_test_pred == y_test).mean()
    test_auc = roc_auc_score(y_test, y_test_proba)
    test_report = classification_report(y_test, y_test_pred)
    
    print(f"📊 Accuracy en test: {test_accuracy:.3f}")
    print(f"📈 AUC en test: {test_auc:.3f}")
    print(f"\n📋 Reporte detallado:")
    print(test_report)
    
    # Matriz de confusión
    cm = confusion_matrix(y_test, y_test_pred)
    print(f"\n🔢 Matriz de Confusión:")
    print(f"               Predicho")
    print(f"Real    No Apto  Apto")
    print(f"No Apto   {cm[0,0]:3d}    {cm[0,1]:3d}")
    print(f"Apto      {cm[1,0]:3d}    {cm[1,1]:3d}")
    
    # Guardar el mejor modelo
    model_filename = MODELS_PATH / f"best_model_{best_model_name.lower().replace(' ', '_')}.joblib"
    scaler_filename = MODELS_PATH / "scaler.joblib"
    
    joblib.dump(best_model, model_filename)
    joblib.dump(scaler, scaler_filename)
    
    # Guardar metadatos del modelo
    model_metadata = {
        'model_name': best_model_name,
        'model_type': str(type(best_model).__name__),
        'features': list(X.columns),
        'feature_count': len(X.columns),
        'train_accuracy': results_df.loc[best_model_name, 'accuracy'],
        'train_auc': results_df.loc[best_model_name, 'auc'],
        'train_f1_macro': best_score,
        'test_accuracy': test_accuracy,
        'test_auc': test_auc,
        'training_samples': len(X_train_balanced),
        'test_samples': len(X_test),
        'classes': list(np.unique(y_clean)),
        'created_at': pd.Timestamp.now().isoformat()
    }
    
    import json
    metadata_filename = MODELS_PATH / "model_metadata.json"
    with open(metadata_filename, 'w') as f:
        json.dump(model_metadata, f, indent=2)
    
    print(f"\n💾 MODELO GUARDADO:")
    print(f"📄 Modelo: {model_filename}")
    print(f"⚖️  Scaler: {scaler_filename}")
    print(f"📋 Metadatos: {metadata_filename}")
    
else:
    print("❌ ERROR: No hay modelos válidos para comparar")
