# 🎯 Model Training: Cluster-Stratified Random Forest

**Objetivo:** Entrenar modelos cluster-stratified para Crohn y CU

**Input:**
- `../data/processed/crohn/ml_dataset_enhanced.csv`
- `../data/processed/cu/ml_dataset_enhanced.csv`
- `../data/processed/crohn/user_clusters.csv`
- `../data/processed/cu/user_clusters.csv`

**Output:** Modelos entrenados:
- `../models/crohn/` (global + por cluster)
- `../models/cu/` (global + por cluster)

**Autor:** Asier Ortiz García  
**Fecha:** Noviembre 2025

## 📦 Imports y Configuración

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)

# Crear directorios
Path('../models/crohn').mkdir(parents=True, exist_ok=True)
Path('../models/cu').mkdir(parents=True, exist_ok=True)

print("=" * 80)
print("MODEL TRAINING: Cluster-Stratified Random Forest")
print("=" * 80)

## 🔧 Funciones de Entrenamiento

In [None]:
def train_cluster_stratified_models(ibd_type='crohn'):
    """
    Entrena modelos global + cluster-specific para un tipo de IBD.
    """
    print(f"\n{'='*80}")
    print(f"ENTRENANDO MODELOS: {ibd_type.upper()}")
    print(f"{'='*80}\n")
    
    # Cargar dataset
    df = pd.read_csv(f'../data/processed/{ibd_type}/ml_dataset_enhanced.csv')
    clusters_df = pd.read_csv(f'../data/processed/{ibd_type}/user_clusters.csv')
    
    df = df.merge(clusters_df[['user_id', 'cluster']], on='user_id', how='left')
    
    print(f"✓ Dataset cargado: {len(df):,} registros")
    print(f"  Usuarios: {df['user_id'].nunique():,}")
    print(f"  Distribución de clusters: {df['cluster'].value_counts().to_dict()}")
    print(f"  Distribución de risk: {df['risk_level'].value_counts().to_dict()}")
    
    # Features
    exclude_cols = ['user_id', 'checkin_date', 'risk_level', 'severity_score', 'cluster',
                    'sex', 'first_checkin', 'days_since_first_checkin', 'is_flare_day',
                    'cumulative_flare_days', 'is_bad_day', 'risk_numeric']
    
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    X = df[feature_cols].copy()
    y = df['risk_level'].copy()
    
    # Encode categorical
    if 'gender' in X.columns:
        X = pd.get_dummies(X, columns=['gender'], drop_first=True)
    
    # Fill missing
    for col in X.columns:
        if X[col].dtype in ['float64', 'int64']:
            X[col].fillna(X[col].median(), inplace=True)
    
    print(f"\nFeatures: {len(X.columns)}")
    
    # 1. Entrenar modelo global
    print(f"\n1️⃣ Entrenando modelo GLOBAL...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # SMOTE
    print("  Aplicando SMOTE...")
    smote = SMOTE(sampling_strategy='not majority', random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    print(f"  Antes SMOTE: {len(X_train):,} | Después SMOTE: {len(X_train_res):,}")
    
    # Train
    rf_global = RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=10, random_state=42, n_jobs=-1)
    rf_global.fit(X_train_res, y_train_res)
    
    # Evaluate
    y_pred = rf_global.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n✅ Modelo global entrenado - Accuracy: {acc:.3f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Save global
    model_path = f'../models/{ibd_type}/rf_severity_classifier_global.pkl'
    with open(model_path, 'wb') as f:
        pickle.dump(rf_global, f)
    print(f"💾 Guardado: {model_path}")
    
    # 2. Entrenar modelos por cluster
    cluster_models = {}
    n_clusters = df['cluster'].nunique()
    
    for cluster_id in range(n_clusters):
        print(f"\n2️⃣ Entrenando modelo CLUSTER {cluster_id}...")
        df_cluster = df[df['cluster'] == cluster_id].copy()
        
        if len(df_cluster) < 50:
            print(f"  ⚠️  Muy pocos datos ({len(df_cluster)} registros), usando modelo global")
            cluster_models[cluster_id] = rf_global
            continue
        
        X_c = df_cluster[feature_cols].copy()
        y_c = df_cluster['risk_level'].copy()
        
        if 'gender' in X_c.columns:
            X_c = pd.get_dummies(X_c, columns=['gender'], drop_first=True)
        
        for col in X_c.columns:
            if X_c[col].dtype in ['float64', 'int64']:
                X_c[col].fillna(X_c[col].median(), inplace=True)
        
        # Align with global features
        for col in X.columns:
            if col not in X_c.columns:
                X_c[col] = 0
        X_c = X_c[X.columns]
        
        X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42)
        
        # SMOTE if enough samples
        if len(X_train_c) > 30:
            try:
                smote_c = SMOTE(sampling_strategy='not majority', random_state=42)
                X_train_c, y_train_c = smote_c.fit_resample(X_train_c, y_train_c)
            except:
                print("  ⚠️  SMOTE failed, using original data")
        
        rf_cluster = RandomForestClassifier(n_estimators=150, max_depth=12, random_state=42, n_jobs=-1)
        rf_cluster.fit(X_train_c, y_train_c)
        
        y_pred_c = rf_cluster.predict(X_test_c)
        acc_c = accuracy_score(y_test_c, y_pred_c)
        print(f"  ✅ Cluster {cluster_id} - Accuracy: {acc_c:.3f}")
        
        cluster_models[cluster_id] = rf_cluster
        
        # Save
        model_path = f'../models/{ibd_type}/rf_severity_classifier_cluster_{cluster_id}.pkl'
        with open(model_path, 'wb') as f:
            pickle.dump(rf_cluster, f)
        print(f"  💾 Guardado: {model_path}")
    
    # Metadata
    metadata = {
        'ibd_type': ibd_type,
        'n_clusters': n_clusters,
        'n_samples': len(df),
        'n_features': len(X.columns),
        'features': list(X.columns),
        'global_accuracy': float(acc),
        'cluster_models': {f'cluster_{i}': f'rf_severity_classifier_cluster_{i}.pkl' for i in range(n_clusters)}
    }
    
    with open(f'../models/{ibd_type}/cluster_models_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"\n✅ {ibd_type.upper()} completado!")
    return rf_global, cluster_models, metadata

print("✓ Función de entrenamiento definida")

## 🔄 Entrenar Crohn

In [None]:
rf_crohn_global, crohn_cluster_models, crohn_metadata = train_cluster_stratified_models('crohn')

## 🔄 Entrenar Colitis Ulcerosa

In [None]:
rf_cu_global, cu_cluster_models, cu_metadata = train_cluster_stratified_models('cu')

## ✅ Resumen Final

In [None]:
print("\n" + "="*80)
print("RESUMEN FINAL")
print("="*80)

print(f"\n📊 CROHN:")
print(f"  Modelos entrenados: 1 global + {crohn_metadata['n_clusters']} cluster-specific")
print(f"  Global accuracy: {crohn_metadata['global_accuracy']:.3f}")
print(f"  Features: {crohn_metadata['n_features']}")

print(f"\n📊 CU:")
print(f"  Modelos entrenados: 1 global + {cu_metadata['n_clusters']} cluster-specific")
print(f"  Global accuracy: {cu_metadata['global_accuracy']:.3f}")
print(f"  Features: {cu_metadata['n_features']}")

print("\n📂 Archivos generados:")
print(f"  - ../models/crohn/ ({1 + crohn_metadata['n_clusters']} modelos)")
print(f"  - ../models/cu/ ({1 + cu_metadata['n_clusters']} modelos)")
print("  - Metadata JSON files")

print("\n" + "="*80)
print("✅ MODEL TRAINING COMPLETADO")
print("="*80)
print("\nModelos listos para predicción via API!")