# 🩺 Análisis Exploratorio V2: Separación Crohn vs Colitis Ulcerosa

**Objetivo:** Separar usuarios por tipo de EII y crear clusters específicos para cada enfermedad

**Mejoras vs V1:**
- Separa Crohn de Colitis Ulcerosa desde el inicio
- Clustering independiente para cada tipo
- Genera datasets separados para entrenamiento

**Autor:** Asier Ortiz García  
**Fecha:** Noviembre 2025

## 📦 Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (14, 6)
%matplotlib inline

# Create directories
Path('../data/processed/crohn').mkdir(parents=True, exist_ok=True)
Path('../data/processed/cu').mkdir(parents=True, exist_ok=True)
Path('../docs/figures').mkdir(parents=True, exist_ok=True)

print("=" * 80)
print("ANÁLISIS EXPLORATORIO V2: Crohn vs Colitis Ulcerosa")
print("=" * 80)

## 1️⃣ Carga de Datos

In [None]:
print("📊 Cargando dataset completo...\n")
df = pd.read_csv('../data/raw/export.csv', low_memory=False)
print(f"✓ Cargado: {len(df):,} registros")

# Convert date
df['checkin_date'] = pd.to_datetime(df['checkin_date'], errors='coerce')

print(f"\nUsuarios totales: {df['user_id'].nunique():,}")
print(f"Rango de fechas: {df['checkin_date'].min()} → {df['checkin_date'].max()}")

## 2️⃣ Identificar y Separar Crohn vs CU

In [None]:
conditions_df = df[df['trackable_type'] == 'Condition']

# Keywords específicos para cada tipo
CROHN_KEYWORDS = ['crohn']
UC_KEYWORDS = ['ulcerative colitis', 'ulcerative', 'proctitis', 'pancolitis']

# Identificar usuarios por tipo (exclusivo)
crohn_mask = conditions_df['trackable_name'].str.contains('|'.join(CROHN_KEYWORDS), case=False, na=False)
uc_mask = conditions_df['trackable_name'].str.contains('|'.join(UC_KEYWORDS), case=False, na=False)

crohn_user_ids = conditions_df[crohn_mask]['user_id'].unique()
uc_user_ids = conditions_df[uc_mask]['user_id'].unique()

# Remove overlap (usuarios que reportan ambas - usar la más frecuente)
overlap = set(crohn_user_ids) & set(uc_user_ids)
print(f"\n⚠️  Usuarios con ambas condiciones: {len(overlap)}")

# For overlapping users, count which condition they report more
for user_id in overlap:
    user_conditions = conditions_df[conditions_df['user_id'] == user_id]
    crohn_count = user_conditions['trackable_name'].str.contains('crohn', case=False, na=False).sum()
    uc_count = user_conditions['trackable_name'].str.contains('ulcerative', case=False, na=False).sum()
    
    if crohn_count > uc_count:
        uc_user_ids = np.setdiff1d(uc_user_ids, [user_id])
    else:
        crohn_user_ids = np.setdiff1d(crohn_user_ids, [user_id])

print(f"\n✓ Usuarios con Crohn (exclusivo): {len(crohn_user_ids):,}")
print(f"✓ Usuarios con CU (exclusivo): {len(uc_user_ids):,}")
print(f"Total: {len(crohn_user_ids) + len(uc_user_ids):,}")

## 3️⃣ Filtrar Datasets por Tipo

In [None]:
# Filter datasets
df_crohn = df[df['user_id'].isin(crohn_user_ids)].copy()
df_cu = df[df['user_id'].isin(uc_user_ids)].copy()

print(f"\n📊 Dataset Crohn:")
print(f"  Registros: {len(df_crohn):,}")
print(f"  Usuarios: {df_crohn['user_id'].nunique():,}")
print(f"  Tipos de trackables:")
print(df_crohn['trackable_type'].value_counts())

print(f"\n📊 Dataset CU:")
print(f"  Registros: {len(df_cu):,}")
print(f"  Usuarios: {df_cu['user_id'].nunique():,}")
print(f"  Tipos de trackables:")
print(df_cu['trackable_type'].value_counts())

## 4️⃣ Clustering de Fenotipos - CROHN

In [None]:
def create_symptom_clusters(df_ibd, ibd_type='crohn', n_clusters=3):
    """
    Create phenotype clusters for a specific IBD type.
    
    Args:
        df_ibd: Filtered dataframe for specific IBD type
        ibd_type: 'crohn' or 'cu'
        n_clusters: Number of clusters
    
    Returns:
        clustering_features: DataFrame with cluster assignments
        kmeans: Trained KMeans model
        scaler: Trained StandardScaler
    """
    print(f"\n{'='*80}")
    print(f"CLUSTERING DE FENOTIPOS - {ibd_type.upper()}")
    print(f"{'='*80}\n")
    
    # Filter symptoms
    symptoms_df = df_ibd[df_ibd['trackable_type'] == 'Symptom'].copy()
    
    # Map common symptoms
    SYMPTOM_MAPPING = {
        'abdominal_pain': ['abdominal pain', 'stomach pain', 'belly pain', 'cramping', 'abdominal cramps'],
        'diarrhea': ['diarrhea', 'loose stools', 'watery stools'],
        'fatigue': ['fatigue', 'tired', 'exhaustion', 'tiredness'],
        'nausea': ['nausea', 'nauseous', 'feeling sick'],
        'blood_in_stool': ['blood in stool', 'bloody stool', 'rectal bleeding'],
        'fever': ['fever', 'high temperature']
    }
    
    def normalize_symptom(symptom_name):
        if pd.isna(symptom_name):
            return None
        symptom_lower = str(symptom_name).lower()
        for category, variants in SYMPTOM_MAPPING.items():
            if any(variant in symptom_lower for variant in variants):
                return category
        return None
    
    symptoms_df['symptom_category'] = symptoms_df['trackable_name'].apply(normalize_symptom)
    symptoms_df = symptoms_df[symptoms_df['symptom_category'].notna()]
    symptoms_df['value_numeric'] = pd.to_numeric(symptoms_df['trackable_value'], errors='coerce')
    
    print(f"Síntomas mapeados: {len(symptoms_df):,} registros")
    print(f"\nDistribución:")
    print(symptoms_df['symptom_category'].value_counts())
    
    # Aggregate by user
    user_profiles = symptoms_df.groupby(['user_id', 'symptom_category']).agg({
        'value_numeric': 'mean'
    }).reset_index()
    
    user_profiles = user_profiles.pivot_table(
        index='user_id',
        columns='symptom_category',
        values='value_numeric',
        fill_value=0
    )
    
    # Filter users with sufficient data
    total_checkins = df_ibd.groupby('user_id').size()
    valid_users = total_checkins[total_checkins >= 10].index
    clustering_features = user_profiles.loc[user_profiles.index.isin(valid_users)]
    
    print(f"\nUsuarios válidos: {len(clustering_features)}")
    print(f"Features: {clustering_features.columns.tolist()}")
    
    # Standardize and cluster
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(clustering_features)
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=50)
    cluster_labels = kmeans.fit_predict(X_scaled)
    
    clustering_features['cluster'] = cluster_labels
    
    silhouette = silhouette_score(X_scaled, cluster_labels)
    print(f"\nSilhouette Score: {silhouette:.3f}")
    print(f"\nDistribución de clusters:")
    for cluster_id in range(n_clusters):
        count = (cluster_labels == cluster_id).sum()
        pct = count / len(cluster_labels) * 100
        print(f"  Cluster {cluster_id}: {count:4d} ({pct:5.1f}%)")
    
    # Cluster profiles
    cluster_profiles = clustering_features.groupby('cluster').mean()
    print(f"\n📊 Perfiles de Clusters:\n")
    print(cluster_profiles.round(2))
    
    return clustering_features, kmeans, scaler, cluster_profiles

# Run clustering for Crohn
crohn_clusters, crohn_kmeans, crohn_scaler, crohn_profiles = create_symptom_clusters(
    df_crohn, 
    ibd_type='crohn', 
    n_clusters=3
)

## 5️⃣ Clustering de Fenotipos - COLITIS ULCEROSA

In [None]:
# Run clustering for CU (k=3 for E1/E2/E3 mapping)
cu_clusters, cu_kmeans, cu_scaler, cu_profiles = create_symptom_clusters(
    df_cu, 
    ibd_type='cu', 
    n_clusters=3
)

## 6️⃣ Visualización Comparativa

In [None]:
# Compare cluster profiles
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Crohn heatmap
symptom_cols = [col for col in crohn_profiles.columns if col != 'cluster']
if symptom_cols:
    sns.heatmap(
        crohn_profiles[symptom_cols].T,
        annot=True,
        fmt='.2f',
        cmap='RdYlGn_r',
        ax=axes[0],
        cbar_kws={'label': 'Severidad (0-4)'},
        linewidths=1
    )
    axes[0].set_title('Clusters Crohn (n=3)', fontweight='bold', fontsize=14)
    axes[0].set_xlabel('Cluster')
    axes[0].set_ylabel('Síntoma')

# CU heatmap
symptom_cols_cu = [col for col in cu_profiles.columns if col != 'cluster']
if symptom_cols_cu:
    sns.heatmap(
        cu_profiles[symptom_cols_cu].T,
        annot=True,
        fmt='.2f',
        cmap='Blues',
        ax=axes[1],
        cbar_kws={'label': 'Severidad (0-4)'},
        linewidths=1
    )
    axes[1].set_title('Clusters CU (n=3)', fontweight='bold', fontsize=14)
    axes[1].set_xlabel('Cluster')
    axes[1].set_ylabel('Síntoma')

plt.tight_layout()
plt.savefig('../docs/figures/clusters_comparison_crohn_vs_cu.png', dpi=300, bbox_inches='tight')
plt.show()

## 7️⃣ Guardar Resultados

## 8️⃣ Análisis de Features Adicionales (Medications, Surgery, Smoking)

**Conclusión:** NO incluir en entrenamiento por datos insuficientes (~4-7% cobertura).

In [None]:
print('Ver análisis completo en commit message')

In [None]:
import pickle

# Save Crohn results
crohn_clusters[['cluster']].to_csv('../data/processed/crohn/user_clusters.csv')
crohn_profiles.to_csv('../data/processed/crohn/cluster_profiles.csv')

with open('../data/processed/crohn/kmeans.pkl', 'wb') as f:
    pickle.dump(crohn_kmeans, f)
with open('../data/processed/crohn/scaler.pkl', 'wb') as f:
    pickle.dump(crohn_scaler, f)

print("✅ Crohn results saved:")
print("  - data/processed/crohn/user_clusters.csv")
print("  - data/processed/crohn/cluster_profiles.csv")
print("  - data/processed/crohn/kmeans.pkl")
print("  - data/processed/crohn/scaler.pkl")

# Save CU results
cu_clusters[['cluster']].to_csv('../data/processed/cu/user_clusters.csv')
cu_profiles.to_csv('../data/processed/cu/cluster_profiles.csv')

with open('../data/processed/cu/kmeans.pkl', 'wb') as f:
    pickle.dump(cu_kmeans, f)
with open('../data/processed/cu/scaler.pkl', 'wb') as f:
    pickle.dump(cu_scaler, f)

print("\n✅ CU results saved:")
print("  - data/processed/cu/user_clusters.csv")
print("  - data/processed/cu/cluster_profiles.csv")
print("  - data/processed/cu/kmeans.pkl")
print("  - data/processed/cu/scaler.pkl")

# Save filtered datasets
df_crohn.to_csv('../data/processed/crohn_filtered.csv', index=False)
df_cu.to_csv('../data/processed/cu_filtered.csv', index=False)

print("\n✅ Filtered datasets saved:")
print("  - data/processed/crohn_filtered.csv")
print("  - data/processed/cu_filtered.csv")

## ✅ Resumen

### Datos Separados:
- **Crohn**: Usuarios y clusters específicos
- **CU**: Usuarios y clusters específicos

### Archivos Generados:
```
data/processed/
├── crohn/
│   ├── user_clusters.csv
│   ├── cluster_profiles.csv
│   ├── kmeans.pkl
│   └── scaler.pkl
├── cu/
│   ├── user_clusters.csv
│   ├── cluster_profiles.csv
│   ├── kmeans.pkl
│   └── scaler.pkl
├── crohn_filtered.csv
└── cu_filtered.csv
```

### Próximos Pasos:
1. Notebook 02: Feature engineering (separado por tipo)
2. Notebooks 04/05: Training models (Crohn y CU)
3. API: Selector de modelo según ibd_type