# üî¨ Predicci√≥n de Anemia con Machine Learning

**Dataset:** TACNA_Final_Corregido.csv  
**Modelos:** Logistic Regression, Random Forest, Gradient Boosting  

---

## üì¶ Instalaci√≥n e Importaci√≥n de Librer√≠as

In [None]:
# Instalar dependencias (si es necesario)
# !pip install pandas numpy matplotlib seaborn scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve,
    ConfusionMatrixDisplay
)
import warnings
warnings.filterwarnings('ignore')

# Configuraci√≥n de visualizaci√≥n
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

print('‚úÖ Librer√≠as importadas correctamente')

## üìÇ Carga del Dataset

Sube tu archivo `TACNA_Final_Corregido.csv` o con√©ctalo desde Google Drive.

In [None]:
# Opci√≥n 1: Subir archivo manualmente
from google.colab import files
uploaded = files.upload()

# Obtener nombre del archivo subido
filename = list(uploaded.keys())[0]
print(f'üìÅ Archivo cargado: {filename}')

In [None]:
# Opci√≥n 2: Cargar desde Google Drive (descomenta si prefieres esta opci√≥n)
# from google.colab import drive
# drive.mount('/content/drive')
# filename = '/content/drive/MyDrive/tu_carpeta/TACNA_Final_Corregido.csv'

In [None]:
# Cargar dataset
df = pd.read_csv(filename)

print('=' * 60)
print('üìä INFORMACI√ìN DEL DATASET')
print('=' * 60)
print(f'\nüìå Dimensiones: {df.shape[0]} filas x {df.shape[1]} columnas')
print(f'\nüìã Primeras 5 filas:')
df.head()

---
## 1Ô∏è‚É£ Exploraci√≥n de Datos (EDA)

### 1.1 Informaci√≥n General del Dataset

In [None]:
# Informaci√≥n de tipos de datos
print('üìã Tipos de datos por columna:')
print(df.dtypes)

In [None]:
# Estad√≠sticas descriptivas
numeric_cols = ['EdadMeses', 'Peso', 'Talla', 'Hemoglobina', 'Hbc', 'PTZ', 'ZTE', 'ZPE', 'AlturaREN']
available_cols = [col for col in numeric_cols if col in df.columns]

print('üìà Estad√≠sticas de Variables Num√©ricas:')
df[available_cols].describe().round(2)

### 1.2 An√°lisis de la Variable Objetivo (Dx_anemia)

In [None]:
# Distribuci√≥n de la variable objetivo
print('üìå Distribuci√≥n de Dx_anemia:')
target_counts = df['Dx_anemia'].value_counts(dropna=False)
print(target_counts)

# Visualizaci√≥n
fig, ax = plt.subplots(figsize=(8, 5))
colors = ['#2ecc71', '#e74c3c', '#f39c12', '#9b59b6', '#95a5a6']
target_counts.plot(kind='bar', color=colors[:len(target_counts)], ax=ax, edgecolor='black')
ax.set_title('Distribuci√≥n del Diagn√≥stico de Anemia', fontsize=14, fontweight='bold')
ax.set_xlabel('Diagn√≥stico')
ax.set_ylabel('Cantidad de Registros')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

for i, v in enumerate(target_counts.values):
    ax.text(i, v + 20, str(v), ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

### 1.3 Distribuci√≥n de Hemoglobina

In [None]:
# Filtrar datos v√°lidos
df_valid = df[df['Dx_anemia'].notna() & df['Hemoglobina'].notna()].copy()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histograma general
axes[0].hist(df_valid['Hemoglobina'], bins=30, color='#3498db', edgecolor='black', alpha=0.7)
axes[0].set_title('Distribuci√≥n de Hemoglobina', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Hemoglobina (g/dL)')
axes[0].set_ylabel('Frecuencia')
axes[0].axvline(x=11, color='red', linestyle='--', label='Umbral anemia (11 g/dL)')
axes[0].legend()

# Boxplot por diagn√≥stico
order = ['Normal', 'Anemia Leve', 'Anemia Moderada']
available_order = [o for o in order if o in df_valid['Dx_anemia'].unique()]

sns.boxplot(data=df_valid, x='Dx_anemia', y='Hemoglobina', order=available_order,
            palette=['#2ecc71', '#f39c12', '#e74c3c'], ax=axes[1])
axes[1].set_title('Hemoglobina por Diagn√≥stico', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Diagn√≥stico de Anemia')
axes[1].set_ylabel('Hemoglobina (g/dL)')

plt.tight_layout()
plt.show()

### 1.4 Valores Faltantes

In [None]:
# An√°lisis de valores faltantes
print('üîç Valores Faltantes (% por columna):')
missing = df[available_cols].isnull().sum() / len(df) * 100
missing_filtered = missing[missing > 0].sort_values(ascending=False)

if len(missing_filtered) > 0:
    print(missing_filtered.round(2))
else:
    print('‚úÖ No hay valores faltantes en las columnas num√©ricas principales.')

### 1.5 Matriz de Correlaci√≥n

In [None]:
# Matriz de correlaci√≥n
df_numeric = df[available_cols].dropna()

fig, ax = plt.subplots(figsize=(10, 8))
corr_matrix = df_numeric.corr()

mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r',
            center=0, square=True, linewidths=0.5, ax=ax)
ax.set_title('Matriz de Correlaci√≥n - Features Num√©ricas', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

---
## 2Ô∏è‚É£ Preprocesamiento de Datos

In [None]:
print('=' * 60)
print('2. PREPROCESAMIENTO DE DATOS')
print('=' * 60)

# Filtrar registros con diagn√≥stico de anemia v√°lido
df_clean = df[df['Dx_anemia'].notna()].copy()
print(f'\nüìå Registros con diagn√≥stico v√°lido: {len(df_clean)}')

# Crear variable objetivo binaria
df_clean['anemia_binary'] = df_clean['Dx_anemia'].apply(
    lambda x: 0 if x == 'Normal' else 1
)

print(f'\nüìä Distribuci√≥n del target binario:')
print(df_clean['anemia_binary'].value_counts())
print(f'   - Sin anemia (0): {(df_clean["anemia_binary"] == 0).sum()}')
print(f'   - Con anemia (1): {(df_clean["anemia_binary"] == 1).sum()}')

In [None]:
# Seleccionar features
feature_cols = [
    'Sexo',           # Categ√≥rica
    'EdadMeses',      # Num√©rica
    'Peso',           # Num√©rica
    'Talla',          # Num√©rica
    'PTZ',            # Z-score peso-talla
    'ZTE',            # Z-score talla-edad
    'ZPE',            # Z-score peso-edad
    'AlturaREN',      # Altitud
    'Suplementacion', # Programa
    'SIS',            # Seguro
]

# Verificar disponibilidad de columnas
available_features = [col for col in feature_cols if col in df_clean.columns]
print(f'\nüìã Features seleccionadas: {available_features}')

# Preparar dataset
df_model = df_clean[available_features + ['anemia_binary']].copy()

# Eliminar filas con valores faltantes en features cr√≠ticas
initial_rows = len(df_model)
df_model = df_model.dropna(subset=['EdadMeses', 'Peso', 'Talla'])
print(f'\nüßπ Filas eliminadas por valores faltantes: {initial_rows - len(df_model)}')
print(f'üìå Dataset final para modelado: {len(df_model)} registros')

In [None]:
# Imputar valores faltantes restantes
for col in ['PTZ', 'ZTE', 'ZPE', 'AlturaREN']:
    if col in df_model.columns:
        df_model[col] = df_model[col].fillna(df_model[col].median())

for col in ['Suplementacion', 'SIS']:
    if col in df_model.columns:
        df_model[col] = pd.to_numeric(df_model[col], errors='coerce').fillna(0)

# Codificar Sexo
if 'Sexo' in df_model.columns:
    df_model['Sexo'] = df_model['Sexo'].map({'M': 1, 'F': 0}).fillna(0)

print('‚úÖ Preprocesamiento completado')
df_model.head()

### 2.1 Divisi√≥n Train/Test

In [None]:
print('-' * 40)
print('üìå Divisi√≥n Train/Test')
print('-' * 40)

X = df_model[available_features]
y = df_model['anemia_binary']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f'   - Entrenamiento: {len(X_train)} registros')
print(f'   - Prueba: {len(X_test)} registros')
print(f'   - Proporci√≥n anemia en train: {y_train.mean():.2%}')
print(f'   - Proporci√≥n anemia en test: {y_test.mean():.2%}')

# Escalar features num√©ricas
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

feature_names = X.columns.tolist()
print('\n‚úÖ Datos escalados correctamente')

---
## 3Ô∏è‚É£ Entrenamiento de Modelos

In [None]:
print('=' * 60)
print('3. ENTRENAMIENTO DE MODELOS')
print('=' * 60)

models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

trained_models = {}

for name, model in models.items():
    print(f'\nüîÑ Entrenando {name}...')
    model.fit(X_train_scaled, y_train)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='f1')
    print(f'   ‚úÖ F1-Score CV (5-fold): {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})')
    
    trained_models[name] = model

print('\n‚úÖ Todos los modelos entrenados correctamente')

---
## 4Ô∏è‚É£ Evaluaci√≥n de Modelos

In [None]:
print('=' * 60)
print('4. EVALUACI√ìN DE MODELOS')
print('=' * 60)

results = []

for name, model in trained_models.items():
    print(f'\n{"‚îÄ" * 50}')
    print(f'üìä {name}')
    print('‚îÄ' * 50)
    
    # Predicciones
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    # M√©tricas
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_prob)
    
    print(f'   Accuracy:  {accuracy:.4f}')
    print(f'   Precision: {precision:.4f}')
    print(f'   Recall:    {recall:.4f}')
    print(f'   F1-Score:  {f1:.4f}')
    print(f'   AUC-ROC:   {auc_roc:.4f}')
    
    results.append({
        'Modelo': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC-ROC': auc_roc,
        'y_prob': y_prob,
        'y_pred': y_pred
    })

### 4.1 Matrices de Confusi√≥n

In [None]:
# Visualizar matrices de confusi√≥n
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, result in enumerate(results):
    cm = confusion_matrix(y_test, result['y_pred'])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Sin Anemia', 'Con Anemia'])
    disp.plot(ax=axes[idx], cmap='Blues', values_format='d')
    axes[idx].set_title(f"{result['Modelo']}\nF1={result['F1-Score']:.3f}", fontweight='bold')

plt.tight_layout()
plt.show()

### 4.2 Comparaci√≥n de M√©tricas

In [None]:
# Crear DataFrame de resultados
df_results = pd.DataFrame(results)

# Gr√°fico de barras comparativo
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC']
x = np.arange(len(df_results))
width = 0.15
colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6', '#f39c12']

for i, metric in enumerate(metrics):
    axes[0].bar(x + i*width, df_results[metric], width, label=metric, color=colors[i])

axes[0].set_xlabel('Modelo')
axes[0].set_ylabel('Score')
axes[0].set_title('Comparaci√≥n de M√©tricas por Modelo', fontsize=12, fontweight='bold')
axes[0].set_xticks(x + width * 2)
axes[0].set_xticklabels(df_results['Modelo'], rotation=15, ha='right')
axes[0].legend(loc='lower right')
axes[0].set_ylim([0, 1.1])

# Curvas ROC
for result in results:
    fpr, tpr, _ = roc_curve(y_test, result['y_prob'])
    axes[1].plot(fpr, tpr, label=f"{result['Modelo']} (AUC={result['AUC-ROC']:.3f})", linewidth=2)

axes[1].plot([0, 1], [0, 1], 'k--', label='Random', alpha=0.5)
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('Curvas ROC Comparativas', fontsize=12, fontweight='bold')
axes[1].legend(loc='lower right')

plt.tight_layout()
plt.show()

### 4.3 Importancia de Features (Random Forest)

In [None]:
# Feature Importance para Random Forest
rf_model = trained_models['Random Forest']
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

fig, ax = plt.subplots(figsize=(10, 6))
colors = plt.cm.RdYlGn(np.linspace(0.2, 0.8, len(feature_names)))

ax.barh(range(len(feature_names)), importances[indices], color=colors)
ax.set_yticks(range(len(feature_names)))
ax.set_yticklabels([feature_names[i] for i in indices])
ax.set_xlabel('Importancia')
ax.set_title('Importancia de Features - Random Forest', fontsize=12, fontweight='bold')
ax.invert_yaxis()

plt.tight_layout()
plt.show()

---
## 5Ô∏è‚É£ Resumen y Conclusiones

In [None]:
print('=' * 60)
print('5. RESUMEN Y CONCLUSIONES')
print('=' * 60)

# Tabla de resultados
print('\nüìä TABLA COMPARATIVA DE MODELOS:')
df_results_clean = df_results[['Modelo', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC']]
print(df_results_clean.to_string(index=False))

# Mejor modelo
best_idx = df_results['F1-Score'].idxmax()
best_model = df_results.loc[best_idx]

print(f'\nüèÜ MEJOR MODELO: {best_model["Modelo"]}')
print('-' * 40)
print(f'   Accuracy:  {best_model["Accuracy"]:.4f}')
print(f'   Precision: {best_model["Precision"]:.4f}')
print(f'   Recall:    {best_model["Recall"]:.4f}')
print(f'   F1-Score:  {best_model["F1-Score"]:.4f}')
print(f'   AUC-ROC:   {best_model["AUC-ROC"]:.4f}')

In [None]:
# Guardar resultados en CSV
df_results_clean.to_csv('resultados_modelos.csv', index=False)
print('\n‚úÖ Resultados guardados en: resultados_modelos.csv')

# Descargar archivo
from google.colab import files
files.download('resultados_modelos.csv')

---
## üîÆ Predicci√≥n con Nuevos Datos (Opcional)

In [None]:
def predict_anemia(sexo, edad_meses, peso, talla, ptz, zte, zpe, altura, suplementacion, sis):
    """
    Predice si un paciente tiene anemia basado en sus caracter√≠sticas.
    
    Par√°metros:
    - sexo: 'M' o 'F'
    - edad_meses: edad en meses
    - peso: peso en kg
    - talla: talla en cm
    - ptz, zte, zpe: z-scores
    - altura: altitud en metros
    - suplementacion: 0 o 1
    - sis: 0 o 1
    """
    # Codificar sexo
    sexo_encoded = 1 if sexo == 'M' else 0
    
    # Crear array de features
    features = np.array([[sexo_encoded, edad_meses, peso, talla, ptz, zte, zpe, altura, suplementacion, sis]])
    
    # Escalar
    features_scaled = scaler.transform(features)
    
    # Usar el mejor modelo (Random Forest)
    best_model_obj = trained_models['Random Forest']
    
    # Predecir
    prediction = best_model_obj.predict(features_scaled)[0]
    probability = best_model_obj.predict_proba(features_scaled)[0]
    
    resultado = 'CON ANEMIA' if prediction == 1 else 'SIN ANEMIA'
    
    print(f'\nüîÆ RESULTADO DE PREDICCI√ìN')
    print('-' * 40)
    print(f'   Diagn√≥stico: {resultado}')
    print(f'   Probabilidad Sin Anemia: {probability[0]:.2%}')
    print(f'   Probabilidad Con Anemia: {probability[1]:.2%}')
    
    return prediction, probability

In [None]:
# Ejemplo de uso
predict_anemia(
    sexo='F',
    edad_meses=36,
    peso=12.5,
    talla=92,
    ptz=0.5,
    zte=-0.3,
    zpe=0.2,
    altura=3000,
    suplementacion=1,
    sis=1
)

---
## ‚úÖ Proceso Completado

### Archivos generados:
- `resultados_modelos.csv` - M√©tricas de rendimiento de cada modelo

### Pr√≥ximos pasos sugeridos:
1. Ajustar hiperpar√°metros con GridSearchCV
2. Probar t√©cnicas de balanceo (SMOTE) si el dataset est√° desbalanceado
3. Agregar m√°s features si est√°n disponibles
4. Implementar validaci√≥n cruzada estratificada