<a href="https://colab.research.google.com/github/r-nassib/practica-acuity/blob/main/Practice_acuity_tid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TID 2025 Competition

Competición de Kaggle. Evaluación ML de asignatura TID

In [4]:
# ==========================================
# 0. MONTAJE DE DRIVE Y RUTAS
# ==========================================
from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/tid-data-competition"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# ==========================================
# 1. IMPORTAR LIBRERÍAS
# ==========================================
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, make_scorer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [6]:
# ==========================================
# 2. CARGAR DATOS
# ==========================================
train = pd.read_csv(f"{DATA_PATH}/train_kaggle.csv")
test = pd.read_csv(f"{DATA_PATH}/test_kaggle.csv")
sample_submission = pd.read_csv(f"{DATA_PATH}/submission.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print(train.head())

Train shape: (209050, 19)
Test shape: (209050, 18)
    stay_id               intime              outtime gender  \
0  32563223  2111-05-01 15:04:00  2111-05-01 20:14:00      M   
1  39010345  2184-05-22 11:01:00  2184-05-22 17:34:00      M   
2  34685893  2184-04-03 13:55:00  2184-04-05 11:31:00      M   
3  35539621  2149-03-05 07:23:00  2149-03-05 08:34:00      F   
4  38659214  2166-05-02 11:38:00  2166-05-02 14:17:00      F   

                             race arrival_transport disposition  temperature  \
0                           WHITE           WALK IN    ADMITTED         98.9   
1                           WHITE           WALK IN        HOME         96.0   
2  HISPANIC/LATINO - PUERTO RICAN           WALK IN        HOME         98.9   
3                           OTHER           WALK IN    TRANSFER         98.4   
4                           WHITE           WALK IN        HOME         97.0   

   heartrate  resprate  o2sat    sbp   dbp pain  acuity  \
0       58.0      16.0  

In [7]:
# ==========================================
# 3. DEFINIR COLUMNAS CLAVE (TARGET + ID)
# ==========================================
TARGET_COL = "acuity"    # Nivel de urgencia: Low / Medium / High
ID_COL = "stay_id"       # Identificador de la estancia

y = train[TARGET_COL]
X = train.drop(columns=[TARGET_COL])

# Si el ID está en las columnas predictoras, lo quitamos
if ID_COL in X.columns:
    X = X.drop(columns=[ID_COL])

X_test = test.copy()
if ID_COL in X_test.columns:
    X_test = X_test.drop(columns=[ID_COL])

print("Features:", X.columns.tolist())
print("Clases objetivo:", y.unique())

Features: ['intime', 'outtime', 'gender', 'race', 'arrival_transport', 'disposition', 'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'chiefcomplaint', 'medrecon_count', 'pyxis_count', 'vitalsign_count']
Clases objetivo: [3. 2. 4. 1. 5.]


In [8]:
# ==========================================
# 4. PREPROCESAMIENTO (NUMÉRICO + CATEGÓRICO)
# ==========================================
numeric_selector = make_column_selector(dtype_include=["int64", "float64"])
categorical_selector = make_column_selector(dtype_exclude=["int64", "float64"])

numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_selector),
        ("cat", categorical_pipeline, categorical_selector),
    ],
    remainder="drop"
)

In [9]:
# ==========================================
# 5. DEFINIR MODELOS
#    - Modelo 1: Árbol de decisión (muy interpretable)
#    - Modelo 2: Random Forest (mejor rendimiento, aún explicable)
# ==========================================

# Árbol de decisión sencillo para interpretabilidad
dt_clf = DecisionTreeClassifier(
    max_depth=4,            # Árbol poco profundo, fácil de explicar
    class_weight="balanced",
    random_state=42
)

dt_pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", dt_clf)
])

# Random Forest como modelo principal
rf_clf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=3,
    class_weight="balanced",  # ayuda con desbalanceo de clases
    n_jobs=-1,
    random_state=42
)

rf_pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", rf_clf)
])

In [10]:
# ==========================================
# 6. VALIDACIÓN CRUZADA (Stratified K-Fold)
#    Métricas: Accuracy y F1-macro
# ==========================================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "accuracy": make_scorer(accuracy_score),
    "f1_macro": make_scorer(f1_score, average="macro")
}

def evaluar_modelo(pipe, X, y, nombre):
    cv_results = cross_validate(
        pipe,
        X, y,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False
    )

    acc_mean = cv_results["test_accuracy"].mean()
    f1_mean = cv_results["test_f1_macro"].mean()

    print(f"\n===== {nombre} =====")
    print(f"Accuracy CV (media):  {acc_mean:.4f}")
    print(f"F1-macro CV (media):  {f1_mean:.4f}")
    print("========================")

    return acc_mean, f1_mean

dt_acc, dt_f1 = evaluar_modelo(dt_pipe, X, y, "Decision Tree (baseline interpretable)")
rf_acc, rf_f1 = evaluar_modelo(rf_pipe, X, y, "Random Forest (modelo principal)")

# Elegimos como modelo final el Random Forest (normalmente mejor F1-macro)
best_model = rf_pipe


===== Decision Tree (baseline interpretable) =====
Accuracy CV (media):  0.3806
F1-macro CV (media):  0.2983

===== Random Forest (modelo principal) =====
Accuracy CV (media):  0.3542
F1-macro CV (media):  0.2930


In [11]:
# ==========================================
# 7. ENTRENAR MODELO FINAL EN TODO EL TRAIN
#    Y GENERAR PREDICCIONES PARA TEST
# ==========================================
best_model.fit(X, y)
test_pred = best_model.predict(X_test)

# Comprobamos las clases predichas
print("\nClases predichas en test:", np.unique(test_pred))


Clases predichas en test: [1. 2. 3. 4. 5.]


In [12]:
# ==========================================
# 8. CREAR SUBMISSION PARA KAGGLE / TID COMPETITION
# ==========================================
submission = sample_submission.copy()

# Asumimos que sample_submission tiene columnas: ['stay_id', 'acuity']
# y que 'stay_id' coincide con test[ID_COL]
submission["acuity"] = test_pred

submission_path = f"{DATA_PATH}/submission_tid2025_rf.csv"
submission.to_csv(submission_path, index=False)

print("\n✔ Archivo de submission creado:")
print(submission_path)
print(submission.head())


✔ Archivo de submission creado:
/content/drive/MyDrive/tid-data-competition/submission_tid2025_rf.csv
    stay_id  acuity
0  30435959     2.0
1  37847321     1.0
2  31222694     2.0
3  32964590     1.0
4  38112419     2.0
