# Clasificaci√≥n ocupacional ENAHO: Modelos TF-IDF + Regresi√≥n log√≠stica y SVM - Modelo multimodal

In [2]:
from google.colab import drive
drive.flush_and_unmount()        # Desmonta cualquier conexi√≥n previa
!rm -rf /content/drive           # Borra restos del montaje anterior
drive.mount('/content/drive')    # Vuelve a montar desde cero

Mounted at /content/drive


In [3]:
# ============================================================
# 1. LIBRER√çAS
# ============================================================
import os, json
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, log_loss
)

from scipy.sparse import hstack, csr_matrix

In [4]:
# ============================================================
# 2. RUTAS Y CARPETAS DE SALIDA
# ============================================================
BASE = "/content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF"

PATH_IN = f"{BASE}/BASE_LEMATIZADA.parquet"
OUT = f"{BASE}/MODELOS_MULTIMODAL_TFIDF_ML"

os.makedirs(OUT, exist_ok=True)
os.makedirs(f"{OUT}/metricas", exist_ok=True)

In [5]:
# ============================================================
# 3. CONFIGURACI√ìN DEL PIPELINE
# ============================================================
TEXT_COL   = "texto_lematizado"  # texto lematizado
EDAD_COL   = "p208a"             # edad (num√©rica)
NIVEL_COL  = "p301a"             # nivel educativo (categ√≥rica)
DESEM_COL  = "p507"              # tipo de desempe√±o (categ√≥rica)
TARGET_COL = "p505r4"            # c√≥digo CNO-2015 (target)

RANDOM_STATE = 2025
TRAIN_SIZE   = 0.70              # 70% train, 15% val, 15% test
MIN_SAMPLES_PER_CLASS = 10       # m√≠nimo por clase

In [6]:
# ============================================================
# 4. CARGA Y FILTRO INICIAL
# ============================================================
print("\nüìå Cargando base...")

df = pd.read_parquet(PATH_IN)

# Eliminamos registros sin texto
df = df[df[TEXT_COL].notna()]

# Opcional: asegurarse de que las variables estructuradas no tengan NA
df = df.dropna(subset=[EDAD_COL, NIVEL_COL, DESEM_COL, TARGET_COL])

# Filtrar clases con pocas observaciones
counts = df[TARGET_COL].value_counts()
valid_classes = counts[counts >= MIN_SAMPLES_PER_CLASS].index
df = df[df[TARGET_COL].isin(valid_classes)]

print(f"‚úî Filas finales tras filtros: {len(df):,}")
print(f"‚úî Clases mantenidas: {len(valid_classes):,}")


üìå Cargando base...
‚úî Filas finales tras filtros: 315,546
‚úî Clases mantenidas: 357


In [7]:
# ============================================================
# 5. SPLIT ESTRATIFICADO 70 / 15 / 15
# ============================================================
print("\nüìå Realizando split estratificado 70/15/15...")

# Primer split: train vs (val+test)
train_df, temp_df = train_test_split(
    df,
    train_size=TRAIN_SIZE,
    stratify=df[TARGET_COL],
    random_state=RANDOM_STATE
)

# Segundo split: val vs test (50/50 del 30% restante)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df[TARGET_COL],
    random_state=RANDOM_STATE
)

print(f"‚úî Train: {len(train_df):,}")
print(f"‚úî Val:   {len(val_df):,}")
print(f"‚úî Test:  {len(test_df):,}")



üìå Realizando split estratificado 70/15/15...
‚úî Train: 220,882
‚úî Val:   47,332
‚úî Test:  47,332


In [8]:
# ============================================================
# 6. FEATURES TEXTUALES (TF-IDF SOBRE TEXTO)
# ============================================================
print("\nüìå Entrenando TF-IDF sobre texto lematizado...")

X_train_text = train_df[TEXT_COL]
X_val_text   = val_df[TEXT_COL]
X_test_text  = test_df[TEXT_COL]

tfidf = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1,3),
    min_df=5,
    max_df=0.85,
    sublinear_tf=True,
    token_pattern=r"(?u)\b\w+\b"
)

X_train_tfidf = tfidf.fit_transform(X_train_text)
X_val_tfidf   = tfidf.transform(X_val_text)
X_test_tfidf  = tfidf.transform(X_test_text)

print("‚úî TF-IDF listo.")
print("  Dimensiones Train:", X_train_tfidf.shape)
print("  Dimensiones Val:  ", X_val_tfidf.shape)
print("  Dimensiones Test: ", X_test_tfidf.shape)


üìå Entrenando TF-IDF sobre texto lematizado...
‚úî TF-IDF listo.
  Dimensiones Train: (220882, 15000)
  Dimensiones Val:   (47332, 15000)
  Dimensiones Test:  (47332, 15000)


In [10]:
# ============================================================
# 7. FEATURES ESTRUCTURADAS (EDAD + NIVEL + DESEMPE√ëO)
# ============================================================
print("\nüìå Preparando features estructuradas (edad, nivel, desempe√±o)...")

# Extraer columnas estructuradas
X_train_struct = train_df[[EDAD_COL, NIVEL_COL, DESEM_COL]].copy()
X_val_struct   = val_df[[EDAD_COL, NIVEL_COL, DESEM_COL]].copy()
X_test_struct  = test_df[[EDAD_COL, NIVEL_COL, DESEM_COL]].copy()

# Asegurar tipos adecuados
X_train_struct[EDAD_COL]  = X_train_struct[EDAD_COL].astype(float)
X_val_struct[EDAD_COL]    = X_val_struct[EDAD_COL].astype(float)
X_test_struct[EDAD_COL]   = X_test_struct[EDAD_COL].astype(float)

X_train_struct[NIVEL_COL] = X_train_struct[NIVEL_COL].astype(int)
X_val_struct[NIVEL_COL]   = X_val_struct[NIVEL_COL].astype(int)
X_test_struct[NIVEL_COL]  = X_test_struct[NIVEL_COL].astype(int)

X_train_struct[DESEM_COL] = X_train_struct[DESEM_COL].astype(int)
X_val_struct[DESEM_COL]   = X_val_struct[DESEM_COL].astype(int)
X_test_struct[DESEM_COL]  = X_test_struct[DESEM_COL].astype(int)

# Escalador para la edad
scaler_edad = StandardScaler()
edad_train = scaler_edad.fit_transform(X_train_struct[[EDAD_COL]])
edad_val   = scaler_edad.transform(X_val_struct[[EDAD_COL]])
edad_test  = scaler_edad.transform(X_test_struct[[EDAD_COL]])

# One-Hot Encoding para NIVEL y DESEMPE√ëO
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
except:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

cats_train = ohe.fit_transform(X_train_struct[[NIVEL_COL, DESEM_COL]])
cats_val   = ohe.transform(X_val_struct[[NIVEL_COL, DESEM_COL]])
cats_test  = ohe.transform(X_test_struct[[NIVEL_COL, DESEM_COL]])

# Convertir edad a sparse y concatenar con categ√≥ricas
edad_train_sp = csr_matrix(edad_train)
edad_val_sp   = csr_matrix(edad_val)
edad_test_sp  = csr_matrix(edad_test)

X_train_struct_final = hstack([edad_train_sp, cats_train])
X_val_struct_final   = hstack([edad_val_sp, cats_val])
X_test_struct_final  = hstack([edad_test_sp, cats_test])

print("‚úî Features estructuradas listas.")
print("  Dimensiones estructurales Train:", X_train_struct_final.shape)


üìå Preparando features estructuradas (edad, nivel, desempe√±o)...
‚úî Features estructuradas listas.
  Dimensiones estructurales Train: (220882, 20)


In [11]:
# ============================================================
# 8. FUSI√ìN MULTIMODAL (TEXTO + ESTRUCTURADAS)
# ============================================================
print("\nüìå Combinando TF-IDF + variables estructuradas...")

X_train_final = hstack([X_train_tfidf, X_train_struct_final]).tocsr()
X_val_final   = hstack([X_val_tfidf,   X_val_struct_final]).tocsr()
X_test_final  = hstack([X_test_tfidf,  X_test_struct_final]).tocsr()

print("‚úî Dimensiones finales:")
print("  Train:", X_train_final.shape)
print("  Val:  ", X_val_final.shape)
print("  Test: ", X_test_final.shape)


üìå Combinando TF-IDF + variables estructuradas...
‚úî Dimensiones finales:
  Train: (220882, 15020)
  Val:   (47332, 15020)
  Test:  (47332, 15020)


In [12]:
# ============================================================
# 9. ENCODING DEL TARGET
# ============================================================
print("\nüìå Codificando etiquetas (LabelEncoder)...")

le = LabelEncoder()
y_train = train_df[TARGET_COL]
y_val   = val_df[TARGET_COL]
y_test  = test_df[TARGET_COL]

y_train_enc = le.fit_transform(y_train)
y_val_enc   = le.transform(y_val)
y_test_enc  = le.transform(y_test)

print(f"‚úî N¬∫ de clases: {len(le.classes_)}")


üìå Codificando etiquetas (LabelEncoder)...
‚úî N¬∫ de clases: 357


In [13]:
# ============================================================
# 10. FUNCI√ìN DE M√âTRICAS
# ============================================================
def metricas(y_true, y_pred, y_prob=None):
    """
    Devuelve un diccionario con m√©tricas est√°ndar de clasificaci√≥n multiclase.
    Si y_prob es None, no se calcula log_loss.
    """
    loss_value = log_loss(y_true, y_prob) if y_prob is not None else None

    return {
        "Loss": loss_value,
        "Accuracy": accuracy_score(y_true, y_pred),
        "F1_Macro": f1_score(y_true, y_pred, average="macro"),
        "F1_Micro": f1_score(y_true, y_pred, average="micro"),
        "F1_Weighted": f1_score(y_true, y_pred, average="weighted"),
        "Precision_Macro": precision_score(y_true, y_pred, average="macro"),
        "Precision_Micro": precision_score(y_true, y_pred, average="micro"),
        "Precision_Weighted": precision_score(y_true, y_pred, average="weighted"),
        "Recall_Macro": recall_score(y_true, y_pred, average="macro"),
        "Recall_Micro": recall_score(y_true, y_pred, average="micro"),
        "Recall_Weighted": recall_score(y_true, y_pred, average="weighted"),
    }

In [14]:
# ============================================================
# 11. REGRESI√ìN LOG√çSTICA MULTIMODAL
# ============================================================
print("\nüìå Entrenando Logistic Regression (multimodal)...")

lr = LogisticRegression(
    max_iter=400,
    class_weight="balanced",
    n_jobs=-1
)

lr.fit(X_train_final, y_train_enc)

# Predicciones y probabilidades
lr_pred_tr = lr.predict(X_train_final)
lr_pred_va = lr.predict(X_val_final)
lr_pred_te = lr.predict(X_test_final)

lr_prob_tr = lr.predict_proba(X_train_final)
lr_prob_va = lr.predict_proba(X_val_final)
lr_prob_te = lr.predict_proba(X_test_final)

metricas_lr = {
    "Train": metricas(y_train_enc, lr_pred_tr, lr_prob_tr),
    "Val":   metricas(y_val_enc,   lr_pred_va, lr_prob_va),
    "Test":  metricas(y_test_enc,  lr_pred_te, lr_prob_te),
}

with open(f"{OUT}/metricas/lr_multimodal.json", "w") as f:
    json.dump(metricas_lr, f, indent=4)


üìå Entrenando Logistic Regression (multimodal)...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# ============================================================
# 12. SVM LINEAL MULTIMODAL
# ============================================================
print("\nüìå Entrenando Linear SVM (multimodal)...")

svm = LinearSVC(class_weight="balanced")

svm.fit(X_train_final, y_train_enc)

svm_pred_tr = svm.predict(X_train_final)
svm_pred_va = svm.predict(X_val_final)
svm_pred_te = svm.predict(X_test_final)

metricas_svm = {
    "Train": metricas(y_train_enc, svm_pred_tr),
    "Val":   metricas(y_val_enc,   svm_pred_va),
    "Test":  metricas(y_test_enc,  svm_pred_te),
}

with open(f"{OUT}/metricas/svm_multimodal.json", "w") as f:
    json.dump(metricas_svm, f, indent=4)


üìå Entrenando Linear SVM (multimodal)...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# ============================================================
# 13. TABLAS RESUMEN DE M√âTRICAS (4 DECIMALES)
# ============================================================
print("\nüìå TABLA DE M√âTRICAS ‚Äì LOGISTIC REGRESSION (MULTIMODAL)\n")
df_lr = pd.DataFrame(metricas_lr).T.applymap(
    lambda x: round(x, 4) if isinstance(x, (float, int)) else x
)
print(df_lr)

print("\nüìå TABLA DE M√âTRICAS ‚Äì LINEAR SVM (MULTIMODAL)\n")
df_svm = pd.DataFrame(metricas_svm).T.applymap(
    lambda x: round(x, 4) if isinstance(x, (float, int)) else x
)
print(df_svm)

  df_lr = pd.DataFrame(metricas_lr).T.applymap(



üìå TABLA DE M√âTRICAS ‚Äì LOGISTIC REGRESSION (MULTIMODAL)

         Loss  Accuracy  F1_Macro  F1_Micro  F1_Weighted  Precision_Macro  \
Train  0.5677    0.8959    0.7864    0.8959       0.9004           0.7055   
Val    0.6519    0.8711    0.5734    0.8711       0.8783           0.5336   
Test   0.6513    0.8702    0.5670    0.8702       0.8781           0.5352   

       Precision_Micro  Precision_Weighted  Recall_Macro  Recall_Micro  \
Train           0.8959              0.9271        0.9503        0.8959   
Val             0.8711              0.9051        0.6598        0.8711   
Test            0.8702              0.9050        0.6478        0.8702   

       Recall_Weighted  
Train           0.8959  
Val             0.8711  
Test            0.8702  

üìå TABLA DE M√âTRICAS ‚Äì LINEAR SVM (MULTIMODAL)

       Loss  Accuracy  F1_Macro  F1_Micro  F1_Weighted  Precision_Macro  \
Train   NaN    0.9589    0.9216    0.9589       0.9582           0.8830   
Val     NaN    0.9094    0.

  df_svm = pd.DataFrame(metricas_svm).T.applymap(


In [17]:
# ============================================================
# 14. GUARDADO DE MODELOS Y ARTEFACTOS
# ============================================================
print("\nüíæ Guardando modelos y artefactos...")

# Preprocesadores
joblib.dump(tfidf,        f"{OUT}/tfidf_multimodal.joblib")
joblib.dump(scaler_edad,  f"{OUT}/scaler_edad_multimodal.joblib")
joblib.dump(ohe,          f"{OUT}/ohe_multimodal.joblib")
joblib.dump(le,           f"{OUT}/label_encoder_multimodal.joblib")

# Modelos
joblib.dump(lr,  f"{OUT}/lr_multimodal_model.joblib")
joblib.dump(svm, f"{OUT}/svm_multimodal_model.joblib")

print(f"\nüéâ ENTRENAMIENTO MULTIMODAL COMPLETO")
print(f"üìÅ Artefactos guardados en:\n   {OUT}")


üíæ Guardando modelos y artefactos...

üéâ ENTRENAMIENTO MULTIMODAL COMPLETO
üìÅ Artefactos guardados en:
   /content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF/MODELOS_MULTIMODAL_TFIDF_ML


In [18]:
# ============================================================
# 15. EXPORTACI√ìN DEL TEST SET EN M√öLTIPLES FORMATOS
# ============================================================
print("\nüíæ Exportando conjunto de TEST (multimodal)...")

EXPORT_DIR = f"{OUT}/test_export"
os.makedirs(EXPORT_DIR, exist_ok=True)

# Reconstruir DataFrame TEST con columnas clave
test_df_export = test_df[[TEXT_COL, EDAD_COL, NIVEL_COL, DESEM_COL, TARGET_COL]].copy()
test_df_export.rename(columns={
    TEXT_COL:   "texto_lematizado",
    EDAD_COL:   "edad",
    NIVEL_COL:  "nivel",
    DESEM_COL:  "desempeno",
    TARGET_COL: "target_original"
}, inplace=True)

test_df_export["target_encoded"] = y_test_enc

# Versi√≥n esencial (solo texto)
test_df_essential = test_df_export[["texto_lematizado"]].copy()

# Metadata b√°sica
metadata = {
    "total_test_rows": int(len(test_df_export)),
    "unique_clases_test": int(test_df_export["target_original"].nunique()),
    "label_encoder_mapping": {int(i): str(label) for i, label in enumerate(le.classes_)},
    "tfidf_vocab_size": int(len(tfidf.vocabulary_)),
    "train_size": int(len(train_df)),
    "val_size": int(len(val_df)),
    "test_size": int(len(test_df)),
    "structured_features": ["edad (escalada)", "nivel (one-hot)", "desempeno (one-hot)"],
}

# Exportar
test_df_export.to_parquet(f"{EXPORT_DIR}/test_data_multimodal.parquet", index=False)
test_df_export.to_csv(f"{EXPORT_DIR}/test_data_multimodal.csv", index=False, encoding="utf-8")
test_df_essential.to_parquet(f"{EXPORT_DIR}/test_data_essential_multimodal.parquet", index=False)

with open(f"{EXPORT_DIR}/test_metadata_multimodal.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=4)

print("\nüéâ Archivos de TEST exportados en:")
print(f"   {EXPORT_DIR}")


üíæ Exportando conjunto de TEST (multimodal)...

üéâ Archivos de TEST exportados en:
   /content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF/MODELOS_MULTIMODAL_TFIDF_ML/test_export


In [19]:
# ============================================================
# 16. EXPORTACI√ìN DE PREDICCIONES (LR y SVM)
# ============================================================
print("\nüíæ Exportando predicciones (Logistic Regression y SVM)...")

PRED_DIR = f"{OUT}/predicciones"
os.makedirs(PRED_DIR, exist_ok=True)

# DataFrame base con verdad de terreno
df_pred = pd.DataFrame({
    "texto_lematizado": test_df[TEXT_COL].values,
    "edad":             test_df[EDAD_COL].values,
    "nivel":            test_df[NIVEL_COL].values,
    "desempeno":        test_df[DESEM_COL].values,
    "true_label_original": y_test.values,
    "true_label_encoded":  y_test_enc,
})

# A√±adir predicciones LR
df_pred["pred_lr_encoded"]  = lr_pred_te
df_pred["pred_lr_original"] = le.inverse_transform(lr_pred_te)

# A√±adir predicciones SVM
df_pred["pred_svm_encoded"]  = svm_pred_te
df_pred["pred_svm_original"] = le.inverse_transform(svm_pred_te)

# Probabilidades LR
prob_df = pd.DataFrame(
    lr_prob_te,
    columns=[f"prob_class_{label}" for label in le.classes_]
)

df_pred_lr_prob = pd.concat([df_pred, prob_df], axis=1)

# Exportar
df_pred_lr_prob.to_parquet(f"{PRED_DIR}/predicciones_lr_multimodal.parquet", index=False)
df_pred.to_parquet(f"{PRED_DIR}/predicciones_svm_multimodal.parquet", index=False)
df_pred_lr_prob.to_csv(f"{PRED_DIR}/predicciones_completas_multimodal.csv", index=False, encoding="utf-8")
df_pred_lr_prob.to_parquet(f"{PRED_DIR}/predicciones_completas_multimodal.parquet", index=False)

print("\nüéâ Predicciones multimodales exportadas en:")
print(f"   {PRED_DIR}")


üíæ Exportando predicciones (Logistic Regression y SVM)...

üéâ Predicciones multimodales exportadas en:
   /content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF/MODELOS_MULTIMODAL_TFIDF_ML/predicciones
