# Clasificaci√≥n ocupacional ENAHO: Modelos TF-IDF + Regresi√≥n log√≠stica y SVM - Modelo concatenado

In [1]:
from google.colab import drive
drive.flush_and_unmount()        # Desmonta cualquier conexi√≥n previa
!rm -rf /content/drive           # Borra restos del montaje anterior
drive.mount('/content/drive')    # Vuelve a montar desde cero

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [2]:
# ============================================================
# 1. LIBRER√çAS
# ============================================================
# Manejo general
import os, json
import pandas as pd
import numpy as np
import joblib

# Preprocesamiento y modelado
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# M√©tricas
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, log_loss
)

In [3]:
# ============================================================
# 2. RUTAS Y CARPETAS DE SALIDA
# ============================================================
BASE = "/content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF"

# Archivo de entrada (texto lematizado)
PATH_IN = f"{BASE}/BASE_LEMATIZADA.parquet"

# Carpeta donde se guardar√°n modelos y m√©tricas
OUT = f"{BASE}/MODELOS_CONCAT_FINALES"
os.makedirs(OUT, exist_ok=True)
os.makedirs(f"{OUT}/metricas", exist_ok=True)

In [4]:
# ============================================================
# 3. CONFIGURACI√ìN DEL PIPELINE
# ============================================================
TEXT_COL = "texto_lematizado"  # texto base
EDAD_COL = "p208a"              # edad
NIVEL_COL = "p301a"             # nivel educativo
DESEM_COL = "p507"              # tipo de desempe√±o
TARGET_COL = "p505r4"           # c√≥digo CNO-2015 (target)

RANDOM_STATE = 2025
TRAIN_SIZE = 0.70               # 70% train, resto se divide 15/15
MIN_SAMPLES_PER_CLASS = 10      # m√≠nimo por clase para estabilidad

INCLUDE_LABELS = True           # si queremos describir expl√≠citamente las variables
SEPARATOR = ", "                # ‚Üê concatenaci√≥n con coma

In [5]:
# ============================================================
# 4. DICCIONARIOS DESCRIPTIVOS
# ============================================================
# Se utilizan para traducir c√≥digos num√©ricos en descripciones legibles.
NIVEL_EDUCATIVO_MAP = {
    0: "sin nivel educativo",
    1: "educaci√≥n inicial",
    2: "primaria incompleta",
    3: "primaria completa",
    4: "secundaria incompleta",
    5: "secundaria completa",
    6: "superior no universitaria incompleta",
    7: "superior no universitaria completa",
    8: "superior universitaria incompleta",
    9: "superior universitaria completa",
    10: "posgrado",
    11: "maestr√≠a o doctorado"
}

DESEM_MAP = {
    0: "empleador o patrono",
    1: "trabajador independiente",
    2: "empleado",
    3: "trabajador familiar no remunerado"
}

In [6]:
# ============================================================
# 5. FUNCI√ìN PARA CONCATENAR VARIABLES TEXTUALES
# ============================================================
# Se crea una √∫nica cadena de texto con:
# ‚Äî descripci√≥n lematizada
# ‚Äî edad
# ‚Äî nivel educativo
# ‚Äî desempe√±o laboral
#
# Esto permite que TF-IDF procese TODA la informaci√≥n como texto.
def concatenate_features(texto, edad, nivel, desempeno, include_labels=True, sep=", "):
    texto = str(texto).strip()

    # ---- Manejo de NaN para evitar errores ----
    edad_desc = f"{int(edad)} a√±os" if not pd.isna(edad) else "edad desconocida"
    nivel_desc = NIVEL_EDUCATIVO_MAP.get(int(nivel), "educaci√≥n desconocida") if not pd.isna(nivel) else "educaci√≥n desconocida"
    desempeno_desc = DESEM_MAP.get(int(desempeno), "desempe√±o desconocido") if not pd.isna(desempeno) else "desempe√±o desconocido"

    # Construcci√≥n de los componentes
    components = [texto]

    if include_labels:
        # Versi√≥n con etiquetas expl√≠citas
        components.append(f"edad: {edad_desc}")
        components.append(f"educaci√≥n: {nivel_desc}")
        components.append(f"desempe√±o: {desempeno_desc}")
    else:
        # Versi√≥n sin etiquetas
        components.extend([edad_desc, nivel_desc, desempeno_desc])

    return sep.join(components)

In [7]:
# ============================================================
# 6. CARGA DE BASE
# ============================================================
df = pd.read_parquet(PATH_IN)

# Se elimina cualquier registro sin texto (no se puede vectorizar)
df = df[df[TEXT_COL].notna()]

# Se filtran las clases poco frecuentes (<10 obs), para evitar inestabilidad
counts = df[TARGET_COL].value_counts()
valid_classes = counts[counts >= MIN_SAMPLES_PER_CLASS].index
df = df[df[TARGET_COL].isin(valid_classes)]

print(f"‚úî Filas finales tras filtro m√≠nimo por clase: {len(df):,}")

‚úî Filas finales tras filtro m√≠nimo por clase: 315,625


In [8]:
# ============================================================
# 7. GENERACI√ìN DEL TEXTO CONCATENADO
# ============================================================
df["texto_concatenado"] = df.apply(
    lambda row: concatenate_features(
        texto=row[TEXT_COL],
        edad=row[EDAD_COL],
        nivel=row[NIVEL_COL],
        desempeno=row[DESEM_COL],
        include_labels=INCLUDE_LABELS,
        sep=SEPARATOR
    ),
    axis=1
)

In [9]:
# ============================================================
# 8. SPLIT EXACTO 70 / 15 / 15
# ============================================================
X = df["texto_concatenado"]
y = df[TARGET_COL]

# Primero: 70% train vs 30% temporal
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    train_size=TRAIN_SIZE,
    stratify=y,
    random_state=RANDOM_STATE
)

# Luego: del 30% ‚Üí 15% validaci√≥n y 15% prueba
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,         # 50% de X_temp ‚Üí 15% del total
    stratify=y_temp,
    random_state=RANDOM_STATE
)

print(f"Train: {len(X_train):,}")
print(f"Val:   {len(X_val):,}")
print(f"Test:  {len(X_test):,}")

Train: 220,937
Val:   47,344
Test:  47,344


In [10]:
# ============================================================
# 9. VECTORIZACI√ìN TF-IDF
# ============================================================
# TF-IDF captura importancia de t√©rminos y n-gramas, ideal para textos cortos.
tfidf = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1,3),
    min_df=5,
    max_df=0.85,
    sublinear_tf=True,
    token_pattern=r"(?u)\b\w+\b"
)

# Ajustamos con train (fit) y transformamos cada partici√≥n
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf   = tfidf.transform(X_val)
X_test_tfidf  = tfidf.transform(X_test)

# Guardamos vectorizador
joblib.dump(tfidf, f"{OUT}/tfidf_vectorizer.joblib")

['/content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF/MODELOS_CONCAT_FINALES/tfidf_vectorizer.joblib']

In [11]:
# ============================================================
# 10. ENCODE DEL TARGET
# ============================================================
# Necesario para modelos sklearn (categor√≠as ‚Üí n√∫meros)
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc   = le.transform(y_val)
y_test_enc  = le.transform(y_test)

joblib.dump(le, f"{OUT}/label_encoder.joblib")

['/content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF/MODELOS_CONCAT_FINALES/label_encoder.joblib']

In [12]:
# ============================================================
# 11. FUNCI√ìN PARA CALCULAR SUITE COMPLETA DE M√âTRICAS
# ============================================================
def metricas(y_true, y_pred, y_prob=None):
    loss_value = log_loss(y_true, y_prob) if y_prob is not None else None

    return {
        "Loss": loss_value,
        "Accuracy": accuracy_score(y_true, y_pred),
        "F1_Macro": f1_score(y_true, y_pred, average="macro"),
        "F1_Micro": f1_score(y_true, y_pred, average="micro"),
        "F1_Weighted": f1_score(y_true, y_pred, average="weighted"),
        "Precision_Macro": precision_score(y_true, y_pred, average="macro"),
        "Precision_Micro": precision_score(y_true, y_pred, average="micro"),
        "Precision_Weighted": precision_score(y_true, y_pred, average="weighted"),
        "Recall_Macro": recall_score(y_true, y_pred, average="macro"),
        "Recall_Micro": recall_score(y_true, y_pred, average="micro"),
        "Recall_Weighted": recall_score(y_true, y_pred, average="weighted")
    }

In [13]:
# ============================================================
# 12. REGRESI√ìN LOG√çSTICA
# ============================================================
lr = LogisticRegression(
    max_iter=400,
    class_weight="balanced",
    n_jobs=-1
)

print("\nEntrenando Logistic Regression...")
lr.fit(X_train_tfidf, y_train_enc)

# Predicciones
pred_tr = lr.predict(X_train_tfidf)
pred_va = lr.predict(X_val_tfidf)
pred_te = lr.predict(X_test_tfidf)

# Probabilidades (solo para LR)
prob_tr = lr.predict_proba(X_train_tfidf)
prob_va = lr.predict_proba(X_val_tfidf)
prob_te = lr.predict_proba(X_test_tfidf)

# M√©tricas
metricas_lr = {
    "Train": metricas(y_train_enc, pred_tr, prob_tr),
    "Val":   metricas(y_val_enc,   pred_va, prob_va),
    "Test":  metricas(y_test_enc,  pred_te, prob_te),
}

# Guardar m√©tricas LR
with open(f"{OUT}/metricas/lr_concat.json", "w") as f:
    json.dump(metricas_lr, f, indent=4)

# Guardar modelo LR
joblib.dump(lr, f"{OUT}/lr_concat_model.joblib")


Entrenando Logistic Regression...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['/content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF/MODELOS_CONCAT_FINALES/lr_concat_model.joblib']

In [14]:
# ============================================================
# 13. SVM (LINEAR SVC)
# ============================================================
# LinearSVC = SVM lineal optimizado para texto (muy r√°pido y estable)
svm = LinearSVC(class_weight="balanced")

print("\nEntrenando Linear SVM...")
svm.fit(X_train_tfidf, y_train_enc)

# Predicciones
pred_tr = svm.predict(X_train_tfidf)
pred_va = svm.predict(X_val_tfidf)
pred_te = svm.predict(X_test_tfidf)

metricas_svm = {
    "Train": metricas(y_train_enc, pred_tr),
    "Val":   metricas(y_val_enc,   pred_va),
    "Test":  metricas(y_test_enc,  pred_te),
}

# Guardar m√©tricas SVM
with open(f"{OUT}/metricas/svm_concat.json", "w") as f:
    json.dump(metricas_svm, f, indent=4)

# Guardar modelo
joblib.dump(svm, f"{OUT}/svm_concat_model.joblib")


Entrenando Linear SVM...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['/content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF/MODELOS_CONCAT_FINALES/svm_concat_model.joblib']

In [15]:
# ============================================================
# 14. TABLAS DE M√âTRICAS (4 decimales)
# ============================================================
print("\nüìå TABLA DE M√âTRICAS LR\n")
df_lr = pd.DataFrame(metricas_lr).T.applymap(
    lambda x: round(x,4) if isinstance(x,(float,int)) else x
)
print(df_lr)

print("\nüìå TABLA DE M√âTRICAS SVM\n")
df_svm = pd.DataFrame(metricas_svm).T.applymap(
    lambda x: round(x,4) if isinstance(x,(float,int)) else x
)
print(df_svm)

print("\nüéâ MODELOS CONCATENADOS LISTOS Y GUARDADOS.")


üìå TABLA DE M√âTRICAS LR

         Loss  Accuracy  F1_Macro  F1_Micro  F1_Weighted  Precision_Macro  \
Train  0.6795    0.8917    0.7854    0.8917       0.8958           0.7032   
Val    0.7736    0.8629    0.5500    0.8629       0.8710           0.5133   
Test   0.7709    0.8634    0.5437    0.8634       0.8712           0.5077   

       Precision_Micro  Precision_Weighted  Recall_Macro  Recall_Micro  \
Train           0.8917              0.9218        0.9488        0.8917   
Val             0.8629              0.8981        0.6374        0.8629   
Test            0.8634              0.8993        0.6335        0.8634   

       Recall_Weighted  
Train           0.8917  
Val             0.8629  
Test            0.8634  

üìå TABLA DE M√âTRICAS SVM

       Loss  Accuracy  F1_Macro  F1_Micro  F1_Weighted  Precision_Macro  \
Train   NaN    0.9580    0.9254    0.9580       0.9570           0.8855   
Val     NaN    0.9055    0.5684    0.9055       0.9052           0.5503   
Test    Na

  df_lr = pd.DataFrame(metricas_lr).T.applymap(
  df_svm = pd.DataFrame(metricas_svm).T.applymap(


In [16]:
# ============================================================
# 15. EXPORTACI√ìN DEL TEST SET EN M√öLTIPLES FORMATOS
# ============================================================

print("\nüíæ Exportando conjunto de TEST en m√∫ltiples formatos...")

EXPORT_DIR = f"{OUT}/test_export"
os.makedirs(EXPORT_DIR, exist_ok=True)

# ----- Reconstruir DataFrame TEST -----
test_df_export = pd.DataFrame({
    "texto_concatenado": X_test.values,
    "texto_original_lematizado": df.loc[X_test.index, TEXT_COL].values,
    "edad": df.loc[X_test.index, EDAD_COL].values,
    "nivel_educativo": df.loc[X_test.index, NIVEL_COL].values,
    "desempeno": df.loc[X_test.index, DESEM_COL].values,
    "target_original": y_test.values,
    "target_encoded": y_test_enc
})

# Versi√≥n esencial (solo texto)
test_df_essential = test_df_export[["texto_concatenado"]].copy()

# METADATA
metadata = {
    "total_test_rows": len(test_df_export),
    "unique_classes_test": int(test_df_export["target_original"].nunique()),
    "label_encoder_mapping": {
        int(i): int(lbl) for i, lbl in enumerate(le.classes_)
    },
    "tfidf_vocab_size": int(len(tfidf.vocabulary_)),
    "train_size": len(X_train),
    "val_size": len(X_val),
    "test_size": len(X_test),
}

# EXPORTAR ARCHIVOS
test_df_export.to_parquet(f"{EXPORT_DIR}/test_data.parquet", index=False)
test_df_export.to_csv(f"{EXPORT_DIR}/test_data.csv", index=False, encoding="utf-8")
test_df_essential.to_parquet(f"{EXPORT_DIR}/test_data_essential.parquet", index=False)

with open(f"{EXPORT_DIR}/test_metadata.json", "w") as f:
    json.dump(metadata, f, indent=4)

print("\nüéâ Archivos del TEST exportados en:")
print(f"   {EXPORT_DIR}")


üíæ Exportando conjunto de TEST en m√∫ltiples formatos...

üéâ Archivos del TEST exportados en:
   /content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF/MODELOS_CONCAT_FINALES/test_export


In [17]:
# ============================================================
# 16. EXPORTACI√ìN DE PREDICCIONES (LR y SVM)
# ============================================================

print("\nüíæ Exportando predicciones de Logistic Regression y SVM...")

PRED_DIR = f"{OUT}/predicciones"
os.makedirs(PRED_DIR, exist_ok=True)

# Obtener predicciones (ya calculadas antes)
y_pred_lr = pred_te
y_prob_lr = prob_te  # prob LR

y_pred_svm = pred_te  # SVM no tiene probas

# ----- Construcci√≥n del DataFrame -----
df_pred = pd.DataFrame({
    "texto_concatenado": X_test.values,
    "texto_original_lematizado": df.loc[X_test.index, TEXT_COL].values,
    "true_label_original": y_test.values,
    "true_label_encoded": y_test_enc,

    "pred_lr_encoded": y_pred_lr,
    "pred_lr_original": le.inverse_transform(y_pred_lr),

    "pred_svm_encoded": y_pred_svm,
    "pred_svm_original": le.inverse_transform(y_pred_svm),
})

# Probabilidades LR
prob_df = pd.DataFrame(
    y_prob_lr,
    columns=[f"prob_class_{c}" for c in le.classes_]
)

df_pred_lr_prob = pd.concat([df_pred, prob_df], axis=1)

# ----- EXPORTAR -----
df_pred_lr_prob.to_parquet(f"{PRED_DIR}/predicciones_lr.parquet", index=False)
df_pred.to_parquet(f"{PRED_DIR}/predicciones_svm.parquet", index=False)

df_pred_lr_prob.to_csv(f"{PRED_DIR}/predicciones_completas.csv", index=False, encoding="utf-8")
df_pred_lr_prob.to_parquet(f"{PRED_DIR}/predicciones_completas.parquet", index=False)

print("\nüéâ Predicciones exportadas en:")
print(f"   {PRED_DIR}")


üíæ Exportando predicciones de Logistic Regression y SVM...

üéâ Predicciones exportadas en:
   /content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF/MODELOS_CONCAT_FINALES/predicciones
