# Clasificaci√≥n ocupacional ENAHO: Modelos TF-IDF + Regresi√≥n log√≠stica y SVM - Modelo Inicial

Este script entrena dos modelos basados en Machine Learning:
     1) Regresi√≥n Log√≠stica multinomial
     2) SVM lineal (LinearSVC). Ambos modelos utilizan TF-IDF como representaci√≥n vectorial del texto y se eval√∫an sobre un split estratificado 70-15-15.
El objetivo es construir l√≠neas base s√≥lidas para comparar posteriormente con modelos de deep learning (Transformers).

In [3]:
from google.colab import drive
drive.flush_and_unmount()        # Desmonta cualquier conexi√≥n previa
!rm -rf /content/drive           # Borra restos del montaje anterior
drive.mount('/content/drive')    # Vuelve a montar desde cero

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [4]:
# ============================================================
# IMPORTACI√ìN DE LIBRER√çAS
# ============================================================
import os, json
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, log_loss
)

In [5]:
# ============================================================
# DEFINICI√ìN DE RUTAS
# ============================================================
BASE = "/content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF"

PATH_IN = f"{BASE}/BASE_LEMATIZADA.parquet"
OUT = f"{BASE}/MODELOS_FINALES_TFIDF_ML"

# Crear carpetas para guardar modelos y m√©tricas
os.makedirs(OUT, exist_ok=True)
os.makedirs(f"{OUT}/metricas", exist_ok=True)

In [6]:
# ============================================================
# CONFIGURACI√ìN
# ============================================================
TEXT_COL = "texto_lematizado"   # Columna con texto procesado
TARGET_COL = "p505r4"           # C√≥digo CIUO 4-d√≠gitos

RANDOM_STATE = 2025
TRAIN_SIZE = 0.70
MIN_SAMPLES_PER_CLASS = 10

In [7]:
# ============================================================
# 1. CARGA Y LIMPIEZA DE DATOS
# ============================================================
print("\nüìå Cargando base...")

df = pd.read_parquet(PATH_IN)

# Eliminar textos faltantes
df = df[df[TEXT_COL].notna()]

# Mantener solo clases con m√≠nimo volumen (‚â•10 observaciones)
counts = df[TARGET_COL].value_counts()
valid_classes = counts[counts >= MIN_SAMPLES_PER_CLASS].index
df = df[df[TARGET_COL].isin(valid_classes)]

print(f"‚úî Filas finales: {len(df):,}")


üìå Cargando base...
‚úî Filas finales: 315,625


In [8]:
# ============================================================
# 2. SPLIT EXACTO 70% ‚Äì 15% ‚Äì 15%
# ============================================================
print("\nüìå Realizando split estratificado 70/15/15...")

X = df[TEXT_COL]
y = df[TARGET_COL]

# 70% entrenamiento
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    train_size=0.70,
    stratify=y,
    random_state=RANDOM_STATE
)

# Del 30% restante ‚Üí dividir en 50/50 = 15% y 15%
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=RANDOM_STATE
)

print(f"‚úî Train: {len(X_train):,}")
print(f"‚úî Val:   {len(X_val):,}")
print(f"‚úî Test:  {len(X_test):,}")


üìå Realizando split estratificado 70/15/15...
‚úî Train: 220,937
‚úî Val:   47,344
‚úî Test:  47,344


In [9]:
# ============================================================
# 3. TF-IDF VECTOR REPRESENTATION
# ============================================================
print("\nüìå Entrenando vectorizador TF-IDF...")

# TF-IDF con n-gramas (1,3) y m√°ximo 15 000 tokens
tfidf = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1,3),
    min_df=5,
    max_df=0.85,
    sublinear_tf=True,
    token_pattern=r"(?u)\b\w+\b"
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf   = tfidf.transform(X_val)
X_test_tfidf  = tfidf.transform(X_test)


üìå Entrenando vectorizador TF-IDF...


In [10]:
# ============================================================
# 4. ENCODING DE ETIQUETAS
# ============================================================
le = LabelEncoder()

y_train_enc = le.fit_transform(y_train)
y_val_enc   = le.transform(y_val)
y_test_enc  = le.transform(y_test)

In [11]:
# ============================================================
# 5. FUNCI√ìN DE M√âTRICAS
# ============================================================
def metricas(y_true, y_pred, y_prob=None):
    """
    Devuelve un diccionario con todas las m√©tricas usadas en clasificaci√≥n multiclase.
    Si el modelo no tiene predict_proba (como SVM), no se calcula la p√©rdida.
    """
    loss_value = log_loss(y_true, y_prob) if y_prob is not None else None

    return {
        "Loss": loss_value,
        "Accuracy": accuracy_score(y_true, y_pred),
        "F1_Macro": f1_score(y_true, y_pred, average="macro"),
        "F1_Micro": f1_score(y_true, y_pred, average="micro"),
        "F1_Weighted": f1_score(y_true, y_pred, average="weighted"),
        "Precision_Macro": precision_score(y_true, y_pred, average="macro"),
        "Precision_Micro": precision_score(y_true, y_pred, average="micro"),
        "Precision_Weighted": precision_score(y_true, y_pred, average="weighted"),
        "Recall_Macro": recall_score(y_true, y_pred, average="macro"),
        "Recall_Micro": recall_score(y_true, y_pred, average="micro"),
        "Recall_Weighted": recall_score(y_true, y_pred, average="weighted")
    }

In [12]:
# ============================================================
# 6. REGRESI√ìN LOG√çSTICA
# ============================================================
print("\nüìå Entrenando Logistic Regression...")

lr = LogisticRegression(
    max_iter=400,
    class_weight="balanced",
    n_jobs=-1
)
lr.fit(X_train_tfidf, y_train_enc)

pred_tr = lr.predict(X_train_tfidf)
pred_va = lr.predict(X_val_tfidf)
pred_te = lr.predict(X_test_tfidf)

prob_tr = lr.predict_proba(X_train_tfidf)
prob_va = lr.predict_proba(X_val_tfidf)
prob_te = lr.predict_proba(X_test_tfidf)

metricas_lr = {
    "Train": metricas(y_train_enc, pred_tr, prob_tr),
    "Val":   metricas(y_val_enc, pred_va, prob_va),
    "Test":  metricas(y_test_enc, pred_te, prob_te),
}

with open(f"{OUT}/metricas/lr.json","w") as f:
    json.dump(metricas_lr, f, indent=4)


üìå Entrenando Logistic Regression...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
# ============================================================
# TABLA DE M√âTRICAS ‚Äî REGRESI√ìN LOG√çSTICA
# ============================================================

print("\nüìå TABLA ‚Äî M√©tricas Logistic Regression\n")

import pandas as pd

df_lr_table = pd.DataFrame(metricas_lr).T  # Transponer: filas = Train/Val/Test
df_lr_table = df_lr_table.applymap(
    lambda x: round(x, 4) if isinstance(x, (float, int)) else x
)

print(df_lr_table)


üìå TABLA ‚Äî M√©tricas Logistic Regression

         Loss  Accuracy  F1_Macro  F1_Micro  F1_Weighted  Precision_Macro  \
Train  0.7184    0.8735    0.7616    0.8735       0.8796           0.6789   
Val    0.8103    0.8469    0.5555    0.8469       0.8568           0.5199   
Test   0.8094    0.8454    0.5325    0.8454       0.8559           0.4983   

       Precision_Micro  Precision_Weighted  Recall_Macro  Recall_Micro  \
Train           0.8735              0.9118        0.9403        0.8735   
Val             0.8469              0.8896        0.6472        0.8469   
Test            0.8454              0.8900        0.6311        0.8454   

       Recall_Weighted  
Train           0.8735  
Val             0.8469  
Test            0.8454  


  df_lr_table = df_lr_table.applymap(


In [14]:
# ============================================================
# 7. SVM LINEAL
# ============================================================
print("\nüìå Entrenando Linear SVM...")

svm = LinearSVC(class_weight="balanced")
svm.fit(X_train_tfidf, y_train_enc)

pred_tr = svm.predict(X_train_tfidf)
pred_va = svm.predict(X_val_tfidf)
pred_te = svm.predict(X_test_tfidf)

metricas_svm = {
    "Train": metricas(y_train_enc, pred_tr),
    "Val":   metricas(y_val_enc, pred_va),
    "Test":  metricas(y_test_enc, pred_te),
}

with open(f"{OUT}/metricas/svm.json","w") as f:
    json.dump(metricas_svm, f, indent=4)


üìå Entrenando Linear SVM...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# ============================================================
# TABLA DE M√âTRICAS ‚Äî SVM (LinearSVC)
# ============================================================

print("\nüìå TABLA ‚Äî M√©tricas Linear SVM\n")

df_svm_table = pd.DataFrame(metricas_svm).T
df_svm_table = df_svm_table.applymap(
    lambda x: round(x, 4) if isinstance(x, (float, int)) else x
)

print(df_svm_table)


üìå TABLA ‚Äî M√©tricas Linear SVM

       Loss  Accuracy  F1_Macro  F1_Micro  F1_Weighted  Precision_Macro  \
Train   NaN    0.9525    0.9025    0.9525       0.9515           0.8563   
Val     NaN    0.8998    0.5700    0.8998       0.8993           0.5503   
Test    NaN    0.9011    0.5653    0.9011       0.9008           0.5450   

       Precision_Micro  Precision_Weighted  Recall_Macro  Recall_Micro  \
Train           0.9525              0.9562        0.9653        0.9525   
Val             0.8998              0.9064        0.6133        0.8998   
Test            0.9011              0.9087        0.6119        0.9011   

       Recall_Weighted  
Train           0.9525  
Val             0.8998  
Test            0.9011  


  df_svm_table = df_svm_table.applymap(


In [16]:
# ============================================================
# 8. GUARDAR MODELOS Y ARTEFACTOS
# ============================================================
print("\nüíæ Guardando modelos y artefactos...")

joblib.dump(tfidf, f"{OUT}/tfidf.joblib")
joblib.dump(le,    f"{OUT}/label_encoder.joblib")
joblib.dump(lr,    f"{OUT}/lr_model.joblib")
joblib.dump(svm,   f"{OUT}/svm_model.joblib")

print(f"\nüéâ ENTRENAMIENTO COMPLETO\nüìÅ Modelos guardados en:\n   {OUT}")


üíæ Guardando modelos y artefactos...

üéâ ENTRENAMIENTO COMPLETO
üìÅ Modelos guardados en:
   /content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF/MODELOS_FINALES_TFIDF_ML


In [18]:
# ============================================================
# 9. EXPORTACI√ìN DEL TEST SET EN M√öLTIPLES FORMATOS
# ============================================================

print("\nüíæ Exportando conjunto de TEST en m√∫ltiples formatos...")

# ----- Asegurar carpeta -----
EXPORT_DIR = f"{OUT}/test_export"
os.makedirs(EXPORT_DIR, exist_ok=True)

# ----- Reconstruir DataFrame TEST -----
test_df_export = pd.DataFrame({
    "texto_lematizado": X_test.values,
    "target_original": y_test.values,
    "target_encoded": y_test_enc
})

# Opcional: info m√≠nima (solo para inferencia)
test_df_essential = test_df_export[["texto_lematizado"]].copy()

# ----- METADATA -----
metadata = {
    "total_test_rows": len(test_df_export),
    "unique_clases_test": int(test_df_export["target_original"].nunique()),
    "label_encoder_mapping": {int(k): int(v) for k, v in enumerate(le.classes_)},
    "tfidf_vocab_size": int(len(tfidf.vocabulary_)),
    "train_size": len(X_train),
    "val_size": len(X_val),
    "test_size": len(X_test),
}

# ----- EXPORTAR -----
test_df_export.to_parquet(f"{EXPORT_DIR}/test_data.parquet", index=False)
test_df_export.to_csv(f"{EXPORT_DIR}/test_data.csv", index=False, encoding="utf-8")
test_df_essential.to_parquet(f"{EXPORT_DIR}/test_data_essential.parquet", index=False)

with open(f"{EXPORT_DIR}/test_metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=4)

print("\nüéâ Archivos exportados en:")
print(f"   {EXPORT_DIR}")


üíæ Exportando conjunto de TEST en m√∫ltiples formatos...

üéâ Archivos exportados en:
   /content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF/MODELOS_FINALES_TFIDF_ML/test_export


In [19]:
# ============================================================
# 10. EXPORTACI√ìN DE PREDICCIONES (LR y SVM)
# ============================================================

print("\nüíæ Exportando predicciones de Logistic Regression y SVM...")

# ----- Asegurar carpeta -----
PRED_DIR = f"{OUT}/predicciones"
os.makedirs(PRED_DIR, exist_ok=True)

# ----- Obtener predicciones (ya calculadas antes) -----
# Logistic Regression
y_pred_lr = pred_te
y_prob_lr = prob_te  # probabilidades

# SVM
y_pred_svm = pred_te  # ya calculado arriba


# ============================================================
# üß± Construir DataFrames con predicciones
# ============================================================

# ----- BASE GENERAL -----
df_pred = pd.DataFrame({
    "texto_lematizado": X_test.values,
    "true_label_original": y_test.values,
    "true_label_encoded": y_test_enc,

    "pred_lr_encoded": y_pred_lr,
    "pred_lr_original": le.inverse_transform(y_pred_lr),

    "pred_svm_encoded": y_pred_svm,
    "pred_svm_original": le.inverse_transform(y_pred_svm),
})

# ----- probabilidades LR -----
prob_df = pd.DataFrame(
    y_prob_lr,
    columns=[f"prob_class_{c}" for c in le.classes_]
)
df_pred_lr_prob = pd.concat([df_pred, prob_df], axis=1)


# ============================================================
# üì§ EXPORTAR ARCHIVOS
# ============================================================

# 1) Predicciones LR completas (con probabilidades)
df_pred_lr_prob.to_parquet(f"{PRED_DIR}/predicciones_lr.parquet", index=False)

# 2) Predicciones SVM (sin probabilidades)
df_pred.to_parquet(f"{PRED_DIR}/predicciones_svm.parquet", index=False)

# 3) Exportaci√≥n combinada en CSV
df_pred_lr_prob.to_csv(f"{PRED_DIR}/predicciones_completas.csv", index=False, encoding="utf-8")

# 4) Exportaci√≥n combinada en Parquet
df_pred_lr_prob.to_parquet(f"{PRED_DIR}/predicciones_completas.parquet", index=False)


print("\nüéâ Predicciones exportadas en:")
print(f"   {PRED_DIR}")


üíæ Exportando predicciones de Logistic Regression y SVM...

üéâ Predicciones exportadas en:
   /content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/TF-IDF/MODELOS_FINALES_TFIDF_ML/predicciones
