<a href="https://colab.research.google.com/github/catalinakarinip/mercadolabroal/blob/main/mercadolaboral.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Script 1: limpiar bases originales del INE

In [None]:
# ================= 0. MONTAR DRIVE =====================================
from google.colab import drive
drive.mount('/content/drive')

# ================= 1. LIBRERÍAS ========================================
from pathlib import Path
import pandas as pd, re

# ================= 2. RUTAS ============================================
ROOT = Path("/content/drive/MyDrive/Data/Mercado_Laboral/Biobio")
ORIG_DIR  = ROOT / "Datos_ENE_originales"
CLEAN_DIR = ROOT / "Datos_ENE_limpios"
CLEAN_DIR.mkdir(parents=True, exist_ok=True)

MAP_FILE  = ROOT / "resultados/variables_por_base.xlsx"
MAP_SHEET = 0

# ================= 3. DICCIONARIO DE ABREVIATURAS =====================
df_map = (
    pd.read_excel(MAP_FILE, sheet_name=MAP_SHEET, usecols="A:C")
    .rename(columns=str.lower)
    .dropna()
)
df_map["base"] = df_map["base"].str.strip().str.replace(".xlsx", "", regex=False).str.lower()
df_map["var_original"] = df_map["var_original"].str.strip()
df_map["abreviatura"]  = df_map["abreviatura"].str.strip()
ABBR = {(r.base, r.var_original): r.abreviatura for r in df_map.itertuples()}

# ================= 4. HOJAS A CONSERVAR ================================
ID_VARS = ["Año", "Trimestre"]
HOJAS_OBJETIVO = {
    "AS", "AP", "TA", "AN", "AT", "CO", "VA", "RM", "LI", "ML",
    "NB", "BI", "AR", "LR", "LL", "AI", "MA"
}

# ================= 5. CONVERSIÓN NÚMEROS LATINOS =======================
import re
def texto_a_numero(valor):
    if pd.isna(valor):
        return None
    texto = str(valor)
    if re.match(r"^\d{1,3}(\.\d{3})*(,\d+)?$", texto):
        texto = texto.replace(".", "").replace(",", ".")
    else:
        texto = texto.replace(",", ".")
    try:
        return float(texto)
    except ValueError:
        return None

# ================= 6. LIMPIADOR DE HOJAS ===============================
def limpiar_hoja(df: pd.DataFrame, base_alias: str) -> pd.DataFrame:
    df.columns = df.columns.str.strip()
    df = df.copy()
    columnas = list(df.columns)
    renombres, columnas_a_borrar = {}, []

    for i in range(len(columnas)):
        col = columnas[i]
        if col in ID_VARS or "Unnamed" not in col:
            continue
        anterior = columnas[i - 1] if i > 0 else ""
        anterior = anterior.strip()
        if anterior in ID_VARS or not anterior:
            columnas_a_borrar.append(col)
            continue
        abbr = ABBR.get((base_alias, anterior))
        if abbr:
            renombres[col] = f"{abbr}_{base_alias}"
        else:
            columnas_a_borrar.append(col)

    df = df.drop(columns=columnas_a_borrar, errors="ignore").rename(columns=renombres)
    df = df[df[ID_VARS[0]].notna() | df[ID_VARS[1]].notna()].reset_index(drop=True)

    for col in renombres.values():
        df[col] = df[col].apply(texto_a_numero)

    return df[[*ID_VARS, *renombres.values()]]

# ================= 7. PROCESAR ARCHIVOS =================================
for src in ORIG_DIR.glob("*.xlsx"):
    base_alias = src.stem.split("_")[0].lower()
    dest = CLEAN_DIR / f"{base_alias}_limpia.xlsx"

    if dest.exists() and dest.stat().st_mtime >= src.stat().st_mtime:
        print(f"↻  Ya limpio: {dest.name}")
        continue

    print(f"→ Procesando {src.name} (base={base_alias})")

    xls = pd.ExcelFile(src)
    with pd.ExcelWriter(dest, engine="openpyxl") as xlw:
        hojas_guardadas = 0
        for hoja in xls.sheet_names:
            if hoja not in HOJAS_OBJETIVO:
                continue
            try:
                raw = pd.read_excel(src, sheet_name=hoja, header=5, dtype=str)
                limpio = limpiar_hoja(raw, base_alias)
                if not limpio.empty:
                    limpio.to_excel(xlw, sheet_name=hoja, index=False)
                    hojas_guardadas += 1
            except Exception as e:
                print(f"⚠️ Error en hoja '{hoja}': {e}")
        if hojas_guardadas == 0:
            print(f"⚠️ Ninguna hoja válida guardada en {src.name}")

print("\n✅ Limpieza completada. Archivos en:", CLEAN_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
→ Procesando rama.xlsx (base=rama)
→ Procesando informalidadtasas.xlsx (base=informalidadtasas)
→ Procesando informalidadrama.xlsx (base=informalidadrama)
→ Procesando informalidadgrupo.xlsx (base=informalidadgrupo)
→ Procesando informalidadcategoria.xlsx (base=informalidadcategoria)
→ Procesando indicadoresprincipales.xlsx (base=indicadoresprincipales)
→ Procesando grupo.xlsx (base=grupo)
→ Procesando complementarios.xlsx (base=complementarios)
