# Consolidación de la data

## Configuración inicial

In [None]:
!pip install pyreadstat pyarrow fastparquet

Collecting pyreadstat
  Downloading pyreadstat-1.3.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.2 kB)
Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading pyreadstat-1.3.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (666 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m666.4/666.4 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyreadstat, fastparquet
Successfully installed fastparquet-2024.11.0 pyreadstat-1.3.1


In [None]:
import pandas as pd
import pyreadstat
import duckdb
import glob
import os
from pathlib import Path

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
raw_path = Path('/content/drive/MyDrive/classification_coding_open_ended_occupational_responses_ENAHO/data/raw')  #'../data/raw'

In [None]:
preview_300 = pd.read_stata(raw_path / 'enaho01a-2020-2024-300-educacion_p301a.dta', convert_categoricals=False)

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  preview_300 = pd.read_stata(raw_path / 'enaho01a-2020-2024-300-educacion_p301a.dta', convert_categoricals=False)


In [None]:
preview_300.shape

(550874, 8)

In [None]:
print(preview_300.columns.tolist())

['id_persona', 'anio', 'conglome', 'vivienda', 'hogar', 'codperso', 'p301a', 'p301a1']


In [None]:
preview_500 = pd.read_stata(raw_path / 'enaho01a_2020_2024_500_txt.dta', convert_categoricals=False)

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  preview_500 = pd.read_stata(raw_path / 'enaho01a_2020_2024_500_txt.dta', convert_categoricals=False)


In [None]:
print(preview_500.columns.tolist())

['aÑo', 'mes', 'conglome', 'vivienda', 'hogar', 'codperso', 'ubigeo', 'dominio', 'estrato', 'codinfor', 'p500a', 'p500b', 'p500b1', 'p500c', 'p500d', 'p500d1', 'p500n', 'p500i', 'p501', 'p502', 'p503', 'p504', 'p5041', 'p5042', 'p5043', 'p5044', 'p5045', 'p5046', 'p5047', 'p5048', 'p5049', 'p50410', 'p50411', 'p504a', 'p504b', 'p505', 'p505r4', 'txt505', 'p505b', 'txt505b', 'p506', 'p506r4', 'txt506', 'p507', 'p508', 'p509', 'p510', 'p510a1', 'p510b', 'p5111', 'p5112', 'p5113', 'p5114', 'p5115', 'p5116', 'p5117', 'p5118', 'p5119', 'p51110', 'p51111', 'p51112', 'p511a', 'p512a', 'p512b', 'p513', 'p513a', 'p513b', 'p513c', 'p513d', 'p513e', 'p513f', 'p513g', 'p513t', 'p513a1', 'p513a2', 'p514', 'p5151', 'p5152', 'p5153', 'p5154', 'p5155', 'p5156', 'p5157', 'p5158', 'p5159', 'p51510', 'p51511', 'p516', 'p516r4', 'txt516', 'p517', 'p517a', 'p517b1', 'p517c', 'p517d1', 'p517d2', 'p518', 'p519', 'p520', 'p520a', 'p521', 'p521a', 'p521b', 'p521b1', 'p521c', 'p521d', 'p522a', 'p522b', 'p522c',

## Extraer etiquetas (lbl)

In [None]:
# Read metadata with content

_, meta1 = pyreadstat.read_dta(raw_path / 'enaho01a_2020_2024_500_txt.dta',
metadataonly=True, encoding='latin1')
vars_1 = ['p207', 'p301a', 'p506r4', 'p507', 'p510', 'p505r4']
labels_1 = {var: meta1.variable_value_labels.get(var, {}) for var in vars_1}

_, meta2 = pyreadstat.read_dta(raw_path / 'enaho01a-2020-2024-300-educacion_p301a.dta', metadataonly=True, encoding='latin1')
vars_2 = ['p301a1']
labels_2 = {var: meta2.variable_value_labels.get(var, {}) for var in vars_2}

In [None]:
# Dict
labels_dict = {**labels_1, **labels_2}
labels_dict.keys()

dict_keys(['p207', 'p301a', 'p506r4', 'p507', 'p510', 'p505r4', 'p301a1'])

In [None]:
for var, labels in labels_dict.items():
    print(f"Variable: {var}")
    for code, label in labels.items():
        print(f"  {code}: {label}")
    print("-" * 30)


Variable: p207
  1: hombre
  2: mujer
------------------------------
Variable: p301a
  1: sin nivel
  2: inicial
  3: primaria incompleta
  4: primaria completa
  5: secundaria incompleta
  6: secundaria completa
  7: superior no universitaria incompleta
  8: superior no universitaria completa
  9: superior universitaria incompleta
  10: superior universitaria completa
  11: maestria/doctorado
  12: bÃ¡sica especial
------------------------------
Variable: p506r4
  111: cultivo de cereales (excepto arroz), legumbres y semillas  oleaginosas
  112: cultivo de arroz
  113: cultivo de vegetales y melones, raÃ­ces y tubÃ©rculos
  114: cultivo de  caÃ±a de  azÃºcar
  115: cultivo de tabaco
  116: cultivo de fibras
  119: cultivo de otros productos  no perennes
  121: cultivo de uvas
  122: cultivo de frutas tropicales y subtropicales
  123: cultivo de frutas cÃ­tricas
  124: cultivo de frutas con hueso y con pepa
  125: cultivo de otras  frutas y nueces que crecen en Ã¡rboles y arbustos
  12

## Seleccionar columnas de interés

In [None]:
# Cols of interest 1

cols1 = ['aÑo', # año
        'conglome', 'vivienda', 'hogar', 'codperso', # key de persona
        'txt505', # p505 before encoding
        'txt505b', # p505 tasks
        'txt506',
        'p208a', # edad
        'p501',
        'p502',
        'p503',
        'p504',
        'p5041',
        'p5042',
        'p5043',
        'p5044',
        'p5045',
        'p5046',
        'p5047',
        'p5048',
        'p5049',
        'p50410',
        'p50411'
        ] + vars_1

df1 = pd.read_stata(raw_path / 'enaho01a_2020_2024_500_txt.dta', columns=cols1, convert_categoricals=False)

In [None]:
# Cols of interest 2

cols2 = ['anio', # año
        'conglome', 'vivienda', 'hogar', 'codperso' # key de persona
        ] + vars_2

df2 = pd.read_stata(raw_path / 'enaho01a-2020-2024-300-educacion_p301a.dta', columns=cols2, convert_categoricals=False)

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  df2 = pd.read_stata(raw_path / 'enaho01a-2020-2024-300-educacion_p301a.dta', columns=cols2, convert_categoricals=False)


## Agregando las etiquetas (lbl) como nuevas variables

In [None]:
df1 = df1.rename(columns={'aÑo': 'anio'})
df1.head(3)

Unnamed: 0,anio,conglome,vivienda,hogar,codperso,txt505,txt505b,p208a,p501,p502,...,p5048,p5049,p50410,p50411,p207,p301a,p506r4,p507,p510,p505r4
0,2020,5002,11,11,1,TEJEDORA DE CHOMPA DE LANA EN MAQUINA DE TEJER,TEJER CHOMPA DE LANA EN MAQUINA DE TEJER,66,1.0,,...,,,,,2,4.0,1430.0,2.0,,7351.0
1,2020,5002,11,11,2,,,43,2.0,2.0,...,2.0,2.0,2.0,2.0,1,3.0,,,,
2,2020,5002,11,11,3,TECNICO ADMINISTRATIVO,DIGITAR DOCUMENTO MANEJAR SISTEMA,47,1.0,,...,,,,,2,8.0,8610.0,3.0,2.0,4419.0


In [None]:
df2.head(3)

In [None]:
# labels
for var, mapping in labels_dict.items():
    if var in df1.columns:
        df1[f'{var}_label'] = df1[var].map(mapping)
    elif var in df2.columns:
        df2[f'{var}_label'] = df2[var].map(mapping)

## Convertir archivo .dat a .parquet

In [None]:
interim_path = Path('/content/drive/MyDrive/PEU - CD/classification_coding_open_ended_occupational_responses_ENAHO/data/interim') # '../data/interim'

In [None]:
df1.to_parquet(interim_path / 'enaho.parquet', compression='snappy')
df2.to_parquet(interim_path / 'enaho_300.parquet', compression='snappy')

In [None]:
# Checking parquets

df1 = pd.read_parquet(interim_path / 'enaho.parquet')
df2 = pd.read_parquet(interim_path / 'enaho_300.parquet')

In [None]:
print(df1.shape)
print(df2.shape)

In [None]:
print("Columnas df1:", list(df1.columns))
print("Columnas df2:",list(df2.columns))

In [None]:
print(df1.head(2))

In [None]:
print(df2.head(2))

## Merge de los parquets

In [None]:
# Merge
df_merged = pd.merge(
    df1,  # left
    df2[['anio', 'conglome', 'vivienda', 'hogar', 'codperso', 'p301a1', 'p301a1_label']],  # right
    on=['anio', 'conglome', 'vivienda', 'hogar', 'codperso'],
    how='left'
)

In [None]:
print(df_merged.shape)

In [None]:
print(list(df_merged.columns))

In [None]:
print(df_merged['p506r4_label'].unique().tolist())

In [None]:
print(df_merged.dtypes)

## Guardar df_merged como un archivo parquet

In [None]:
df_merged.to_parquet(interim_path / 'enaho_merged.parquet', compression='snappy')

In [None]:
preview_merged = pd.read_parquet(interim_path / 'enaho_merged.parquet')

In [None]:
preview_merged.shape

In [None]:
list(preview_merged.columns)

In [None]:
# preview_merged.head(50)

Unnamed: 0,anio,conglome,vivienda,hogar,codperso,txt505,txt505b,p208a,p501,p502,...,p510,p505r4,p207_label,p301a_label,p506r4_label,p507_label,p510_label,p505r4_label,p301a1,p301a1_label
0,2020,5002,11,11,1,TEJEDORA DE CHOMPA DE LANA EN MAQUINA DE TEJER,TEJER CHOMPA DE LANA EN MAQUINA DE TEJER,66,1.0,,...,,7351.0,mujer,primaria completa,fabricaciÃ³n de prendas de tejidos de punto y ...,trabajador independiente,,tejedores a mano y en telares,,
1,2020,5002,11,11,2,,,43,2.0,2.0,...,,,hombre,primaria incompleta,,,,,,
2,2020,5002,11,11,3,TECNICO ADMINISTRATIVO,DIGITAR DOCUMENTO MANEJAR SISTEMA,47,1.0,,...,2.0,4419.0,mujer,superior no universitaria completa,actividades de hospitales,empleado,administraciÃ³n pÃºblica,otro personal de apoyo administrativo,342025.0,contabilidad
3,2020,5002,23,11,1,CARPINTERO EN MADERA,CONFECCIONAR LAQUEAR PUERTA VENTANA CAMA DE MA...,62,1.0,,...,,7322.0,hombre,secundaria completa,fabricaciÃ³n de muebles,empleador o patrono,,ebanistas y afines,,
4,2020,5002,49,11,1,,,74,2.0,2.0,...,,,hombre,secundaria completa,,,,,,
5,2020,5002,49,11,2,,,73,2.0,2.0,...,,,mujer,primaria completa,,,,,,
6,2020,5002,49,11,3,PINTOR DE PARED,PINTAR PARED CON RODILLO,51,1.0,,...,,7128.0,hombre,secundaria completa,terminaciÃ³n y acabado de edificios,trabajador independiente,,pintores de brocha gorda,,
7,2020,5002,49,11,4,SUPERVISORA DE ENCUESTA,SUPERVISAR EJECUCION DE ENCUESTA REVISAR INFOR...,48,1.0,,...,2.0,4225.0,mujer,superior universitaria completa,actividades de la administraciÃ³n pÃºblica en ...,empleado,administraciÃ³n pÃºblica,entrevistadores de encuestas y de investigacio...,522016.0,industrias alimentarias
8,2020,5002,73,11,1,,,60,2.0,2.0,...,,,hombre,secundaria completa,,,,,,
9,2020,5002,73,11,2,,,58,2.0,2.0,...,,,mujer,primaria completa,,,,,,


# ---
---

In [None]:
# preview_hugo = pd.read_stata('/content/drive/MyDrive/PEU - CD/classification_coding_open_ended_occupational_responses_ENAHO/data/raw/BASE_ENAHO_2020_2024_PLN_FINAL.dta', convert_categoricals=False)

In [None]:
preview_hugo.shape

In [None]:
preview_hugo.head(2)

---

In [None]:
# import pandas as pd
# import pyreadstat
# import chardet
# from pathlib import Path

# # Ruta del archivo original
# file_path = Path('/content/drive/MyDrive/PEU - CD/classification_coding_open_ended_occupational_responses_ENAHO/data/raw/enaho01a_2020_2024_500_txt.dta')



In [None]:
# # --- 1. Detectar encoding automáticamente ---
# with open(file_path, 'rb') as f:
#     raw_data = f.read(100000)  # muestra inicial (100KB)
#     detected = chardet.detect(raw_data)
# encoding_detected = detected['encoding']
# print(f"Encoding detectado: {encoding_detected}")



In [None]:
# # --- 2. Intentar lectura robusta ---
# try:
#     df, meta = pyreadstat.read_dta(file_path, encoding=encoding_detected)
# except UnicodeDecodeError:
#     print("Error al leer con el encoding detectado. Intentando con 'latin1'...")
#     df, meta = pyreadstat.read_dta(file_path, encoding='latin1')
# except Exception as e:
#     print("Error general:", e)
#     raise

In [None]:
# # --- 3. Corrección de caracteres mal decodificados (si hubiera) ---
# def fix_encoding(col):
#     def try_fix(x):
#         if isinstance(x, str):
#             try:
#                 return x.encode('latin1').decode('utf-8')
#             except UnicodeEncodeError:
#                 return x
#             except UnicodeDecodeError:
#                 return x
#         return x
#     return col.apply(try_fix)

# for c in df.select_dtypes('object').columns:
#     df[c] = fix_encoding(df[c])

# # --- 4. Guardar en parquet con compresión snappy ---
# out_path = file_path.with_suffix('.parquet')
# df.to_parquet(out_path, compression='snappy')
# print(f"Archivo guardado correctamente en: {out_path}")