# ENIGH Database

@roman

26 June, 2024

Code to create an ETL of the Enigh database, which is a database of the National Institute of Statistics and Geography (INEGI) of Mexico. The database contains information on the income and expenses of Mexican households.

In [1]:
import pandas as pd
import geo
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm



In [2]:
# Settings
# show 100 columns in pandas
pd.set_option('display.max_columns', 500)

---
# Concentrado Hogar

The data diccionary is found [here](https://www.inegi.org.mx/rnm/index.php/catalog/685/data-dictionary/F28?file_name=concentradohogar)

## S1: Extract & Transform

In [3]:
# function to extract, transform and load each enigh file
def weighted_mean(df, cols, weight_col):
    return (df[cols].mul(df[weight_col], axis=0).sum() / df[weight_col].sum())


def weighted_mode(df, columns_to_mode, weighting_column):
    # Initialize a DataFrame to store the mode values
    mode_values = pd.DataFrame(columns=['Column', 'Weighted_Mode'])

    # Calculate weighted mode for each column in columns_to_mode
    for col in columns_to_mode:
        # Group by the column values and calculate weighted counts
        weighted_counts = df.groupby(col)[weighting_column].sum()

        # Find the index of maximum weighted count
        mode_index = weighted_counts.idxmax()

        # Get the mode value
        mode_value = df.loc[df[col] == mode_index, col].iloc[0]

        # Append mode value to mode_values DataFrame
        mode_values.loc[len(mode_values)] = [col, mode_value]

    # return a Series
    return mode_values.set_index('Column')['Weighted_Mode']


def get_statistics(df, cols_mode, cols_mean, weight_col):
    # s1: get weighted mode
    mode_values = weighted_mode(df, cols_mode, weight_col)

    # s2: get weighted mean
    mean_values = weighted_mean(df, cols_mean, weight_col)

    # s3: get total viviendas
    total_hogares = df[weight_col].sum()

    # return all the values in a pd series
    melt_values = pd.concat([mode_values, mean_values])
    melt_values['total_hogares'] = total_hogares

    return melt_values


def get_enigh_concentrado_hogar(file, cols_mode, cols_mean):
    # s1:read file
    df = pd.read_csv(file, na_values=[' '])
    
    # s2: subset and wrangle
    df = (
        df
        .query("foliohog == 1")
        .reset_index(drop=True)
        .assign(
            ubica_geo=lambda x: x["ubica_geo"].astype(str).str.zfill(5)
        )
    )

    # s3: get weighted mean
    table = (
        df
        .groupby("ubica_geo", as_index=False)
        .apply(
            lambda x: get_statistics(x, cols_mode, cols_mean, "factor"),
            include_groups=False
        )
    )
    
    return table


In [4]:
# get params
years_enigh = [2018, 2020, 2022]

cols_mode = [
    'est_dis', 'clase_hog', 'sexo_jefe',
]
cols_mean = [
    'tam_loc', 'est_socio',  'edad_jefe',
    'educa_jefe', 'tot_integ', 'hombres', 'mujeres',
    'mayores', 'menores', 'p12_64', 'p65mas', 'ocupados',
    'percep_ing', 'perc_ocupa', 'ing_cor', 'ingtrab',
    'trabajo', 'sueldos', 'horas_extr', 'comisiones',
    'aguinaldo', 'indemtrab', 'otra_rem', 'remu_espec',
    'negocio', 'noagrop', 'industria', 'comercio', 'servicios',
    'agrope', 'agricolas', 'pecuarios', 'reproducc', 'pesca',
    'otros_trab', 'rentas', 'utilidad', 'arrenda', 'transfer',
    'jubilacion', 'becas', 'donativos', 'remesas', 'bene_gob',
    'transf_hog', 'trans_inst', 'estim_alqu', 'otros_ing', 'gasto_mon',
    'alimentos', 'ali_dentro', 'cereales', 'carnes', 'pescado',
    'leche', 'huevo', 'aceites', 'tuberculo', 'verduras', 'frutas',
    'azucar', 'cafe', 'especias', 'otros_alim', 'bebidas', 'ali_fuera',
    'tabaco', 'vesti_calz', 'vestido', 'calzado', 'vivienda',
    'alquiler', 'pred_cons', 'agua', 'energia', 'limpieza', 'cuidados',
    'utensilios', 'enseres', 'salud', 'atenc_ambu', 'hospital',
    'medicinas', 'transporte', 'publico', 'foraneo', 'adqui_vehi',
    'mantenim', 'refaccion', 'combus', 'comunica', 'educa_espa',
    'educacion', 'esparci', 'paq_turist', 'personales', 'cuida_pers',
    'acces_pers', 'otros_gas', 'transf_gas', 'percep_tot', 'retiro_inv',
    'prestamos', 'otras_perc', 'ero_nm_viv', 'ero_nm_hog', 'erogac_tot',
    'cuota_viv', 'mater_serv', 'material', 'servicio', 'deposito',
    'prest_terc', 'pago_tarje', 'deudas', 'balance', 'otras_erog',
]

FILE_TEMPLATE = 'enigh{enigh_year}_ns_concentradohogar_csv.zip'
ENIGH_ROOT = "../../data/catalogues/adamuz_data/enigh_{enigh_year}/"

In [5]:
# get all tables
list_tables = []
for year in tqdm(years_enigh):
    # get file
    dir_file = ENIGH_ROOT.format(enigh_year=year)
    file = dir_file + FILE_TEMPLATE.format(enigh_year=year)

    # unzip
    os.system(f"unzip -o {file} -d {dir_file}")

    # get table
    table = get_enigh_concentrado_hogar(
        dir_file + 'concentradohogar.csv',
        cols_mode,
        cols_mean
        )

    # add year
    table["year"] = year

    # append
    list_tables.append(table)


  0%|          | 0/3 [00:00<?, ?it/s]

Archive:  ../../data/catalogues/adamuz_data/enigh_2018/enigh2018_ns_concentradohogar_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/concentradohogar.csv  
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/nota_bases_datos_enigh2018_ns.txt  


 33%|███▎      | 1/3 [00:09<00:18,  9.14s/it]

Archive:  ../../data/catalogues/adamuz_data/enigh_2020/enigh2020_ns_concentradohogar_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2020/concentradohogar.csv  


 67%|██████▋   | 2/3 [00:18<00:09,  9.24s/it]

Archive:  ../../data/catalogues/adamuz_data/enigh_2022/enigh2022_ns_concentradohogar_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2022/concentradohogar.csv  


100%|██████████| 3/3 [00:27<00:00,  9.12s/it]


In [6]:
# concat
df_concentradohogar_all = (
    pd.concat(list_tables)
    .sort_values(["ubica_geo", "year"], ignore_index=True)
    )
df_concentradohogar_all.shape

(3218, 123)

## S2: EDA

In [7]:
# see sample
df_concentradohogar_all.head()

Unnamed: 0,ubica_geo,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares,year
0,1001,2.0,2.0,1.0,1.313242,2.830593,50.089514,6.479373,3.721097,1.780415,1.940683,2.999458,0.72164,2.691494,0.307964,1.803984,2.391824,1.774901,63956.301534,43070.626604,37697.35196,31202.010734,495.39457,1317.133005,2206.442086,252.169473,852.605979,1371.596112,3683.76857,3046.691033,415.123858,1268.234312,1363.332863,637.077537,0.783675,633.1576,3.136262,0.0,1689.506073,4540.471227,3606.090472,934.380755,9947.056421,5419.655226,159.891602,989.171367,295.72428,299.683788,1599.486996,1183.443163,6351.823899,46.323383,39119.615453,11954.944213,8940.815992,1409.190847,1791.346954,141.805814,1001.070483,256.830314,64.707445,116.593703,888.009195,411.459735,58.34193,74.699891,70.812195,1791.369493,864.577994,2928.15608,85.972141,1740.256643,1088.568451,651.688192,3114.482487,1146.30137,175.330648,644.507169,1148.3433,2429.241635,1727.998294,274.058488,427.184852,1639.883555,1227.447356,197.869469,214.566731,8381.883266,1171.234546,335.669626,1267.892256,3749.066387,355.85374,3393.212646,1858.020452,5514.06328,3462.857938,1320.678289,730.527053,2968.421188,2140.535891,117.545545,710.339753,1376.439186,4509.766233,906.226031,512.698217,1570.168785,10.959857,1509.713343,8311.22751,1231.299092,479.004866,209.606931,269.397936,2451.678069,257.427359,2074.223392,787.065174,316.65008,713.879477,234164.0,2018
1,1001,2.0,2.0,1.0,1.191967,2.848673,50.307018,6.518359,3.511532,1.67006,1.841472,2.877479,0.634053,2.578268,0.299212,1.696833,2.29508,1.642091,60912.502414,39994.510208,36177.707945,30166.188042,278.128582,943.278159,2202.367386,323.549054,1224.366678,1039.830043,2677.040577,2616.953301,378.597045,837.298788,1401.057468,60.087276,2.420698,57.607004,0.059573,0.0,1139.761686,3339.710125,2725.802014,613.90811,10562.410583,6146.652668,104.825597,1051.749094,361.69834,898.020269,1472.286234,527.178381,6937.108693,78.762806,37361.636895,12577.730151,10739.532593,1660.287784,1981.896889,180.714402,1105.435781,366.78371,92.778479,130.16186,1079.879732,523.272579,81.819202,84.717259,85.751375,2324.198133,1041.835407,1720.38709,117.810468,1351.179617,878.251851,472.927765,3523.484832,1395.231161,184.045724,789.58759,1154.620356,2287.033608,1552.910254,271.907183,462.216171,1778.678539,1277.235951,174.258678,327.18391,8257.743571,929.733346,119.353027,1820.589293,3162.720259,342.765384,2819.954876,2225.347646,3700.139892,2536.653595,776.041132,387.445165,2749.710175,1969.424468,112.600106,667.685601,1135.93651,4979.077693,1640.427443,570.14062,1558.225213,42.339486,1167.944931,10520.616231,1834.717706,532.231593,265.539319,266.692273,4694.411677,129.864933,1556.153613,757.984491,463.801093,551.451125,267473.0,2020
2,1001,2.0,2.0,1.0,1.260126,2.843422,50.853007,6.667584,3.485117,1.630896,1.854221,2.844748,0.640369,2.536844,0.307904,1.688633,2.319125,1.663681,85273.9755,54618.627742,46768.736235,38819.779869,589.695894,1155.565731,2649.834336,264.17195,1727.459018,1562.229437,6391.428561,5888.420235,766.776615,2344.601644,2777.041975,503.008326,0.0,503.008326,0.0,0.0,1458.462947,6910.592874,5149.578103,1761.014771,14780.159263,7191.002235,108.277001,1569.589724,673.929321,1579.764943,2139.159845,1518.436194,8849.450081,115.14554,47870.29216,15429.203656,12588.366776,2001.663323,2403.058055,175.872044,1287.168127,468.580378,134.253503,186.934718,1206.943568,589.087442,95.506481,104.592349,95.286204,2631.073397,1208.347185,2707.329485,133.507395,1836.530237,1121.475143,715.055094,4335.68938,1732.631168,335.986748,842.009523,1425.061941,3344.99517,2252.075444,580.827866,512.091861,1720.222464,1223.844712,246.189409,250.188344,9865.144573,1220.680673,311.682692,1854.674742,4071.129026,399.856847,3671.272179,2406.977439,6212.240008,3870.336809,1398.845743,943.057456,3802.91017,2583.958871,155.036877,1063.914422,1323.356501,6767.566527,2160.745485,464.943617,2153.185749,327.203453,1661.488223,13991.693495,1978.273587,288.019917,145.017387,143.00253,5997.86469,156.027955,2643.188077,744.784574,457.460912,1726.073781,267713.0,2022
3,1002,10.0,2.0,1.0,3.502535,2.0,49.600994,4.982965,3.604238,1.983979,1.62026,2.760799,0.843439,2.500406,0.260393,1.721456,2.213243,1.621882,56859.42945,24675.146233,22639.608039,18628.511903,886.846242,0.0,1274.586249,223.188603,460.445814,1166.029227,1191.927595,762.882436,295.869884,236.925137,230.087414,429.045159,164.702685,264.342474,0.0,0.0,843.610599,23675.976623,23631.338923,44.6377,5451.38016,443.291597,172.418478,802.779294,1569.348306,624.306757,1420.621389,418.614339,3056.926434,0.0,21993.396234,8219.964496,6756.400139,1514.529793,1135.884369,40.070674,525.476606,283.662692,113.508012,157.457868,937.194774,351.052311,61.911357,16.976191,52.826062,900.659416,665.190014,1397.342622,66.221735,1495.82783,1033.302822,462.525008,1339.606533,104.80488,16.754183,205.183786,1012.863684,1680.446001,1177.354195,292.371691,210.720115,552.17986,365.701076,100.468288,86.010496,3832.240098,551.075067,305.880559,525.023184,1672.733856,124.451836,1548.28202,777.527433,2771.949981,2014.941112,563.664797,193.344071,1748.0378,1547.277353,71.133487,129.626959,353.143635,1629.274389,646.414495,139.68577,38.114105,0.0,805.060019,3019.599582,0.0,474.202007,281.918093,192.283914,832.381317,10.572054,63.432649,153.873705,1376.595836,108.542015,9862.0,2018
4,1002,11.0,2.0,1.0,4.0,2.0,51.105312,5.020217,4.385703,2.075317,2.310386,3.304836,1.080867,2.881739,0.423097,1.839984,2.478858,1.762685,42676.932291,28949.53069,23455.044724,20353.21711,289.514035,100.071924,1067.784121,44.078525,344.034133,1256.344874,4318.092303,2342.715008,772.794728,1331.904181,238.016099,1975.377295,725.629626,1249.747669,0.0,0.0,1176.393663,2627.893293,2461.790284,166.103009,7652.633981,1284.361472,76.407115,1160.62607,1561.976716,1818.070979,1634.229198,116.96243,3409.293809,37.580518,24535.163513,10057.882685,8938.334839,1917.737784,1633.17432,78.674667,729.702141,359.271783,149.51401,163.616924,1091.038517,265.520017,143.136826,57.307307,61.904314,1377.758239,909.97799,1115.317676,4.23017,1494.923783,1034.875998,460.047785,1404.263898,60.937721,45.485267,296.539376,1001.301534,1987.917123,1279.63468,241.193533,467.08891,1378.318923,862.202386,135.480009,380.636527,4183.672015,463.693001,13.089827,556.921541,1915.928905,110.701546,1805.227359,1234.038742,1400.986002,1074.667875,231.7884,94.529726,2049.032189,1764.058932,34.384234,250.589024,578.166895,3222.72366,741.477315,229.954656,823.146531,0.0,1428.145157,6509.449112,0.0,411.181687,284.149183,127.032504,3711.725764,105.886901,96.435736,477.849499,1453.620311,252.749214,7568.0,2020


In [8]:
# shape
df_concentradohogar_all.shape

(3218, 123)

In [9]:
# see empty values
df_concentradohogar_all.isnull().sum()[df_concentradohogar_all.isnull().sum() > 0]

Series([], dtype: int64)

In [10]:
# count # of ubica_geo
df_concentradohogar_all["ubica_geo"].value_counts().value_counts()

count
3    664
1    502
2    362
Name: count, dtype: int64

In [11]:
# describe
df_concentradohogar_all.describe()

Unnamed: 0,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares,year
count,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0
mean,306.894655,2.024549,1.018334,3.126627,1.738642,51.777733,5.09802,3.626452,1.756526,1.869925,2.874127,0.752324,2.516746,0.357382,1.737205,2.361901,1.659605,42832.739771,27342.263322,21022.137858,18360.965488,142.262048,387.370905,918.352346,100.772285,375.463892,736.950893,5144.973552,3571.022166,965.558829,1307.249235,1298.214102,1573.951386,839.207739,648.441564,35.62828,50.673804,1175.151912,2560.332132,2155.66974,404.662392,8356.073,2834.563251,69.764822,1111.201224,842.251982,1682.114427,1294.96088,521.216414,4540.223875,33.847443,26978.456638,10722.503389,9178.216533,1815.776905,1985.935418,191.479814,810.754164,399.759513,179.801073,158.835174,1166.499288,364.515518,132.1625,100.298404,85.964536,1004.160293,782.273934,1500.58238,43.704476,1089.014242,663.350024,425.664218,2143.950353,492.10169,105.836001,233.313465,1312.699197,1745.702951,1326.950768,154.707186,264.044997,1090.296698,805.042192,113.984523,171.269983,5031.318885,1135.877534,138.677962,654.763285,1912.605288,190.388716,1722.216572,1189.394816,2289.39638,1641.266439,485.30151,162.828431,2108.788971,1643.044461,70.585569,395.158941,757.484768,3404.853373,887.634839,423.18689,563.459168,41.937477,1488.634998,5091.98567,281.405813,382.341654,246.206399,136.135255,2710.776539,81.868545,453.112205,378.826458,479.237772,324.416685,32945.253263,2020.084525
std,145.922871,0.170082,0.134178,0.94244,0.599341,4.652554,1.114301,0.55671,0.310097,0.320843,0.361939,0.301188,0.414995,0.163814,0.338901,0.363624,0.302988,22079.261206,13624.868297,12552.947856,10571.895894,273.596572,730.809214,1129.441988,492.834326,561.607693,1319.945559,5602.526469,3210.407989,1566.704211,2023.140395,1581.801258,4704.270603,3594.124805,2272.837941,411.593423,383.87096,1238.658581,10020.006994,9002.40879,1575.824722,4809.179743,3930.437381,157.433559,902.178519,1471.656998,1033.746098,874.195265,1051.108887,2641.8804,150.033695,11049.292672,3424.128936,2527.859401,534.205421,847.653631,212.697699,352.802879,158.582726,108.802514,86.709701,403.089733,227.235148,96.496406,76.533667,52.637713,751.658398,458.862095,1432.257307,72.741868,613.473681,415.077051,245.074229,1598.960558,935.987191,290.270275,225.581083,603.345665,856.83661,670.747769,151.344192,246.698419,1082.226831,906.164427,373.188263,179.876313,2801.542113,866.237,170.5982,1382.168033,1347.640313,208.86354,1233.632288,701.579133,1836.578691,1455.647583,406.859784,360.552993,954.593303,597.307267,77.137441,497.376622,941.693618,3724.412643,2114.884268,876.415986,1636.452589,249.778868,1945.518702,8723.316637,679.453627,686.206314,469.394427,329.565881,7722.201818,289.404396,1291.502099,567.986961,1018.181229,1069.963643,60317.776856,1.624437
min,2.0,1.0,1.0,1.0,1.0,35.578947,1.647059,1.823529,0.823529,0.5,1.666667,0.0,0.769231,0.0,0.461538,1.23518,0.461538,7025.1776,2048.4985,220.3885,220.3885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,295.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1011.8672,0.0,4695.6392,2536.177368,2350.776316,217.0236,0.0,0.0,0.0,0.0,0.0,0.0,23.444706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.03125,0.0,0.0,0.0,2.902609,262.213182,262.213182,0.0,0.0,0.0,0.0,0.0,0.0,60.941579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,216.839545,216.839545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,596.0,2018.0
25%,199.0,2.0,1.0,2.516317,1.0,48.70524,4.317911,3.274568,1.55544,1.662957,2.642101,0.553403,2.263158,0.242709,1.517938,2.127926,1.459978,29889.115003,18309.341903,12378.825616,11072.997178,0.0,0.0,254.390377,0.0,18.631791,117.596704,2607.497615,1640.168641,135.239157,299.38593,278.24515,58.22265,0.0,0.0,0.0,0.0,489.940704,33.089443,0.0,0.0,5258.913105,354.136115,0.0,524.537711,0.0,913.734682,702.612945,106.035562,3047.424902,0.0,19930.566513,8467.494607,7415.360074,1467.966364,1400.911154,53.783281,566.324939,292.161705,105.219149,99.397628,875.613971,202.036786,71.306652,49.629522,50.750658,463.588681,435.558929,605.231759,0.0,673.740754,388.667558,254.205029,1228.604431,0.0,11.246043,60.800604,963.778083,1216.694076,948.304753,56.428613,97.353886,474.079375,314.354991,0.0,61.907056,3174.615883,482.369617,30.184493,0.0,931.085995,53.65583,827.607753,686.007408,1096.973883,691.06425,241.458912,0.0,1469.541017,1234.771067,28.305885,84.401607,222.672356,1658.39046,176.992429,19.426037,3.269956,0.0,771.909047,1914.533984,0.0,46.078569,26.493325,0.0,556.193607,0.0,0.0,0.0,59.941534,0.0,7035.75,2018.0
50%,299.0,2.0,1.0,3.298769,2.0,51.568747,5.03311,3.567275,1.727273,1.841641,2.850307,0.713246,2.512498,0.341979,1.70365,2.333333,1.640999,39869.456376,25822.063305,19296.596791,17100.243792,38.408246,88.657252,679.474123,0.0,197.241986,427.0986,4079.346108,2878.009426,577.544267,813.918873,921.155182,583.50728,172.150524,116.41754,0.0,0.0,953.984703,723.200522,413.322825,95.01625,7512.118099,1731.967372,16.429932,929.998177,250.751114,1542.821228,1164.594194,275.595066,3929.271023,0.0,25549.982164,10372.444365,9040.695648,1781.743083,1919.664698,125.585412,797.670121,377.515579,155.110305,147.409917,1129.385507,327.783708,107.632078,83.322976,77.78425,823.610429,701.263688,1180.660302,12.928432,991.937261,591.09785,383.660923,1826.589343,177.345808,43.5625,177.5,1254.396328,1605.797474,1208.304826,117.449471,209.771111,834.467063,577.86723,15.14107,129.732027,4533.068571,965.510262,86.330372,170.914101,1649.844215,141.823022,1486.880278,1093.465833,1871.105318,1290.694348,407.026774,44.933181,1954.620217,1589.027351,55.219715,254.782758,523.730182,2738.933253,507.548775,195.652,104.097826,0.0,1245.280821,3695.403654,0.0,174.538422,110.093782,37.408616,1623.270476,14.26625,101.574398,196.410853,215.5177,85.736678,13650.0,2020.0
75%,433.0,2.0,1.0,4.0,2.0,54.665891,5.887943,3.930711,1.928207,2.048896,3.090699,0.9,2.764706,0.45,1.937673,2.565217,1.85,51979.013228,34557.327656,27780.953793,24155.541584,173.152685,518.174376,1317.720547,37.420984,517.649674,918.850418,6197.604471,4602.488804,1254.863713,1571.958105,1717.450784,1807.15946,779.243506,579.18145,0.0,0.0,1525.109045,2392.000718,1861.381553,417.75019,10432.399989,4123.632094,79.134144,1467.921669,1054.334789,2267.545107,1702.184372,611.378844,5228.351652,10.460019,32365.405007,12636.303887,10773.701152,2122.136031,2466.701461,257.142232,1033.432579,486.214317,228.173664,201.349359,1410.714451,490.002623,162.626603,131.388522,110.655061,1386.760253,1045.150099,1977.874937,59.590384,1387.516859,843.723982,545.374897,2684.149462,636.671948,110.329848,344.777561,1577.770019,2095.839645,1541.562586,204.361144,363.639966,1342.229632,982.265733,92.627692,225.810786,6356.25861,1603.51335,186.759146,809.768713,2626.332944,258.371423,2364.476373,1565.653336,2996.003392,2224.12916,619.744899,184.787935,2570.163068,1981.441259,92.875114,532.505918,1026.92875,4167.929956,1030.580097,532.864064,489.13,3.161105,1887.595572,6413.763542,255.850618,437.189863,269.091982,146.049614,3292.116281,82.922562,489.557663,524.609937,519.523964,344.537772,28826.75,2022.0
max,560.0,3.0,2.0,4.0,4.0,72.368421,9.344718,6.777778,3.5625,3.421053,5.05,2.578947,4.55,1.6,3.8125,4.5625,3.0,572613.797646,191517.9725,141050.832806,140536.798588,5366.189524,9456.521667,33697.789019,10961.427185,12808.664,49188.752981,174002.7075,43576.716522,33887.6745,43411.645652,22387.291304,174002.7075,151022.995,72633.545833,20192.037143,12165.959545,32688.403687,374277.694588,301291.03504,72986.659548,80796.708462,80440.880769,3124.470497,15119.818552,16958.1505,10200.305556,11380.839483,23116.2405,40926.941033,3409.628571,198564.803559,53320.775816,23501.893873,5018.0915,8206.536555,1818.936842,2869.491789,1433.564167,925.709545,1158.525699,3975.332,1647.132222,1029.046957,610.709,605.48134,5422.739256,3433.241818,29682.632442,676.392174,8465.745762,5639.592,3146.540559,31567.734613,15105.459945,11225.096908,2583.217391,5974.432267,13679.22329,11005.484394,2678.53188,4009.788333,16597.893553,12904.743478,11957.522178,2827.603889,53276.999366,6400.39625,2040.118824,36905.93249,12453.499177,3799.247273,11114.038562,7120.0,32959.123901,23815.482807,8998.59896,5576.086,14088.718844,7771.963133,1888.085714,8161.349412,15232.414611,93552.134937,82188.836819,23521.026087,27214.896111,7072.6105,89973.999679,383153.34156,9495.0,11683.630909,10083.959091,9356.410909,369157.307305,10824.630678,38326.31105,12466.388882,20326.75,40983.606111,586990.0,2022.0


## S3: Explicit Nulls

In [12]:
# generate explicit nulls for ubica_geo and year
index_cols = ["ubica_geo", "year"]
new_index = pd.MultiIndex.from_product(
    [df_concentradohogar_all["ubica_geo"].unique(), df_concentradohogar_all["year"].unique()],
    names=index_cols
)

# reindex
df_concentradohogar_all = (
    df_concentradohogar_all
    .set_index(index_cols)
    .reindex(new_index)
    .reset_index()
    )

In [13]:
# count # of ubica_geo
df_concentradohogar_all["ubica_geo"].value_counts().value_counts()

count
3    1528
Name: count, dtype: int64

In [14]:
# see ubica geo 01004
df_concentradohogar_all.query("ubica_geo == '01004'")

Unnamed: 0,ubica_geo,year,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares
9,1004,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10,1004,2020,11.0,2.0,1.0,4.0,2.0,53.095238,4.952381,4.666667,2.095238,2.571429,3.52381,1.142857,2.952381,0.571429,1.666667,3.095238,1.619048,50108.64,29205.945714,20554.909048,16689.658571,16.304286,0.0,1425.112381,0.0,863.847619,1559.98619,6453.678571,3989.44,0.0,58.229524,3931.210476,2464.238571,52.406667,2411.831905,0.0,0.0,2197.358095,0.0,0.0,0.0,17151.685238,6183.724286,58.229524,579.967619,3232.916667,2820.648571,2437.044762,1839.15381,3442.391429,308.617619,29510.594286,13492.470952,11801.460952,2150.163333,1509.784286,0.0,1468.144762,726.722857,230.506667,200.199524,1711.491905,512.132381,245.506667,91.223333,83.874762,1893.665238,978.045238,1678.765238,12.244762,815.899524,440.392381,375.507143,2110.618571,69.124286,229.52381,320.0,1491.970476,2548.0,2275.535238,253.831429,18.633333,405.97,376.81,10.993333,18.166667,5659.382381,745.705238,10.946667,1271.738571,2076.111429,71.505238,2004.60619,1554.880476,1202.63,493.482857,476.228095,232.919048,2283.779524,1708.030476,31.21,544.539048,991.843333,2058.328571,638.197619,698.756667,15.139048,0.0,706.235238,6272.042857,0.0,1411.489524,1276.396667,135.092857,4227.48,451.396667,0.0,0.0,0.0,181.676667,1155.0
11,1004,2022,11.0,2.0,1.0,4.0,2.0,55.555556,4.166667,4.5,2.111111,2.388889,3.333333,1.166667,2.833333,0.5,1.777778,2.666667,1.722222,42181.936667,32637.283333,26569.804444,23682.403333,288.609444,0.0,1494.448333,0.0,179.971111,924.372222,4776.420556,1205.441667,0.0,41.331111,1164.110556,3570.978889,972.763889,2598.215,0.0,0.0,1291.058333,0.0,0.0,0.0,6420.463333,0.0,132.596667,1272.037222,379.229444,2604.846111,1673.525,358.228889,3124.19,0.0,26342.328889,10408.444444,9682.019444,2974.238333,949.277778,117.856667,541.420556,427.138333,94.998889,210.711667,1291.761667,419.638889,47.142222,7.142778,14.285,1331.424444,1254.982222,610.711667,115.713333,847.595556,425.861111,421.734444,2041.126667,166.666667,29.375,423.333333,1421.751667,1961.920556,1568.127222,173.911667,219.881667,1284.121667,1086.679444,0.0,197.442222,4915.933889,1071.417778,70.765,0.0,2943.806667,208.862778,2734.943889,829.944444,1176.49,657.202778,329.07,190.217222,3333.295,2838.871111,2.825556,491.598333,373.401111,637.091111,0.0,0.0,0.0,0.0,637.091111,1127.683333,0.0,2.732222,2.732222,0.0,1124.951111,0.0,0.0,0.0,0.0,0.0,1494.0


In [15]:
# fill na with strategy ffill and then bfill by ubica_geo
df_concentradohogar_all = (
    df_concentradohogar_all
    .sort_values(["ubica_geo", "year"])
    .groupby("ubica_geo")
    .apply(lambda x: x.ffill().bfill())
    .reset_index(drop=True)
    )

  .apply(lambda x: x.ffill().bfill())


In [16]:
# see ubica geo 01004
df_concentradohogar_all.query("ubica_geo == '01004'")

Unnamed: 0,ubica_geo,year,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares
9,1004,2018,11.0,2.0,1.0,4.0,2.0,53.095238,4.952381,4.666667,2.095238,2.571429,3.52381,1.142857,2.952381,0.571429,1.666667,3.095238,1.619048,50108.64,29205.945714,20554.909048,16689.658571,16.304286,0.0,1425.112381,0.0,863.847619,1559.98619,6453.678571,3989.44,0.0,58.229524,3931.210476,2464.238571,52.406667,2411.831905,0.0,0.0,2197.358095,0.0,0.0,0.0,17151.685238,6183.724286,58.229524,579.967619,3232.916667,2820.648571,2437.044762,1839.15381,3442.391429,308.617619,29510.594286,13492.470952,11801.460952,2150.163333,1509.784286,0.0,1468.144762,726.722857,230.506667,200.199524,1711.491905,512.132381,245.506667,91.223333,83.874762,1893.665238,978.045238,1678.765238,12.244762,815.899524,440.392381,375.507143,2110.618571,69.124286,229.52381,320.0,1491.970476,2548.0,2275.535238,253.831429,18.633333,405.97,376.81,10.993333,18.166667,5659.382381,745.705238,10.946667,1271.738571,2076.111429,71.505238,2004.60619,1554.880476,1202.63,493.482857,476.228095,232.919048,2283.779524,1708.030476,31.21,544.539048,991.843333,2058.328571,638.197619,698.756667,15.139048,0.0,706.235238,6272.042857,0.0,1411.489524,1276.396667,135.092857,4227.48,451.396667,0.0,0.0,0.0,181.676667,1155.0
10,1004,2020,11.0,2.0,1.0,4.0,2.0,53.095238,4.952381,4.666667,2.095238,2.571429,3.52381,1.142857,2.952381,0.571429,1.666667,3.095238,1.619048,50108.64,29205.945714,20554.909048,16689.658571,16.304286,0.0,1425.112381,0.0,863.847619,1559.98619,6453.678571,3989.44,0.0,58.229524,3931.210476,2464.238571,52.406667,2411.831905,0.0,0.0,2197.358095,0.0,0.0,0.0,17151.685238,6183.724286,58.229524,579.967619,3232.916667,2820.648571,2437.044762,1839.15381,3442.391429,308.617619,29510.594286,13492.470952,11801.460952,2150.163333,1509.784286,0.0,1468.144762,726.722857,230.506667,200.199524,1711.491905,512.132381,245.506667,91.223333,83.874762,1893.665238,978.045238,1678.765238,12.244762,815.899524,440.392381,375.507143,2110.618571,69.124286,229.52381,320.0,1491.970476,2548.0,2275.535238,253.831429,18.633333,405.97,376.81,10.993333,18.166667,5659.382381,745.705238,10.946667,1271.738571,2076.111429,71.505238,2004.60619,1554.880476,1202.63,493.482857,476.228095,232.919048,2283.779524,1708.030476,31.21,544.539048,991.843333,2058.328571,638.197619,698.756667,15.139048,0.0,706.235238,6272.042857,0.0,1411.489524,1276.396667,135.092857,4227.48,451.396667,0.0,0.0,0.0,181.676667,1155.0
11,1004,2022,11.0,2.0,1.0,4.0,2.0,55.555556,4.166667,4.5,2.111111,2.388889,3.333333,1.166667,2.833333,0.5,1.777778,2.666667,1.722222,42181.936667,32637.283333,26569.804444,23682.403333,288.609444,0.0,1494.448333,0.0,179.971111,924.372222,4776.420556,1205.441667,0.0,41.331111,1164.110556,3570.978889,972.763889,2598.215,0.0,0.0,1291.058333,0.0,0.0,0.0,6420.463333,0.0,132.596667,1272.037222,379.229444,2604.846111,1673.525,358.228889,3124.19,0.0,26342.328889,10408.444444,9682.019444,2974.238333,949.277778,117.856667,541.420556,427.138333,94.998889,210.711667,1291.761667,419.638889,47.142222,7.142778,14.285,1331.424444,1254.982222,610.711667,115.713333,847.595556,425.861111,421.734444,2041.126667,166.666667,29.375,423.333333,1421.751667,1961.920556,1568.127222,173.911667,219.881667,1284.121667,1086.679444,0.0,197.442222,4915.933889,1071.417778,70.765,0.0,2943.806667,208.862778,2734.943889,829.944444,1176.49,657.202778,329.07,190.217222,3333.295,2838.871111,2.825556,491.598333,373.401111,637.091111,0.0,0.0,0.0,0.0,637.091111,1127.683333,0.0,2.732222,2.732222,0.0,1124.951111,0.0,0.0,0.0,0.0,0.0,1494.0


## S4: Save

In [17]:
# see shape
df_concentradohogar_all.shape

(4584, 123)

In [18]:
# save
dir_save = "../../data/interim/"
df_concentradohogar_all.to_csv(dir_save + "concentrado_hogar_enigh.csv")

---
# Vivienda

INEGI's data diccionary is found [here](https://www.inegi.org.mx/rnm/index.php/catalog/685/data-dictionary/F13?file_name=viviendas)

Notes:
- A Hogar can have multiple viviendas

## S1: Extract & Transform

In [19]:
# function to extract, transform and load each enigh file
def weighted_mode(df, columns_to_mode, weighting_column):
    # Initialize a DataFrame to store the mode values
    mode_values = pd.DataFrame(columns=['Column', 'Weighted_Mode'])

    # Calculate weighted mode for each column in columns_to_mode
    for col in columns_to_mode:
        # Group by the column values and calculate weighted counts
        weighted_counts = df.groupby(col)[weighting_column].sum()

        # Find the index of maximum weighted count
        mode_index = weighted_counts.idxmax()

        # Get the mode value
        mode_value = df.loc[df[col] == mode_index, col].iloc[0]

        # Append mode value to mode_values DataFrame
        mode_values.loc[len(mode_values)] = [col, mode_value]

    # return a Series
    return mode_values.set_index('Column')['Weighted_Mode']


def weighted_mean(df, cols, weight_col):
    return (df[cols].mul(df[weight_col], axis=0).sum() / df[weight_col].sum())


def get_statistics(df, cols_mode, cols_mean, weight_col):
    # s1: get weighted mode
    mode_values = weighted_mode(df, cols_mode, weight_col)

    # s2: get weighted mean
    mean_values = weighted_mean(df, cols_mean, weight_col)

    # s3: get total viviendas
    total_viviendas = df[weight_col].sum()

    # return all the values in a pd series
    melt_values = pd.concat([mode_values, mean_values])
    melt_values['total_viviendas'] = total_viviendas

    return melt_values


def get_enigh_vivienda(file, cols_mode, cols_mean):
    # s1:read file
    df = pd.read_csv(file, na_values=[' '])

    # s2: wrangle
    yes_no_columns = [
        'cocina',
        'cocina_dor',
        'excusado',
        'uso_compar',
        'biodigest',
        'combustible',
        'estufa_chi',
        'lavadero',
        'fregadero',
        'regadera',
        'tinaco_azo',
        'cisterna',
        'pileta',
        'calent_sol',
        'calent_gas',
        'medidor_luz',
        'bomba_agua',
        'tanque_gas',
        'aire_acond',
        'calefacc',
        'pago_mesp',
        'viv_usada'
    ]
    # map 1 for 1 and 2 for 0 (yes and no)
    df[yes_no_columns] = df[yes_no_columns].replace({2: 0})
    # ubica_geo
    df["ubica_geo"] = df["ubica_geo"].astype(str).str.zfill(5)

    # s3: get statistics
    table = (
        df
        .groupby("ubica_geo", as_index=False)
        .apply(
            lambda x: get_statistics(x, cols_mode, cols_mean, "factor"),
            include_groups=False
        )
    )

    return table


In [20]:
# get params
years_enigh = [2018, 2020, 2022]
cols_mode = [
    'mat_pared', 'mat_techos', 'mat_pisos', 'disp_agua',
    'combustible', 'eli_basura', 'tenencia', 'tipo_adqui',
    'tipo_adqui', 'tipo_finan', 'escrituras', 'disp_elect',
    'tipo_viv'
]
cols_mean = [
    'regadera', 'pago_mesp', 'tinaco_azo',
    'lavadero', 'procaptar', 'tot_resid', 'antiguedad',
    'bano_comp', 'calent_sol', 'cocina',
    'aire_acond', 'tot_hom', 'cocina_dor', 'renta',
    'fregadero', 'focos_inca', 'sanit_agua',
    'uso_compar', 'medidor_luz', 'est_socio',
    'tot_muj', 'dotac_agua', 'bano_regad', 'estufa_chi',
    'tam_loc', 'tanque_gas', 'focos_ahor',
    'cisterna', 'cuart_dorm', 'drenaje', 'excusado', 'pileta',
    'num_cuarto', 'calent_gas', 'calefacc',
    'bano_excus', 'pago_viv',
    'bomba_agua', 'viv_usada', 'biodigest', 'tot_hog',
    'estim_pago'
]

FILE_TEMPLATE = 'enigh{enigh_year}_ns_viviendas_csv.zip'
ENIGH_ROOT = "../../data/catalogues/adamuz_data/enigh_{enigh_year}/"

In [21]:
# get all tables
list_tables = []
for year in tqdm(years_enigh):
    # get file
    dir_file = ENIGH_ROOT.format(enigh_year=year)
    file = dir_file + FILE_TEMPLATE.format(enigh_year=year)

    # unzip
    os.system(f"unzip -o {file} -d {dir_file}")

    # get table
    table = get_enigh_vivienda(dir_file + 'viviendas.csv', cols_mode, cols_mean)

    # add year
    table["year"] = year

    # append
    list_tables.append(table)

  0%|          | 0/3 [00:00<?, ?it/s]

Archive:  ../../data/catalogues/adamuz_data/enigh_2018/enigh2018_ns_viviendas_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/viviendas.csv  
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/nota_bases_datos_enigh2018_ns.txt  


  df = pd.read_csv(file, na_values=[' '])
 33%|███▎      | 1/3 [00:39<01:18, 39.23s/it]

Archive:  ../../data/catalogues/adamuz_data/enigh_2020/enigh2020_ns_viviendas_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2020/viviendas.csv  


  df = pd.read_csv(file, na_values=[' '])
 67%|██████▋   | 2/3 [01:08<00:33, 33.21s/it]

Archive:  ../../data/catalogues/adamuz_data/enigh_2022/enigh2022_ns_viviendas_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2022/viviendas.csv  


  df = pd.read_csv(file, na_values=[' '])
100%|██████████| 3/3 [01:37<00:00, 32.37s/it]


In [22]:
# concat
df_viviendas_all = (
    pd.concat(list_tables)
    .sort_values(["ubica_geo", "year"], ignore_index=True)
    )
df_viviendas_all

Unnamed: 0,ubica_geo,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_adqui.1,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas,year
0,01001,8.0,10.0,3.0,1.0,3.0,3.0,4.0,1.0,1.0,1,1.0,1,1,0.945602,0.362238,0.782853,0.895975,0.0,3.781875,19.145343,1.265395,0.263397,0.979702,0.015126,1.806226,0.023304,385.384175,0.882548,1.506320,1.069981,0.029129,0.972374,2.830593,1.975650,1.091257,0.010228,0.001665,1.313242,0.146555,7.696495,0.493936,2.460400,1.044960,0.996631,0.133018,4.529099,0.515284,0.010715,0.266219,442.032669,0.416913,0.166781,0.014665,1.019717,2165.050435,234164,2018
1,01001,8.0,10.0,3,1.0,3.0,3.0,4.0,1.0,1.0,1,1.0,1,1,0.962908,0.385572,0.805143,0.903710,0.0,3.555936,18.578944,1.301716,0.287558,0.988414,0.017228,1.693341,0.020858,484.117574,0.898580,1.067147,1.066227,0.022514,0.982148,2.848673,1.862595,1.177386,0.007960,0.003885,1.191967,0.152516,7.850508,0.475506,2.355871,1.009949,0.996949,0.098937,4.408034,0.492554,0.012072,0.198125,670.729401,0.391329,0.161022,0.010222,1.016633,2361.336658,267473,2020
2,01001,8.0,10.0,3,1.0,3,3.0,4.0,1.0,1.0,1,1.0,1,1,0.970420,0.386612,0.784273,0.905320,0.0,3.514820,20.415628,1.364741,0.350969,0.986751,0.027798,1.645770,0.018158,594.349359,0.916743,0.690979,1.054902,0.015830,0.972788,2.843422,1.869050,1.180679,0.005229,0.004247,1.260126,0.167579,8.915832,0.438922,2.393433,1.033536,0.998091,0.120005,4.433587,0.439448,0.017268,0.239865,711.650391,0.403757,0.179857,0.029685,1.010339,3013.785845,267713,2022
3,01002,8.0,10.0,2.0,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1,1,0.858142,0.033360,0.825390,0.900527,0.0,3.604238,19.959440,0.925979,0.559116,0.966640,0.000000,1.983979,0.110221,35.956195,0.604847,2.070980,1.075745,0.064997,0.978503,2.000000,1.620260,1.121476,0.032752,0.000000,3.502535,0.193673,3.612148,0.108497,2.040256,1.031637,0.989252,0.097141,3.856520,0.383695,0.000000,0.225918,0.000000,0.066112,0.044109,0.010748,1.000000,1048.428311,9862,2018
4,01002,8.0,10.0,2,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1,1,0.835095,0.026823,0.828621,0.833510,0.0,4.468816,22.767310,0.884514,0.607426,0.990618,0.008060,2.112183,0.018763,20.989693,0.569371,2.511364,1.166226,0.051401,0.951506,2.000000,2.356633,2.066332,0.021934,0.021010,4.000000,0.085359,4.049154,0.215777,2.379889,1.227669,0.979651,0.229783,4.209567,0.219212,0.008060,0.231501,0.000000,0.203356,0.029070,0.000000,1.027484,1159.791226,7568,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3213,32056,8.0,10.0,3.0,1.0,3.0,1.0,4.0,1.0,1.0,5,1.0,1,1,0.988622,0.307692,0.976436,0.669133,0.0,3.634024,20.698178,1.414077,0.417638,0.993963,0.000000,1.681708,0.000000,435.273334,0.899652,1.926805,1.035972,0.006065,0.993434,3.121463,1.952316,2.024788,0.011963,0.000000,1.000000,0.240840,7.018640,0.221616,2.378717,1.000000,1.000000,0.054389,4.806148,0.810238,0.000000,0.247016,363.107525,0.197357,0.084880,0.005648,1.010433,2008.997079,35945,2018
3214,32056,8.0,10.0,3,1.0,3.0,1.0,4.0,2.0,2.0,5,1.0,1,1,0.914939,0.214437,0.927101,0.696612,0.0,3.529814,21.250086,1.260574,0.583608,0.980378,0.014230,1.642572,0.019228,358.733813,0.821385,1.129967,1.119824,0.019942,0.968511,2.869344,1.887242,1.902826,0.018415,0.005072,1.304299,0.267098,7.814270,0.217613,2.318750,1.071003,0.994928,0.135900,4.486508,0.682087,0.014058,0.343468,295.051455,0.215742,0.058841,0.004481,1.021641,2243.818012,40618,2020
3215,32056,8.0,10.0,3.0,1.0,3.0,1.0,4.0,3.0,3.0,5,1.0,1,1,0.967891,0.236054,0.983913,0.725864,0.0,3.566169,19.698376,1.344131,0.664632,0.972469,0.018398,1.656065,0.007913,389.496458,0.895782,1.107357,1.050638,0.033569,0.981232,2.884752,1.910104,2.172752,0.012425,0.004752,1.270866,0.261253,8.239760,0.203749,2.521766,1.008153,0.990692,0.064763,4.555946,0.677144,0.018332,0.282747,249.263433,0.196403,0.070627,0.017068,1.032698,2658.267030,45875,2022
3216,32057,8.0,10.0,2,1.0,3.0,1.0,4.0,3.0,3.0,5,1.0,1,1,0.719911,0.000000,0.875070,0.812605,0.0,4.152593,23.318238,0.889905,0.402454,0.960067,0.000000,1.934969,0.065031,0.000000,0.387619,2.494590,1.280089,0.022532,0.957501,2.000000,2.217624,3.779810,0.000000,0.000000,3.000000,0.087563,3.455772,0.382487,2.254992,1.079866,0.957501,0.260123,4.019967,0.410151,0.000000,0.300056,0.000000,0.424986,0.042499,0.000000,1.062465,1562.699387,8965,2020


## S2: EDA

In [23]:
# see sample
df_viviendas_all.head()

Unnamed: 0,ubica_geo,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_adqui.1,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas,year
0,1001,8.0,10.0,3.0,1.0,3.0,3.0,4.0,1.0,1.0,1,1.0,1,1,0.945602,0.362238,0.782853,0.895975,0.0,3.781875,19.145343,1.265395,0.263397,0.979702,0.015126,1.806226,0.023304,385.384175,0.882548,1.50632,1.069981,0.029129,0.972374,2.830593,1.97565,1.091257,0.010228,0.001665,1.313242,0.146555,7.696495,0.493936,2.4604,1.04496,0.996631,0.133018,4.529099,0.515284,0.010715,0.266219,442.032669,0.416913,0.166781,0.014665,1.019717,2165.050435,234164,2018
1,1001,8.0,10.0,3.0,1.0,3.0,3.0,4.0,1.0,1.0,1,1.0,1,1,0.962908,0.385572,0.805143,0.90371,0.0,3.555936,18.578944,1.301716,0.287558,0.988414,0.017228,1.693341,0.020858,484.117574,0.89858,1.067147,1.066227,0.022514,0.982148,2.848673,1.862595,1.177386,0.00796,0.003885,1.191967,0.152516,7.850508,0.475506,2.355871,1.009949,0.996949,0.098937,4.408034,0.492554,0.012072,0.198125,670.729401,0.391329,0.161022,0.010222,1.016633,2361.336658,267473,2020
2,1001,8.0,10.0,3.0,1.0,3.0,3.0,4.0,1.0,1.0,1,1.0,1,1,0.97042,0.386612,0.784273,0.90532,0.0,3.51482,20.415628,1.364741,0.350969,0.986751,0.027798,1.64577,0.018158,594.349359,0.916743,0.690979,1.054902,0.01583,0.972788,2.843422,1.86905,1.180679,0.005229,0.004247,1.260126,0.167579,8.915832,0.438922,2.393433,1.033536,0.998091,0.120005,4.433587,0.439448,0.017268,0.239865,711.650391,0.403757,0.179857,0.029685,1.010339,3013.785845,267713,2022
3,1002,8.0,10.0,2.0,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1,1,0.858142,0.03336,0.82539,0.900527,0.0,3.604238,19.95944,0.925979,0.559116,0.96664,0.0,1.983979,0.110221,35.956195,0.604847,2.07098,1.075745,0.064997,0.978503,2.0,1.62026,1.121476,0.032752,0.0,3.502535,0.193673,3.612148,0.108497,2.040256,1.031637,0.989252,0.097141,3.85652,0.383695,0.0,0.225918,0.0,0.066112,0.044109,0.010748,1.0,1048.428311,9862,2018
4,1002,8.0,10.0,2.0,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1,1,0.835095,0.026823,0.828621,0.83351,0.0,4.468816,22.76731,0.884514,0.607426,0.990618,0.00806,2.112183,0.018763,20.989693,0.569371,2.511364,1.166226,0.051401,0.951506,2.0,2.356633,2.066332,0.021934,0.02101,4.0,0.085359,4.049154,0.215777,2.379889,1.227669,0.979651,0.229783,4.209567,0.219212,0.00806,0.231501,0.0,0.203356,0.02907,0.0,1.027484,1159.791226,7568,2020


In [24]:
# shape
df_viviendas_all.shape

(3218, 58)

In [25]:
# see empty values
df_viviendas_all.isnull().sum()[df_viviendas_all.isnull().sum() > 0]

Series([], dtype: int64)

In [26]:
# count # of ubica_geo
df_viviendas_all["ubica_geo"].value_counts().value_counts()

count
3    664
1    502
2    362
Name: count, dtype: int64

In [27]:
# describe
df_viviendas_all.describe()

Unnamed: 0,mat_pared,mat_techos,disp_agua,eli_basura,tenencia,tipo_adqui,tipo_adqui.1,escrituras,disp_elect,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas,year
count,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0
mean,7.872281,8.279366,1.60317,1.633002,3.943443,2.169049,2.169049,1.353014,1.006837,0.632663,0.116859,0.539838,0.833896,0.072518,3.680122,19.966795,0.717635,0.131332,0.907696,0.107184,1.78254,0.060955,168.136975,0.468555,1.124025,1.461249,0.073366,0.919543,1.738642,1.897582,1.527458,0.043776,0.085863,3.126627,0.078632,5.008371,0.152714,2.017363,1.704186,0.967612,0.446672,3.619212,0.27053,0.020671,0.479583,105.376335,0.223503,0.057935,0.0239,1.016293,1546.849737,32945.253263,2020.084525
std,0.428437,2.956521,1.409176,1.206857,0.373886,0.65259,0.65259,0.760988,0.15355,0.290137,0.137805,0.281555,0.204577,0.250589,0.578878,5.926552,0.388612,0.191552,0.110453,0.204979,0.319315,0.058519,321.772576,0.283027,0.73889,0.411315,0.071763,0.112973,0.599341,0.332447,0.764325,0.066133,0.157747,0.94244,0.103955,1.836917,0.174639,0.275016,0.771586,0.066174,0.261569,0.583695,0.249719,0.065202,0.275174,243.101552,0.20783,0.067028,0.047719,0.030076,899.843595,60317.776856,1.624437
min,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.823529,3.733333,0.0,0.0,0.1875,0.0,0.823529,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,1.0,0.5,0.0,0.0,0.0,1.0,0.0,0.032258,0.0,1.055556,1.0,0.142857,0.0,1.684211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,346.0,596.0,2018.0
25%,8.0,8.0,1.0,1.0,4.0,2.0,2.0,1.0,1.0,0.421113,0.0,0.3,0.773755,0.0,3.315789,15.98703,0.425251,0.0,0.875,0.0,1.578024,0.020557,0.0,0.227002,0.565231,1.15,0.023877,0.898095,1.0,1.686838,1.000501,0.0,0.0,2.516317,0.0,3.782784,0.029041,1.833333,1.086957,0.956431,0.225509,3.244407,0.045455,0.0,0.252286,0.0,0.052632,0.0,0.0,1.0,1039.152261,7035.75,2018.0
50%,8.0,10.0,1.0,1.0,4.0,2.0,2.0,1.0,1.0,0.697116,0.063959,0.580633,0.918696,0.0,3.619048,19.370338,0.765071,0.029813,0.947368,0.0,1.750978,0.048585,60.0,0.462269,0.992645,1.352941,0.052632,0.954164,2.0,1.869565,1.262893,0.021841,0.023276,3.298769,0.045455,4.947368,0.092385,2.017716,1.423005,0.997983,0.437706,3.631579,0.210526,0.0,0.445595,0.0,0.166667,0.043478,0.0,1.0,1340.976532,13650.0,2020.0
75%,8.0,10.0,2.0,1.0,4.0,3.0,3.0,1.0,1.0,0.878542,0.190476,0.786549,0.970587,0.0,3.991653,23.232315,1.018641,0.203478,0.984959,0.095625,1.954545,0.086957,221.486058,0.7,1.565217,1.666667,0.10095,0.992186,2.0,2.076226,1.904762,0.055767,0.094195,4.0,0.111713,6.126788,0.217391,2.19597,2.110828,1.0,0.652174,4.0,0.450649,0.003781,0.69367,99.310637,0.339218,0.090909,0.027487,1.023641,1782.648405,28826.75,2022.0
max,8.0,10.0,7.0,7.0,5.0,4.0,4.0,3.0,5.0,1.0,1.0,1.0,1.0,1.0,7.0,50.809524,2.375818,0.95,1.0,1.0,3.6875,0.535294,5180.798358,1.0,4.608696,3.0,0.6,1.0,4.0,3.5,4.809524,1.102098,1.0,4.0,0.8,21.994528,1.0,3.111687,5.0,1.0,1.0,6.217034,1.0,0.785714,1.4,3415.0,1.0,0.478261,0.545455,1.5,13853.746046,586990.0,2022.0


## S3: Explicit Nulls

In [28]:
# generate explicit nulls for ubica_geo and year
index_cols = ["ubica_geo", "year"]
new_index = pd.MultiIndex.from_product(
    [df_viviendas_all["ubica_geo"].unique(), df_viviendas_all["year"].unique()],
    names=index_cols
)

# reindex
df_viviendas_all = (
    df_viviendas_all
    .set_index(index_cols)
    .reindex(new_index)
    .reset_index()
    )

In [29]:
# count # of ubica_geo
df_viviendas_all["ubica_geo"].value_counts().value_counts()

count
3    1528
Name: count, dtype: int64

In [45]:
# remove duplicated columns
df_viviendas_all = df_viviendas_all.loc[:, ~df_viviendas_all.columns.duplicated()]


In [46]:
# see ubica geo 01004
df_viviendas_all.query("ubica_geo == '01004'")

Unnamed: 0,ubica_geo,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas
9,1004,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10,1004,2020,8.0,10.0,2.0,1.0,3.0,3.0,4.0,3.0,5.0,1.0,1.0,1.0,0.904762,0.047619,0.809524,0.952381,0.0,4.666667,28.666667,1.0,0.333333,0.904762,0.0,2.095238,0.0,23.809524,0.666667,1.571429,1.190476,0.047619,0.952381,2.0,2.571429,1.142857,0.047619,0.0,4.0,0.047619,5.047619,0.333333,2.666667,1.0,1.0,0.238095,4.428571,0.285714,0.0,0.142857,0.0,0.619048,0.0,0.0,1.0,1185.714286,1155.0
11,1004,2022,8.0,10.0,2.0,1.0,3.0,3.0,4.0,3.0,5.0,1.0,1.0,1.0,0.833333,0.055556,0.666667,0.666667,0.0,4.5,15.277778,0.833333,0.222222,0.944444,0.0,2.111111,0.055556,55.555556,0.611111,0.555556,1.277778,0.111111,0.888889,2.0,2.388889,1.0,0.0,0.0,4.0,0.0,4.333333,0.111111,2.111111,1.277778,1.0,0.222222,3.611111,0.166667,0.055556,0.333333,0.0,0.111111,0.0,0.0,1.0,1072.222222,1494.0


In [47]:
# fill na with strategy ffill and then bfill by ubica_geo
df_viviendas_all = (
    df_viviendas_all
    .sort_values(["ubica_geo", "year"])
    .groupby("ubica_geo")
    .apply(lambda x: x.ffill().bfill())
    .reset_index(drop=True)
    )

  .apply(lambda x: x.ffill().bfill())
  .apply(lambda x: x.ffill().bfill())


In [48]:
# see ubica geo 01004
df_viviendas_all.query("ubica_geo == '01004'")

Unnamed: 0,ubica_geo,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas
9,1004,2018,8.0,10.0,2,1.0,3.0,3.0,4.0,3.0,5,1.0,1.0,1,0.904762,0.047619,0.809524,0.952381,0.0,4.666667,28.666667,1.0,0.333333,0.904762,0.0,2.095238,0.0,23.809524,0.666667,1.571429,1.190476,0.047619,0.952381,2.0,2.571429,1.142857,0.047619,0.0,4.0,0.047619,5.047619,0.333333,2.666667,1.0,1.0,0.238095,4.428571,0.285714,0.0,0.142857,0.0,0.619048,0.0,0.0,1.0,1185.714286,1155.0
10,1004,2020,8.0,10.0,2,1.0,3.0,3.0,4.0,3.0,5,1.0,1.0,1,0.904762,0.047619,0.809524,0.952381,0.0,4.666667,28.666667,1.0,0.333333,0.904762,0.0,2.095238,0.0,23.809524,0.666667,1.571429,1.190476,0.047619,0.952381,2.0,2.571429,1.142857,0.047619,0.0,4.0,0.047619,5.047619,0.333333,2.666667,1.0,1.0,0.238095,4.428571,0.285714,0.0,0.142857,0.0,0.619048,0.0,0.0,1.0,1185.714286,1155.0
11,1004,2022,8.0,10.0,2,1.0,3.0,3.0,4.0,3.0,5,1.0,1.0,1,0.833333,0.055556,0.666667,0.666667,0.0,4.5,15.277778,0.833333,0.222222,0.944444,0.0,2.111111,0.055556,55.555556,0.611111,0.555556,1.277778,0.111111,0.888889,2.0,2.388889,1.0,0.0,0.0,4.0,0.0,4.333333,0.111111,2.111111,1.277778,1.0,0.222222,3.611111,0.166667,0.055556,0.333333,0.0,0.111111,0.0,0.0,1.0,1072.222222,1494.0


## S4: Save

In [49]:
# see shape
df_viviendas_all.shape

(4584, 57)

In [50]:
# save
dir_save = "../../data/interim/"
df_viviendas_all.to_csv(dir_save + "viviendas_enigh.csv")

---
# Join with Properties

## S1: Load & Transform

In [55]:
# read parquet
df_properties = (
    pd.read_parquet("../../data/interim/cleaned_data_s4.parquet")
    .assign(
        state_id=lambda x: x['state_id'].astype(str).str.zfill(2),
        municipality_id=lambda x: x['municipality_id'].astype(str).str.zfill(3)
    )
)   


# get original columns
original_columns = df_properties.columns

# see num of rows
df_properties.shape

(1909959, 51)

In [56]:
# see columns cve ent & cve mun
df_properties[["state_id", "municipality_id"]]

Unnamed: 0,state_id,municipality_id
0,31,050
1,31,041
2,31,050
3,31,041
4,31,050
...,...,...
1909954,01,001
1909955,01,001
1909956,14,120
1909957,13,013


In [57]:
# create 'ubica_geo' column
df_properties["ubica_geo"] = (
    df_properties["state_id"].astype(str).str.zfill(2)
    + df_properties["municipality_id"].astype(str).str.zfill(3)
    )

# see some values
df_properties["ubica_geo"]

0          31050
1          31041
2          31050
3          31041
4          31050
           ...  
1909954    01001
1909955    01001
1909956    14120
1909957    13013
1909958    15081
Name: ubica_geo, Length: 1909959, dtype: object

In [58]:
# get year of fecha avaluo and floor every 2 years
df_properties["year_enigh"] = (
    df_properties["valuation_date"].dt.year
    // 2 * 2
    )

# describe
df_properties["year_enigh"].describe()

count    1.909959e+06
mean     2.020327e+03
std      1.499379e+00
min      2.018000e+03
25%      2.020000e+03
50%      2.020000e+03
75%      2.022000e+03
max      2.022000e+03
Name: year_enigh, dtype: float64

## S2: Join with Concentrado Hogar

In [59]:
# join with df_concentradohogar_all
df_properties = (
    df_properties
    .merge(
        df_concentradohogar_all,
        left_on=["ubica_geo", "year_enigh"],
        right_on=["ubica_geo", "year"],
        how="left"
        )
    )

# see columns
df_properties.shape

(1909959, 175)

In [60]:
# see size
df_properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1909959 entries, 0 to 1909958
Columns: 175 entries, half_bathrooms to total_hogares
dtypes: datetime64[ns](1), float32(3), float64(138), int16(1), int64(1), object(3), string(12), uint16(1), uint32(1), uint8(14)
memory usage: 2.3+ GB


In [61]:
# see if there are columns with "_y"
df_properties.filter(like="_x").columns

Index([], dtype='object')

In [62]:
# see how many nan values 
df_properties.loc[:, df_concentradohogar_all.columns].isnull().sum()

ubica_geo           0
year             5669
est_dis          5669
clase_hog        5669
sexo_jefe        5669
                 ... 
pago_tarje       5669
deudas           5669
balance          5669
otras_erog       5669
total_hogares    5669
Length: 123, dtype: int64

In [63]:
# see how many nan values 
df_properties.loc[:, df_concentradohogar_all.columns].isnull().mean()

ubica_geo        0.000000
year             0.002968
est_dis          0.002968
clase_hog        0.002968
sexo_jefe        0.002968
                   ...   
pago_tarje       0.002968
deudas           0.002968
balance          0.002968
otras_erog       0.002968
total_hogares    0.002968
Length: 123, dtype: float64

## S3: Join with Vivienda

In [64]:
# join with df_viviendas_all
df_properties = (
    df_properties
    .drop(columns=["year", "tam_loc", "est_socio"])
    .merge(
        df_viviendas_all,
        left_on=["ubica_geo", "year_enigh"],
        right_on=["ubica_geo", "year"],
        how="left"
        )
    )

# see columns
df_properties.shape

(1909959, 228)

In [65]:
# see size
df_properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1909959 entries, 0 to 1909958
Columns: 228 entries, half_bathrooms to total_viviendas
dtypes: datetime64[ns](1), float32(3), float64(187), int16(1), int64(1), object(7), string(12), uint16(1), uint32(1), uint8(14)
memory usage: 3.0+ GB


In [66]:
# see if there are columns with "_y"
df_properties.filter(like="_x").columns

Index([], dtype='object')

In [67]:
# see how many nan values 
df_properties.loc[:, df_viviendas_all.columns].isnull().sum()

ubica_geo             0
year               5669
mat_pared          5669
mat_techos         5669
mat_pisos          5669
disp_agua          5669
combustible        5669
eli_basura         5669
tenencia           5669
tipo_adqui         5669
tipo_finan         5669
escrituras         5669
disp_elect         5669
tipo_viv           5669
regadera           5669
pago_mesp          5669
tinaco_azo         5669
lavadero           5669
procaptar          5669
tot_resid          5669
antiguedad         5669
bano_comp          5669
calent_sol         5669
cocina             5669
aire_acond         5669
tot_hom            5669
cocina_dor         5669
renta              5669
fregadero          5669
focos_inca         5669
sanit_agua         5669
uso_compar         5669
medidor_luz        5669
est_socio          5669
tot_muj            5669
dotac_agua         5669
bano_regad         5669
estufa_chi         5669
tam_loc            5669
tanque_gas         5669
focos_ahor         5669
cisterna        

In [68]:
# see how many nan values 
df_properties.loc[:, df_viviendas_all.columns].isnull().mean()

ubica_geo          0.000000
year               0.002968
mat_pared          0.002968
mat_techos         0.002968
mat_pisos          0.002968
disp_agua          0.002968
combustible        0.002968
eli_basura         0.002968
tenencia           0.002968
tipo_adqui         0.002968
tipo_finan         0.002968
escrituras         0.002968
disp_elect         0.002968
tipo_viv           0.002968
regadera           0.002968
pago_mesp          0.002968
tinaco_azo         0.002968
lavadero           0.002968
procaptar          0.002968
tot_resid          0.002968
antiguedad         0.002968
bano_comp          0.002968
calent_sol         0.002968
cocina             0.002968
aire_acond         0.002968
tot_hom            0.002968
cocina_dor         0.002968
renta              0.002968
fregadero          0.002968
focos_inca         0.002968
sanit_agua         0.002968
uso_compar         0.002968
medidor_luz        0.002968
est_socio          0.002968
tot_muj            0.002968
dotac_agua         0

## S4: See which ubica_geo have nan values

In [69]:
# see how many nan values in ubica_geo
nan_ubica_geo = (
    df_properties
    .loc[:, df_viviendas_all.columns]
    .groupby("ubica_geo")
    .apply(lambda x: x['tipo_viv'].isnull().sum())
    .to_frame()
    .rename(columns={0: "nan_values"})
    .query("nan_values > 0")
    .reset_index()
)
nan_ubica_geo

  .apply(lambda x: x['tipo_viv'].isnull().sum())


Unnamed: 0,ubica_geo,nan_values
0,7012,1998
1,14051,920
2,15012,8
3,15022,5
4,15055,111
5,15069,26
6,15072,1
7,15073,1155
8,15075,2
9,15093,1


ideas to input missing values:
1. remove rows with missing values
2. fill with mean/median
3. fill with the most common value in the state
4. knn imputer using centroids of the municipalities

## S5: NaN Imputation

### Concentrado Hogar

In [70]:
# to new_index add nan_ubica_geo ubica_geo's
index_cols = ["ubica_geo", "year"]
new_index = pd.MultiIndex.from_product(
    [df_properties["ubica_geo"].unique(), df_properties["year_enigh"].unique()],
    names=index_cols
)

# reindex
df_concentradohogar_all = (
    df_concentradohogar_all
    .set_index(index_cols)
    .reindex(new_index)
    .reset_index()
    )

In [71]:
# see 31100
df_concentradohogar_all.query("ubica_geo == '31100'")

Unnamed: 0,ubica_geo,year,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares
984,31100,2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
985,31100,2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
986,31100,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [72]:
# get cve_ent from ubica_geo
df_concentradohogar_all["cve_ent"] = df_concentradohogar_all["ubica_geo"].str[:2]

In [73]:
# get table of median values by cve_ent
table_concentradohogar_medians = (
    df_concentradohogar_all
    .drop(columns=["ubica_geo"])
    .groupby(["cve_ent", "year"], as_index=False)
    .mean()
)
table_concentradohogar_medians

Unnamed: 0,cve_ent,year,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares
0,01,2018,7.25,2.0,1.0,2.522576,2.414177,47.687678,5.924445,4.231678,2.103663,2.128015,3.225904,1.005775,2.987313,0.238591,1.933839,2.601711,1.908534,57546.501917,42175.722370,37856.587978,31547.447171,605.131393,576.517348,2326.051831,154.959261,1130.499934,1515.981040,2875.245978,2661.243760,369.481068,1433.138650,858.624042,214.002217,2.293808,195.643490,16.064920,0.000000,1443.888415,3355.685716,2615.570958,740.114757,7169.642607,3199.390802,137.395118,633.645283,666.325667,375.672929,1481.425311,675.787496,4828.325341,17.125883,37814.258156,11934.957508,9227.468909,1724.210104,1681.992910,112.469329,985.448146,300.019253,91.806188,145.132712,957.057835,382.807851,69.522183,45.774184,79.061287,1731.165731,921.001197,2632.970715,74.517884,2029.283931,1185.964255,843.319677,2658.848390,802.044744,136.601105,528.640498,1191.562043,2424.583557,1621.714474,247.770211,555.098872,1026.498332,717.805619,110.611874,198.080839,8745.990408,1064.118884,236.550508,2691.796336,3283.818842,252.486822,3031.332020,1469.705838,4993.581514,3085.307053,1084.595370,823.679092,3053.109005,2271.484178,138.913064,642.711763,947.405509,7252.788357,1998.429067,1019.924174,2820.591134,4.061614,1409.782368,7940.738606,1039.656032,360.747981,208.338887,152.409094,2208.098865,161.992612,1103.215315,853.770753,360.464153,1852.792894,71810.500000
1,01,2020,7.75,2.0,1.0,2.449221,2.364563,48.758832,6.015882,3.879883,1.896065,1.983817,3.080848,0.799035,2.854600,0.226248,1.851366,2.460383,1.796398,56350.007168,40087.355633,35701.544646,29465.393712,500.282880,642.235859,2011.113880,755.954370,955.121441,1371.442504,3086.483947,2470.259256,419.270481,905.429470,1145.559306,616.224691,0.863741,615.346057,0.014893,0.000000,1299.327040,2514.508859,2142.143591,372.365267,8730.815872,3230.844841,89.629829,826.649587,1357.597954,747.266949,1682.485452,796.341261,4957.400292,59.926512,33853.781161,12182.777551,10547.309440,1828.628470,1913.509970,130.174343,990.255509,361.068749,109.922133,158.698904,1225.866169,539.889474,85.681711,61.430062,87.498870,1982.642267,1072.042808,1532.868293,102.599817,1335.359985,829.227718,506.132267,2816.261812,902.198784,174.745960,599.311830,1140.005238,2242.630952,1552.295019,223.740292,466.595641,1517.744163,1067.810596,160.856458,289.077109,7092.826849,662.482944,82.318935,1663.511317,2707.404832,323.181176,2384.223657,1977.108820,2972.029507,1940.673436,694.151772,337.204299,2690.676422,2122.897479,93.349568,474.429376,1003.473920,5504.859199,2294.829563,544.879857,1132.624126,18.439067,1514.086586,9609.479342,1057.457097,769.350187,315.337099,454.013088,4986.049829,134.394666,887.047999,777.237696,481.488880,516.452989,84977.750000
2,01,2022,7.75,2.0,1.0,2.679804,2.337955,48.127423,6.184812,3.791812,1.831449,1.960363,2.913784,0.878028,2.702086,0.211698,1.691886,2.405484,1.679086,68725.334446,46849.303706,40985.575132,34453.628658,580.584113,816.047966,2276.634546,417.118793,1279.020104,1162.540952,3975.746145,3632.347135,565.778462,1350.852093,1715.716580,343.399010,25.287888,318.111122,0.000000,0.000000,1887.982428,3386.878683,2437.174984,949.703699,12029.236933,5138.552735,130.863012,1185.856819,1601.677162,1237.150736,2070.478286,664.658183,6407.965165,51.949959,42044.603211,14924.628549,12354.624456,2233.080573,2448.901373,138.825764,1143.778286,499.317957,155.330865,174.474266,1296.217760,583.523372,76.130000,78.343356,103.832847,2222.391743,1200.476294,2452.366990,117.637104,1836.656722,1140.302844,696.353877,3235.885812,1015.270928,198.319026,658.472384,1363.823473,2767.466536,2022.513906,348.525832,396.426798,1509.479589,961.013290,319.865013,228.601286,8494.694779,1003.902141,278.770272,1784.268857,3469.699456,335.949475,3133.749982,1958.054053,4537.560108,2880.985794,1018.085554,638.488760,3470.730730,2667.303520,142.676884,660.750325,1267.500387,4306.089021,1230.746070,575.060819,1115.323830,102.153362,1282.804940,10143.878477,1634.254858,696.671250,459.210024,237.461226,4512.251987,132.056530,1527.517592,496.400729,476.120958,668.604572,85916.500000
3,02,2018,17.50,2.0,1.0,1.786824,2.450202,48.275740,6.029622,3.357513,1.675624,1.681889,2.683144,0.674369,2.423732,0.259412,1.524581,2.034405,1.512097,56915.238306,40919.095225,37804.598595,34016.058648,322.359775,1193.046342,1156.710523,60.947327,343.486988,711.988991,2220.240730,2196.625641,836.240270,514.032360,846.353011,23.615089,16.021613,3.193825,0.000000,4.399650,894.255900,1808.597256,1022.502522,786.094734,7854.593944,4939.687213,34.689122,1389.941265,218.811523,202.472223,835.822501,233.170098,6286.549538,46.402343,35603.841750,11196.029500,7462.355140,1247.997974,1535.836232,184.211693,834.909768,287.775401,104.802455,110.718043,685.124646,392.189067,51.171443,58.091097,111.140383,876.472038,981.914901,3674.280820,59.393540,1332.281462,895.828206,436.453255,4403.212035,1452.330263,120.837559,565.867133,2264.177080,1985.978527,1417.999618,137.659829,430.319080,606.164094,461.028165,50.490846,94.645083,8775.436810,1150.179869,286.686791,683.244173,4824.136791,275.695714,4548.441077,1831.189187,3401.200265,1940.186122,1223.059953,237.954190,2988.034747,2346.281936,68.685699,573.067112,915.504310,1638.962794,306.998297,222.102263,272.325872,16.482340,821.054022,5835.221543,1005.534322,270.599035,195.549560,75.049475,3465.772226,49.952377,450.041092,76.439212,84.064264,432.819015,266591.000000
4,02,2020,19.00,2.0,1.0,1.784814,2.485101,50.364883,6.013806,3.292556,1.607149,1.685407,2.766915,0.525641,2.450593,0.316322,1.503161,2.044177,1.473485,63294.979636,42338.628255,38639.018149,34256.229522,392.927556,762.441270,1590.122527,268.804832,647.472736,721.019706,2684.228334,2631.456297,723.995932,678.958014,1228.502351,52.772037,10.515240,33.519705,0.000000,8.737092,1015.381773,3096.739783,2321.807545,774.932237,9279.724872,5272.059012,85.857624,1300.099336,389.430085,647.436339,1163.135963,421.706514,8546.212531,33.674195,36065.912649,11967.181916,9938.924112,1463.704726,2048.908996,298.431649,969.877252,372.672169,143.174907,138.866564,862.781140,484.165491,83.676698,89.423820,142.574890,1575.471469,1265.194340,1957.703177,70.554628,968.607592,675.388276,293.219316,5021.729671,1501.968346,273.738694,765.665469,2480.357162,2420.109653,1925.147237,167.587360,327.375056,1029.625673,705.151834,137.728193,186.745646,7939.797097,753.598608,94.450401,943.264989,3995.990392,253.380619,3742.609773,2152.492707,2615.620763,2119.038481,444.433401,52.148880,3139.629518,2457.785624,54.596843,627.247050,963.610767,2714.744714,629.291253,142.523321,597.003405,14.598804,1331.327931,6377.658354,1008.415648,326.996259,229.963159,97.033099,3587.609357,164.992464,476.431958,144.373221,219.433915,449.405532,272823.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,31,2020,537.50,2.0,1.0,2.238077,2.122419,46.601857,5.963540,3.622582,1.704812,1.917770,2.841130,0.781453,2.586005,0.255125,1.875707,2.444548,1.814711,42069.208478,28460.115229,23616.203553,19266.157268,296.372213,1052.102110,1088.724343,214.578797,520.460090,1177.808734,3195.503220,3126.251192,503.937563,920.649766,1701.663863,69.252028,9.188451,29.590379,30.473198,0.000000,1648.408456,1134.087348,729.399389,404.687959,7847.199959,3165.222888,69.226118,987.865271,26.373124,974.895555,1179.529821,1444.087183,4606.232050,21.573892,27754.171585,12451.847931,11516.464166,1942.506619,2212.913746,73.085594,666.794729,372.006395,69.091752,123.058915,735.928397,330.471274,58.315733,110.083316,140.449118,3291.242626,1390.515952,913.490514,21.893250,412.421748,281.028347,131.393401,2177.161862,676.777184,127.794693,105.214681,1267.375304,1817.001560,1390.706105,83.757210,342.538245,1035.513213,667.938143,123.133925,244.441146,5377.127980,877.434475,35.249716,734.446778,2114.267115,259.943068,1854.324047,1615.729896,1816.644109,1196.157669,530.055144,90.431296,1770.918773,1417.940812,74.355416,278.622544,895.534408,4437.186994,1387.489851,755.125869,728.104925,17.474220,1548.992128,8474.901176,814.935448,468.138775,180.548120,287.590655,3192.599005,95.647992,1326.756290,1132.699423,878.485483,565.638760,88370.250000
92,31,2022,540.50,2.0,1.0,2.332202,2.227814,49.155729,6.014349,3.493476,1.811269,1.682208,2.913202,0.580274,2.564022,0.349180,1.800049,2.380312,1.804270,59075.955870,40613.729112,35447.093311,31088.677550,357.013911,629.566962,1316.879625,23.336354,606.377524,1425.241385,4132.809461,4120.850474,1402.913543,884.575358,1833.361574,11.958987,4.249396,7.192230,0.517362,0.000000,1033.826340,2592.643452,2221.736755,370.906697,9736.341010,4500.770509,27.600515,745.151871,10.087871,2192.871029,1552.243347,707.615868,6123.252896,9.989399,35313.348367,15342.910208,13000.887154,1964.389167,3081.443454,79.004296,565.544173,407.696848,87.575220,106.687780,876.246467,248.973448,55.271047,80.176801,117.720919,3844.698950,1485.458584,2321.469356,20.553698,1022.713401,656.030186,366.683215,2243.191737,467.234851,145.645871,133.467253,1496.843762,2150.352544,1743.520718,75.267016,331.564810,857.774466,621.686930,58.654037,177.433499,6931.617596,1732.270407,79.085958,950.325587,2202.876829,163.053446,2039.823383,1967.058815,3117.815734,2204.153066,673.922169,239.740498,2469.403330,2003.284174,104.140307,361.978849,1177.569350,3821.327708,1059.213002,439.699174,751.045138,22.158885,1549.211508,8707.202587,1449.497474,774.966824,303.583104,471.383720,4020.108512,145.758131,1124.799895,704.306087,235.871560,251.894104,87237.250000
93,32,2018,540.00,2.0,1.0,1.884902,2.583217,48.600404,6.040914,3.663835,1.724708,1.939127,2.842427,0.821408,2.565572,0.276855,1.632066,2.433253,1.591926,47992.014945,31382.643719,27347.370582,23558.132799,254.681307,273.540987,1611.872110,117.600372,956.418704,575.124303,2718.615032,2221.501980,390.741852,1039.449767,791.310361,497.113052,354.287276,141.917397,0.908380,0.000000,1316.658105,3281.898369,2890.556708,391.341661,8664.729210,3583.764738,110.954586,1299.422565,710.127477,551.512814,1642.677198,766.269832,4634.699082,28.044566,32892.631484,10908.359444,8894.203462,1810.606797,1892.659918,126.192481,993.246883,361.800422,141.802913,197.857518,1019.069035,491.443092,78.449823,75.102941,89.041521,863.366448,753.563669,1947.527649,66.628333,1732.199449,1050.415824,681.783625,2717.073657,721.146906,150.090638,371.456468,1474.379645,2042.016786,1484.015779,223.809527,334.191480,833.096336,673.637752,42.605810,116.852774,6804.287323,1034.635850,284.656507,942.346810,3094.667802,291.739822,2802.927980,1447.980354,3973.974746,2483.255564,983.541981,507.177201,2989.844665,2152.447671,140.636885,696.760109,891.779078,2906.935446,685.333969,359.162926,588.255767,2.663306,1271.519477,4323.512827,828.117836,501.953861,329.493259,172.460601,1238.011099,71.568808,315.677856,410.387723,366.188613,591.607030,56854.666667
94,32,2020,550.00,2.0,1.0,1.617510,2.691681,50.363635,6.600189,3.511094,1.687316,1.823779,2.800229,0.710865,2.533462,0.266768,1.656581,2.249278,1.619966,61037.260339,39247.637871,35586.594822,30852.827005,182.393053,481.027322,2186.383287,144.810784,1010.278604,728.874767,2684.544097,2364.743674,518.768102,586.530439,1259.445134,319.800422,278.271729,41.528694,0.000000,0.000000,976.498953,6330.567767,5328.770904,1001.796863,9457.620552,4981.466789,199.330496,760.034191,451.296212,993.893483,1746.367534,325.231847,5987.676241,13.757907,35397.209060,12494.387042,10571.612150,1863.741933,2086.935743,262.534786,1156.323964,428.628390,132.955988,176.382250,1246.345087,631.126732,91.159254,86.121147,96.761726,1491.681869,820.913280,1849.360702,73.414190,1384.038456,888.067404,495.971052,3104.210521,899.037769,171.479591,517.728906,1515.964255,2546.249067,1865.074695,321.511380,359.662993,1602.588440,1227.666287,118.636867,256.285286,6881.533826,438.137949,171.744775,1121.341741,3123.409415,359.503734,2763.905680,2026.899945,2837.967510,1945.889930,682.624608,209.452973,3041.797543,2147.525449,76.534222,817.737872,1504.436655,4661.065680,1610.584331,535.407767,900.263894,40.807742,1574.001946,14119.477912,887.336058,744.324102,501.184398,243.139704,9253.845942,47.775094,1149.447652,523.191157,303.029064,1210.528844,52094.000000


In [74]:
# append to df_concentradohogar_all for nan values
mask = (
    df_concentradohogar_all['clase_hog'].isnull()
)
columns_to_impute = df_concentradohogar_all.drop(columns=["ubica_geo", "year", "cve_ent"]).columns.copy()

# append to df_concentradohogar_all for nan values
df_concentradohogar_all.loc[mask, columns_to_impute] = (
    df_concentradohogar_all
    .loc[mask, :]
    .merge(
        table_concentradohogar_medians,
        on=["cve_ent", "year"],
        how="left"
        )
    .filter(like="_y")
    .values
)

# sort by ubica_geo and year
df_concentradohogar_all = (
    df_concentradohogar_all
    .sort_values(["ubica_geo", "year"], ignore_index=True)
    )

# see how many nan values
df_concentradohogar_all.isnull().sum()[df_concentradohogar_all.isnull().sum() > 0]

Series([], dtype: int64)

In [75]:
# see 31100
df_concentradohogar_all.query("ubica_geo == '31100'")

Unnamed: 0,ubica_geo,year,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares,cve_ent
1023,31100,2018,523.5,2.0,1.0,2.456703,2.136037,47.884917,5.393414,3.787951,1.928931,1.859019,3.04342,0.74453,2.769623,0.273798,2.135098,2.615438,2.119616,48829.923423,35268.822907,30158.612997,25160.049908,261.022478,1355.279482,1250.52136,95.52116,446.907764,1589.310846,3581.597498,3530.59039,885.469929,657.283153,1987.837307,51.007108,8.034997,3.786773,39.185338,0.0,1528.612412,3080.148049,2635.184312,444.963737,6337.237566,3092.755226,64.606151,969.174025,2.304668,405.076266,1361.925334,441.395896,4122.47007,21.244831,30109.082739,11960.566853,10201.588787,1727.076917,2238.6911,75.307536,552.903825,313.925006,76.572678,99.372963,757.031369,224.305888,62.460342,76.621581,132.80094,2424.23873,1440.279913,1721.087436,37.89063,988.84136,621.11168,367.729681,1814.37696,429.100939,54.296103,90.660146,1240.319772,1626.419767,1239.560925,81.348146,305.510696,881.197583,599.048439,173.89052,108.258624,6685.024772,2292.568324,104.673306,886.218412,1880.483958,163.92072,1716.563238,1521.080772,3374.304039,2388.266291,839.749936,146.287811,1805.657499,1510.164102,92.39333,203.100067,972.693908,3438.84677,945.507827,504.581552,643.013262,23.859404,1321.884725,7952.846139,956.530281,197.31296,130.386248,66.926712,3730.836279,378.061268,924.414087,623.302979,187.29021,955.098076,80021.5,31
1024,31100,2020,537.5,2.0,1.0,2.238077,2.122419,46.601857,5.96354,3.622582,1.704812,1.91777,2.84113,0.781453,2.586005,0.255125,1.875707,2.444548,1.814711,42069.208478,28460.115229,23616.203553,19266.157268,296.372213,1052.10211,1088.724343,214.578797,520.46009,1177.808734,3195.50322,3126.251192,503.937563,920.649766,1701.663863,69.252028,9.188451,29.590379,30.473198,0.0,1648.408456,1134.087348,729.399389,404.687959,7847.199959,3165.222888,69.226118,987.865271,26.373124,974.895555,1179.529821,1444.087183,4606.23205,21.573892,27754.171585,12451.847931,11516.464166,1942.506619,2212.913746,73.085594,666.794729,372.006395,69.091752,123.058915,735.928397,330.471274,58.315733,110.083316,140.449118,3291.242626,1390.515952,913.490514,21.89325,412.421748,281.028347,131.393401,2177.161862,676.777184,127.794693,105.214681,1267.375304,1817.00156,1390.706105,83.75721,342.538245,1035.513213,667.938143,123.133925,244.441146,5377.12798,877.434475,35.249716,734.446778,2114.267115,259.943068,1854.324047,1615.729896,1816.644109,1196.157669,530.055144,90.431296,1770.918773,1417.940812,74.355416,278.622544,895.534408,4437.186994,1387.489851,755.125869,728.104925,17.47422,1548.992128,8474.901176,814.935448,468.138775,180.54812,287.590655,3192.599005,95.647992,1326.75629,1132.699423,878.485483,565.63876,88370.25,31
1025,31100,2022,540.5,2.0,1.0,2.332202,2.227814,49.155729,6.014349,3.493476,1.811269,1.682208,2.913202,0.580274,2.564022,0.34918,1.800049,2.380312,1.80427,59075.95587,40613.729112,35447.093311,31088.67755,357.013911,629.566962,1316.879625,23.336354,606.377524,1425.241385,4132.809461,4120.850474,1402.913543,884.575358,1833.361574,11.958987,4.249396,7.19223,0.517362,0.0,1033.82634,2592.643452,2221.736755,370.906697,9736.34101,4500.770509,27.600515,745.151871,10.087871,2192.871029,1552.243347,707.615868,6123.252896,9.989399,35313.348367,15342.910208,13000.887154,1964.389167,3081.443454,79.004296,565.544173,407.696848,87.57522,106.68778,876.246467,248.973448,55.271047,80.176801,117.720919,3844.69895,1485.458584,2321.469356,20.553698,1022.713401,656.030186,366.683215,2243.191737,467.234851,145.645871,133.467253,1496.843762,2150.352544,1743.520718,75.267016,331.56481,857.774466,621.68693,58.654037,177.433499,6931.617596,1732.270407,79.085958,950.325587,2202.876829,163.053446,2039.823383,1967.058815,3117.815734,2204.153066,673.922169,239.740498,2469.40333,2003.284174,104.140307,361.978849,1177.56935,3821.327708,1059.213002,439.699174,751.045138,22.158885,1549.211508,8707.202587,1449.497474,774.966824,303.583104,471.38372,4020.108512,145.758131,1124.799895,704.306087,235.87156,251.894104,87237.25,31


In [76]:
# see 31 in table
table_concentradohogar_medians.query("cve_ent == '31'")

Unnamed: 0,cve_ent,year,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares
90,31,2018,523.5,2.0,1.0,2.456703,2.136037,47.884917,5.393414,3.787951,1.928931,1.859019,3.04342,0.74453,2.769623,0.273798,2.135098,2.615438,2.119616,48829.923423,35268.822907,30158.612997,25160.049908,261.022478,1355.279482,1250.52136,95.52116,446.907764,1589.310846,3581.597498,3530.59039,885.469929,657.283153,1987.837307,51.007108,8.034997,3.786773,39.185338,0.0,1528.612412,3080.148049,2635.184312,444.963737,6337.237566,3092.755226,64.606151,969.174025,2.304668,405.076266,1361.925334,441.395896,4122.47007,21.244831,30109.082739,11960.566853,10201.588787,1727.076917,2238.6911,75.307536,552.903825,313.925006,76.572678,99.372963,757.031369,224.305888,62.460342,76.621581,132.80094,2424.23873,1440.279913,1721.087436,37.89063,988.84136,621.11168,367.729681,1814.37696,429.100939,54.296103,90.660146,1240.319772,1626.419767,1239.560925,81.348146,305.510696,881.197583,599.048439,173.89052,108.258624,6685.024772,2292.568324,104.673306,886.218412,1880.483958,163.92072,1716.563238,1521.080772,3374.304039,2388.266291,839.749936,146.287811,1805.657499,1510.164102,92.39333,203.100067,972.693908,3438.84677,945.507827,504.581552,643.013262,23.859404,1321.884725,7952.846139,956.530281,197.31296,130.386248,66.926712,3730.836279,378.061268,924.414087,623.302979,187.29021,955.098076,80021.5
91,31,2020,537.5,2.0,1.0,2.238077,2.122419,46.601857,5.96354,3.622582,1.704812,1.91777,2.84113,0.781453,2.586005,0.255125,1.875707,2.444548,1.814711,42069.208478,28460.115229,23616.203553,19266.157268,296.372213,1052.10211,1088.724343,214.578797,520.46009,1177.808734,3195.50322,3126.251192,503.937563,920.649766,1701.663863,69.252028,9.188451,29.590379,30.473198,0.0,1648.408456,1134.087348,729.399389,404.687959,7847.199959,3165.222888,69.226118,987.865271,26.373124,974.895555,1179.529821,1444.087183,4606.23205,21.573892,27754.171585,12451.847931,11516.464166,1942.506619,2212.913746,73.085594,666.794729,372.006395,69.091752,123.058915,735.928397,330.471274,58.315733,110.083316,140.449118,3291.242626,1390.515952,913.490514,21.89325,412.421748,281.028347,131.393401,2177.161862,676.777184,127.794693,105.214681,1267.375304,1817.00156,1390.706105,83.75721,342.538245,1035.513213,667.938143,123.133925,244.441146,5377.12798,877.434475,35.249716,734.446778,2114.267115,259.943068,1854.324047,1615.729896,1816.644109,1196.157669,530.055144,90.431296,1770.918773,1417.940812,74.355416,278.622544,895.534408,4437.186994,1387.489851,755.125869,728.104925,17.47422,1548.992128,8474.901176,814.935448,468.138775,180.54812,287.590655,3192.599005,95.647992,1326.75629,1132.699423,878.485483,565.63876,88370.25
92,31,2022,540.5,2.0,1.0,2.332202,2.227814,49.155729,6.014349,3.493476,1.811269,1.682208,2.913202,0.580274,2.564022,0.34918,1.800049,2.380312,1.80427,59075.95587,40613.729112,35447.093311,31088.67755,357.013911,629.566962,1316.879625,23.336354,606.377524,1425.241385,4132.809461,4120.850474,1402.913543,884.575358,1833.361574,11.958987,4.249396,7.19223,0.517362,0.0,1033.82634,2592.643452,2221.736755,370.906697,9736.34101,4500.770509,27.600515,745.151871,10.087871,2192.871029,1552.243347,707.615868,6123.252896,9.989399,35313.348367,15342.910208,13000.887154,1964.389167,3081.443454,79.004296,565.544173,407.696848,87.57522,106.68778,876.246467,248.973448,55.271047,80.176801,117.720919,3844.69895,1485.458584,2321.469356,20.553698,1022.713401,656.030186,366.683215,2243.191737,467.234851,145.645871,133.467253,1496.843762,2150.352544,1743.520718,75.267016,331.56481,857.774466,621.68693,58.654037,177.433499,6931.617596,1732.270407,79.085958,950.325587,2202.876829,163.053446,2039.823383,1967.058815,3117.815734,2204.153066,673.922169,239.740498,2469.40333,2003.284174,104.140307,361.978849,1177.56935,3821.327708,1059.213002,439.699174,751.045138,22.158885,1549.211508,8707.202587,1449.497474,774.966824,303.583104,471.38372,4020.108512,145.758131,1124.799895,704.306087,235.87156,251.894104,87237.25


### Vivendas

In [77]:
# to new_index add nan_ubica_geo ubica_geo's
index_cols = ["ubica_geo", "year"]
new_index = pd.MultiIndex.from_product(
    [df_properties["ubica_geo"].unique(), df_properties["year_enigh"].unique()],
    names=index_cols
)

# reindex
df_viviendas_all = (
    df_viviendas_all
    .set_index(index_cols)
    .reindex(new_index)
    .reset_index()
    )

In [78]:
# subset columns that are not duplicated
df_viviendas_all = (
    df_viviendas_all
    .loc[:, ~df_viviendas_all.columns.duplicated()]
)

In [79]:
# see 31100
df_viviendas_all.query("ubica_geo == '31100'")

Unnamed: 0,ubica_geo,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas
984,31100,2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
985,31100,2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
986,31100,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [80]:
# get cve_ent from ubica_geo
df_viviendas_all["cve_ent"] = df_viviendas_all["ubica_geo"].str[:2]

In [119]:
mask.sum()

2

In [125]:
# curate data error of '&' in tipo_finan
mask = df_viviendas_all['tipo_finan'].str.contains('&').astype(float).fillna(0).astype(bool)
# set to real nan
df_viviendas_all.loc[mask, 'tipo_finan'] = np.nan

In [126]:
# convert to float
cols_to_change_type = ["combustible", "tipo_finan", "tipo_viv", "mat_pisos"]
df_viviendas_all[cols_to_change_type] = df_viviendas_all[cols_to_change_type].astype(float)

In [127]:
# get table of median values by cve_ent
table_vivienda_medians = (
    df_viviendas_all
    .drop(columns=["ubica_geo"])
    .groupby(["cve_ent", "year"], as_index=False)
    .mean()
)
table_vivienda_medians

Unnamed: 0,cve_ent,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas
0,01,2018,8.0,10.0,3.000000,1.0,3.0,3.0,4.00,2.000000,4.000000,1.0,1.0,1.0,0.920996,0.315568,0.805590,0.912256,0.0,4.305369,19.029272,1.130087,0.425857,0.965987,0.004484,2.139502,0.037438,275.551698,0.810831,1.663549,1.078883,0.039975,0.967105,2.414177,2.165868,1.155225,0.021514,0.004740,2.522576,0.098115,6.194716,0.348511,2.431610,1.066894,0.995430,0.088918,4.184356,0.425365,0.003381,0.254550,365.019505,0.297801,0.092614,0.014900,1.024408,1645.410893,71810.500000
1,01,2020,8.0,10.0,3.000000,1.0,3.0,3.0,4.00,1.500000,4.000000,1.0,1.0,1.0,0.948013,0.301067,0.829743,0.904102,0.0,3.918506,17.347477,1.154849,0.446635,0.987648,0.007158,1.912456,0.042310,305.124306,0.802218,1.105541,1.084915,0.029291,0.974117,2.364563,2.006050,1.136489,0.014086,0.004851,2.449221,0.118582,6.506671,0.353960,2.296961,1.052942,0.992563,0.118789,4.287628,0.406414,0.005875,0.191794,389.388663,0.320350,0.115512,0.008485,1.015404,1691.097447,84977.750000
2,01,2022,8.0,10.0,3.000000,1.0,3.0,3.0,3.75,1.500000,3.000000,1.0,1.0,1.0,0.953315,0.363934,0.833432,0.888370,0.0,3.862661,17.033339,1.187011,0.527365,0.964534,0.017653,1.864706,0.024552,350.099027,0.856483,0.747706,1.093178,0.036064,0.962341,2.337955,1.997955,1.259933,0.010746,0.004450,2.679804,0.120712,7.343714,0.335790,2.291607,1.070470,0.990484,0.112913,4.165880,0.404203,0.010323,0.164826,589.775208,0.363320,0.116361,0.022477,1.018898,2183.065867,85916.500000
3,02,2018,8.0,8.5,2.250000,1.0,3.0,1.0,4.00,1.500000,5.000000,1.0,1.0,1.0,0.852444,0.313906,0.101366,0.603249,0.0,3.370514,14.778483,0.976271,0.013631,0.974302,0.271139,1.682316,0.068284,495.536871,0.822640,1.190902,1.249026,0.059280,0.934396,2.450202,1.688198,0.994116,0.034741,0.002962,1.786824,0.050373,5.619687,0.042568,2.115245,1.394415,0.995097,0.127318,3.651680,0.542618,0.030859,0.331060,355.509805,0.063110,0.130529,0.006312,1.003140,2137.880791,266591.000000
4,02,2020,8.0,8.5,2.500000,1.0,3.0,1.0,4.00,2.000000,5.000000,1.0,1.0,1.0,0.913127,0.279764,0.108240,0.577239,0.0,3.309996,18.097982,1.099012,0.025186,0.979650,0.272621,1.613549,0.072284,519.522103,0.860794,0.819700,1.161200,0.088035,0.964287,2.485101,1.696447,1.115911,0.031589,0.001947,1.784814,0.062003,6.346981,0.043219,2.161368,1.252475,0.994902,0.170105,3.842881,0.588295,0.044019,0.279955,386.555622,0.059437,0.128789,0.021931,1.005444,2910.766735,272823.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,31,2020,8.0,10.0,3.000000,1.0,3.0,1.0,4.00,1.250000,3.000000,1.0,1.0,1.0,0.882248,0.319899,0.834266,0.606312,0.0,3.697294,15.621745,0.997795,0.003348,0.936638,0.252943,1.737912,0.095735,236.960118,0.792539,0.351484,1.211693,0.042477,0.948645,2.122419,1.959382,0.987243,0.005081,0.011412,2.238077,0.023373,7.078678,0.090151,1.859020,2.074360,0.979938,0.072890,3.527309,0.103532,0.002434,0.144663,368.438554,0.255339,0.097741,0.074711,1.025997,1566.595569,88370.250000
92,31,2022,8.0,10.0,2.750000,1.0,3.0,1.0,4.00,1.250000,2.000000,1.0,1.0,1.0,0.888618,0.283053,0.861815,0.624339,0.0,3.546514,20.736073,1.021109,0.004833,0.939103,0.340484,1.842908,0.105069,160.820769,0.776416,0.157252,1.120146,0.021508,0.954367,2.227814,1.703606,1.004846,0.004446,0.001194,2.332202,0.039448,7.641070,0.096652,1.845370,1.972431,0.979488,0.101510,3.484885,0.098103,0.004757,0.134482,529.623155,0.129257,0.165997,0.069498,1.017018,2088.318963,87237.250000
93,32,2018,8.0,10.0,2.666667,1.0,3.0,1.0,4.00,1.666667,5.000000,1.0,1.0,1.0,0.894064,0.236414,0.895117,0.764019,0.0,3.691466,19.721876,1.101486,0.367296,0.979743,0.003330,1.740038,0.037580,248.232123,0.736292,2.183872,1.118875,0.042784,0.953893,2.583217,1.951428,2.056887,0.025031,0.011190,1.884902,0.175979,5.519620,0.235562,2.248404,1.194866,0.978831,0.118172,4.253503,0.629447,0.004590,0.271035,288.063652,0.247167,0.075241,0.012431,1.009344,1579.312406,56854.666667
94,32,2020,8.0,10.0,3.000000,1.0,3.0,1.0,4.00,2.000000,3.666667,1.0,1.0,1.0,0.904266,0.246265,0.916026,0.764015,0.0,3.544823,19.989872,1.180518,0.497435,0.986793,0.010614,1.704727,0.028393,298.266756,0.803988,1.535255,1.119642,0.031891,0.974158,2.691681,1.840096,2.069922,0.012740,0.010342,1.617510,0.221294,6.975131,0.284139,2.303781,1.113889,0.984747,0.150201,4.305402,0.650624,0.011160,0.309841,310.957732,0.314609,0.075841,0.013191,1.012844,2041.666763,52094.000000


In [129]:
# append to df_viviendas_all for nan values
mask = (
    df_viviendas_all['mat_pared'].isnull()
)
columns_to_impute = df_viviendas_all.drop(columns=["ubica_geo", "year", "cve_ent"]).columns.copy()

# append to df_viviendas_all for nan values
df_viviendas_all.loc[mask, columns_to_impute] = (
    df_viviendas_all
    .loc[mask, :]
    .merge(
        table_vivienda_medians,
        on=["cve_ent", "year"],
        how="left"
        )
    .filter(like="_y")
    .values
)

# sort by ubica_geo and year
df_viviendas_all = (
    df_viviendas_all
    .sort_values(["ubica_geo", "year"], ignore_index=True)
    )

# see how many nan values
df_viviendas_all.isnull().sum()[df_viviendas_all.isnull().sum() > 0]

tipo_finan    2
dtype: int64

In [130]:
# see 31100
df_viviendas_all.query("ubica_geo == '31100'")

Unnamed: 0,ubica_geo,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas,cve_ent
1023,31100,2018,8.0,10.0,2.75,1.0,2.5,1.0,3.75,1.25,3.0,1.0,1.0,1.0,0.911595,0.267145,0.868114,0.485174,0.0,3.94994,19.816853,1.027697,0.002856,0.83335,0.230294,2.014664,0.086792,145.350302,0.733703,0.66029,1.103274,0.018584,0.981393,2.136037,1.935276,0.982264,0.00245,0.023236,2.456703,0.040776,6.723034,0.094318,1.946893,2.028594,0.968858,0.04332,3.485306,0.143839,0.000466,0.108094,345.033435,0.165134,0.138853,0.100664,1.048106,1405.56214,80021.5,31
1024,31100,2020,8.0,10.0,3.0,1.0,3.0,1.0,4.0,1.25,3.0,1.0,1.0,1.0,0.882248,0.319899,0.834266,0.606312,0.0,3.697294,15.621745,0.997795,0.003348,0.936638,0.252943,1.737912,0.095735,236.960118,0.792539,0.351484,1.211693,0.042477,0.948645,2.122419,1.959382,0.987243,0.005081,0.011412,2.238077,0.023373,7.078678,0.090151,1.85902,2.07436,0.979938,0.07289,3.527309,0.103532,0.002434,0.144663,368.438554,0.255339,0.097741,0.074711,1.025997,1566.595569,88370.25,31
1025,31100,2022,8.0,10.0,2.75,1.0,3.0,1.0,4.0,1.25,2.0,1.0,1.0,1.0,0.888618,0.283053,0.861815,0.624339,0.0,3.546514,20.736073,1.021109,0.004833,0.939103,0.340484,1.842908,0.105069,160.820769,0.776416,0.157252,1.120146,0.021508,0.954367,2.227814,1.703606,1.004846,0.004446,0.001194,2.332202,0.039448,7.64107,0.096652,1.84537,1.972431,0.979488,0.10151,3.484885,0.098103,0.004757,0.134482,529.623155,0.129257,0.165997,0.069498,1.017018,2088.318963,87237.25,31


In [131]:
# see 31 in table
table_vivienda_medians.query("cve_ent == '31'")

Unnamed: 0,cve_ent,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas
90,31,2018,8.0,10.0,2.75,1.0,2.5,1.0,3.75,1.25,3.0,1.0,1.0,1.0,0.911595,0.267145,0.868114,0.485174,0.0,3.94994,19.816853,1.027697,0.002856,0.83335,0.230294,2.014664,0.086792,145.350302,0.733703,0.66029,1.103274,0.018584,0.981393,2.136037,1.935276,0.982264,0.00245,0.023236,2.456703,0.040776,6.723034,0.094318,1.946893,2.028594,0.968858,0.04332,3.485306,0.143839,0.000466,0.108094,345.033435,0.165134,0.138853,0.100664,1.048106,1405.56214,80021.5
91,31,2020,8.0,10.0,3.0,1.0,3.0,1.0,4.0,1.25,3.0,1.0,1.0,1.0,0.882248,0.319899,0.834266,0.606312,0.0,3.697294,15.621745,0.997795,0.003348,0.936638,0.252943,1.737912,0.095735,236.960118,0.792539,0.351484,1.211693,0.042477,0.948645,2.122419,1.959382,0.987243,0.005081,0.011412,2.238077,0.023373,7.078678,0.090151,1.85902,2.07436,0.979938,0.07289,3.527309,0.103532,0.002434,0.144663,368.438554,0.255339,0.097741,0.074711,1.025997,1566.595569,88370.25
92,31,2022,8.0,10.0,2.75,1.0,3.0,1.0,4.0,1.25,2.0,1.0,1.0,1.0,0.888618,0.283053,0.861815,0.624339,0.0,3.546514,20.736073,1.021109,0.004833,0.939103,0.340484,1.842908,0.105069,160.820769,0.776416,0.157252,1.120146,0.021508,0.954367,2.227814,1.703606,1.004846,0.004446,0.001194,2.332202,0.039448,7.64107,0.096652,1.84537,1.972431,0.979488,0.10151,3.484885,0.098103,0.004757,0.134482,529.623155,0.129257,0.165997,0.069498,1.017018,2088.318963,87237.25


## S6: Re-Join with df_properties

### Load & Transform

In [132]:
# read parquet
df_properties = pd.read_parquet("../../data/interim/cleaned_data_s4.parquet")

# get original columns
original_columns = df_properties.columns

# see num of rows
df_properties.shape

(1909959, 51)

In [133]:
# see columns cve ent & cve mun
df_properties[["state_id", "municipality_id"]]

Unnamed: 0,state_id,municipality_id
0,31,050
1,31,041
2,31,050
3,31,041
4,31,050
...,...,...
1909954,01,001
1909955,01,001
1909956,14,120
1909957,13,013


In [134]:
# create 'ubica_geo' column
df_properties["ubica_geo"] = (
    df_properties["state_id"].astype(str).str.zfill(2)
    + df_properties["municipality_id"].astype(str).str.zfill(3)
    )

# see some values
df_properties["ubica_geo"]

0          31050
1          31041
2          31050
3          31041
4          31050
           ...  
1909954    01001
1909955    01001
1909956    14120
1909957    13013
1909958    15081
Name: ubica_geo, Length: 1909959, dtype: object

In [135]:
# get year of fecha avaluo and floor every 2 years
df_properties["year_enigh"] = (
    df_properties["valuation_date"].dt.year
    // 2 * 2
    )

# describe
df_properties["year_enigh"].describe()

count    1.909959e+06
mean     2.020327e+03
std      1.499379e+00
min      2.018000e+03
25%      2.020000e+03
50%      2.020000e+03
75%      2.022000e+03
max      2.022000e+03
Name: year_enigh, dtype: float64

### Join with Concentrado Hogar

In [136]:
# join with df_concentradohogar_all
df_properties = (
    df_properties
    .merge(
        df_concentradohogar_all,
        left_on=["ubica_geo", "year_enigh"],
        right_on=["ubica_geo", "year"],
        how="left"
        )
    )

# see columns
df_properties.shape

(1909959, 176)

In [137]:
# see size
df_properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1909959 entries, 0 to 1909958
Columns: 176 entries, half_bathrooms to cve_ent
dtypes: datetime64[ns](1), float32(3), float64(137), int16(1), int64(2), object(4), string(12), uint16(1), uint32(1), uint8(14)
memory usage: 2.3+ GB


In [138]:
# see if there are columns with "_y"
df_properties.filter(like="_x").columns

Index([], dtype='object')

In [139]:
# see how many nan values 
df_properties.isnull().sum()[df_properties.isnull().sum() > 0]

neighborhood                           118
constructor                           2518
potable_water_service_id            633139
lighting_service_id                 633119
sidewalk_id                         634669
project_quality_id                  647455
elevator_service_id                  91867
curb_id                             633119
water_collection_service_id         633119
electrical_supply_service_id        633119
telephone_service_supply_id         633119
public_transportation_service_id    633119
rentable_units                      265108
rentable_units_subject_property     647450
current_use                         641538
access_routes                       633173
dtype: int64

### Join with Vivienda

In [140]:
# join with df_viviendas_all
df_properties = (
    df_properties
    .drop(columns=["year", "tam_loc", "est_socio"])
    .merge(
        df_viviendas_all,
        left_on=["ubica_geo", "year_enigh"],
        right_on=["ubica_geo", "year"],
        how="left"
        )
    )

# see columns
df_properties.shape

(1909959, 230)

In [141]:
# see size
df_properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1909959 entries, 0 to 1909958
Columns: 230 entries, half_bathrooms to cve_ent_y
dtypes: datetime64[ns](1), float32(3), float64(190), int16(1), int64(2), object(5), string(12), uint16(1), uint32(1), uint8(14)
memory usage: 3.0+ GB


In [142]:
# see if there are columns with "_y"
df_properties.filter(like="_x").columns

Index(['cve_ent_x'], dtype='object')

In [143]:
# drop cve_ent_x and cve_ent_y
df_properties = (
    df_properties
    .drop(columns=["cve_ent_x", "cve_ent_y"])
    )

In [144]:
# see how many nan values 
df_properties.isnull().sum()[df_properties.isnull().sum() > 0]

neighborhood                           118
constructor                           2518
potable_water_service_id            633139
lighting_service_id                 633119
sidewalk_id                         634669
project_quality_id                  647455
elevator_service_id                  91867
curb_id                             633119
water_collection_service_id         633119
electrical_supply_service_id        633119
telephone_service_supply_id         633119
public_transportation_service_id    633119
rentable_units                      265108
rentable_units_subject_property     647450
current_use                         641538
access_routes                       633173
tipo_finan                               9
dtype: int64

---
# Save

In [145]:
# save
dir_save = "../../data/interim/"
df_viviendas_all.to_csv(dir_save + "viviendas_enigh.csv")
df_concentradohogar_all.to_csv(dir_save + "concentrado_hogar_enigh.csv")

In [146]:
# save parquet
df_properties.to_parquet("../../data/interim/cleaned_data_s5_adamuz.parquet")

---
# Sandbox

In [44]:
df_viviendas_all.columns.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [41]:
df_viviendas_all.dtypes

ubica_geo           object
year                 int64
mat_pared          float64
mat_techos         float64
mat_pisos           object
disp_agua          float64
combustible         object
eli_basura         float64
tenencia           float64
tipo_adqui         float64
tipo_adqui         float64
tipo_finan          object
escrituras         float64
disp_elect         float64
tipo_viv            object
regadera           float64
pago_mesp          float64
tinaco_azo         float64
lavadero           float64
procaptar          float64
tot_resid          float64
antiguedad         float64
bano_comp          float64
calent_sol         float64
cocina             float64
aire_acond         float64
tot_hom            float64
cocina_dor         float64
renta              float64
fregadero          float64
focos_inca         float64
sanit_agua         float64
uso_compar         float64
medidor_luz        float64
est_socio          float64
tot_muj            float64
dotac_agua         float64
b

In [40]:
df_viviendas_all['ubica_geo'].dtype

dtype('O')

In [109]:
# df_viviendas_all.query("tipo_finan.str.contains('&')")
df_viviendas_all[df_viviendas_all['tipo_finan'].str.contains('&').astype(float).fillna(0).astype(bool)]

Unnamed: 0,ubica_geo,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas,cve_ent
923,23003,2018,8.0,10.0,2.0,7.0,3.0,1.0,3.0,2.0,&,1.0,1.0,1,0.2,0.6,0.2,0.4,0.0,4.0,3.8,0.4,0.0,1.0,0.6,2.6,0.4,200.0,0.4,0.0,1.8,0.2,0.8,1.0,1.4,0.0,0.0,0.0,3.0,0.0,5.4,0.0,1.2,2.0,1.0,0.6,1.8,0.0,0.0,0.8,1360.0,1.0,0.0,0.0,1.0,1040.0,2575.0,23
992,20083,2018,8.0,10.0,3.0,6.0,3.0,1.0,4.0,1.0,&,1.0,1.0,1,1.0,0.2,0.8,0.8,0.0,4.8,21.2,1.2,0.2,1.0,0.0,2.6,0.0,200.0,1.0,0.0,1.8,0.0,1.0,2.0,2.2,1.4,0.0,0.0,3.0,0.2,10.8,0.8,2.6,2.6,1.0,0.0,4.2,0.8,0.0,0.2,660.0,0.8,0.2,0.0,1.0,5000.0,3905.0,20


In [108]:
df_viviendas_all['tipo_finan'].str.contains('&').astype(float).fillna(0).astype(bool)

0       False
1       False
2       False
3       False
4       False
        ...  
1033    False
1034    False
1035    False
1036    False
1037    False
Name: tipo_finan, Length: 1038, dtype: bool

In [115]:
df_viviendas_all['combustible'].astype(float)

0       3.0
1       3.0
2       3.0
3       3.0
4       3.0
       ... 
1033    3.0
1034    3.0
1035    NaN
1036    NaN
1037    NaN
Name: combustible, Length: 1038, dtype: float64

In [84]:
df_viviendas_all.query("ubica_geo.eq('01004')")

Unnamed: 0,ubica_geo,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas,cve_ent
