In [1]:
import pandas as pd

In [2]:
df_labels = pd.ExcelFile("https://docs.google.com/spreadsheets/d/e/2PACX-1vR08Js9Sh4nNTMe5uBcsDUFedG5MOjIf90p6EHAr1_sWY5kpnI3xUvyPHzQpTEUrXz1pskaoc0uyea6/pub?output=xlsx")

In [3]:
df = pd.read_csv("https://storage.googleapis.com/datamexico-data/inegi_intercensal_census/TR_PERSONA32.CSV", 
                 dtype=str, 
                 index_col=None, 
                 header=0, 
                 encoding="latin-1")

In [4]:
df.columns = df.columns.str.lower()

In [5]:
 # Adding IDs columns and renaming factor as population
df["loc_id"] = df["ent"] + df["mun"] + df["loc50k"]

In [6]:
 # Transforming certains str columns into int values
df["loc_id"] = df["loc_id"].astype(int)
df["factor"] = df["factor"].astype(int)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462568 entries, 0 to 462567
Data columns (total 87 columns):
id_viv                462568 non-null object
id_persona            462568 non-null object
ent                   462568 non-null object
nom_ent               462568 non-null object
mun                   462568 non-null object
nom_mun               462568 non-null object
loc50k                462568 non-null object
nom_loc               462568 non-null object
cobertura             462568 non-null object
estrato               462568 non-null object
upm                   462568 non-null object
factor                462568 non-null int32
numper                462568 non-null object
sexo                  462568 non-null object
edad                  462568 non-null object
parent                462568 non-null object
parent_otro_c         462568 non-null object
ident_madre           462568 non-null object
ident_padre           462568 non-null object
sersalud              462568 non-nu

In [8]:
  # List of columns for the next df
params = ["sexo", "parent", "sersalud", "dhsersal1", "nacionalidad"]
params_translated = ["sex", "parent", "sersalud", "dhsersal1", "nationality"]

In [9]:
# For cycle in order to change the content of a column from previous id, into the new ones (working for translate too)
for sheet in params:
    df_l = pd.read_excel(df_labels, sheet)
    df[sheet] = df[sheet].astype(int)
    df[sheet] = df[sheet].replace(dict(zip(df_l.prev_id, df_l.id)))

In [10]:
# Condense df around params list, mun_id and loc_id, and sum over population (factor)
df.rename(index=str, columns={"factor": "population", "nacionalidad": "nationality", "sexo": "sex"}, inplace=True)

In [11]:
df[params_translated].head()

Unnamed: 0,sex,parent,sersalud,dhsersal1,nationality
0,2,2,5,8,1
1,1,1,1,8,1
2,1,3,6,8,1
3,1,3,6,8,1
4,2,3,6,8,1


In [12]:
df = df.groupby(params_translated + ["loc_id"]).sum().reset_index(col_fill='ffill')

In [13]:
df.head()


Unnamed: 0,sex,parent,sersalud,dhsersal1,nationality,loc_id,population
0,1,0,0,0,0,320070000,4
1,1,0,0,0,0,320080000,1
2,1,0,0,0,0,320100000,6
3,1,0,0,0,0,320110000,1
4,1,0,0,0,0,320120000,2


In [14]:
df["nationality"].unique()

array([0, 1, 2], dtype=int64)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17013 entries, 0 to 17012
Data columns (total 7 columns):
sex            17013 non-null int64
parent         17013 non-null int64
sersalud       17013 non-null int64
dhsersal1      17013 non-null int64
nationality    17013 non-null int64
loc_id         17013 non-null int64
population     17013 non-null int32
dtypes: int32(1), int64(6)
memory usage: 864.0 KB


In [16]:
pivote = df["parent"].isnull()

In [17]:
pivote.unique()

array([False])