# Socio-Economic Data
The purpose of this document is to create one dataset containing the following socioeconomic factors:
- malnutrition
- poverty rate
- unemployment rate
- gdp per capita
- population

In [1]:
import pandas as pd

## Malnutrition per municipality

In [3]:
malnutrition = pd.read_csv("../../transformed_data/socio_economic/desnutrição/malnutrition_per_municipality_ML.csv")
malnutrition.head()

Unnamed: 0,Ano,Código_IBGE,Magreza_total_%
0,2008,5300108,5.31
1,2008,5200050,8.14
2,2008,5200100,4.02
3,2008,5200134,0.97
4,2008,5200159,1.23


In [4]:
malnutrition.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89120 entries, 0 to 89119
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Ano              89120 non-null  int64  
 1   Código_IBGE      89120 non-null  int64  
 2   Magreza_total_%  89120 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.0 MB


## GDP per capita per UF

In [5]:
gdp = pd.read_csv("../../transformed_data/socio_economic/pib/gdp_per_capita_per_uf_ML")
gdp.head()

Unnamed: 0,Código_UF,Ano,PIB
0,12,2000,3864.01
1,12,2001,4309.39
2,12,2002,4887.09
3,12,2003,5502.38
4,12,2004,6415.31


In [6]:
gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 567 entries, 0 to 566
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Código_UF  567 non-null    int64  
 1   Ano        567 non-null    int64  
 2   PIB        567 non-null    float64
dtypes: float64(1), int64(2)
memory usage: 13.4 KB


## Population per municipality

In [7]:
population = pd.read_csv("../../transformed_data/socio_economic/population/population_per_municipality_ML.csv")
population.head()

Unnamed: 0,Código_IBGE,Ano,Populacao
0,3166600,2021,771
1,3166600,2020,776
2,3166600,2019,781
3,3166600,2018,786
4,3507209,2011,806


In [8]:
population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83530 entries, 0 to 83529
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Código_IBGE  83530 non-null  int64
 1   Ano          83530 non-null  int64
 2   Populacao    83530 non-null  int64
dtypes: int64(3)
memory usage: 1.9 MB


## Poverty per UF

In [9]:
poverty = pd.read_csv("../../transformed_data/socio_economic/poverty/poverty_per_uf_ML.csv")
poverty.head()

Unnamed: 0,Código_UF,Ano,Poverty_%
0,11,2012,31.61
1,11,2013,29.5
2,11,2014,26.03
3,11,2015,28.92
4,11,2016,31.23


In [10]:
poverty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Código_UF  270 non-null    int64  
 1   Ano        270 non-null    int64  
 2   Poverty_%  270 non-null    float64
dtypes: float64(1), int64(2)
memory usage: 6.5 KB


## Unemployment per UF

In [11]:
unemployment = pd.read_csv("../../transformed_data/socio_economic/unemployment/unemployement_per_UF_ML.csv")
unemployment.head()

Unnamed: 0,Código_UF,Ano,Unemployed_%
0,11,2018,9.05
1,11,2020,10.425
2,11,2017,8.225
3,11,2012,6.35
4,11,2016,7.925


In [12]:
unemployment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Código_UF     297 non-null    int64  
 1   Ano           297 non-null    int64  
 2   Unemployed_%  297 non-null    float64
dtypes: float64(1), int64(2)
memory usage: 7.1 KB


## Merge dataframes

### Malnutrition + Population

In [14]:
malnutrition.shape, population.shape

((89120, 3), (83530, 3))

In [19]:
len(malnutrition.Código_IBGE.unique()), len(population.Código_IBGE.unique())

(5289, 5570)

In [25]:
malnutrition_and_population = pd.merge(population, malnutrition, on=['Código_IBGE', 'Ano'], how='inner')

In [26]:
malnutrition_and_population

Unnamed: 0,Código_IBGE,Ano,Populacao,Magreza_total_%
0,3166600,2021,771,2.70
1,3166600,2020,776,3.50
2,3166600,2019,781,1.59
3,3166600,2018,786,1.67
4,3507209,2011,806,0.00
...,...,...,...,...
83525,3550308,2017,12106920,2.42
83526,3550308,2018,12176866,2.50
83527,3550308,2019,12252023,2.85
83528,3550308,2020,12325232,2.53


Add UF column

In [27]:
br_minicipalicites = pd.read_csv("../../raw_data/municipality_lookup.csv")
br_minicipalicites.head()

Unnamed: 0,Código_UF,Estado,UF,Código_IBGE,Município,Região
0,11,Rondônia,RO,1100015,Alta Floresta D'Oeste,Norte
1,11,Rondônia,RO,1100023,Ariquemes,Norte
2,11,Rondônia,RO,1100031,Cabixi,Norte
3,11,Rondônia,RO,1100049,Cacoal,Norte
4,11,Rondônia,RO,1100056,Cerejeiras,Norte


In [28]:
malnutrition_and_population_UF = pd.merge(malnutrition_and_population, br_minicipalicites, on=['Código_IBGE'], how='left')

In [30]:
malnutrition_and_population_UF = malnutrition_and_population_UF.drop(columns=["Estado","UF", "Município","Região"])

In [31]:
malnutrition_and_population_UF.head()

Unnamed: 0,Código_IBGE,Ano,Populacao,Magreza_total_%,Código_UF
0,3166600,2021,771,2.7,31
1,3166600,2020,776,3.5,31
2,3166600,2019,781,1.59,31
3,3166600,2018,786,1.67,31
4,3507209,2011,806,0.0,35


### Malnutrition + Population + GDP

In [32]:
malnutrition_and_population_UF.shape, gdp.shape

((83530, 5), (567, 3))

In [37]:
len(malnutrition_and_population_UF.Código_UF.unique()), len(gdp.Código_UF.unique())

(27, 27)

In [55]:
malnutrition_population_gdp_UF = pd.merge(malnutrition_and_population_UF, gdp, on=['Código_UF', 'Ano'], how='inner')

In [56]:
malnutrition_population_gdp_UF

Unnamed: 0,Código_IBGE,Ano,Populacao,Magreza_total_%,Código_UF,PIB
0,3166600,2020,776,3.50,31,32066.73000
1,3115607,2020,1157,7.14,31,32066.73000
2,3127909,2020,1387,4.55,31,32066.73000
3,3165800,2020,1461,2.70,31,32066.73000
4,3164803,2020,1492,0.00,31,32066.73000
...,...,...,...,...,...,...
72385,5300108,2018,2974703,3.61,53,85661.39366
72386,5300108,2016,2977216,4.45,53,79114.19420
72387,5300108,2019,3015268,4.26,53,90742.75039
72388,5300108,2017,3039444,4.25,53,80515.46577


### Malnutrition + Population + GDP + poverty

In [52]:
len(malnutrition_population_gdp_UF.Código_UF.unique())

27

In [57]:
malnutrition_population_gdp_poverty_UF = pd.merge(malnutrition_population_gdp_UF, poverty, on=['Código_UF', 'Ano'], how='inner')

In [58]:
malnutrition_population_gdp_poverty_UF

Unnamed: 0,Código_IBGE,Ano,Populacao,Magreza_total_%,Código_UF,PIB,Poverty_%
0,3166600,2020,776,3.50,31,32066.73000,17.06
1,3115607,2020,1157,7.14,31,32066.73000,17.06
2,3127909,2020,1387,4.55,31,32066.73000,17.06
3,3165800,2020,1461,2.70,31,32066.73000,17.06
4,3164803,2020,1492,0.00,31,32066.73000,17.06
...,...,...,...,...,...,...,...
50125,5300108,2018,2974703,3.61,53,85661.39366,13.70
50126,5300108,2016,2977216,4.45,53,79114.19420,15.16
50127,5300108,2019,3015268,4.26,53,90742.75039,11.82
50128,5300108,2017,3039444,4.25,53,80515.46577,15.22


### Malnutrition + Population + GDP + poverty + unemployment

In [59]:
socio_economic_data = pd.merge(malnutrition_population_gdp_poverty_UF, unemployment, on=['Código_UF', 'Ano'], how='inner')

In [61]:
socio_economic_data = socio_economic_data.drop(columns="Código_UF")

In [63]:
socio_economic_data = socio_economic_data.dropna()

Unnamed: 0,Código_IBGE,Ano,Populacao,Magreza_total_%,PIB,Poverty_%,Unemployed_%
0,3166600,2020,776,3.50,32066.73000,17.06,12.475
1,3115607,2020,1157,7.14,32066.73000,17.06,12.475
2,3127909,2020,1387,4.55,32066.73000,17.06,12.475
3,3165800,2020,1461,2.70,32066.73000,17.06,12.475
4,3164803,2020,1492,0.00,32066.73000,17.06,12.475
...,...,...,...,...,...,...,...
50125,5300108,2018,2974703,3.61,85661.39366,13.70,12.725
50126,5300108,2016,2977216,4.45,79114.19420,15.16,12.000
50127,5300108,2019,3015268,4.26,90742.75039,11.82,13.375
50128,5300108,2017,3039444,4.25,80515.46577,15.22,13.175


In [64]:
file_path = "../../transformed_data/socio_economic/socioeconomic_ML.csv"
socio_economic_data.to_csv(file_path, index=False)

In [66]:
pd.read_csv(file_path).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50130 entries, 0 to 50129
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Código_IBGE      50130 non-null  int64  
 1   Ano              50130 non-null  int64  
 2   Populacao        50130 non-null  int64  
 3   Magreza_total_%  50130 non-null  float64
 4   PIB              50130 non-null  float64
 5   Poverty_%        50130 non-null  float64
 6   Unemployed_%     50130 non-null  float64
dtypes: float64(4), int64(3)
memory usage: 2.7 MB
