# Data preparation for Machine Learning

This notebook aims to combine all datasets into one, which will be input for ML/DL. 

In [1]:
import pandas as pd

## Education datasets

Link to [Education Notebook](../notebooks/education/School_all_data(Anand).ipynb)

In [8]:
education_expanded = pd.read_csv("../transformed_data/education/school_expanded_data_ML.csv")
education_pivoted = pd.read_csv("../transformed_data/education/school_pivoted_data_ML.csv")
education_rural = pd.read_csv("../transformed_data/education/school_rural_data_ML.csv")
education_urban = pd.read_csv("../transformed_data/education/school_urbana_data_ML.csv")

In [9]:
education_expanded.shape, education_pivoted.shape, education_rural.shape, education_urban.shape

((102889, 10), (19542, 16), (19851, 9), (83038, 9))

In [3]:
education.shape

(102889, 10)

## Socio-economic dataset

Link to [Socio-Economic Notebook](../notebooks/socio_economic/Socio-Economic%20Data%20(Anand).ipynb)

In [10]:
socio_economic_data = pd.read_csv("../transformed_data/socio_economic/socioeconomic_ML.csv")

In [23]:
socio_economic_data

Unnamed: 0,Código_IBGE,Ano,Populacao,Magreza_total_%,PIB,Poverty_%,Unemployed_%
0,3166600,2020,776,3.50,32066.73000,17.06,12.475
1,3115607,2020,1157,7.14,32066.73000,17.06,12.475
2,3127909,2020,1387,4.55,32066.73000,17.06,12.475
3,3165800,2020,1461,2.70,32066.73000,17.06,12.475
4,3164803,2020,1492,0.00,32066.73000,17.06,12.475
...,...,...,...,...,...,...,...
50125,5300108,2018,2974703,3.61,85661.39366,13.70,12.725
50126,5300108,2016,2977216,4.45,79114.19420,15.16,12.000
50127,5300108,2019,3015268,4.26,90742.75039,11.82,13.375
50128,5300108,2017,3039444,4.25,80515.46577,15.22,13.175


## Funding

Link to [Funding Notebook](../notebooks/funding/Funding(Anand).ipynb)

In [12]:
funding = pd.read_csv("../transformed_data/funding/funding_per_municipality_ML.csv")

In [13]:
funding.shape

(83490, 3)

## Internet Access
Link to [Internet access Notebook](../notebooks/Internet%20Access%20(Anand).ipynb)

In [120]:
internet_access = pd.read_csv("../transformed_data/acess_to_internet.csv")
internet_access

Unnamed: 0,Ano,Região,Acesso a internet %
0,2008,Sudeste,25
1,2008,Nordeste,7
2,2008,Sul,20
3,2008,Norte,7
4,2008,Centro Oeste,21
...,...,...,...
70,2022,Sudeste,82
71,2022,Nordeste,78
72,2022,Sul,81
73,2022,Norte,76


## Brazil municipalities

In [47]:
municipalities = pd.read_csv("../raw_data/municipality_lookup.csv")
sorted(municipalities.Região.unique())

['Centro Oeste', 'Nordeste', 'Norte', 'Sudeste', 'Sul']

## Merge datasets

### education + socio-economic

In [26]:
education_exp_socio_econ_df = pd.merge(education_expanded, socio_economic_data, on=['Código_IBGE', 'Ano'], how='inner')
education_exp_socio_econ_df

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,Populacao,Magreza_total_%,PIB,Poverty_%,Unemployed_%
0,2012,1100023,Rural,92.6,4.0,3.4,498,42,1,15,92747,2.03,18938.68679,31.61,6.350
1,2012,1100023,Urbana,69.9,16.6,13.5,3453,172,6,136,92747,2.03,18938.68679,31.61,6.350
2,2012,1100031,Rural,92.1,2.6,5.3,38,6,1,3,6132,8.80,18938.68679,31.61,6.350
3,2012,1100031,Urbana,91.3,5.2,3.5,230,22,1,11,6132,8.80,18938.68679,31.61,6.350
4,2012,1100064,Rural,90.6,9.2,0.2,415,42,1,15,18093,6.92,18938.68679,31.61,6.350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62191,2020,5221858,Urbana,93.0,5.9,1.1,5.069,276,12,161.0,172135,5.88,31506.97000,18.05,12.425
62192,2020,5221908,Urbana,100.0,0.0,0.0,125.0,12,1,6.0,3838,8.70,31506.97000,18.05,12.425
62193,2020,5222005,Urbana,97.2,2.6,0.2,496.0,23,1,19.0,13977,5.31,31506.97000,18.05,12.425
62194,2020,5222054,Urbana,99.2,0.8,0.0,362.0,18,1,15.0,8873,3.79,31506.97000,18.05,12.425


In [27]:
education_pv_socio_econ_df = pd.merge(education_pivoted, socio_economic_data, on=['Código_IBGE', 'Ano'], how='inner')
education_pv_socio_econ_df

Unnamed: 0,Ano,Código_IBGE,Aprovação_Rural,Aprovação_Urbana,Reprovação_Rural,Reprovação_Urbana,Abandono_Rural,Abandono_Urbana,Matrículas_Rural,Matrículas_Urbana,...,Docentes_Urbana,Estabelecimentos_Rural,Estabelecimentos_Urbana,Turmas_Rural,Turmas_Urbana,Populacao,Magreza_total_%,PIB,Poverty_%,Unemployed_%
0,2012,1100023,92.6,69.9,4.0,16.6,3.4,13.5,498,3453,...,172,1,6,15.0,136,92747,2.03,18938.68679,31.61,6.350
1,2012,1100031,92.1,91.3,2.6,5.2,5.3,3.5,38,230,...,22,1,1,3.0,11,6132,8.80,18938.68679,31.61,6.350
2,2012,1100064,90.6,86.9,9.2,5.6,0.2,7.5,415,733,...,54,1,3,15.0,32,18093,6.92,18938.68679,31.61,6.350
3,2012,1100072,91.4,83.3,5.3,4.4,3.3,12.3,155,237,...,19,2,1,6.0,13,8530,4.70,18938.68679,31.61,6.350
4,2012,1100080,60.3,78.0,18.1,4.9,21.6,17.1,192,380,...,28,1,1,8.0,14,14355,4.23,18938.68679,31.61,6.350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12146,2020,5220702,87.5,99.0,12.5,0.0,0.0,1.0,8.0,103.0,...,12,1,1,1.0,6.0,3001,6.39,31506.97000,18.05,12.425
12147,2020,5221080,100.0,100.0,0.0,0.0,0.0,0.0,3.0,145.0,...,16,1,1,2.0,7.0,3498,3.70,31506.97000,18.05,12.425
12148,2020,5221809,52.1,98.2,22.3,1.8,25.6,0.0,565.0,63.0,...,11,1,1,21.0,4.0,3066,8.82,31506.97000,18.05,12.425
12149,2020,5222302,99.3,96.1,0.0,3.1,0.7,0.8,142.0,132.0,...,14,1,1,5.0,6.0,5882,4.55,31506.97000,18.05,12.425


In [28]:
education_rural_socio_econ_df = pd.merge(education_rural, socio_economic_data, on=['Código_IBGE', 'Ano'], how='inner')
education_rural_socio_econ_df

Unnamed: 0,Ano,Código_IBGE,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,Populacao,Magreza_total_%,PIB,Poverty_%,Unemployed_%
0,2012,1100023,92.6,4.0,3.4,498,42,1,15.0,92747,2.03,18938.68679,31.61,6.350
1,2012,1100031,92.1,2.6,5.3,38,6,1,3.0,6132,8.80,18938.68679,31.61,6.350
2,2012,1100064,90.6,9.2,0.2,415,42,1,15.0,18093,6.92,18938.68679,31.61,6.350
3,2012,1100072,91.4,5.3,3.3,155,12,2,6.0,8530,4.70,18938.68679,31.61,6.350
4,2012,1100080,60.3,18.1,21.6,192,10,1,8.0,14355,4.23,18938.68679,31.61,6.350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12338,2020,5220702,87.5,12.5,0.0,8.0,1,1,1.0,3001,6.39,31506.97000,18.05,12.425
12339,2020,5221080,100.0,0.0,0.0,3.0,3,1,2.0,3498,3.70,31506.97000,18.05,12.425
12340,2020,5221809,52.1,22.3,25.6,565.0,80,1,21.0,3066,8.82,31506.97000,18.05,12.425
12341,2020,5222302,99.3,0.0,0.7,142.0,9,1,5.0,5882,4.55,31506.97000,18.05,12.425


In [29]:
education_urban_socio_econ_df = pd.merge(education_urban, socio_economic_data, on=['Código_IBGE', 'Ano'], how='inner')
education_urban_socio_econ_df

Unnamed: 0,Ano,Código_IBGE,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,Populacao,Magreza_total_%,PIB,Poverty_%,Unemployed_%
0,2012,1100015,78.4,8.0,13.6,1099,104,2,55,24069,5.06,18938.68679,31.61,6.350
1,2012,1100023,69.9,16.6,13.5,3453,172,6,136,92747,2.03,18938.68679,31.61,6.350
2,2012,1100031,91.3,5.2,3.5,230,22,1,11,6132,8.80,18938.68679,31.61,6.350
3,2012,1100049,79.5,9.6,10.9,3413,235,10,140,79330,4.13,18938.68679,31.61,6.350
4,2012,1100056,76.9,12.7,10.4,729,42,2,30,16852,6.22,18938.68679,31.61,6.350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49848,2020,5222005,97.2,2.6,0.2,496.0,23,1,19.0,13977,5.31,31506.97000,18.05,12.425
49849,2020,5222054,99.2,0.8,0.0,362.0,18,1,15.0,8873,3.79,31506.97000,18.05,12.425
49850,2020,5222203,100.0,0.0,0.0,229.0,11,1,9.0,6312,2.63,31506.97000,18.05,12.425
49851,2020,5222302,96.1,3.1,0.8,132.0,14,1,6.0,5882,4.55,31506.97000,18.05,12.425


### Education + socio-economic + funding

In [30]:
education_exp_socio_econ_fn_df = pd.merge(education_exp_socio_econ_df, funding, on=['Código_IBGE', 'Ano'], how='inner')
education_exp_socio_econ_fn_df

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,Populacao,Magreza_total_%,PIB,Poverty_%,Unemployed_%,Valor_Consolidado
0,2012,1100023,Rural,92.6,4.0,3.4,498,42,1,15,92747,2.03,18938.68679,31.61,6.350,3.551779e+07
1,2012,1100023,Urbana,69.9,16.6,13.5,3453,172,6,136,92747,2.03,18938.68679,31.61,6.350,3.551779e+07
2,2012,1100031,Rural,92.1,2.6,5.3,38,6,1,3,6132,8.80,18938.68679,31.61,6.350,1.521377e+06
3,2012,1100031,Urbana,91.3,5.2,3.5,230,22,1,11,6132,8.80,18938.68679,31.61,6.350,1.521377e+06
4,2012,1100064,Rural,90.6,9.2,0.2,415,42,1,15,18093,6.92,18938.68679,31.61,6.350,2.814809e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62169,2020,5221858,Urbana,93.0,5.9,1.1,5.069,276,12,161.0,172135,5.88,31506.97000,18.05,12.425,1.073406e+08
62170,2020,5221908,Urbana,100.0,0.0,0.0,125.0,12,1,6.0,3838,8.70,31506.97000,18.05,12.425,1.476294e+06
62171,2020,5222005,Urbana,97.2,2.6,0.2,496.0,23,1,19.0,13977,5.31,31506.97000,18.05,12.425,7.186561e+06
62172,2020,5222054,Urbana,99.2,0.8,0.0,362.0,18,1,15.0,8873,3.79,31506.97000,18.05,12.425,6.174362e+06


In [31]:
education_pv_socio_econ_fn_df = pd.merge(education_pv_socio_econ_df, funding, on=['Código_IBGE', 'Ano'], how='inner')
education_pv_socio_econ_fn_df

Unnamed: 0,Ano,Código_IBGE,Aprovação_Rural,Aprovação_Urbana,Reprovação_Rural,Reprovação_Urbana,Abandono_Rural,Abandono_Urbana,Matrículas_Rural,Matrículas_Urbana,...,Estabelecimentos_Rural,Estabelecimentos_Urbana,Turmas_Rural,Turmas_Urbana,Populacao,Magreza_total_%,PIB,Poverty_%,Unemployed_%,Valor_Consolidado
0,2012,1100023,92.6,69.9,4.0,16.6,3.4,13.5,498,3453,...,1,6,15.0,136,92747,2.03,18938.68679,31.61,6.350,35517787.24
1,2012,1100031,92.1,91.3,2.6,5.2,5.3,3.5,38,230,...,1,1,3.0,11,6132,8.80,18938.68679,31.61,6.350,1521376.74
2,2012,1100064,90.6,86.9,9.2,5.6,0.2,7.5,415,733,...,1,3,15.0,32,18093,6.92,18938.68679,31.61,6.350,2814808.69
3,2012,1100072,91.4,83.3,5.3,4.4,3.3,12.3,155,237,...,2,1,6.0,13,8530,4.70,18938.68679,31.61,6.350,2284673.55
4,2012,1100080,60.3,78.0,18.1,4.9,21.6,17.1,192,380,...,1,1,8.0,14,14355,4.23,18938.68679,31.61,6.350,5258803.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12142,2020,5220702,87.5,99.0,12.5,0.0,0.0,1.0,8.0,103.0,...,1,1,1.0,6.0,3001,6.39,31506.97000,18.05,12.425,1237056.38
12143,2020,5221080,100.0,100.0,0.0,0.0,0.0,0.0,3.0,145.0,...,1,1,2.0,7.0,3498,3.70,31506.97000,18.05,12.425,1610351.89
12144,2020,5221809,52.1,98.2,22.3,1.8,25.6,0.0,565.0,63.0,...,1,1,21.0,4.0,3066,8.82,31506.97000,18.05,12.425,1452932.37
12145,2020,5222302,99.3,96.1,0.0,3.1,0.7,0.8,142.0,132.0,...,1,1,5.0,6.0,5882,4.55,31506.97000,18.05,12.425,3128786.75


In [32]:
education_rural_socio_econ_fn_df = pd.merge(education_rural_socio_econ_df, funding, on=['Código_IBGE', 'Ano'], how='inner')
education_rural_socio_econ_fn_df

Unnamed: 0,Ano,Código_IBGE,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,Populacao,Magreza_total_%,PIB,Poverty_%,Unemployed_%,Valor_Consolidado
0,2012,1100023,92.6,4.0,3.4,498,42,1,15.0,92747,2.03,18938.68679,31.61,6.350,35517787.24
1,2012,1100031,92.1,2.6,5.3,38,6,1,3.0,6132,8.80,18938.68679,31.61,6.350,1521376.74
2,2012,1100064,90.6,9.2,0.2,415,42,1,15.0,18093,6.92,18938.68679,31.61,6.350,2814808.69
3,2012,1100072,91.4,5.3,3.3,155,12,2,6.0,8530,4.70,18938.68679,31.61,6.350,2284673.55
4,2012,1100080,60.3,18.1,21.6,192,10,1,8.0,14355,4.23,18938.68679,31.61,6.350,5258803.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12333,2020,5220702,87.5,12.5,0.0,8.0,1,1,1.0,3001,6.39,31506.97000,18.05,12.425,1237056.38
12334,2020,5221080,100.0,0.0,0.0,3.0,3,1,2.0,3498,3.70,31506.97000,18.05,12.425,1610351.89
12335,2020,5221809,52.1,22.3,25.6,565.0,80,1,21.0,3066,8.82,31506.97000,18.05,12.425,1452932.37
12336,2020,5222302,99.3,0.0,0.7,142.0,9,1,5.0,5882,4.55,31506.97000,18.05,12.425,3128786.75


In [33]:
education_urban_socio_econ_fn_df = pd.merge(education_urban_socio_econ_df, funding, on=['Código_IBGE', 'Ano'], how='inner')
education_urban_socio_econ_fn_df

Unnamed: 0,Ano,Código_IBGE,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,Populacao,Magreza_total_%,PIB,Poverty_%,Unemployed_%,Valor_Consolidado
0,2012,1100015,78.4,8.0,13.6,1099,104,2,55,24069,5.06,18938.68679,31.61,6.350,6773134.92
1,2012,1100023,69.9,16.6,13.5,3453,172,6,136,92747,2.03,18938.68679,31.61,6.350,35517787.24
2,2012,1100031,91.3,5.2,3.5,230,22,1,11,6132,8.80,18938.68679,31.61,6.350,1521376.74
3,2012,1100049,79.5,9.6,10.9,3413,235,10,140,79330,4.13,18938.68679,31.61,6.350,18166669.91
4,2012,1100056,76.9,12.7,10.4,729,42,2,30,16852,6.22,18938.68679,31.61,6.350,3954495.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49831,2020,5222005,97.2,2.6,0.2,496.0,23,1,19.0,13977,5.31,31506.97000,18.05,12.425,7186560.98
49832,2020,5222054,99.2,0.8,0.0,362.0,18,1,15.0,8873,3.79,31506.97000,18.05,12.425,6174361.84
49833,2020,5222203,100.0,0.0,0.0,229.0,11,1,9.0,6312,2.63,31506.97000,18.05,12.425,4461612.07
49834,2020,5222302,96.1,3.1,0.8,132.0,14,1,6.0,5882,4.55,31506.97000,18.05,12.425,3128786.75


### Education + socio-economic + funding + municipalities

In [34]:
education_exp_socio_econ_fn_br_df = pd.merge(education_exp_socio_econ_fn_df, municipalities, on=['Código_IBGE'], how='left')
education_exp_socio_econ_fn_br_df

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,...,Magreza_total_%,PIB,Poverty_%,Unemployed_%,Valor_Consolidado,Código_UF,Estado,UF,Município,Região
0,2012,1100023,Rural,92.6,4.0,3.4,498,42,1,15,...,2.03,18938.68679,31.61,6.350,3.551779e+07,11,Rondônia,RO,Ariquemes,Norte
1,2012,1100023,Urbana,69.9,16.6,13.5,3453,172,6,136,...,2.03,18938.68679,31.61,6.350,3.551779e+07,11,Rondônia,RO,Ariquemes,Norte
2,2012,1100031,Rural,92.1,2.6,5.3,38,6,1,3,...,8.80,18938.68679,31.61,6.350,1.521377e+06,11,Rondônia,RO,Cabixi,Norte
3,2012,1100031,Urbana,91.3,5.2,3.5,230,22,1,11,...,8.80,18938.68679,31.61,6.350,1.521377e+06,11,Rondônia,RO,Cabixi,Norte
4,2012,1100064,Rural,90.6,9.2,0.2,415,42,1,15,...,6.92,18938.68679,31.61,6.350,2.814809e+06,11,Rondônia,RO,Colorado do Oeste,Norte
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62169,2020,5221858,Urbana,93.0,5.9,1.1,5.069,276,12,161.0,...,5.88,31506.97000,18.05,12.425,1.073406e+08,52,Goiás,GO,Valparaíso de Goiás,Centro Oeste
62170,2020,5221908,Urbana,100.0,0.0,0.0,125.0,12,1,6.0,...,8.70,31506.97000,18.05,12.425,1.476294e+06,52,Goiás,GO,Varjão,Centro Oeste
62171,2020,5222005,Urbana,97.2,2.6,0.2,496.0,23,1,19.0,...,5.31,31506.97000,18.05,12.425,7.186561e+06,52,Goiás,GO,Vianópolis,Centro Oeste
62172,2020,5222054,Urbana,99.2,0.8,0.0,362.0,18,1,15.0,...,3.79,31506.97000,18.05,12.425,6.174362e+06,52,Goiás,GO,Vicentinópolis,Centro Oeste


In [36]:
education_pv_socio_econ_fn_br_df = pd.merge(education_pv_socio_econ_fn_df, municipalities, on=['Código_IBGE'], how='left')
education_pv_socio_econ_fn_br_df

Unnamed: 0,Ano,Código_IBGE,Aprovação_Rural,Aprovação_Urbana,Reprovação_Rural,Reprovação_Urbana,Abandono_Rural,Abandono_Urbana,Matrículas_Rural,Matrículas_Urbana,...,Magreza_total_%,PIB,Poverty_%,Unemployed_%,Valor_Consolidado,Código_UF,Estado,UF,Município,Região
0,2012,1100023,92.6,69.9,4.0,16.6,3.4,13.5,498,3453,...,2.03,18938.68679,31.61,6.350,35517787.24,11,Rondônia,RO,Ariquemes,Norte
1,2012,1100031,92.1,91.3,2.6,5.2,5.3,3.5,38,230,...,8.80,18938.68679,31.61,6.350,1521376.74,11,Rondônia,RO,Cabixi,Norte
2,2012,1100064,90.6,86.9,9.2,5.6,0.2,7.5,415,733,...,6.92,18938.68679,31.61,6.350,2814808.69,11,Rondônia,RO,Colorado do Oeste,Norte
3,2012,1100072,91.4,83.3,5.3,4.4,3.3,12.3,155,237,...,4.70,18938.68679,31.61,6.350,2284673.55,11,Rondônia,RO,Corumbiara,Norte
4,2012,1100080,60.3,78.0,18.1,4.9,21.6,17.1,192,380,...,4.23,18938.68679,31.61,6.350,5258803.02,11,Rondônia,RO,Costa Marques,Norte
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12142,2020,5220702,87.5,99.0,12.5,0.0,0.0,1.0,8.0,103.0,...,6.39,31506.97000,18.05,12.425,1237056.38,52,Goiás,GO,Sítio d'Abadia,Centro Oeste
12143,2020,5221080,100.0,100.0,0.0,0.0,0.0,0.0,3.0,145.0,...,3.70,31506.97000,18.05,12.425,1610351.89,52,Goiás,GO,Teresina de Goiás,Centro Oeste
12144,2020,5221809,52.1,98.2,22.3,1.8,25.6,0.0,565.0,63.0,...,8.82,31506.97000,18.05,12.425,1452932.37,52,Goiás,GO,Urutaí,Centro Oeste
12145,2020,5222302,99.3,96.1,0.0,3.1,0.7,0.8,142.0,132.0,...,4.55,31506.97000,18.05,12.425,3128786.75,52,Goiás,GO,Vila Propício,Centro Oeste


In [37]:
education_rural_socio_econ_fn_br_df = pd.merge(education_rural_socio_econ_fn_df, municipalities, on=['Código_IBGE'], how='left')
education_rural_socio_econ_fn_br_df

Unnamed: 0,Ano,Código_IBGE,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,Populacao,Magreza_total_%,PIB,Poverty_%,Unemployed_%,Valor_Consolidado,Código_UF,Estado,UF,Município,Região
0,2012,1100023,92.6,4.0,3.4,498,42,1,15.0,92747,2.03,18938.68679,31.61,6.350,35517787.24,11,Rondônia,RO,Ariquemes,Norte
1,2012,1100031,92.1,2.6,5.3,38,6,1,3.0,6132,8.80,18938.68679,31.61,6.350,1521376.74,11,Rondônia,RO,Cabixi,Norte
2,2012,1100064,90.6,9.2,0.2,415,42,1,15.0,18093,6.92,18938.68679,31.61,6.350,2814808.69,11,Rondônia,RO,Colorado do Oeste,Norte
3,2012,1100072,91.4,5.3,3.3,155,12,2,6.0,8530,4.70,18938.68679,31.61,6.350,2284673.55,11,Rondônia,RO,Corumbiara,Norte
4,2012,1100080,60.3,18.1,21.6,192,10,1,8.0,14355,4.23,18938.68679,31.61,6.350,5258803.02,11,Rondônia,RO,Costa Marques,Norte
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12333,2020,5220702,87.5,12.5,0.0,8.0,1,1,1.0,3001,6.39,31506.97000,18.05,12.425,1237056.38,52,Goiás,GO,Sítio d'Abadia,Centro Oeste
12334,2020,5221080,100.0,0.0,0.0,3.0,3,1,2.0,3498,3.70,31506.97000,18.05,12.425,1610351.89,52,Goiás,GO,Teresina de Goiás,Centro Oeste
12335,2020,5221809,52.1,22.3,25.6,565.0,80,1,21.0,3066,8.82,31506.97000,18.05,12.425,1452932.37,52,Goiás,GO,Urutaí,Centro Oeste
12336,2020,5222302,99.3,0.0,0.7,142.0,9,1,5.0,5882,4.55,31506.97000,18.05,12.425,3128786.75,52,Goiás,GO,Vila Propício,Centro Oeste


In [38]:
education_urban_socio_econ_fn_br_df = pd.merge(education_urban_socio_econ_fn_df, municipalities, on=['Código_IBGE'], how='left')
education_urban_socio_econ_fn_br_df

Unnamed: 0,Ano,Código_IBGE,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,Populacao,Magreza_total_%,PIB,Poverty_%,Unemployed_%,Valor_Consolidado,Código_UF,Estado,UF,Município,Região
0,2012,1100015,78.4,8.0,13.6,1099,104,2,55,24069,5.06,18938.68679,31.61,6.350,6773134.92,11,Rondônia,RO,Alta Floresta D'Oeste,Norte
1,2012,1100023,69.9,16.6,13.5,3453,172,6,136,92747,2.03,18938.68679,31.61,6.350,35517787.24,11,Rondônia,RO,Ariquemes,Norte
2,2012,1100031,91.3,5.2,3.5,230,22,1,11,6132,8.80,18938.68679,31.61,6.350,1521376.74,11,Rondônia,RO,Cabixi,Norte
3,2012,1100049,79.5,9.6,10.9,3413,235,10,140,79330,4.13,18938.68679,31.61,6.350,18166669.91,11,Rondônia,RO,Cacoal,Norte
4,2012,1100056,76.9,12.7,10.4,729,42,2,30,16852,6.22,18938.68679,31.61,6.350,3954495.66,11,Rondônia,RO,Cerejeiras,Norte
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49831,2020,5222005,97.2,2.6,0.2,496.0,23,1,19.0,13977,5.31,31506.97000,18.05,12.425,7186560.98,52,Goiás,GO,Vianópolis,Centro Oeste
49832,2020,5222054,99.2,0.8,0.0,362.0,18,1,15.0,8873,3.79,31506.97000,18.05,12.425,6174361.84,52,Goiás,GO,Vicentinópolis,Centro Oeste
49833,2020,5222203,100.0,0.0,0.0,229.0,11,1,9.0,6312,2.63,31506.97000,18.05,12.425,4461612.07,52,Goiás,GO,Vila Boa,Centro Oeste
49834,2020,5222302,96.1,3.1,0.8,132.0,14,1,6.0,5882,4.55,31506.97000,18.05,12.425,3128786.75,52,Goiás,GO,Vila Propício,Centro Oeste


### Education + socio-economic + funding + municipalities + internet

In [102]:
all_expanded = pd.merge(education_exp_socio_econ_fn_br_df, internet_access, on=['Região', 'Ano'], how='left')
all_expanded

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,...,PIB,Poverty_%,Unemployed_%,Valor_Consolidado,Código_UF,Estado,UF,Município,Região,Acesso a internet %
0,2012,1100023,Rural,92.6,4.0,3.4,498,42,1,15,...,18938.68679,31.61,6.350,3.551779e+07,11,Rondônia,RO,Ariquemes,Norte,21
1,2012,1100023,Urbana,69.9,16.6,13.5,3453,172,6,136,...,18938.68679,31.61,6.350,3.551779e+07,11,Rondônia,RO,Ariquemes,Norte,21
2,2012,1100031,Rural,92.1,2.6,5.3,38,6,1,3,...,18938.68679,31.61,6.350,1.521377e+06,11,Rondônia,RO,Cabixi,Norte,21
3,2012,1100031,Urbana,91.3,5.2,3.5,230,22,1,11,...,18938.68679,31.61,6.350,1.521377e+06,11,Rondônia,RO,Cabixi,Norte,21
4,2012,1100064,Rural,90.6,9.2,0.2,415,42,1,15,...,18938.68679,31.61,6.350,2.814809e+06,11,Rondônia,RO,Colorado do Oeste,Norte,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62169,2020,5221858,Urbana,93.0,5.9,1.1,5.069,276,12,161.0,...,31506.97000,18.05,12.425,1.073406e+08,52,Goiás,GO,Valparaíso de Goiás,Centro Oeste,81
62170,2020,5221908,Urbana,100.0,0.0,0.0,125.0,12,1,6.0,...,31506.97000,18.05,12.425,1.476294e+06,52,Goiás,GO,Varjão,Centro Oeste,81
62171,2020,5222005,Urbana,97.2,2.6,0.2,496.0,23,1,19.0,...,31506.97000,18.05,12.425,7.186561e+06,52,Goiás,GO,Vianópolis,Centro Oeste,81
62172,2020,5222054,Urbana,99.2,0.8,0.0,362.0,18,1,15.0,...,31506.97000,18.05,12.425,6.174362e+06,52,Goiás,GO,Vicentinópolis,Centro Oeste,81


In [103]:
all_pivoted = pd.merge(education_pv_socio_econ_fn_br_df, internet_access, on=['Região', 'Ano'], how='left')
all_pivoted

Unnamed: 0,Ano,Código_IBGE,Aprovação_Rural,Aprovação_Urbana,Reprovação_Rural,Reprovação_Urbana,Abandono_Rural,Abandono_Urbana,Matrículas_Rural,Matrículas_Urbana,...,PIB,Poverty_%,Unemployed_%,Valor_Consolidado,Código_UF,Estado,UF,Município,Região,Acesso a internet %
0,2012,1100023,92.6,69.9,4.0,16.6,3.4,13.5,498,3453,...,18938.68679,31.61,6.350,35517787.24,11,Rondônia,RO,Ariquemes,Norte,21
1,2012,1100031,92.1,91.3,2.6,5.2,5.3,3.5,38,230,...,18938.68679,31.61,6.350,1521376.74,11,Rondônia,RO,Cabixi,Norte,21
2,2012,1100064,90.6,86.9,9.2,5.6,0.2,7.5,415,733,...,18938.68679,31.61,6.350,2814808.69,11,Rondônia,RO,Colorado do Oeste,Norte,21
3,2012,1100072,91.4,83.3,5.3,4.4,3.3,12.3,155,237,...,18938.68679,31.61,6.350,2284673.55,11,Rondônia,RO,Corumbiara,Norte,21
4,2012,1100080,60.3,78.0,18.1,4.9,21.6,17.1,192,380,...,18938.68679,31.61,6.350,5258803.02,11,Rondônia,RO,Costa Marques,Norte,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12142,2020,5220702,87.5,99.0,12.5,0.0,0.0,1.0,8.0,103.0,...,31506.97000,18.05,12.425,1237056.38,52,Goiás,GO,Sítio d'Abadia,Centro Oeste,81
12143,2020,5221080,100.0,100.0,0.0,0.0,0.0,0.0,3.0,145.0,...,31506.97000,18.05,12.425,1610351.89,52,Goiás,GO,Teresina de Goiás,Centro Oeste,81
12144,2020,5221809,52.1,98.2,22.3,1.8,25.6,0.0,565.0,63.0,...,31506.97000,18.05,12.425,1452932.37,52,Goiás,GO,Urutaí,Centro Oeste,81
12145,2020,5222302,99.3,96.1,0.0,3.1,0.7,0.8,142.0,132.0,...,31506.97000,18.05,12.425,3128786.75,52,Goiás,GO,Vila Propício,Centro Oeste,81


In [104]:
all_rural = pd.merge(education_rural_socio_econ_fn_br_df, internet_access, on=['Região', 'Ano'], how='left')
all_rural

Unnamed: 0,Ano,Código_IBGE,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,Populacao,...,PIB,Poverty_%,Unemployed_%,Valor_Consolidado,Código_UF,Estado,UF,Município,Região,Acesso a internet %
0,2012,1100023,92.6,4.0,3.4,498,42,1,15.0,92747,...,18938.68679,31.61,6.350,35517787.24,11,Rondônia,RO,Ariquemes,Norte,21
1,2012,1100031,92.1,2.6,5.3,38,6,1,3.0,6132,...,18938.68679,31.61,6.350,1521376.74,11,Rondônia,RO,Cabixi,Norte,21
2,2012,1100064,90.6,9.2,0.2,415,42,1,15.0,18093,...,18938.68679,31.61,6.350,2814808.69,11,Rondônia,RO,Colorado do Oeste,Norte,21
3,2012,1100072,91.4,5.3,3.3,155,12,2,6.0,8530,...,18938.68679,31.61,6.350,2284673.55,11,Rondônia,RO,Corumbiara,Norte,21
4,2012,1100080,60.3,18.1,21.6,192,10,1,8.0,14355,...,18938.68679,31.61,6.350,5258803.02,11,Rondônia,RO,Costa Marques,Norte,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12333,2020,5220702,87.5,12.5,0.0,8.0,1,1,1.0,3001,...,31506.97000,18.05,12.425,1237056.38,52,Goiás,GO,Sítio d'Abadia,Centro Oeste,81
12334,2020,5221080,100.0,0.0,0.0,3.0,3,1,2.0,3498,...,31506.97000,18.05,12.425,1610351.89,52,Goiás,GO,Teresina de Goiás,Centro Oeste,81
12335,2020,5221809,52.1,22.3,25.6,565.0,80,1,21.0,3066,...,31506.97000,18.05,12.425,1452932.37,52,Goiás,GO,Urutaí,Centro Oeste,81
12336,2020,5222302,99.3,0.0,0.7,142.0,9,1,5.0,5882,...,31506.97000,18.05,12.425,3128786.75,52,Goiás,GO,Vila Propício,Centro Oeste,81


In [105]:
all_urban = pd.merge(education_urban_socio_econ_fn_br_df, internet_access, on=['Região', 'Ano'], how='left')
all_urban

Unnamed: 0,Ano,Código_IBGE,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,Populacao,...,PIB,Poverty_%,Unemployed_%,Valor_Consolidado,Código_UF,Estado,UF,Município,Região,Acesso a internet %
0,2012,1100015,78.4,8.0,13.6,1099,104,2,55,24069,...,18938.68679,31.61,6.350,6773134.92,11,Rondônia,RO,Alta Floresta D'Oeste,Norte,21
1,2012,1100023,69.9,16.6,13.5,3453,172,6,136,92747,...,18938.68679,31.61,6.350,35517787.24,11,Rondônia,RO,Ariquemes,Norte,21
2,2012,1100031,91.3,5.2,3.5,230,22,1,11,6132,...,18938.68679,31.61,6.350,1521376.74,11,Rondônia,RO,Cabixi,Norte,21
3,2012,1100049,79.5,9.6,10.9,3413,235,10,140,79330,...,18938.68679,31.61,6.350,18166669.91,11,Rondônia,RO,Cacoal,Norte,21
4,2012,1100056,76.9,12.7,10.4,729,42,2,30,16852,...,18938.68679,31.61,6.350,3954495.66,11,Rondônia,RO,Cerejeiras,Norte,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49831,2020,5222005,97.2,2.6,0.2,496.0,23,1,19.0,13977,...,31506.97000,18.05,12.425,7186560.98,52,Goiás,GO,Vianópolis,Centro Oeste,81
49832,2020,5222054,99.2,0.8,0.0,362.0,18,1,15.0,8873,...,31506.97000,18.05,12.425,6174361.84,52,Goiás,GO,Vicentinópolis,Centro Oeste,81
49833,2020,5222203,100.0,0.0,0.0,229.0,11,1,9.0,6312,...,31506.97000,18.05,12.425,4461612.07,52,Goiás,GO,Vila Boa,Centro Oeste,81
49834,2020,5222302,96.1,3.1,0.8,132.0,14,1,6.0,5882,...,31506.97000,18.05,12.425,3128786.75,52,Goiás,GO,Vila Propício,Centro Oeste,81


In [106]:
all_urban.columns

Index(['Ano', 'Código_IBGE', 'Aprovação', 'Reprovação', 'Abandono',
       'Matrículas', 'Docentes', 'Estabelecimentos', 'Turmas', 'Populacao',
       'Magreza_total_%', 'PIB', 'Poverty_%', 'Unemployed_%',
       'Valor_Consolidado', 'Código_UF', 'Estado', 'UF', 'Município', 'Região',
       'Acesso a internet %'],
      dtype='object')

### Adjust population and Funding value for rural versus urban

#### Adjust for all_expanded

In [107]:
def adjust_population(row):
    """
    Adjusts the population value based on the column value ('rural' or 'urban').
    
    Args:
    - row (Series): A row from a pandas DataFrame.
    
    Returns:
    - float: Adjusted population value.
    """
    if row['Localização'] == 'Rural':
        return row['Populacao'] * 0.15
    elif row['Localização'] == 'Urbana':
        return row['Populacao'] * 0.85
    else:
        return row['Populacao']


def adjust_funding(row):
    """
    Adjusts the funding value based on the column value ('rural' or 'urban').
    
    Args:
    - row (Series): A row from a pandas DataFrame.
    
    Returns:
    - float: Adjusted funding value.
    """
    if row['Localização'] == 'Rural':
        return row['Valor_Consolidado'] * 0.15
    elif row['Localização'] == 'Urbana':
        return row['Valor_Consolidado'] * 0.85
    else:
        return row['Valor_Consolidado']

In [108]:
all_expanded['adjusted_population'] = all_expanded.apply(adjust_population, axis=1)
all_expanded = all_expanded.drop(columns="Populacao")

In [109]:
all_expanded['adjusted_funding'] = all_expanded.apply(adjust_funding, axis=1)
all_expanded = all_expanded.drop(columns="Valor_Consolidado")

#### Adjust for all rural

In [110]:
all_rural["adjusted_funding"] = all_rural["Valor_Consolidado"]*0.15
all_rural["adjusted_population"] = all_rural["Populacao"]*0.15

In [111]:
all_rural = all_rural.drop(columns=["Valor_Consolidado", "Populacao"])

#### Adjust for all urban

In [112]:
all_urban["adjusted_funding"] = all_urban["Valor_Consolidado"]*0.85
all_urban["adjusted_population"] = all_urban["Populacao"]*0.85

In [113]:
all_urban = all_urban.drop(columns=["Valor_Consolidado", "Populacao"])

### Write to file

In [115]:
# All expanded
file_path = "../transformed_data/all_expanded_ML.csv"
all_expanded.to_csv(file_path, index=False)

In [116]:
# All pivoted
file_path = "../transformed_data/all_pivoted_ML.csv"
all_pivoted.to_csv(file_path, index=False)

In [117]:
# All rural
file_path = "../transformed_data/all_rural_ML.csv"
all_rural.to_csv(file_path, index=False)

In [118]:
# All urban
file_path = "../transformed_data/all_urban_ML.csv"
all_urban.to_csv(file_path, index=False)