## School performance dataset

In [1]:
import pandas as pd

In [3]:
performance_df = pd.read_csv("../transformed_data/school_performance_per_municipality.csv")
performance_df

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono
0,2008,1100015,Rural,--,--,--
1,2008,1100023,Rural,--,--,--
2,2008,1100049,Rural,--,--,--
3,2008,1100056,Rural,--,--,--
4,2008,1100064,Rural,77.7,22,0.3
...,...,...,...,...,...,...
150498,2022,5222005,Urbana,99.2,0.8,0.0
150499,2022,5222054,Urbana,99.3,0.7,0.0
150500,2022,5222203,Urbana,100.0,0.0,0.0
150501,2022,5222302,Urbana,90.5,5.2,4.3


In [4]:
performance_df[performance_df.isna().any(axis=1)]

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono


In [5]:
performance_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150503 entries, 0 to 150502
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Ano          150503 non-null  int64 
 1   Código_IBGE  150503 non-null  int64 
 2   Localização  150503 non-null  object
 3   Aprovação    150503 non-null  object
 4   Reprovação   150503 non-null  object
 5   Abandono     150503 non-null  object
dtypes: int64(2), object(4)
memory usage: 6.9+ MB


## School infrastructure dataset

In [6]:
infrastructure_df = pd.read_csv("../transformed_data/school_infrastructure_per_municipality.csv")
infrastructure_df

Unnamed: 0,Código_IBGE,Localização,Matrículas,Ano,Docentes,Estabelecimentos,Turmas
0,5200050,Rural,0,2009,0,0,0
1,5200050,Urbana,302,2009,19,1,12
2,3100104,Rural,0,2009,0,0,0
3,3100104,Urbana,211,2009,16,1,7
4,5200100,Rural,0,2009,0,0,0
...,...,...,...,...,...,...,...
178175,2900504,Urbana,475,2010,23,1,18
178176,1505106,Rural,0,2010,0,0,0
178177,1505106,Urbana,2405,2010,64,1,85
178178,3533809,Rural,0,2010,0,0,0


In [7]:
infrastructure_df[infrastructure_df.isna().any(axis=1)]

Unnamed: 0,Código_IBGE,Localização,Matrículas,Ano,Docentes,Estabelecimentos,Turmas
123954,5300108,Rural,1758,2012,,10,60
123955,5300108,Urbana,81536,2012,,79,2247


In [8]:
infrastructure_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178180 entries, 0 to 178179
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Código_IBGE       178180 non-null  int64 
 1   Localização       178180 non-null  object
 2   Matrículas        178180 non-null  object
 3   Ano               178180 non-null  int64 
 4   Docentes          178178 non-null  object
 5   Estabelecimentos  178180 non-null  int64 
 6   Turmas            178180 non-null  object
dtypes: int64(3), object(4)
memory usage: 9.5+ MB


## Combine school performance and infra datasets

### Check is primary keys are same of subsets of one another

In [11]:
def check_equal_lists(list1, list2):
    return sorted(list1) == sorted(list2)

#### IBGE

In [12]:
performance_ibge_code = list(performance_df['Código_IBGE'].unique())

In [13]:
infra_ibge_code = list(infrastructure_df['Código_IBGE'].unique())

In [14]:
check_equal_lists(performance_ibge_code, infra_ibge_code)

True

In [15]:
len(performance_ibge_code), len(infra_ibge_code)

(5570, 5570)

#### Ano

In [16]:
performance_years = list(performance_df['Ano'].unique())
performance_years.sort()
performance_years

[2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021,
 2022]

In [17]:
infra_years = list(infrastructure_df['Ano'].unique())
infra_years.sort()
infra_years

[2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021,
 2022]

In [18]:
check_equal_lists(performance_years, infra_years)

False

In [19]:
len(performance_years), len(infra_years)

(15, 16)

#### Remove infra data for year 2007

In [20]:
infrastructure_df = infrastructure_df[infrastructure_df.Ano != 2007]

In [21]:
infra_years = list(infrastructure_df['Ano'].unique())

In [22]:
len(performance_years), len(infra_years)

(15, 15)

In [23]:
performance_df.columns

Index(['Ano', 'Código_IBGE', 'Localização', 'Aprovação', 'Reprovação',
       'Abandono'],
      dtype='object')

In [24]:
school_merged_df = pd.merge(performance_df, infrastructure_df, on=['Ano', 'Código_IBGE', 'Localização'], how='outer')

In [25]:
len(school_merged_df.Código_IBGE.unique())

5570

In [29]:
school_merged_df = school_merged_df.dropna()

In [31]:
school_merged_df = school_merged_df[school_merged_df.ne("--").all(axis=1)]
school_merged_df

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas
4,2008,1100064,Rural,77.7,22,0.3,328,30,1,11
9,2008,1100114,Rural,72.2,12.8,15,506,32,3,16
10,2008,1100122,Rural,93.3,2.1,4.6,245,19,2,10
12,2008,1100148,Rural,85.8,0.9,13.3,237,13,8,9
15,2008,1100205,Rural,72.1,7.2,20.7,598,45,4,24
...,...,...,...,...,...,...,...,...,...,...
150498,2022,5222005,Urbana,99.2,0.8,0.0,399,27,2,16
150499,2022,5222054,Urbana,99.3,0.7,0.0,340,22,1,12
150500,2022,5222203,Urbana,100.0,0.0,0.0,222,8,1,8
150501,2022,5222302,Urbana,90.5,5.2,4.3,117,14,1,6


In [32]:
# Write the combined DataFrame to a new CSV file
file_path = "../transformed_data/school_data.csv"
school_merged_df.to_csv(file_path, index=False)

## Funding Dataset

In [33]:
funding_df = pd.read_csv("../transformed_data/funding_per_municipality.csv")

In [45]:
funding_df.Ano.unique()

array([2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,
       2019, 2020, 2021, 2022])

In [35]:
funding_list = list(funding_df['Código_IBGE'].unique())

In [36]:
check_equal_lists(performance_ibge_code, funding_list)

False

In [37]:
len(performance_ibge_code), len(funding_list)

(5570, 5569)

In [38]:
def is_subset(list1, list2):
    return set(list1).issubset(set(list2))


In [39]:
is_subset(funding_list, performance_ibge_code)

True

## Population dataset

In [46]:
population_df = pd.read_csv("../transformed_data/population_per_municipality.csv")
population_df.Ano.unique()

array([2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,
       2019, 2020, 2021, 2022])

In [42]:
population_list = list(population_df['Código_IBGE'].unique())

In [43]:
check_equal_lists(performance_ibge_code, population_list)

True

In [44]:
len(performance_ibge_code), len(population_list)

(5570, 5570)

In [47]:
pop_fund_merged_df = pd.merge(population_df, funding_df, on=['Ano', 'Código_IBGE'], how='outer')

In [48]:
pop_fund_merged_df

Unnamed: 0,Código_IBGE,Ano,População,Valor_Consolidado
0,1200013,2008,11987,5089382.67
1,1200013,2009,12241.0,5259220.39
2,1200013,2010,12510.0,5682387.10
3,1200013,2011,12779.0,6687641.83
4,1200013,2012,13011,6535497.11
...,...,...,...,...
83545,1722107,2018,11561,6322567.52
83546,1722107,2019,11540,7014407.97
83547,1722107,2020,11520,7515015.35
83548,1722107,2021,11500,9694092.49


In [49]:
school_funding_population = pd.merge(school_merged_df, pop_fund_merged_df, on=['Ano', 'Código_IBGE'], how='outer')
school_funding_population

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,População,Valor_Consolidado
0,2008,1100064,Rural,77.7,22,0.3,328,30,1.0,11,18216,2128372.02
1,2008,1100064,Urbana,85,11.4,3.6,781,40,2.0,24,18216,2128372.02
2,2008,1100114,Rural,72.2,12.8,15,506,32,3.0,16,53955,10777177.00
3,2008,1100114,Urbana,71.3,15.4,13.3,1884,107,4.0,68,53955,10777177.00
4,2008,1100122,Rural,93.3,2.1,4.6,245,19,2.0,10,110707,9386520.09
...,...,...,...,...,...,...,...,...,...,...,...,...
103087,2018,1713700,,,,,,,,,2263,1950745.47
103088,2019,1713700,,,,,,,,,2279,2125006.57
103089,2020,1713700,,,,,,,,,2295,2135635.20
103090,2021,1713700,,,,,,,,,2311,2615901.67


In [50]:
school_funding_population.dropna()

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,População,Valor_Consolidado
0,2008,1100064,Rural,77.7,22,0.3,328,30,1.0,11,18216,2.128372e+06
1,2008,1100064,Urbana,85,11.4,3.6,781,40,2.0,24,18216,2.128372e+06
2,2008,1100114,Rural,72.2,12.8,15,506,32,3.0,16,53955,1.077718e+07
3,2008,1100114,Urbana,71.3,15.4,13.3,1884,107,4.0,68,53955,1.077718e+07
4,2008,1100122,Rural,93.3,2.1,4.6,245,19,2.0,10,110707,9.386520e+06
...,...,...,...,...,...,...,...,...,...,...,...,...
102884,2022,5221858,Urbana,97.0,2.2,0.8,4995,313,12.0,160,198861,1.565021e+08
102885,2022,5221908,Urbana,100.0,0.0,0.0,122,11,1.0,6,3716,1.881745e+06
102886,2022,5222005,Urbana,99.2,0.8,0.0,399,27,2.0,16,14956,9.332058e+06
102887,2022,5222054,Urbana,99.3,0.7,0.0,340,22,1.0,12,8768,7.446251e+06


In [71]:
malnutrition = pd.read_csv("../transformed_data/malnutrition_data.csv")
malnutrition = malnutrition.drop(columns=["Municipio", "UF", "Região"])

In [72]:
school_funding_population_mal = pd.merge(school_funding_population, malnutrition, on=['Ano', 'Código_IBGE'], how='outer')
school_funding_population_mal

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,População,Valor_Consolidado,Código UF,Magreza_total_%
0,2008,1100064,Rural,77.7,22,0.3,328,30,1.0,11,18216,2128372.02,11.0,4.12
1,2008,1100064,Urbana,85,11.4,3.6,781,40,2.0,24,18216,2128372.02,11.0,4.12
2,2008,1100114,Rural,72.2,12.8,15,506,32,3.0,16,53955,10777177.00,11.0,5.68
3,2008,1100114,Urbana,71.3,15.4,13.3,1884,107,4.0,68,53955,10777177.00,11.0,5.68
4,2008,1100122,Rural,93.3,2.1,4.6,245,19,2.0,10,110707,9386520.09,11.0,9.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113907,2023,4219408,,,,,,,,,,,42.0,1.56
113908,2023,4219507,,,,,,,,,,,42.0,1.90
113909,2023,4219606,,,,,,,,,,,42.0,0.78
113910,2023,4219705,,,,,,,,,,,42.0,2.66


In [73]:
school_funding_population_mal = school_funding_population_mal.dropna()
school_funding_population_mal

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,População,Valor_Consolidado,Código UF,Magreza_total_%
0,2008,1100064,Rural,77.7,22,0.3,328,30,1.0,11,18216,2.128372e+06,11.0,4.12
1,2008,1100064,Urbana,85,11.4,3.6,781,40,2.0,24,18216,2.128372e+06,11.0,4.12
2,2008,1100114,Rural,72.2,12.8,15,506,32,3.0,16,53955,1.077718e+07,11.0,5.68
3,2008,1100114,Urbana,71.3,15.4,13.3,1884,107,4.0,68,53955,1.077718e+07,11.0,5.68
4,2008,1100122,Rural,93.3,2.1,4.6,245,19,2.0,10,110707,9.386520e+06,11.0,9.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108133,2022,5221858,Urbana,97.0,2.2,0.8,4995,313,12.0,160,198861,1.565021e+08,52.0,5.87
108134,2022,5221908,Urbana,100.0,0.0,0.0,122,11,1.0,6,3716,1.881745e+06,52.0,5.26
108135,2022,5222005,Urbana,99.2,0.8,0.0,399,27,2.0,16,14956,9.332058e+06,52.0,4.15
108136,2022,5222054,Urbana,99.3,0.7,0.0,340,22,1.0,12,8768,7.446251e+06,52.0,3.36


In [74]:
school_funding_population_mal = school_funding_population_mal.rename(columns={
    'Código UF': 'Código_UF'
})


In [76]:
school_funding_population_mal

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,População,Valor_Consolidado,Código_UF,Magreza_total_%
0,2008,1100064,Rural,77.7,22,0.3,328,30,1.0,11,18216,2.128372e+06,11.0,4.12
1,2008,1100064,Urbana,85,11.4,3.6,781,40,2.0,24,18216,2.128372e+06,11.0,4.12
2,2008,1100114,Rural,72.2,12.8,15,506,32,3.0,16,53955,1.077718e+07,11.0,5.68
3,2008,1100114,Urbana,71.3,15.4,13.3,1884,107,4.0,68,53955,1.077718e+07,11.0,5.68
4,2008,1100122,Rural,93.3,2.1,4.6,245,19,2.0,10,110707,9.386520e+06,11.0,9.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108133,2022,5221858,Urbana,97.0,2.2,0.8,4995,313,12.0,160,198861,1.565021e+08,52.0,5.87
108134,2022,5221908,Urbana,100.0,0.0,0.0,122,11,1.0,6,3716,1.881745e+06,52.0,5.26
108135,2022,5222005,Urbana,99.2,0.8,0.0,399,27,2.0,16,14956,9.332058e+06,52.0,4.15
108136,2022,5222054,Urbana,99.3,0.7,0.0,340,22,1.0,12,8768,7.446251e+06,52.0,3.36


### SocioEconomic data

In [63]:
unemployment = pd.read_csv("../transformed_data/unemployement_data.csv")
unemployment = unemployment.drop(columns=["UF", "Estado", "Região"])
unemployment                                      

Unnamed: 0,Código_UF,Ano,Unemployed
0,12,2012,84
1,12,2013,8975
2,12,2014,77
3,12,2015,8475
4,12,2016,10875
...,...,...,...
292,17,2018,10625
293,17,2019,10825
294,17,2020,11625
295,17,2021,13325


In [65]:
poverty = pd.read_csv("../transformed_data/poverty_data.csv")
poverty = poverty.drop(columns=["Estado", "UF"])
poverty

Unnamed: 0,Código_UF,Ano,Poverty_Rate
0,12,2012,47.72
1,12,2013,45.34
2,12,2014,44.62
3,12,2015,44.95
4,12,2016,49.05
...,...,...,...
265,17,2017,33.47
266,17,2018,32.93
267,17,2019,34.17
268,17,2020,30.19


In [67]:
gdp = pd.read_csv("../transformed_data/gdp_per_capita_data.csv")
gdp = gdp.drop(columns=["Estado", "UF", "Região"])
gdp

Unnamed: 0,Codigo UF,Ano,PIB
0,12,2000,3864.01000
1,12,2001,4309.39000
2,12,2002,4887.09000
3,12,2003,5502.38000
4,12,2004,6415.31000
...,...,...,...
562,17,2016,20604.58933
563,17,2017,22002.49258
564,17,2018,22933.07486
565,17,2019,25021.80174


In [69]:
gdp = gdp.rename(columns={
    'Codigo UF': 'Código_UF'
})


In [68]:
unemployement_poverty = pd.merge(unemployment, poverty, on=['Ano', 'Código_UF'], how='outer')
unemployement_poverty

Unnamed: 0,Código_UF,Ano,Unemployed,Poverty_Rate
0,12,2012,84,47.72
1,12,2013,8975,45.34
2,12,2014,77,44.62
3,12,2015,8475,44.95
4,12,2016,10875,49.05
...,...,...,...,...
292,17,2018,10625,32.93
293,17,2019,10825,34.17
294,17,2020,11625,30.19
295,17,2021,13325,33.11


In [70]:
unemployement_poverty_gdp = pd.merge(unemployement_poverty, gdp, on=['Ano', 'Código_UF'], how='outer')
unemployement_poverty_gdp

Unnamed: 0,Código_UF,Ano,Unemployed,Poverty_Rate,PIB
0,12,2012,84,47.72,13360.71660
1,12,2013,8975,45.34,14777.17569
2,12,2014,77,44.62,17034.14833
3,12,2015,8475,44.95,16954.05276
4,12,2016,10875,49.05,16841.50719
...,...,...,...,...,...
616,17,2007,,,8164.07000
617,17,2008,,,10222.71000
618,17,2009,,,11277.60000
619,17,2010,,,11857.87742


In [81]:
sorted(unemployement_poverty_gdp.Código_UF.unique())

[11,
 12,
 13,
 14,
 15,
 16,
 17,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 31,
 32,
 33,
 35,
 41,
 42,
 43,
 50,
 51,
 52,
 53]

In [82]:
merge_df = pd.merge(school_funding_population_mal, unemployement_poverty_gdp, on=['Ano', 'Código_UF'])
merge_df.dropna()

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas,População,Valor_Consolidado,Código_UF,Magreza_total_%,Unemployed,Poverty_Rate,PIB
26848,2012,1100023,Rural,92.6,4,3.4,498,42,1.0,15,92747,35517787.24,11.0,2.03,635,31.61,18938.68679
26849,2012,1100023,Urbana,69.9,16.6,13.5,3453,172,6.0,136,92747,35517787.24,11.0,2.03,635,31.61,18938.68679
26850,2012,1100031,Rural,92.1,2.6,5.3,38,6,1.0,3,6132,1521376.74,11.0,8.80,635,31.61,18938.68679
26851,2012,1100031,Urbana,91.3,5.2,3.5,230,22,1.0,11,6132,1521376.74,11.0,8.80,635,31.61,18938.68679
26852,2012,1100064,Rural,90.6,9.2,0.2,415,42,1.0,15,18093,2814808.69,11.0,6.92,635,31.61,18938.68679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89017,2020,2709152,Urbana,98.8,0.0,1.2,1.718,84,2.0,46.0,44372,58370038.35,27.0,3.81,18575,43.89,18857.69000
89018,2020,2709202,Urbana,100.0,0.0,0.0,708.0,16,1.0,19.0,27826,21770452.56,27.0,4.83,18575,43.89,18857.69000
89019,2020,2709301,Urbana,99.6,0.0,0.4,2.258,100,3.0,54.0,65790,48915489.57,27.0,2.65,18575,43.89,18857.69000
89020,2020,5300108,Rural,84.2,15.8,0.0,2.78,222,14.0,97.0,3055149,0.00,53.0,4.16,1475,12.38,87016.16000


In [78]:
merge_df["Código_UF"].unique()

array([11., 21., 12., 41., 43., 51., 13., 35., 14., 25., 31., 23., 15.,
       27., 17., 16., 29., 22., 24., 26., 33., 42., 32., 52., 28., 50.,
       53.])