## School performance dataset

In [1]:
import pandas as pd

In [2]:
performance_df = pd.read_csv("../transformed_data/school_performance_per_municipality.csv")

In [3]:
performance_df

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono
0,2008,1100015,Rural,--,--,--
1,2008,1100023,Rural,--,--,--
2,2008,1100049,Rural,--,--,--
3,2008,1100056,Rural,--,--,--
4,2008,1100064,Rural,77.7,22,0.3
...,...,...,...,...,...,...
150498,2022,5222005,Urbana,99.2,0.8,0.0
150499,2022,5222054,Urbana,99.3,0.7,0.0
150500,2022,5222203,Urbana,100.0,0.0,0.0
150501,2022,5222302,Urbana,90.5,5.2,4.3


In [None]:
performance_df.info()

## School infrastructure dataset

In [None]:
infrastructure_df = pd.read_csv("../transformed_data/school_infrastructure_per_municipality.csv")

In [None]:
infrastructure_df

In [None]:
infrastructure_df.info()

In [None]:
performance_name_list = list(performance_df['Nome do Município'].unique())
infra_name_list = list(infrastructure_df['Município'].unique())
check_equal_lists(performance_name_list, infra_name_list)

## Combine school performance and infra datasets

### Check is primary keys are same of subsets of one another

In [8]:
def check_equal_lists(list1, list2):
    return sorted(list1) == sorted(list2)

#### IBGE

In [10]:
performance_ibge_code = list(performance_df['Código_IBGE'].unique())

In [None]:
infra_ibge_code = list(infrastructure_df['Código_IBGE'].unique())

In [None]:
check_equal_lists(performance_ibge_code, infra_ibge_code)

In [None]:
len(performance_ibge_code), len(infra_ibge_code)

#### Ano

In [None]:
performance_years = list(performance_df['Ano'].unique())
performance_years.sort()
performance_years

In [None]:
infra_years = list(infrastructure_df['Ano'].unique())
infra_years.sort()
infra_years

In [None]:
check_equal_lists(performance_years, infra_years)

In [None]:
len(performance_years), len(infra_years)

#### Remove infra data for year 2007

In [None]:
infrastructure_df = infrastructure_df[infrastructure_df.Ano != 2007]

In [None]:
infra_years = list(infrastructure_df['Ano'].unique())

In [None]:
len(performance_years), len(infra_years)

In [None]:
performance_df.columns

In [None]:
school_merged_df = pd.merge(performance_df, infrastructure_df, on=['Ano', 'Código_IBGE', 'Localização'], how='outer')

In [None]:
len(school_merged_df.Código_IBGE.unique())

In [None]:
# Write the combined DataFrame to a new CSV file
file_path = "../transformed_data/school_infrastructure_and_performance.csv"
school_merged_df.to_csv(file_path, index=False)

## Funding Dataset

In [24]:
funding_df = pd.read_csv("../transformed_data/funding_per_municipality.csv")

In [25]:
funding_df

Unnamed: 0,Ano,Código_IBGE,Valor_Consolidado
0,2008,5200050,1755502.19
1,2009,5200050,1880883.04
2,2010,5200050,2324131.10
3,2011,5200050,2850424.86
4,2012,5200050,3758558.91
...,...,...,...
83485,2018,3533809,1285260.97
83486,2019,3533809,1565105.07
83487,2020,3533809,1425284.00
83488,2021,3533809,1691621.32


In [6]:
funding_list = list(funding_df['Código_IBGE'].unique())

In [11]:
check_equal_lists(performance_ibge_code, funding_list)

False

In [13]:
len(performance_ibge_code), len(funding_list)

(5570, 5569)

In [14]:
def is_subset(list1, list2):
    return set(list1).issubset(set(list2))


In [15]:
is_subset(funding_list, performance_ibge_code)

True

## Population dataset

In [17]:
population_df = pd.read_csv("../transformed_data/population_per_municipality.csv")
population_df

Unnamed: 0,Código_IBGE,Ano,População
0,1200013,2008,11987
1,1200013,2009,12241.0
2,1200013,2010,12510.0
3,1200013,2011,12779.0
4,1200013,2012,13011
...,...,...,...
83545,1722107,2018,11561
83546,1722107,2019,11540
83547,1722107,2020,11520
83548,1722107,2021,11500


In [18]:
population_list = list(population_df['Código_IBGE'].unique())

In [20]:
check_equal_lists(performance_ibge_code, population_list)

True

In [21]:
len(performance_ibge_code), len(population_list)

(5570, 5570)

In [26]:
pop_fund_merged_df = pd.merge(population_df, funding_df, on=['Ano', 'Código_IBGE'], how='outer')

In [28]:
pop_fund_merged_df

Unnamed: 0,Código_IBGE,Ano,População,Valor_Consolidado
0,1200013,2008,11987,5089382.67
1,1200013,2009,12241.0,5259220.39
2,1200013,2010,12510.0,5682387.10
3,1200013,2011,12779.0,6687641.83
4,1200013,2012,13011,6535497.11
...,...,...,...,...
83545,1722107,2018,11561,6322567.52
83546,1722107,2019,11540,7014407.97
83547,1722107,2020,11520,7515015.35
83548,1722107,2021,11500,9694092.49
