## School performance dataset

In [21]:
import pandas as pd

In [22]:
performance_df = pd.read_csv("../transformed_data/school_performance_per_municipality.csv")

In [3]:
performance_df

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono
0,2008,1100015,Rural,--,--,--
1,2008,1100023,Rural,--,--,--
2,2008,1100049,Rural,--,--,--
3,2008,1100056,Rural,--,--,--
4,2008,1100064,Rural,77.7,22,0.3
...,...,...,...,...,...,...
150498,2022,5222005,Urbana,99.2,0.8,0.0
150499,2022,5222054,Urbana,99.3,0.7,0.0
150500,2022,5222203,Urbana,100.0,0.0,0.0
150501,2022,5222302,Urbana,90.5,5.2,4.3


In [16]:
performance_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150503 entries, 0 to 150502
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Ano          150503 non-null  int64 
 1   Código_IBGE  150503 non-null  int64 
 2   Localização  150503 non-null  object
 3   Aprovação    150503 non-null  object
 4   Reprovação   150503 non-null  object
 5   Abandono     150503 non-null  object
dtypes: int64(2), object(4)
memory usage: 6.9+ MB


## School infrastructure dataset

In [23]:
infrastructure_df = pd.read_csv("../transformed_data/school_infrastructure_per_municipality.csv")

In [24]:
infrastructure_df

Unnamed: 0,Código_IBGE,Localização,Matrículas,Ano,Docentes,Estabelecimentos,Turmas
0,5200050,Rural,0,2009,0,0,0
1,5200050,Urbana,302,2009,19,1,12
2,3100104,Rural,0,2009,0,0,0
3,3100104,Urbana,211,2009,16,1,7
4,5200100,Rural,0,2009,0,0,0
...,...,...,...,...,...,...,...
178175,2900504,Urbana,475,2010,23,1,18
178176,1505106,Rural,0,2010,0,0,0
178177,1505106,Urbana,2405,2010,64,1,85
178178,3533809,Rural,0,2010,0,0,0


In [15]:
infrastructure_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178180 entries, 0 to 178179
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Código_IBGE       178180 non-null  int64 
 1   Rede              178180 non-null  object
 2   Matrículas        178180 non-null  object
 3   Ano               178180 non-null  int64 
 4   Docentes          178178 non-null  object
 5   Estabelecimentos  178180 non-null  int64 
 6   Turmas            178180 non-null  object
dtypes: int64(3), object(4)
memory usage: 9.5+ MB


In [None]:
performance_name_list = list(performance_df['Nome do Município'].unique())
infra_name_list = list(infrastructure_df['Município'].unique())
check_equal_lists(performance_name_list, infra_name_list)

## Combine school performance and infra datasets

### Check is primary keys are same of subsets of one another

In [6]:
def check_equal_lists(list1, list2):
    return sorted(list1) == sorted(list2)

#### IBGE

In [7]:
performance_ibge_code = list(performance_df['Código_IBGE'].unique())

In [8]:
infra_ibge_code = list(infrastructure_df['Código_IBGE'].unique())

In [9]:
check_equal_lists(performance_ibge_code, infra_ibge_code)

True

In [10]:
len(performance_ibge_code), len(infra_ibge_code)

(5570, 5570)

#### Ano

In [11]:
performance_years = list(performance_df['Ano'].unique())
performance_years.sort()
performance_years

[2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021,
 2022]

In [12]:
infra_years = list(infrastructure_df['Ano'].unique())
infra_years.sort()
infra_years

[2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021,
 2022]

In [13]:
check_equal_lists(performance_years, infra_years)

False

In [14]:
len(performance_years), len(infra_years)

(15, 16)

#### Remove infra data for year 2007

In [25]:
infrastructure_df = infrastructure_df[infrastructure_df.Ano != 2007]

In [26]:
infra_years = list(infrastructure_df['Ano'].unique())

In [27]:
len(performance_years), len(infra_years)

(15, 15)

In [29]:
performance_df.columns

Index(['Ano', 'Código_IBGE', 'Localização', 'Aprovação', 'Reprovação',
       'Abandono'],
      dtype='object')

In [30]:
school_merged_df = pd.merge(performance_df, infrastructure_df, on=['Ano', 'Código_IBGE', 'Localização'], how='outer')

In [34]:
len(school_merged_df.Código_IBGE.unique())

5570

In [35]:
# Write the combined DataFrame to a new CSV file
file_path = "../transformed_data/school_infrastructure_and_performance.csv"
school_merged_df.to_csv(file_path, index=False)

## Funding Dataset

In [None]:
funding_df = pd.read_csv("../transformed_data/funding_per_municipality.csv")

In [None]:
funding_df

In [None]:
funding_list = list(funding_df['Código IBGE'].unique())

In [None]:
check_equal_lists(performance_list, funding_list)

In [None]:
len(performance_list), len(funding_list)

In [None]:
def is_subset(list1, list2):
    return set(list1).issubset(set(list2))


In [None]:
is_subset(funding_list, performance_list)

## Population dataset

In [None]:
population_df = pd.read_csv("../transformed_data/population_per_municipality.csv")
population_df

In [None]:
population_list = list(population_df['Código do Município'].unique())

In [None]:
check_equal_lists(performance_list, population_list)