# School (all data)

This notebook intends to combine school performance data and infrastructure data. The resulting dataset would contain the following rows. 

- Ano 
- Código_IBGE
- Localização
- Aprovação
- Reprovação
- Abandono
- Matrículas
- Docentes
- Estabelecimentos
- Turmas

In [1]:
import pandas as pd

## School performance dataset 

In [13]:
# Load performance dataset
performance_df = pd.read_csv("../../transformed_data/school_data/school_performance_per_municipality.csv")

In [14]:
# check if there is any instance of NaN in any of the rows
performance_df[performance_df.isna().any(axis=1)]

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono


In [15]:
# check if there is any instance of -- in any of the rows
performance_df[performance_df.eq("--").any(axis=1)] 

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono
0,2008,1100015,Rural,--,--,--
1,2008,1100023,Rural,--,--,--
2,2008,1100049,Rural,--,--,--
3,2008,1100056,Rural,--,--,--
5,2008,1100072,Rural,--,--,--
...,...,...,...,...,...,...
149750,2022,4310538,Urbana,--,--,--
149779,2022,4311734,Urbana,--,--,--
149906,2022,4316972,Urbana,--,--,--
149936,2022,4318614,Urbana,--,--,--


In [16]:
# remove instances of -- 
performance_df = performance_df[performance_df.ne("--").all(axis=1)]
performance_df

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono
4,2008,1100064,Rural,77.7,22,0.3
9,2008,1100114,Rural,72.2,12.8,15
10,2008,1100122,Rural,93.3,2.1,4.6
12,2008,1100148,Rural,85.8,0.9,13.3
15,2008,1100205,Rural,72.1,7.2,20.7
...,...,...,...,...,...,...
150498,2022,5222005,Urbana,99.2,0.8,0.0
150499,2022,5222054,Urbana,99.3,0.7,0.0
150500,2022,5222203,Urbana,100.0,0.0,0.0
150501,2022,5222302,Urbana,90.5,5.2,4.3


## School infrastructure data

In [18]:
# Load infrastructure dataset
infrastructure_df = pd.read_csv("../../transformed_data/school_data/school_infrastructure_per_municipality.csv")

In [19]:
# check if there is any instance of NaN in any of the rows
infrastructure_df[infrastructure_df.isna().any(axis=1)]

Unnamed: 0,Código_IBGE,Localização,Matrículas,Ano,Docentes,Estabelecimentos,Turmas
123954,5300108,Rural,1758,2012,,10,60
123955,5300108,Urbana,81536,2012,,79,2247


In [23]:
# remove NaN values
infrastructure_df = infrastructure_df.dropna()

In [24]:
infrastructure_df[infrastructure_df.isna().any(axis=1)]

Unnamed: 0,Código_IBGE,Localização,Matrículas,Ano,Docentes,Estabelecimentos,Turmas


In [25]:
# check if there is any instance of -- in any of the rows
infrastructure_df[infrastructure_df.eq("--").any(axis=1)] 

Unnamed: 0,Código_IBGE,Localização,Matrículas,Ano,Docentes,Estabelecimentos,Turmas


## Validate IBGE Code and Year as primary keys

In [35]:
def check_equal_lists(list1, list2):
    return sorted(list1) == sorted(list2)

def is_subset(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    return set2.issubset(set1)

### IBGE 

In [27]:
performance_ibge_code = list(performance_df['Código_IBGE'].unique())

In [28]:
infra_ibge_code = list(infrastructure_df['Código_IBGE'].unique())

In [40]:
check_equal_lists(infra_ibge_code, performance_ibge_code), is_subset(infra_ibge_code, performance_ibge_code)

(False, True)

In [39]:
len(performance_ibge_code), len(infra_ibge_code)

(5564, 5570)

### Year

In [41]:
performance_years = list(performance_df['Ano'].unique())
performance_years.sort()
performance_years

[2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021,
 2022]

In [42]:
infra_years = list(infrastructure_df['Ano'].unique())
infra_years.sort()
infra_years

[2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021,
 2022]

In [43]:
check_equal_lists(infra_years, performance_years), is_subset(infra_years, performance_years)

(False, True)

In [44]:
len(infra_years),len(performance_years)

(16, 15)

In [45]:
# remove data from infra for year = 2007
infrastructure_df = infrastructure_df[infrastructure_df.Ano != 2007]

In [46]:
# check for Year again
infra_years = list(infrastructure_df['Ano'].unique())
len(performance_years), len(infra_years)

(15, 15)

## Merge the 2 datasets

In [50]:
school_merged_df = pd.merge(performance_df, infrastructure_df, on=['Ano', 'Código_IBGE', 'Localização'])

In [51]:
len(school_merged_df.Código_IBGE.unique())

5564

In [52]:
school_merged_df

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas
0,2008,1100064,Rural,77.7,22,0.3,328,30,1,11
1,2008,1100114,Rural,72.2,12.8,15,506,32,3,16
2,2008,1100122,Rural,93.3,2.1,4.6,245,19,2,10
3,2008,1100148,Rural,85.8,0.9,13.3,237,13,8,9
4,2008,1100205,Rural,72.1,7.2,20.7,598,45,4,24
...,...,...,...,...,...,...,...,...,...,...
102884,2022,5222005,Urbana,99.2,0.8,0.0,399,27,2,16
102885,2022,5222054,Urbana,99.3,0.7,0.0,340,22,1,12
102886,2022,5222203,Urbana,100.0,0.0,0.0,222,8,1,8
102887,2022,5222302,Urbana,90.5,5.2,4.3,117,14,1,6


In [53]:
# check if there is any instance of NaN in any of the rows
school_merged_df[school_merged_df.isna().any(axis=1)]

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas


In [54]:
# check if there is any instance of -- in any of the rows
school_merged_df[school_merged_df.eq("--").any(axis=1)] 

Unnamed: 0,Ano,Código_IBGE,Localização,Aprovação,Reprovação,Abandono,Matrículas,Docentes,Estabelecimentos,Turmas


In [55]:
# Write the merged DataFrame to a new CSV file
file_path = "../../transformed_data/school_data/school_all_data.csv"
school_merged_df.to_csv(file_path, index=False)

In [63]:
pd.read_csv("../../transformed_data/school_data/school_all_data.csv").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102889 entries, 0 to 102888
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Ano               102889 non-null  int64  
 1   Código_IBGE       102889 non-null  int64  
 2   Localização       102889 non-null  object 
 3   Aprovação         102889 non-null  float64
 4   Reprovação        102889 non-null  float64
 5   Abandono          102889 non-null  float64
 6   Matrículas        102889 non-null  int64  
 7   Docentes          102875 non-null  float64
 8   Estabelecimentos  102889 non-null  int64  
 9   Turmas            102889 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 7.8+ MB
