# Preprocessing school census data

Here we will pre-process the inep data of school census present in ```data/censo_escolar_2019.csv```.

## Imports

In [41]:
import json
import missingno
import matplotlib
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# notebook only
%matplotlib inline

## Preliminary Data Processing

In [42]:
df = pd.read_csv('data/censo_escolar_2019.csv', sep=';', dtype={'CO_ORGAO_REGIONAL': str})

In [43]:
df

Unnamed: 0,NU_ANO_CENSO,NO_REGIAO,CO_REGIAO,NO_UF,SG_UF,CO_UF,NO_MUNICIPIO,CO_MUNICIPIO,NO_MESORREGIAO,CO_MESORREGIAO,...,QT_TUR_FUND_AF,QT_TUR_MED,QT_TUR_PROF,QT_TUR_PROF_TEC,QT_TUR_EJA,QT_TUR_EJA_FUND,QT_TUR_EJA_MED,QT_TUR_ESP,QT_TUR_ESP_CC,QT_TUR_ESP_CE
0,2019,Norte,1,Rondonia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,1102,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019,Norte,1,Rondonia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,1102,...,0.0,0.0,0.0,0.0,13.0,7.0,6.0,2.0,2.0,0.0
2,2019,Norte,1,Rondonia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,1102,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019,Norte,1,Rondonia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,1102,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0
4,2019,Norte,1,Rondonia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,1102,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228516,2019,Centro-Oeste,5,Distrito Federal,DF,53,Braslia,5300108,Distrito Federal,5301,...,,,,,,,,,,
228517,2019,Centro-Oeste,5,Distrito Federal,DF,53,Braslia,5300108,Distrito Federal,5301,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0
228518,2019,Centro-Oeste,5,Distrito Federal,DF,53,Braslia,5300108,Distrito Federal,5301,...,3.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
228519,2019,Centro-Oeste,5,Distrito Federal,DF,53,Braslia,5300108,Distrito Federal,5301,...,,,,,,,,,,


### Drop unwanted data

Here we will remove columns and rows that we don't need, like private and non-active schools entries, and also all columns that are not included in the ```selected-features.json``` file.

In [44]:
with open('./data/selected-features.json', 'r') as feature_file:
    features = json.load(feature_file)
features = features['features']
print(features)

['SG_UF', 'CO_MUNICIPIO', 'TP_DEPENDENCIA', 'TP_LOCALIZACAO', 'TP_SITUACAO_FUNCIONAMENTO', 'IN_AGUA_INEXISTENTE', 'IN_ENERGIA_INEXISTENTE', 'IN_ESGOTO_INEXISTENTE', 'IN_TRATAMENTO_LIXO_INEXISTENTE', 'IN_ALMOXARIFADO', 'IN_AREA_VERDE', 'IN_AUDITORIO', 'IN_BANHEIRO', 'IN_BIBLIOTECA', 'IN_BIBLIOTECA_SALA_LEITURA', 'IN_COZINHA', 'IN_LABORATORIO_CIENCIAS', 'IN_LABORATORIO_INFORMATICA', 'IN_PATIO_COBERTO', 'IN_PATIO_DESCOBERTO', 'IN_PARQUE_INFANTIL', 'IN_QUADRA_ESPORTES', 'IN_REFEITORIO', 'IN_SALA_DIRETORIA', 'IN_SALA_LEITURA', 'IN_SALA_PROFESSOR', 'IN_SECRETARIA', 'QT_SALAS_UTILIZADAS', 'QT_DESKTOP_ALUNO', 'IN_INTERNET', 'IN_ALIMENTACAO', 'IN_EXAME_SELECAO', 'IN_ORGAO_NENHUM']


In [45]:
# drop all columns not in features list
for col in df.columns:
    if col not in features:
        del df[col]

In [46]:
df.head()

Unnamed: 0,SG_UF,CO_MUNICIPIO,TP_DEPENDENCIA,TP_LOCALIZACAO,TP_SITUACAO_FUNCIONAMENTO,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,...,IN_SALA_DIRETORIA,IN_SALA_LEITURA,IN_SALA_PROFESSOR,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM
0,RO,1100015,2,2,1,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,RO,1100015,2,1,1,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,14.0,8.0,1.0,1.0,0.0,0.0
2,RO,1100015,3,2,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0
3,RO,1100015,3,1,1,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,5.0,0.0,1.0,1.0,0.0,0.0
4,RO,1100015,3,2,2,,,,,,...,,,,,,,,,,


In [47]:
# drop rows with private schools
df = df.drop(df[df['TP_DEPENDENCIA'] == 4].index)
df

Unnamed: 0,SG_UF,CO_MUNICIPIO,TP_DEPENDENCIA,TP_LOCALIZACAO,TP_SITUACAO_FUNCIONAMENTO,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,...,IN_SALA_DIRETORIA,IN_SALA_LEITURA,IN_SALA_PROFESSOR,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM
0,RO,1100015,2,2,1,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,RO,1100015,2,1,1,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,14.0,8.0,1.0,1.0,0.0,0.0
2,RO,1100015,3,2,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0
3,RO,1100015,3,1,1,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,5.0,0.0,1.0,1.0,0.0,0.0
4,RO,1100015,3,2,2,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228497,DF,5300108,2,1,1,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,11.0,2.0,1.0,1.0,0.0,0.0
228498,DF,5300108,2,1,1,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,16.0,15.0,1.0,1.0,0.0,0.0
228499,DF,5300108,2,1,1,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,19.0,19.0,1.0,1.0,0.0,0.0
228500,DF,5300108,2,1,1,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,10.0,0.0,1.0,1.0,0.0,0.0


In [48]:
# drop rows with non-active schools
df = df.drop(df[df['TP_SITUACAO_FUNCIONAMENTO'] != 1].index)
df

Unnamed: 0,SG_UF,CO_MUNICIPIO,TP_DEPENDENCIA,TP_LOCALIZACAO,TP_SITUACAO_FUNCIONAMENTO,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,...,IN_SALA_DIRETORIA,IN_SALA_LEITURA,IN_SALA_PROFESSOR,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM
0,RO,1100015,2,2,1,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,RO,1100015,2,1,1,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,14.0,8.0,1.0,1.0,0.0,0.0
2,RO,1100015,3,2,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0
3,RO,1100015,3,1,1,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,5.0,0.0,1.0,1.0,0.0,0.0
7,RO,1100015,3,2,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,7.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228497,DF,5300108,2,1,1,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,11.0,2.0,1.0,1.0,0.0,0.0
228498,DF,5300108,2,1,1,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,16.0,15.0,1.0,1.0,0.0,0.0
228499,DF,5300108,2,1,1,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,19.0,19.0,1.0,1.0,0.0,0.0
228500,DF,5300108,2,1,1,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,10.0,0.0,1.0,1.0,0.0,0.0


In [49]:
# drop columns used for filtering
df = df.drop(['TP_DEPENDENCIA', 'TP_SITUACAO_FUNCIONAMENTO'], axis=1)
df

Unnamed: 0,SG_UF,CO_MUNICIPIO,TP_LOCALIZACAO,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,IN_AREA_VERDE,IN_AUDITORIO,...,IN_SALA_DIRETORIA,IN_SALA_LEITURA,IN_SALA_PROFESSOR,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM
0,RO,1100015,2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,RO,1100015,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,14.0,8.0,1.0,1.0,0.0,0.0
2,RO,1100015,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0
3,RO,1100015,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,5.0,0.0,1.0,1.0,0.0,0.0
7,RO,1100015,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,7.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228497,DF,5300108,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,11.0,2.0,1.0,1.0,0.0,0.0
228498,DF,5300108,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,16.0,15.0,1.0,1.0,0.0,0.0
228499,DF,5300108,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,19.0,19.0,1.0,1.0,0.0,0.0
228500,DF,5300108,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,10.0,0.0,1.0,1.0,0.0,0.0


### Check Missing Values.

In [50]:
if df.isnull().any(axis=None):
    print("\nPreview of data with null values:\nxxxxxxxxxxxxxxxx")
    print(df[df.isnull().any(axis=1)].head(3))
    missingno.matrix(df)
    plt.show()
else:
    print('no missing values!')

no missing values!


### Column Merging and Transforming

In [51]:
df['PATIO'] = df['IN_AREA_VERDE'].combine(df['IN_PATIO_COBERTO'], lambda a, b: a or b)
df['PATIO'] = df['PATIO'].combine(df['IN_PATIO_DESCOBERTO'], lambda a, b: a or b)
df['PATIO'] = df['PATIO'].combine(df['IN_PARQUE_INFANTIL'], lambda a, b: a or b)
df = df.drop(['IN_AREA_VERDE', 'IN_PATIO_COBERTO',
             'IN_PATIO_DESCOBERTO', 'IN_PARQUE_INFANTIL'], axis=1)


In [52]:
df['BIBLIOTECA'] = df['IN_BIBLIOTECA'].combine(df['IN_BIBLIOTECA_SALA_LEITURA'], lambda a, b: a or b)
df = df.drop(['IN_BIBLIOTECA', 'IN_BIBLIOTECA_SALA_LEITURA'], axis=1)

In [53]:
df.head()

Unnamed: 0,SG_UF,CO_MUNICIPIO,TP_LOCALIZACAO,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,IN_AUDITORIO,IN_BANHEIRO,...,IN_SALA_PROFESSOR,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM,PATIO,BIBLIOTECA
0,RO,1100015,2,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,RO,1100015,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,14.0,8.0,1.0,1.0,0.0,0.0,1.0,1.0
2,RO,1100015,2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,RO,1100015,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,5.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
7,RO,1100015,2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,7.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


In [54]:
# transforming 'TP_LOCALIZACAO' to 'RURAL'
df['RURAL'] = df['TP_LOCALIZACAO'].map({2: 1, 1: 0})

In [55]:
df = df.drop(['TP_LOCALIZACAO'], axis=1)

In [56]:
df

Unnamed: 0,SG_UF,CO_MUNICIPIO,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,IN_AUDITORIO,IN_BANHEIRO,IN_COZINHA,...,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM,PATIO,BIBLIOTECA,RURAL
0,RO,1100015,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
1,RO,1100015,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,14.0,8.0,1.0,1.0,0.0,0.0,1.0,1.0,0
2,RO,1100015,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
3,RO,1100015,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,5.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0
7,RO,1100015,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,7.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228497,DF,5300108,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,11.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,0
228498,DF,5300108,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,16.0,15.0,1.0,1.0,0.0,0.0,1.0,1.0,0
228499,DF,5300108,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,19.0,19.0,1.0,1.0,0.0,0.0,1.0,1.0,0
228500,DF,5300108,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,10.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0


In [57]:
# we need to invert boolean columns values 
# to represent the existance instead of inexistance
def invert_booleans(df, columns):
    for column in columns:
        df[column] = np.logical_xor(df[column],1).astype(int)
    return df


In [58]:
df.head()

Unnamed: 0,SG_UF,CO_MUNICIPIO,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,IN_AUDITORIO,IN_BANHEIRO,IN_COZINHA,...,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM,PATIO,BIBLIOTECA,RURAL
0,RO,1100015,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
1,RO,1100015,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,14.0,8.0,1.0,1.0,0.0,0.0,1.0,1.0,0
2,RO,1100015,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
3,RO,1100015,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,5.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0
7,RO,1100015,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,7.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1


In [59]:
df = invert_booleans(df, ['IN_AGUA_INEXISTENTE', 'IN_ENERGIA_INEXISTENTE', 'IN_ESGOTO_INEXISTENTE', 'IN_TRATAMENTO_LIXO_INEXISTENTE', 'IN_ORGAO_NENHUM'])
df.head()

Unnamed: 0,SG_UF,CO_MUNICIPIO,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,IN_AUDITORIO,IN_BANHEIRO,IN_COZINHA,...,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM,PATIO,BIBLIOTECA,RURAL
0,RO,1100015,1,1,0,0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1,0.0,0.0,1
1,RO,1100015,1,1,1,0,0.0,0.0,1.0,1.0,...,1.0,14.0,8.0,1.0,1.0,0.0,1,1.0,1.0,0
2,RO,1100015,1,1,1,0,0.0,0.0,1.0,1.0,...,0.0,2.0,0.0,0.0,1.0,0.0,1,1.0,0.0,1
3,RO,1100015,1,1,1,0,0.0,0.0,0.0,1.0,...,0.0,5.0,0.0,1.0,1.0,0.0,1,1.0,1.0,0
7,RO,1100015,1,1,1,0,0.0,0.0,1.0,1.0,...,0.0,7.0,0.0,1.0,1.0,0.0,1,1.0,0.0,1


In [60]:
df.columns

Index(['SG_UF', 'CO_MUNICIPIO', 'IN_AGUA_INEXISTENTE',
       'IN_ENERGIA_INEXISTENTE', 'IN_ESGOTO_INEXISTENTE',
       'IN_TRATAMENTO_LIXO_INEXISTENTE', 'IN_ALMOXARIFADO', 'IN_AUDITORIO',
       'IN_BANHEIRO', 'IN_COZINHA', 'IN_LABORATORIO_CIENCIAS',
       'IN_LABORATORIO_INFORMATICA', 'IN_QUADRA_ESPORTES', 'IN_REFEITORIO',
       'IN_SALA_DIRETORIA', 'IN_SALA_LEITURA', 'IN_SALA_PROFESSOR',
       'IN_SECRETARIA', 'QT_SALAS_UTILIZADAS', 'QT_DESKTOP_ALUNO',
       'IN_INTERNET', 'IN_ALIMENTACAO', 'IN_EXAME_SELECAO', 'IN_ORGAO_NENHUM',
       'PATIO', 'BIBLIOTECA', 'RURAL'],
      dtype='object')

In [61]:
df = df.rename(columns={
    'IN_AGUA_INEXISTENTE': 'AGUA',
    'IN_ENERGIA_INEXISTENTE': 'ENERGIA',
    'IN_ESGOTO_INEXISTENTE': 'REDE_ESGOTO',
    'IN_TRATAMENTO_LIXO_INEXISTENTE': 'TRATA_LIXO',
    'IN_ALMOXARIFADO': 'ALMOXARIFADO',
    'IN_AUDITORIO': 'AUDITORIO',
    'IN_BANHEIRO': 'BANHEIRO',
    'IN_COZINHA': 'COZINHA',
    'IN_LABORATORIO_CIENCIAS': 'LABORATORIO_CIENCIAS',
    'IN_LABORATORIO_INFORMATICA': 'LABORATORIO_INFORMATICA',
    'IN_QUADRA_ESPORTES': 'QUADRA_ESPORTES',
    'IN_REFEITORIO': 'REFEITORIO',
    'IN_SALA_DIRETORIA': 'SALA_DIRETORIA',
    'IN_SALA_LEITURA': 'SALA_LEITURA',
    'IN_SALA_PROFESSOR': 'SALA_PROFESSOR',
    'IN_SECRETARIA': 'SECRETARIA',
    'IN_INTERNET': 'INTERNET',
    'IN_ALIMENTACAO': 'ALIMENTACAO',
    'IN_EXAME_SELECAO': 'EXAME_SELECAO',
    'IN_ORGAO_NENHUM': 'ORGAOS'
    })


In [62]:
df.head()

Unnamed: 0,SG_UF,CO_MUNICIPIO,AGUA,ENERGIA,REDE_ESGOTO,TRATA_LIXO,ALMOXARIFADO,AUDITORIO,BANHEIRO,COZINHA,...,SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,INTERNET,ALIMENTACAO,EXAME_SELECAO,ORGAOS,PATIO,BIBLIOTECA,RURAL
0,RO,1100015,1,1,0,0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1,0.0,0.0,1
1,RO,1100015,1,1,1,0,0.0,0.0,1.0,1.0,...,1.0,14.0,8.0,1.0,1.0,0.0,1,1.0,1.0,0
2,RO,1100015,1,1,1,0,0.0,0.0,1.0,1.0,...,0.0,2.0,0.0,0.0,1.0,0.0,1,1.0,0.0,1
3,RO,1100015,1,1,1,0,0.0,0.0,0.0,1.0,...,0.0,5.0,0.0,1.0,1.0,0.0,1,1.0,1.0,0
7,RO,1100015,1,1,1,0,0.0,0.0,1.0,1.0,...,0.0,7.0,0.0,1.0,1.0,0.0,1,1.0,0.0,1


In [63]:
# no need for floats if booleans
def floats_to_ints(df):
    for column in df.columns:
        if df[column].dtype == 'float':
            df[column] = df[column].astype(int)
    return df

In [64]:
df = floats_to_ints(df)
df.head()

Unnamed: 0,SG_UF,CO_MUNICIPIO,AGUA,ENERGIA,REDE_ESGOTO,TRATA_LIXO,ALMOXARIFADO,AUDITORIO,BANHEIRO,COZINHA,...,SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,INTERNET,ALIMENTACAO,EXAME_SELECAO,ORGAOS,PATIO,BIBLIOTECA,RURAL
0,RO,1100015,1,1,0,0,0,0,1,1,...,0,1,0,0,1,0,1,0,0,1
1,RO,1100015,1,1,1,0,0,0,1,1,...,1,14,8,1,1,0,1,1,1,0
2,RO,1100015,1,1,1,0,0,0,1,1,...,0,2,0,0,1,0,1,1,0,1
3,RO,1100015,1,1,1,0,0,0,0,1,...,0,5,0,1,1,0,1,1,1,0
7,RO,1100015,1,1,1,0,0,0,1,1,...,0,7,0,1,1,0,1,1,0,1


In [65]:
# df.to_csv('./data/censo_inep_processado.csv', index=False)

### Grouping by county

In [66]:
df = df.drop('SG_UF', axis=1).groupby(['CO_MUNICIPIO']).mean()

In [67]:
df.head()

Unnamed: 0_level_0,AGUA,ENERGIA,REDE_ESGOTO,TRATA_LIXO,ALMOXARIFADO,AUDITORIO,BANHEIRO,COZINHA,LABORATORIO_CIENCIAS,LABORATORIO_INFORMATICA,...,SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,INTERNET,ALIMENTACAO,EXAME_SELECAO,ORGAOS,PATIO,BIBLIOTECA,RURAL
CO_MUNICIPIO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1100015,1.0,0.909091,0.969697,0.060606,0.212121,0.030303,0.909091,0.939394,0.090909,0.151515,...,0.272727,6.727273,2.181818,0.575758,1.0,0.0,0.909091,0.666667,0.272727,0.545455
1100023,1.0,1.0,1.0,0.473684,0.631579,0.447368,1.0,1.0,0.210526,0.842105,...,0.894737,13.447368,14.894737,0.973684,1.0,0.052632,1.0,1.0,0.736842,0.236842
1100031,1.0,1.0,1.0,0.0,0.5,0.0,0.875,1.0,0.0,0.25,...,0.5,8.625,5.25,1.0,1.0,0.0,0.875,1.0,0.5,0.25
1100049,1.0,1.0,0.979167,0.3125,0.625,0.166667,0.979167,0.979167,0.145833,0.354167,...,0.708333,9.729167,6.708333,0.770833,1.0,0.208333,1.0,0.958333,0.4375,0.375
1100056,1.0,1.0,1.0,0.916667,0.5,0.166667,1.0,0.916667,0.083333,0.25,...,0.75,8.416667,7.75,1.0,1.0,0.0,0.75,1.0,0.416667,0.0


In [68]:
df.describe()

Unnamed: 0,AGUA,ENERGIA,REDE_ESGOTO,TRATA_LIXO,ALMOXARIFADO,AUDITORIO,BANHEIRO,COZINHA,LABORATORIO_CIENCIAS,LABORATORIO_INFORMATICA,...,SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,INTERNET,ALIMENTACAO,EXAME_SELECAO,ORGAOS,PATIO,BIBLIOTECA,RURAL
count,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,...,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0
mean,0.983156,0.988385,0.956146,0.297186,0.430715,0.0814,0.980488,0.969746,0.092191,0.378456,...,0.664284,7.462413,39.062599,0.783163,0.993658,0.120989,0.789018,0.806931,0.512164,0.36022
std,0.074342,0.0649,0.120357,0.329372,0.266847,0.116041,0.068727,0.077796,0.117377,0.229101,...,0.258562,8.323779,471.731844,0.263322,0.026301,0.468927,0.258702,0.218849,0.260824,0.279927
min,0.0,0.092784,0.054545,0.0,0.0,0.0,0.117647,0.120482,0.0,0.0,...,0.0,1.68254,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,0.991736,0.0,0.214286,0.0,1.0,0.979167,0.0,0.190651,...,0.473684,5.333333,1.545455,0.625,1.0,0.0,0.666667,0.692308,0.307692,0.083333
50%,1.0,1.0,1.0,0.157895,0.413793,0.038462,1.0,1.0,0.055556,0.357143,...,0.692308,7.0,4.25,0.9,1.0,0.0,0.9,0.875,0.5,0.347826
75%,1.0,1.0,1.0,0.5,0.636364,0.125,1.0,1.0,0.142857,0.532979,...,0.886792,8.857143,7.666667,1.0,1.0,0.043478,1.0,1.0,0.698113,0.6
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,530.117647,12704.285714,1.0,1.0,8.375,1.0,1.0,1.0,1.0


In [72]:
df.to_csv('./data/censo_inep_municipio.csv')