# Preprocessing school census data

Here we will pre-process the inep data of school census present in ```data/censo_escolar_2019.csv```.

## Imports

In [1]:
import json
import missingno
import matplotlib
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# notebook only
%matplotlib inline

## Preliminary Data Processing

In [2]:
df = pd.read_csv('data/censo_escolar_2019.csv', sep=';', dtype={'CO_ORGAO_REGIONAL': str})

In [3]:
df.head()

Unnamed: 0,NU_ANO_CENSO,NO_REGIAO,CO_REGIAO,NO_UF,SG_UF,CO_UF,NO_MUNICIPIO,CO_MUNICIPIO,NO_MESORREGIAO,CO_MESORREGIAO,...,QT_TUR_FUND_AF,QT_TUR_MED,QT_TUR_PROF,QT_TUR_PROF_TEC,QT_TUR_EJA,QT_TUR_EJA_FUND,QT_TUR_EJA_MED,QT_TUR_ESP,QT_TUR_ESP_CC,QT_TUR_ESP_CE
0,2019,Norte,1,Rondonia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,1102,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019,Norte,1,Rondonia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,1102,...,0.0,0.0,0.0,0.0,13.0,7.0,6.0,2.0,2.0,0.0
2,2019,Norte,1,Rondonia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,1102,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019,Norte,1,Rondonia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,1102,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0
4,2019,Norte,1,Rondonia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,1102,...,,,,,,,,,,


### Drop unwanted data

Here we will remove columns and rows that we don't need, like private and non-active schools entries, and also all columns that are not included in the ```selected-features.json``` file.

In [4]:
with open('./data/selected-features.json', 'r') as feature_file:
    features = json.load(feature_file)
features = features['features']
print(features)

['SG_UF', 'TP_DEPENDENCIA', 'TP_LOCALIZACAO', 'TP_SITUACAO_FUNCIONAMENTO', 'IN_AGUA_INEXISTENTE', 'IN_ENERGIA_INEXISTENTE', 'IN_ESGOTO_INEXISTENTE', 'IN_TRATAMENTO_LIXO_INEXISTENTE', 'IN_ALMOXARIFADO', 'IN_AREA_VERDE', 'IN_AUDITORIO', 'IN_BANHEIRO', 'IN_BIBLIOTECA', 'IN_BIBLIOTECA_SALA_LEITURA', 'IN_COZINHA', 'IN_LABORATORIO_CIENCIAS', 'IN_LABORATORIO_INFORMATICA', 'IN_PATIO_COBERTO', 'IN_PATIO_DESCOBERTO', 'IN_PARQUE_INFANTIL', 'IN_QUADRA_ESPORTES', 'IN_REFEITORIO', 'IN_SALA_DIRETORIA', 'IN_SALA_LEITURA', 'IN_SALA_PROFESSOR', 'IN_SECRETARIA', 'QT_SALAS_UTILIZADAS', 'QT_DESKTOP_ALUNO', 'IN_INTERNET', 'IN_ALIMENTACAO', 'IN_EXAME_SELECAO', 'IN_ORGAO_NENHUM']


In [5]:
# drop all columns not in features list
for col in df.columns:
    if col not in features:
        del df[col]

In [6]:
df.head()

Unnamed: 0,SG_UF,TP_DEPENDENCIA,TP_LOCALIZACAO,TP_SITUACAO_FUNCIONAMENTO,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,IN_AREA_VERDE,...,IN_SALA_DIRETORIA,IN_SALA_LEITURA,IN_SALA_PROFESSOR,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM
0,RO,2,2,1,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,RO,2,1,1,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,14.0,8.0,1.0,1.0,0.0,0.0
2,RO,3,2,1,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0
3,RO,3,1,1,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,5.0,0.0,1.0,1.0,0.0,0.0
4,RO,3,2,2,,,,,,,...,,,,,,,,,,


In [7]:
# drop rows with private schools
df = df.drop(df[df['TP_DEPENDENCIA'] == 4].index)
df

Unnamed: 0,SG_UF,TP_DEPENDENCIA,TP_LOCALIZACAO,TP_SITUACAO_FUNCIONAMENTO,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,IN_AREA_VERDE,...,IN_SALA_DIRETORIA,IN_SALA_LEITURA,IN_SALA_PROFESSOR,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM
0,RO,2,2,1,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,RO,2,1,1,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,14.0,8.0,1.0,1.0,0.0,0.0
2,RO,3,2,1,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0
3,RO,3,1,1,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,5.0,0.0,1.0,1.0,0.0,0.0
4,RO,3,2,2,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228497,DF,2,1,1,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,11.0,2.0,1.0,1.0,0.0,0.0
228498,DF,2,1,1,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,16.0,15.0,1.0,1.0,0.0,0.0
228499,DF,2,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,19.0,19.0,1.0,1.0,0.0,0.0
228500,DF,2,1,1,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,10.0,0.0,1.0,1.0,0.0,0.0


In [8]:
# drop rows with non-active schools
df = df.drop(df[df['TP_SITUACAO_FUNCIONAMENTO'] != 1].index)
df

Unnamed: 0,SG_UF,TP_DEPENDENCIA,TP_LOCALIZACAO,TP_SITUACAO_FUNCIONAMENTO,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,IN_AREA_VERDE,...,IN_SALA_DIRETORIA,IN_SALA_LEITURA,IN_SALA_PROFESSOR,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM
0,RO,2,2,1,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,RO,2,1,1,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,14.0,8.0,1.0,1.0,0.0,0.0
2,RO,3,2,1,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0
3,RO,3,1,1,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,5.0,0.0,1.0,1.0,0.0,0.0
7,RO,3,2,1,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,7.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228497,DF,2,1,1,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,11.0,2.0,1.0,1.0,0.0,0.0
228498,DF,2,1,1,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,16.0,15.0,1.0,1.0,0.0,0.0
228499,DF,2,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,19.0,19.0,1.0,1.0,0.0,0.0
228500,DF,2,1,1,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,10.0,0.0,1.0,1.0,0.0,0.0


In [9]:
# drop columns used for filtering
df = df.drop(['TP_DEPENDENCIA', 'TP_SITUACAO_FUNCIONAMENTO'], axis=1)
df

Unnamed: 0,SG_UF,TP_LOCALIZACAO,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,IN_AREA_VERDE,IN_AUDITORIO,IN_BANHEIRO,...,IN_SALA_DIRETORIA,IN_SALA_LEITURA,IN_SALA_PROFESSOR,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM
0,RO,2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,RO,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,14.0,8.0,1.0,1.0,0.0,0.0
2,RO,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0
3,RO,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,5.0,0.0,1.0,1.0,0.0,0.0
7,RO,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,7.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228497,DF,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,11.0,2.0,1.0,1.0,0.0,0.0
228498,DF,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,16.0,15.0,1.0,1.0,0.0,0.0
228499,DF,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,19.0,19.0,1.0,1.0,0.0,0.0
228500,DF,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,10.0,0.0,1.0,1.0,0.0,0.0


### Check Missing Values.

In [10]:
if df.isnull().any(axis=None):
    print("\nPreview of data with null values:\nxxxxxxxxxxxxxxxx")
    print(df[df.isnull().any(axis=1)].head(3))
    missingno.matrix(df)
    plt.show()
else:
    print('no missing values!')

no missing values!


### Column Merging and Transforming

In [11]:
df['PATIO'] = df['IN_AREA_VERDE'].combine(df['IN_PATIO_COBERTO'], lambda a, b: a or b)
df['PATIO'] = df['PATIO'].combine(df['IN_PATIO_DESCOBERTO'], lambda a, b: a or b)
df['PATIO'] = df['PATIO'].combine(df['IN_PARQUE_INFANTIL'], lambda a, b: a or b)
df = df.drop(['IN_AREA_VERDE', 'IN_PATIO_COBERTO',
             'IN_PATIO_DESCOBERTO', 'IN_PARQUE_INFANTIL'], axis=1)


In [12]:
df['BIBLIOTECA'] = df['IN_BIBLIOTECA'].combine(df['IN_BIBLIOTECA_SALA_LEITURA'], lambda a, b: a or b)
df = df.drop(['IN_BIBLIOTECA', 'IN_BIBLIOTECA_SALA_LEITURA'], axis=1)

In [13]:
df.head()

Unnamed: 0,SG_UF,TP_LOCALIZACAO,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,IN_AUDITORIO,IN_BANHEIRO,IN_COZINHA,...,IN_SALA_PROFESSOR,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM,PATIO,BIBLIOTECA
0,RO,2,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,RO,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,14.0,8.0,1.0,1.0,0.0,0.0,1.0,1.0
2,RO,2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,RO,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,5.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
7,RO,2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,7.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


In [14]:
# transforming 'TP_LOCALIZACAO' to 'RURAL'
df['RURAL'] = df['TP_LOCALIZACAO'].map({2: 1, 1: 0})

In [15]:
df = df.drop(['TP_LOCALIZACAO'], axis=1)

In [16]:
df

Unnamed: 0,SG_UF,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,IN_AUDITORIO,IN_BANHEIRO,IN_COZINHA,IN_LABORATORIO_CIENCIAS,...,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM,PATIO,BIBLIOTECA,RURAL
0,RO,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
1,RO,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,14.0,8.0,1.0,1.0,0.0,0.0,1.0,1.0,0
2,RO,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
3,RO,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,5.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0
7,RO,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,7.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228497,DF,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,11.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,0
228498,DF,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,16.0,15.0,1.0,1.0,0.0,0.0,1.0,1.0,0
228499,DF,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,19.0,19.0,1.0,1.0,0.0,0.0,1.0,1.0,0
228500,DF,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,10.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0


In [17]:
# we need to invert boolean columns values 
# to represent the existance instead of inexistance
def invert_booleans(df, columns):
    for column in columns:
        df[column] = np.logical_xor(df[column],1).astype(int)
    return df


In [18]:
df.head()

Unnamed: 0,SG_UF,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,IN_AUDITORIO,IN_BANHEIRO,IN_COZINHA,IN_LABORATORIO_CIENCIAS,...,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM,PATIO,BIBLIOTECA,RURAL
0,RO,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
1,RO,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,14.0,8.0,1.0,1.0,0.0,0.0,1.0,1.0,0
2,RO,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
3,RO,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,5.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0
7,RO,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,7.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1


In [19]:
df = invert_booleans(df, ['IN_AGUA_INEXISTENTE', 'IN_ENERGIA_INEXISTENTE', 'IN_ESGOTO_INEXISTENTE', 'IN_TRATAMENTO_LIXO_INEXISTENTE', 'IN_ORGAO_NENHUM'])
df.head()

Unnamed: 0,SG_UF,IN_AGUA_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_TRATAMENTO_LIXO_INEXISTENTE,IN_ALMOXARIFADO,IN_AUDITORIO,IN_BANHEIRO,IN_COZINHA,IN_LABORATORIO_CIENCIAS,...,IN_SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,IN_INTERNET,IN_ALIMENTACAO,IN_EXAME_SELECAO,IN_ORGAO_NENHUM,PATIO,BIBLIOTECA,RURAL
0,RO,1,1,0,0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1,0.0,0.0,1
1,RO,1,1,1,0,0.0,0.0,1.0,1.0,1.0,...,1.0,14.0,8.0,1.0,1.0,0.0,1,1.0,1.0,0
2,RO,1,1,1,0,0.0,0.0,1.0,1.0,0.0,...,0.0,2.0,0.0,0.0,1.0,0.0,1,1.0,0.0,1
3,RO,1,1,1,0,0.0,0.0,0.0,1.0,0.0,...,0.0,5.0,0.0,1.0,1.0,0.0,1,1.0,1.0,0
7,RO,1,1,1,0,0.0,0.0,1.0,1.0,0.0,...,0.0,7.0,0.0,1.0,1.0,0.0,1,1.0,0.0,1


In [20]:
df.columns

Index(['SG_UF', 'IN_AGUA_INEXISTENTE', 'IN_ENERGIA_INEXISTENTE',
       'IN_ESGOTO_INEXISTENTE', 'IN_TRATAMENTO_LIXO_INEXISTENTE',
       'IN_ALMOXARIFADO', 'IN_AUDITORIO', 'IN_BANHEIRO', 'IN_COZINHA',
       'IN_LABORATORIO_CIENCIAS', 'IN_LABORATORIO_INFORMATICA',
       'IN_QUADRA_ESPORTES', 'IN_REFEITORIO', 'IN_SALA_DIRETORIA',
       'IN_SALA_LEITURA', 'IN_SALA_PROFESSOR', 'IN_SECRETARIA',
       'QT_SALAS_UTILIZADAS', 'QT_DESKTOP_ALUNO', 'IN_INTERNET',
       'IN_ALIMENTACAO', 'IN_EXAME_SELECAO', 'IN_ORGAO_NENHUM', 'PATIO',
       'BIBLIOTECA', 'RURAL'],
      dtype='object')

In [21]:
df = df.rename(columns={
    'IN_AGUA_INEXISTENTE': 'AGUA',
    'IN_ENERGIA_INEXISTENTE': 'ENERGIA',
    'IN_ESGOTO_INEXISTENTE': 'REDE_ESGOTO',
    'IN_TRATAMENTO_LIXO_INEXISTENTE': 'TRATA_LIXO',
    'IN_ALMOXARIFADO': 'ALMOXARIFADO',
    'IN_AUDITORIO': 'AUDITORIO',
    'IN_BANHEIRO': 'BANHEIRO',
    'IN_COZINHA': 'COZINHA',
    'IN_LABORATORIO_CIENCIAS': 'LABORATORIO_CIENCIAS',
    'IN_LABORATORIO_INFORMATICA': 'LABORATORIO_INFORMATICA',
    'IN_QUADRA_ESPORTES': 'QUADRA_ESPORTES',
    'IN_REFEITORIO': 'REFEITORIO',
    'IN_SALA_DIRETORIA': 'SALA_DIRETORIA',
    'IN_SALA_LEITURA': 'SALA_LEITURA',
    'IN_SALA_PROFESSOR': 'SALA_PROFESSOR',
    'IN_SECRETARIA': 'SECRETARIA',
    'IN_INTERNET': 'INTERNET',
    'IN_ALIMENTACAO': 'ALIMENTACAO',
    'IN_EXAME_SELECAO': 'EXAME_SELECAO',
    'IN_ORGAO_NENHUM': 'ORGAOS'
    })


In [22]:
df.head()

Unnamed: 0,SG_UF,AGUA,ENERGIA,REDE_ESGOTO,TRATA_LIXO,ALMOXARIFADO,AUDITORIO,BANHEIRO,COZINHA,LABORATORIO_CIENCIAS,...,SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,INTERNET,ALIMENTACAO,EXAME_SELECAO,ORGAOS,PATIO,BIBLIOTECA,RURAL
0,RO,1,1,0,0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1,0.0,0.0,1
1,RO,1,1,1,0,0.0,0.0,1.0,1.0,1.0,...,1.0,14.0,8.0,1.0,1.0,0.0,1,1.0,1.0,0
2,RO,1,1,1,0,0.0,0.0,1.0,1.0,0.0,...,0.0,2.0,0.0,0.0,1.0,0.0,1,1.0,0.0,1
3,RO,1,1,1,0,0.0,0.0,0.0,1.0,0.0,...,0.0,5.0,0.0,1.0,1.0,0.0,1,1.0,1.0,0
7,RO,1,1,1,0,0.0,0.0,1.0,1.0,0.0,...,0.0,7.0,0.0,1.0,1.0,0.0,1,1.0,0.0,1


In [23]:
# no need for floats if booleans
def floats_to_ints(df):
    for column in df.columns:
        if df[column].dtype == 'float':
            df[column] = df[column].astype(int)
    return df

In [24]:
df = floats_to_ints(df)
df.head()

Unnamed: 0,SG_UF,AGUA,ENERGIA,REDE_ESGOTO,TRATA_LIXO,ALMOXARIFADO,AUDITORIO,BANHEIRO,COZINHA,LABORATORIO_CIENCIAS,...,SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,INTERNET,ALIMENTACAO,EXAME_SELECAO,ORGAOS,PATIO,BIBLIOTECA,RURAL
0,RO,1,1,0,0,0,0,1,1,0,...,0,1,0,0,1,0,1,0,0,1
1,RO,1,1,1,0,0,0,1,1,1,...,1,14,8,1,1,0,1,1,1,0
2,RO,1,1,1,0,0,0,1,1,0,...,0,2,0,0,1,0,1,1,0,1
3,RO,1,1,1,0,0,0,0,1,0,...,0,5,0,1,1,0,1,1,1,0
7,RO,1,1,1,0,0,0,1,1,0,...,0,7,0,1,1,0,1,1,0,1


In [25]:
df.to_csv('./data/censo_inep_processado.csv', index=False)