# Initial exploratory data analysis

## Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')

# notebooks only
%matplotlib inline

In [3]:
df = pd.read_csv('./data/censo_inep_processado.csv')
df.head()

Unnamed: 0,SG_UF,AGUA,ENERGIA,REDE_ESGOTO,TRATA_LIXO,ALMOXARIFADO,AUDITORIO,BANHEIRO,COZINHA,LABORATORIO_CIENCIAS,...,SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,INTERNET,ALIMENTACAO,EXAME_SELECAO,ORGAOS,PATIO,BIBLIOTECA,RURAL
0,RO,1,1,0,0,0,0,1,1,0,...,0,1,0,0,1,0,1,0,0,1
1,RO,1,1,1,0,0,0,1,1,1,...,1,14,8,1,1,0,1,1,1,0
2,RO,1,1,1,0,0,0,1,1,0,...,0,2,0,0,1,0,1,1,0,1
3,RO,1,1,1,0,0,0,0,1,0,...,0,5,0,1,1,0,1,1,1,0
4,RO,1,1,1,0,0,0,1,1,0,...,0,7,0,1,1,0,1,1,0,1


In [4]:
df.shape

(140242, 26)

In [5]:
df.columns

Index(['SG_UF', 'AGUA', 'ENERGIA', 'REDE_ESGOTO', 'TRATA_LIXO', 'ALMOXARIFADO',
       'AUDITORIO', 'BANHEIRO', 'COZINHA', 'LABORATORIO_CIENCIAS',
       'LABORATORIO_INFORMATICA', 'QUADRA_ESPORTES', 'REFEITORIO',
       'SALA_DIRETORIA', 'SALA_LEITURA', 'SALA_PROFESSOR', 'SECRETARIA',
       'QT_SALAS_UTILIZADAS', 'QT_DESKTOP_ALUNO', 'INTERNET', 'ALIMENTACAO',
       'EXAME_SELECAO', 'ORGAOS', 'PATIO', 'BIBLIOTECA', 'RURAL'],
      dtype='object')

In [6]:
df.nunique(axis=0)

SG_UF                       27
AGUA                         2
ENERGIA                      2
REDE_ESGOTO                  2
TRATA_LIXO                   2
ALMOXARIFADO                 2
AUDITORIO                    2
BANHEIRO                     2
COZINHA                      2
LABORATORIO_CIENCIAS         2
LABORATORIO_INFORMATICA      2
QUADRA_ESPORTES              2
REFEITORIO                   2
SALA_DIRETORIA               2
SALA_LEITURA                 2
SALA_PROFESSOR               2
SECRETARIA                   2
QT_SALAS_UTILIZADAS        126
QT_DESKTOP_ALUNO           283
INTERNET                     2
ALIMENTACAO                  2
EXAME_SELECAO                3
ORGAOS                       2
PATIO                        2
BIBLIOTECA                   2
RURAL                        2
dtype: int64

In [7]:
df.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

Unnamed: 0,AGUA,ENERGIA,REDE_ESGOTO,TRATA_LIXO,ALMOXARIFADO,AUDITORIO,BANHEIRO,COZINHA,LABORATORIO_CIENCIAS,LABORATORIO_INFORMATICA,...,SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,INTERNET,ALIMENTACAO,EXAME_SELECAO,ORGAOS,PATIO,BIBLIOTECA,RURAL
count,140242.0,140242.0,140242.0,140242.0,140242.0,140242.0,140242.0,140242.0,140242.0,140242.0,...,140242.0,140242.0,140242.0,140242.0,140242.0,140242.0,140242.0,140242.0,140242.0,140242.0
mean,0.973125,0.970159,0.930684,0.241383,0.420195,0.089431,0.961417,0.951912,0.089203,0.349239,...,0.657064,7.747173,38.120392,0.704297,0.990666,0.157136,0.763994,0.756157,0.467428,0.390254
std,0.161719,0.17015,0.253991,0.427923,0.493592,0.285366,0.1926,0.213954,0.285037,0.476731,...,0.474692,25.387868,1711.270523,0.45636,0.09616,1.092776,0.424628,0.429401,0.49894,0.487809
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,3.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
50%,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,6.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
75%,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,10.0,6.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,8888.0,88888.0,1.0,1.0,9.0,1.0,1.0,1.0,1.0


```QT_SALAS_UTILIZADAS``` and ```QT_DESKTOP_ALUNO``` have a max value of 8888, this is the max number accepted and most likely mean that it's corrupted. Let's explore it. 

In [8]:
df[df['QT_SALAS_UTILIZADAS'] > 10]

Unnamed: 0,SG_UF,AGUA,ENERGIA,REDE_ESGOTO,TRATA_LIXO,ALMOXARIFADO,AUDITORIO,BANHEIRO,COZINHA,LABORATORIO_CIENCIAS,...,SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,INTERNET,ALIMENTACAO,EXAME_SELECAO,ORGAOS,PATIO,BIBLIOTECA,RURAL
1,RO,1,1,1,0,0,0,1,1,1,...,1,14,8,1,1,0,1,1,1,0
5,RO,1,1,1,1,1,0,1,1,0,...,1,13,13,1,1,0,1,1,1,0
6,RO,1,1,1,0,1,0,1,1,0,...,0,11,0,1,1,0,1,1,0,0
8,RO,1,1,1,0,0,0,1,1,1,...,1,15,12,1,1,0,1,1,1,0
13,RO,1,1,1,0,1,0,1,1,0,...,0,12,0,1,1,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140236,DF,1,1,1,0,0,1,1,1,0,...,1,25,10,1,1,0,1,1,1,0
140237,DF,1,1,1,0,0,0,1,1,0,...,1,11,2,1,1,0,1,1,1,0
140238,DF,1,1,1,1,0,0,1,1,0,...,1,16,15,1,1,0,1,1,1,0
140239,DF,1,1,1,1,0,0,1,1,0,...,1,19,19,1,1,0,1,1,1,0


In [26]:
df[df['QT_SALAS_UTILIZADAS'] == 8888]
df = df.drop(df[df['QT_SALAS_UTILIZADAS'] == 8888].index)

In [33]:
df[df['QT_SALAS_UTILIZADAS'] > 100]['QT_SALAS_UTILIZADAS'].value_counts()

QT_SALAS_UTILIZADAS
101     3
102     2
121     2
103     1
120     1
273     1
200     1
193     1
113     1
151     1
105     1
114     1
143     1
1016    1
444     1
1210    1
224     1
177     1
116     1
500     1
440     1
600     1
131     1
112     1
404     1
1005    1
111     1
335     1
747     1
808     1
133     1
Name: count, dtype: int64

In [32]:
df[df['QT_DESKTOP_ALUNO'] > 100]['QT_DESKTOP_ALUNO'].value_counts()

Series([], Name: count, dtype: int64)

Unfortunetly, it's highly unlikely to have public schools with more than 100 desktops for students, and with more than a hundred active classrooms. This values looks like outliers, errors of count.

In [34]:
df = df.drop(df[df['QT_DESKTOP_ALUNO'] > 100].index)
df = df.drop(df[df['QT_SALAS_UTILIZADAS'] > 100].index)


In [35]:
df.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

Unnamed: 0,AGUA,ENERGIA,REDE_ESGOTO,TRATA_LIXO,ALMOXARIFADO,AUDITORIO,BANHEIRO,COZINHA,LABORATORIO_CIENCIAS,LABORATORIO_INFORMATICA,...,SECRETARIA,QT_SALAS_UTILIZADAS,QT_DESKTOP_ALUNO,INTERNET,ALIMENTACAO,EXAME_SELECAO,ORGAOS,PATIO,BIBLIOTECA,RURAL
count,139590.0,139590.0,139590.0,139590.0,139590.0,139590.0,139590.0,139590.0,139590.0,139590.0,...,139590.0,139590.0,139590.0,139590.0,139590.0,139590.0,139590.0,139590.0,139590.0,139590.0
mean,0.973014,0.970019,0.930375,0.240483,0.418225,0.086596,0.961272,0.952446,0.086389,0.346543,...,0.655785,7.546995,4.498983,0.703109,0.991683,0.15408,0.763264,0.755513,0.465277,0.391554
std,0.162044,0.170535,0.254515,0.427378,0.493269,0.281244,0.192946,0.21282,0.280938,0.47587,...,0.475113,5.934857,8.437395,0.45689,0.090819,1.093374,0.425081,0.429784,0.498795,0.4881
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,3.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
50%,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,6.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
75%,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,10.0,6.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,100.0,80.0,1.0,1.0,9.0,1.0,1.0,1.0,1.0
