# Preprocessing School grades dataset

Here we will preprocess the school grades datasets present in ```./data/ideb-anos-iniciais-2019.csv``` and ```./data/ideb-anos-finais-2019.csv```

In [34]:
import json
import missingno
import matplotlib
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# notebook only
%matplotlib inline


In [35]:
df = pd.read_csv('./data/ideb-anos-finais-2019.csv')

In [36]:
df.head()

Unnamed: 0,SG_UF,CO_MUNICIPIO,TAXA_APROVACAO,INDICADOR_RENDIMENTO (P),NOTA_MATEMATICA,NOTA_PORTUGUES,NOTA_MEDIA (N),IDEB (N x P)
0,RO,1100015,95.2,0.954601,273.59,268.37,5.699333,5.4
1,RO,1100015,88.9,0.888055,255.87,252.15,5.133667,4.6
2,RO,1100015,93.2,0.933872,270.02,265.1,5.585333,5.2
3,RO,1100023,92.0,0.921108,267.39,266.19,5.559667,5.1
4,RO,1100023,89.2,0.887481,247.56,251.95,4.991833,4.4


Here, 'REDE' is the same info as 'TP_DEPENDENCIA' in the inep dataset. So, we will drop it because we don't need it (there is no private school recorded here)

In [37]:
# df['REDE'].unique()
# df = df.drop(columns='REDE')

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12024 entries, 0 to 12023
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   SG_UF                     12024 non-null  object 
 1   CO_MUNICIPIO              12024 non-null  int64  
 2   TAXA_APROVACAO            12024 non-null  float64
 3   INDICADOR_RENDIMENTO (P)  12024 non-null  float64
 4   NOTA_MATEMATICA           12024 non-null  float64
 5   NOTA_PORTUGUES            12024 non-null  float64
 6   NOTA_MEDIA (N)            12024 non-null  float64
 7   IDEB (N x P)              12024 non-null  float64
dtypes: float64(6), int64(1), object(1)
memory usage: 751.6+ KB


Null or NaN values are represented as 'ND' and '-', and so we will drop them in order to manipulate the object values as floats.

In [39]:
# dropping null values

df = df.drop(df[df['INDICADOR_RENDIMENTO (P)'] == '-'].index, axis=0)
df = df.drop(df[df['NOTA_MATEMATICA'] == 'ND'].index)
df = df.drop(df[df['NOTA_PORTUGUES'] == 'ND'].index)

In [40]:
def obj_to_float(df, columns):
    for column in columns:
        df[column] = df[column].astype(float)
    return df

In [41]:
floats = ['TAXA_APROVACAO', 'INDICADOR_RENDIMENTO (P)', 'NOTA_MATEMATICA', 'NOTA_PORTUGUES', 'NOTA_MEDIA (N)', 'IDEB (N x P)']
df = obj_to_float(df, floats)

In [42]:
df.describe()

Unnamed: 0,CO_MUNICIPIO,TAXA_APROVACAO,INDICADOR_RENDIMENTO (P),NOTA_MATEMATICA,NOTA_PORTUGUES,NOTA_MEDIA (N),IDEB (N x P)
count,12024.0,12024.0,12024.0,12024.0,12024.0,12024.0,12024.0
mean,3244826.0,88.558508,0.885895,257.717685,253.71304,5.190512,4.613407
std,981284.2,8.192375,0.081705,19.109312,16.804543,0.586183,0.757813
min,1100015.0,41.0,0.363577,187.04,179.96,2.8105,1.9
25%,2511905.0,83.6,0.836934,244.7975,242.46,4.790958,4.1
50%,3145372.0,90.0,0.899818,257.935,254.38,5.209083,4.7
75%,4117404.0,95.1,0.95084,270.02,265.09,5.577333,5.2
max,5300108.0,100.0,1.0,355.68,327.68,8.056,7.9


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12024 entries, 0 to 12023
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   SG_UF                     12024 non-null  object 
 1   CO_MUNICIPIO              12024 non-null  int64  
 2   TAXA_APROVACAO            12024 non-null  float64
 3   INDICADOR_RENDIMENTO (P)  12024 non-null  float64
 4   NOTA_MATEMATICA           12024 non-null  float64
 5   NOTA_PORTUGUES            12024 non-null  float64
 6   NOTA_MEDIA (N)            12024 non-null  float64
 7   IDEB (N x P)              12024 non-null  float64
dtypes: float64(6), int64(1), object(1)
memory usage: 751.6+ KB


In [44]:
# df.to_csv('./data/ideb-anos-iniciais-2019.csv', index=False)

In [45]:
df.head()

Unnamed: 0,SG_UF,CO_MUNICIPIO,TAXA_APROVACAO,INDICADOR_RENDIMENTO (P),NOTA_MATEMATICA,NOTA_PORTUGUES,NOTA_MEDIA (N),IDEB (N x P)
0,RO,1100015,95.2,0.954601,273.59,268.37,5.699333,5.4
1,RO,1100015,88.9,0.888055,255.87,252.15,5.133667,4.6
2,RO,1100015,93.2,0.933872,270.02,265.1,5.585333,5.2
3,RO,1100023,92.0,0.921108,267.39,266.19,5.559667,5.1
4,RO,1100023,89.2,0.887481,247.56,251.95,4.991833,4.4


In [46]:
df = df.drop('SG_UF', axis=1).groupby(['CO_MUNICIPIO']).mean()
df.head()

Unnamed: 0_level_0,TAXA_APROVACAO,INDICADOR_RENDIMENTO (P),NOTA_MATEMATICA,NOTA_PORTUGUES,NOTA_MEDIA (N),IDEB (N x P)
CO_MUNICIPIO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1100015,92.433333,0.925509,266.493333,261.873333,5.472778,5.066667
1100023,90.9,0.908209,260.346667,261.133333,5.358,4.866667
1100031,98.2,0.977905,242.44,241.04,4.724667,4.6
1100049,95.533333,0.956184,260.466667,251.436667,5.198389,5.0
1100056,96.7,0.967177,268.57,261.62,5.503167,5.3


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5293 entries, 1100015 to 5300108
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   TAXA_APROVACAO            5293 non-null   float64
 1   INDICADOR_RENDIMENTO (P)  5293 non-null   float64
 2   NOTA_MATEMATICA           5293 non-null   float64
 3   NOTA_PORTUGUES            5293 non-null   float64
 4   NOTA_MEDIA (N)            5293 non-null   float64
 5   IDEB (N x P)              5293 non-null   float64
dtypes: float64(6)
memory usage: 289.5 KB


In [48]:
df.to_csv('./data/ideb-finais-municipios.csv')