### Predição de Notas do Enem 2019 Baseado em Fatores Socio-Econômicos

In [None]:
#@markdown 0.1 Instalando dependências
%pip install dask
%pip install fsspec



In [31]:
#@markdown 0.2 Importações iniciais
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import dask.dataframe as dd


In [None]:
#@markdown 0.3 Autenticação no google drive (opcional)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title Fase 1: Importação dos Dados
#@markdown Salvando como df_enem
LOCALIZACAO_DOS_DADOS = '/content/drive/MyDrive/Enem2019/dados/MICRODADOS_ENEM_2019.csv'  #@param {type: "string"}
df_enem = dd.read_csv(LOCALIZACAO_DOS_DADOS, delimiter = ';', encoding = 'iso-8859-1',assume_missing=True)

In [None]:
#@title Fase 2: Preparação dos Dados
#@markdown 2.1 Selecionar campos a serem utilizados

#@markdown 2.2 Filtrar as informações em um dataset reduzido

COLUNAS_USADAS = [
 'CO_MUNICIPIO_RESIDENCIA',
 'NO_MUNICIPIO_RESIDENCIA',
 'SG_UF_RESIDENCIA',
 'NU_IDADE',
 'TP_SEXO',
 'TP_ESTADO_CIVIL',
 'TP_COR_RACA',
 'TP_NACIONALIDADE',
 'NO_MUNICIPIO_NASCIMENTO',
 'SG_UF_NASCIMENTO',
 'TP_ST_CONCLUSAO',
 'TP_ANO_CONCLUIU',
 'TP_ESCOLA',
 'TP_ENSINO',
 'IN_TREINEIRO',
 'CO_ESCOLA',
 'CO_MUNICIPIO_ESC',
 'NO_MUNICIPIO_ESC',
 'CO_UF_ESC',
 'SG_UF_ESC',
 'TP_DEPENDENCIA_ADM_ESC',
 'TP_LOCALIZACAO_ESC',
 'TP_SIT_FUNC_ESC',
 'IN_BAIXA_VISAO',
 'IN_CEGUEIRA',
 'IN_SURDEZ',
 'IN_DEFICIENCIA_AUDITIVA',
 'IN_SURDO_CEGUEIRA',
 'IN_DEFICIENCIA_FISICA',
 'IN_DEFICIENCIA_MENTAL',
 'IN_DEFICIT_ATENCAO',
 'IN_DISLEXIA',
 'IN_DISCALCULIA',
 'IN_AUTISMO',
 'IN_VISAO_MONOCULAR',
 'IN_OUTRA_DEF',
 'IN_GESTANTE',
 'IN_LACTANTE',
 'IN_IDOSO',
 'IN_ESTUDA_CLASSE_HOSPITALAR',
 'IN_SEM_RECURSO',
 'IN_BRAILLE',
 'IN_AMPLIADA_24',
 'IN_AMPLIADA_18',
 'IN_LEDOR',
 'IN_ACESSO',
 'IN_TRANSCRICAO',
 'IN_LIBRAS',
 'IN_TEMPO_ADICIONAL',
 'IN_LEITURA_LABIAL',
 'IN_MESA_CADEIRA_RODAS',
 'IN_MESA_CADEIRA_SEPARADA',
 'IN_APOIO_PERNA',
 'IN_GUIA_INTERPRETE',
 'IN_COMPUTADOR',
 'IN_CADEIRA_ESPECIAL',
 'IN_CADEIRA_CANHOTO',
 'IN_CADEIRA_ACOLCHOADA',
 'IN_PROVA_DEITADO',
 'IN_MOBILIARIO_OBESO',
 'IN_LAMINA_OVERLAY',
 'IN_PROTETOR_AURICULAR',
 'IN_MEDIDOR_GLICOSE',
 'IN_MAQUINA_BRAILE',
 'IN_SOROBAN',
 'IN_MARCA_PASSO',
 'IN_SONDA',
 'IN_MEDICAMENTOS',
 'IN_SALA_INDIVIDUAL',
 'IN_SALA_ESPECIAL',
 'IN_SALA_ACOMPANHANTE',
 'IN_MOBILIARIO_ESPECIFICO',
 'IN_MATERIAL_ESPECIFICO',
 'IN_NOME_SOCIAL',
 'CO_MUNICIPIO_PROVA',
 'NO_MUNICIPIO_PROVA',
 'CO_UF_PROVA',
 'SG_UF_PROVA',
 'TP_PRESENCA_CN',
 'TP_PRESENCA_CH',
 'TP_PRESENCA_LC',
 'TP_PRESENCA_MT',
 'NU_NOTA_CN',
 'NU_NOTA_CH',
 'NU_NOTA_LC',
 'NU_NOTA_MT',
 'TP_LINGUA',
 'TP_STATUS_REDACAO',
 'NU_NOTA_COMP1',
 'NU_NOTA_COMP2',
 'NU_NOTA_COMP3',
 'NU_NOTA_COMP4',
 'NU_NOTA_COMP5',
 'NU_NOTA_REDACAO',
 'Q001',
 'Q002',
 'Q003',
 'Q004',
 'Q005',
 'Q006',
 'Q007',
 'Q008',
 'Q009',
 'Q010',
 'Q011',
 'Q012',
 'Q013',
 'Q014',
 'Q015',
 'Q016',
 'Q017',
 'Q018',
 'Q019',
 'Q020',
 'Q021',
 'Q022',
 'Q023',
 'Q024',
 'Q025'
]

UF_PARA_FILTRAGEM = "PB" #@param {type: "string"}
df_enem_filtrado = df_enem[df_enem["SG_UF_PROVA"] == "PB"].compute()

In [None]:
#@markdown 2.3 Exportar os dados para re-uso.
ARQUIVO_EXPORTACAO = "df_enem_pb.csv" #@param {type: "string"}
df_enem_filtrado.to_csv(ARQUIVO_EXPORTACAO)

In [None]:
#@markdown 2.4 Criar atributo NOTA_MEDIA, que será usado para intuitos de classificação
df_enem_filtrado["NOTA_MEDIA"] = df_enem_filtrado[['NU_NOTA_COMP1',
 'NU_NOTA_CN',
 'NU_NOTA_CH',
 'NU_NOTA_LC',
 'NU_NOTA_MT',
 'NU_NOTA_REDACAO']].sum(axis=1) / 6

In [None]:
#@title 3. Classificação
#@markdown 3.1 Separando atributos para X e Y: Utilizando dados do questionário sócio-econômico como X, e Nota Média como Y

#@markdown 3.2 transformando os atributos do questionário em atributos numéricos

#@markdown 3.3 Para Y, agrupando os dados do valor de média
#@markdown Categorias: -> 0: entre 0~333, 1: entre 334 ~ 667, 2: entre 401 ~ 600, 3: entre 667 ~ 100



from sklearn.preprocessing import LabelEncoder

CATEGORIAS = ['Q001',
 'Q002',
 'Q003',
 'Q004',
 'Q005',
 'Q006',
 'Q007',
 'Q008',
 'Q009',
 'Q010',
 'Q011',
 'Q012',
 'Q013',
 'Q014',
 'Q015',
 'Q016',
 'Q017',
 'Q018',
 'Q019',
 'Q020',
 'Q021',
 'Q022',
 'Q023',
 'Q024',
 'Q025']

LE = LabelEncoder()
cats = []
for cat in CATEGORIAS:
  df_enem_filtrado[f'C{cat}'] = LE.fit_transform(df_enem_filtrado[cat])
  cats.append(f'C{cat}')

X = df_enem_filtrado.reset_index()[cats]
y = df_enem_filtrado.reset_index()["NOTA_MEDIA"] // 300

In [None]:
#@markdown 3.4 Verificando a matriz de Correlação entre os atributos 
X.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,CQ001,CQ002,CQ003,CQ004,CQ005,CQ006,CQ007,CQ008,CQ009,CQ010,CQ011,CQ012,CQ013,CQ014,CQ015,CQ016,CQ017,CQ018,CQ019,CQ020,CQ021,CQ022,CQ023,CQ024,CQ025
CQ001,1.0,0.517048,0.499374,0.330584,-0.046786,0.340528,0.159974,0.307553,0.212702,0.271722,-0.044304,0.100351,0.208704,0.25784,0.119557,0.219091,0.070367,0.195875,0.258024,0.050989,0.22878,0.245588,0.16907,0.272235,0.180786
CQ002,0.517048,1.0,0.345421,0.460119,-0.043408,0.392332,0.172739,0.346654,0.252807,0.329871,0.000511,0.111839,0.222426,0.273118,0.126727,0.249383,0.070509,0.209513,0.273776,0.032049,0.242606,0.285412,0.178127,0.325533,0.21721
CQ003,0.499374,0.345421,1.0,0.517336,-0.06938,0.363229,0.17566,0.329833,0.20473,0.291158,-0.073841,0.104251,0.210532,0.292826,0.126739,0.244456,0.069671,0.210272,0.274846,0.047913,0.235417,0.249393,0.183777,0.29122,0.204756
CQ004,0.330584,0.460119,0.517336,1.0,-0.040878,0.405845,0.184885,0.358624,0.237303,0.339025,-0.029517,0.111233,0.22513,0.304579,0.135442,0.256228,0.075112,0.225748,0.290499,0.045708,0.247205,0.274943,0.191805,0.329379,0.215386
CQ005,-0.046786,-0.043408,-0.06938,-0.040878,1.0,0.028974,0.040635,0.024307,0.234709,0.045749,0.075541,0.044277,0.002698,-0.020485,-0.00161,-0.054862,0.002299,-0.017588,0.064383,0.017323,0.00044,0.216751,0.017808,-0.040344,-0.038351
CQ006,0.340528,0.392332,0.363229,0.405845,0.028974,1.0,0.438651,0.649386,0.450719,0.62096,-0.000366,0.220344,0.384205,0.439023,0.238012,0.383368,0.169321,0.435666,0.546672,0.125079,0.495508,0.4358,0.343385,0.577307,0.267377
CQ007,0.159974,0.172739,0.17566,0.184885,0.040635,0.438651,1.0,0.412079,0.248832,0.352519,-0.029156,0.177106,0.239847,0.18888,0.147079,0.181824,0.12812,0.223247,0.351987,0.090012,0.300929,0.17923,0.188908,0.278198,0.083607
CQ008,0.307553,0.346654,0.329833,0.358624,0.024307,0.649386,0.412079,1.0,0.543856,0.579498,0.009753,0.250991,0.379457,0.447003,0.223825,0.38555,0.152462,0.406022,0.563347,0.128163,0.438297,0.423308,0.315758,0.5068,0.251506
CQ009,0.212702,0.252807,0.20473,0.237303,0.234709,0.450719,0.248832,0.543856,1.0,0.420809,0.107489,0.237485,0.274433,0.343641,0.153421,0.303728,0.088754,0.266641,0.443885,0.159154,0.29385,0.491091,0.237724,0.3794,0.270361
CQ010,0.271722,0.329871,0.291158,0.339025,0.045749,0.62096,0.352519,0.579498,0.420809,1.0,0.01182,0.210564,0.344657,0.422226,0.211438,0.375831,0.143637,0.416726,0.498858,0.10921,0.402977,0.417214,0.29792,0.48632,0.251114


In [None]:
#@markdown 3.5 separando os dados entre treino e teste
from sklearn.model_selection import train_test_split

TAMANHO_TESTE = 0.2 #@param {type: "number"}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TAMANHO_TESTE, random_state=0)

f"Tamanho do dataset de treino: {len(X_train)}, dataset de testes {len(X_test)}"

'Tamanho do dataset de treino: 117740, dataset de testes 29436'

In [None]:
#@markdown 3.6 partindo as features em X_train e X_test e rodando o modelo
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0, n_estimators=100)
clf.fit(X_train, y_train)
acuracia = clf.score(X_test, y_test)

f"acurácia do modelo com atributos filtrados {acuracia}"

'acurácia do modelo com atributos filtrados 0.7150428047289034'

In [None]:
#@markdown 3.7 executando testes com agrupamentos em granularidades diferentes, sendo n = 10, 5, 3 e 2.
GRANULARIDADE = [100,200,334,500]
N = [10,5, 3, 2]

TAMANHO_TESTE = 0.2 #@param {type: "number"}

resultado = []

for i in range(len(GRANULARIDADE)):
  y = df_enem_filtrado.reset_index()["NOTA_MEDIA"] // GRANULARIDADE[i]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TAMANHO_TESTE, random_state=0)
  clf = RandomForestClassifier(random_state=0, n_estimators=100)
  clf.fit(X_train, y_train)
  acuracia = clf.score(X_test, y_test)
  resultado.append(["RF", N[i], acuracia])
  print(f"Acurácia do modelo RANDOM FOREST com agrupamento N= {N[i]}: {acuracia}")



Acurácia do modelo RANDOM FOREST com agrupamento N= 10: 0.39268242967794537
Acurácia do modelo RANDOM FOREST com agrupamento N= 5: 0.531390134529148
Acurácia do modelo RANDOM FOREST com agrupamento N= 3: 0.7175907052588667
Acurácia do modelo RANDOM FOREST com agrupamento N= 2: 0.8399918467183041


In [None]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

fig = plt.figure(figsize=(15, 10))
plot_tree(clf.estimators_[0], 
          filled=True, impurity=True, 
          rounded=True)