In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa
import  pyarrow.parquet as pq

In [2]:
path_in = "../../Datasets/MICRODADOS_ENEM_2019_SAOPAULO_FILTERED.parquet"

In [3]:
df = pd.read_parquet(path_in)

In [4]:
def vector_to_columns(df, column_prefix, subject, qtd_columns):    
    new_df = df[column_prefix + subject].str.split("",0,expand=True)
    new_df = new_df.drop(columns=[0, qtd_columns], axis="columns", inplace=False)

    return new_df


def remove_extra_columns(dfRespostas, dfGabarito):
    filter = dfRespostas[1] == "9"

    for i in range(1,6):
        dfRespostas.loc[filter, i] = dfRespostas[i+5]
        dfGabarito.loc[filter, i] = dfGabarito[i+5]

    dfRespostas.drop(columns=[6,7,8,9,10], inplace=True)
    dfGabarito.drop(columns=[6,7,8,9,10], inplace=True)

    return dfRespostas, dfGabarito


def create_dict_columns(subject, qtd_columns):
    columnsNames = [subject + str(i) for i in range(1,qtd_columns)]
    dictColumns =  dict(enumerate(columnsNames,1))
    
    return dictColumns


def alternative_to_boolean(dfRespostas, dfGabarito, columnsNames):
    for column in columnsNames:
        conditions = [dfRespostas[column] == dfGabarito[column], pd.isna(dfRespostas[column])]
        choices = [1, np.nan]
        dfRespostas[column] = np.select(conditions, choices, default=0)
    
    return dfRespostas


In [5]:
arrayResp = {"CN":[], "MT":[], "CH":[], "LC":[]}

for subject in arrayResp.keys():

    qtd_columns = 46 if subject != "LC" else 51

    dfSubject = df[["NU_INSCRICAO", "TX_RESPOSTAS_" + subject, "TX_GABARITO_" + subject]].dropna()
    
    dfRespostas = vector_to_columns(dfSubject, "TX_RESPOSTAS_", subject, qtd_columns)
    dfGabarito = vector_to_columns(dfSubject, "TX_GABARITO_", subject, qtd_columns)
    
    if subject == "LC":
        dfRespostas, dfGabarito = remove_extra_columns(dfRespostas, dfGabarito)
        qtd_columns = 46

    columnsNames = [subject + str(i) for i in range(1,qtd_columns)]

    dfRespostas.columns = columnsNames
    dfGabarito.columns = columnsNames

    arrayResp[subject] =  alternative_to_boolean(dfRespostas, dfGabarito, columnsNames)
    arrayResp[subject].insert(0, "NU_INSCRICAO", dfSubject["NU_INSCRICAO"])
    arrayResp[subject] = arrayResp[subject].join(df[["NU_INSCRICAO", "NU_NOTA_" + subject, "CO_PROVA_" + subject]].set_index("NU_INSCRICAO"), on="NU_INSCRICAO", how="left")
    arrayResp[subject] = arrayResp[subject][arrayResp[subject]["NU_NOTA_" + subject]!=0]

del dfSubject
del dfRespostas
del dfGabarito

In [6]:
for subject, dfSubject in arrayResp.items():
    
    table = pa.Table.from_pandas(dfSubject.reset_index(drop=True))
    
    # Definir onde será salvo o novo dataset filtrado por área
    path = f'../../Datasets/BySubject/{subject}_2019_SAOPAULO.parquet'

    # Gerar um parquet da tabela
    pq.write_to_dataset(
        table,
        root_path=path,
    )
    