<h3>Rebuild ENEM's answers<br></h3>

Since some ENEM answers have been lost, you will rebuild them from the final average result - creating a model to predict the marked down answers.

In [1]:
import pandas as pd
import numpy as np
import sys

# could not manage to use a package version of refactor modules
sys.path.insert(0, '../src')
from send_answer import send_answer

sys.path.insert(0, '../src/models')
from regression import predict
from score import score

pd.set_option('display.max_columns', 500)

In [2]:
# input data
train = pd.read_csv('../data/raw/train.csv', index_col=0).set_index('NU_INSCRICAO')
test = pd.read_csv('../data/raw/test3.csv').set_index('NU_INSCRICAO')

# quick data clean-up
train.loc[:,'TX_RESPOSTAS_MT'] = train.loc[:,'TX_RESPOSTAS_MT'].str.replace('\.','*')
train = train.loc[train.TX_RESPOSTAS_MT.dropna(axis=0).index]

<h3>Prediction strategy</h3><br>
The underlying idea of the following function is to predict written answer for the segmented performance quartile as well as for its corresponding test using a MarkovChain

_Reference materials:_
<ol>
    <li><a href="https://www.youtube.com/watch?v=eGFJ8vugIWA">Coding Train - Markov Chains</a></li>
    <li><a href="http://setosa.io/ev/markov-chains/">Markov Chains Visually Explained</a></li>
</ol>

In [3]:
# predict the grades on the test set using the Quantile Transformation
grade_prediction = predict(train.drop('TX_RESPOSTAS_MT', axis=1), test.drop('TX_RESPOSTAS_MT', axis=1))
test.loc[list(grade_prediction.index), 'NU_NOTA_MT'] = grade_prediction.loc[:,'NU_NOTA_MT']

# remove the 0 scores from the training set
train = train.loc[train.NU_NOTA_MT != 0,:]

# reposition the training set
train = train.copy()[list(test.columns)+['TX_GABARITO_MT']]

# separte the datasets in quartiles based on the math grade
quartiles = 4
merged_grades = pd.qcut(pd.concat([train.NU_NOTA_MT, test.NU_NOTA_MT]), quartiles, labels = False)
train['MT_QT'] = merged_grades.loc[train.index].values
test['MT_QT'] = merged_grades.loc[test.index].values


train['PREDICTION'] = ''
test['PREDICTION'] = '' 

In [4]:
from random import choice
class MarkovCN:
    def __init__(self, order = 3):
        self.states = {}
        self.order = order
    
    def train(self, elements):
        for i in range(len(elements)):
            # create the keys based on the order of the Markov Chain            
            key = tuple(elements[i:self.order+i])
            if key not in self.states.keys():
                self.states[key] = []
            try:
                self.states[key].append(elements[self.order+i])
            except IndexError:
                pass
    
    def predict(self, elements):
        try:
            return choice(self.states[tuple(elements[-self.order:])])
        except IndexError:
            raise KeyError

In [6]:
# set up the variables for the Markov Chains
order = 3  # number of letters to consider to train the Markov Chain
n_predictions = 5
shift = n_predictions + order

# iterate through all the math test codes in the training set
test_codes = train.CO_PROVA_MT.unique()
for cod in test_codes:
    
    # iterate through the performance quartiles
    grade_quartiles = train.MT_QT.unique()
    for quartile in grade_quartiles:
       
        model = MarkovCN(order)
        # train markov chain using each line
        
        train_set = train.loc[(train.CO_PROVA_MT == cod) & (train.MT_QT == quartile)]
        test_set = test.loc[(test.CO_PROVA_MT == cod) & (test.MT_QT == quartile)]
        
        for i in train_set.index:
            model.train(train.loc[i, 'TX_RESPOSTAS_MT'][-shift:])
            # attempt to enforce higher grades
            #for _ in range(int(quartile)):
            #    model.train(train.loc[i,'TX_GABARITO_MT'][-shift:])
        
        # this two iterations below must be kept separated due to the divergence in the exception handling
        
        for i in train_set.index:
            # build answer from empty string
            enem_answer = ''
            # collect last string of characters from the answer
            element = train.loc[i,'TX_RESPOSTAS_MT'][-shift:-n_predictions]
            for _ in range(n_predictions):
                try:
                    enem_answer += model.predict(element)
                except KeyError or ValueError:
                    # In case it tries to make an unseen prediction, the result will be the mode on that position
                    enem_answer += train_set.loc[:, 'TX_RESPOSTAS_MT'].str[-n_predictions+len(enem_answer)].mode()[0]
                element = element[-order+1:]+enem_answer[-1]
            train.loc[i,'PREDICTION'] = enem_answer

        for i in test_set.index:
            # build answer from empty string
            enem_answer = ''
            # collect last string of characters from the answer
            element = test.loc[i, 'TX_RESPOSTAS_MT'][-order:]
            for _ in range(n_predictions):
                try:
                    enem_answer += model.predict(element)
                except KeyError or ValueError:
                    # In case it tries to make an unseen prediction, the result will be the mode on that position
                    enem_answer += train_set.loc[:, 'TX_RESPOSTAS_MT'].str[-n_predictions+len(enem_answer)].mode()[0]
                element = element[-order+1:]+enem_answer[-1]  
            test.loc[i, 'PREDICTION'] = enem_answer

print('Training set accuracy: %.2f' % (score(train.TX_RESPOSTAS_MT.str[-n_predictions:], train.PREDICTION)*100))

Training set accuracy: 23.47


In [None]:
answer = test.copy().loc[:,['PREDICTION']]
answer = answer.rename(index=str, columns={"PREDICTION": "TX_RESPOSTAS_MT"})
#send_answer(answer.reset_index(), 3)