In [1]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
import pickle

In [2]:
def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
    return theta + learning_rate_theta(nb_previous_answers) * (
        is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
    )

def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
    return beta - learning_rate_beta(nb_previous_answers) * (
        is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
    )

def learning_rate_theta(nb_answers):
    return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

def learning_rate_beta(nb_answers):
    return 1 / (1 + 0.05 * nb_answers)

def probability_of_good_answer(theta, beta, left_asymptote):
    return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [3]:
def estimate_parameters(answers_df, granularity_feature_name='assessmentItemID'):
    item_parameters = {
        granularity_feature_value: {"beta": 0, "nb_answers": 0}
        for granularity_feature_value in np.unique(answers_df[granularity_feature_name])
    }
    student_parameters = {
        student_id: {"theta": 0, "nb_answers": 0}
        for student_id in np.unique(answers_df.userID)
    }

    print("Parameter estimation is starting...")

    for student_id, item_id, left_asymptote, answerCode in tqdm(
        zip(answers_df.userID.values, answers_df[granularity_feature_name].values, answers_df.left_asymptote.values, answers_df.answerCode.values)
    ):
        theta = student_parameters[student_id]["theta"]
        beta = item_parameters[item_id]["beta"]

        item_parameters[item_id]["beta"] = get_new_beta(
            answerCode, beta, left_asymptote, theta, item_parameters[item_id]["nb_answers"],
        )
        student_parameters[student_id]["theta"] = get_new_theta(
            answerCode, beta, left_asymptote, theta, student_parameters[student_id]["nb_answers"],
        )
        
        item_parameters[item_id]["nb_answers"] += 1
        student_parameters[student_id]["nb_answers"] += 1

    print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
    return student_parameters, item_parameters

In [4]:
def update_parameters(answers_df, student_parameters, item_parameters, granularity_feature_name='assessmentItemID'):
    for student_id, item_id, left_asymptote, answerCode in tqdm(zip(
        answers_df.userID.values, 
        answers_df[granularity_feature_name].values, 
        answers_df.left_asymptote.values, 
        answers_df.answerCode.values)
    ):
        if student_id not in student_parameters:
            student_parameters[student_id] = {'theta': 0, 'nb_answers': 0}
        if item_id not in item_parameters:
            item_parameters[item_id] = {'beta': 0, 'nb_answers': 0}
            
        theta = student_parameters[student_id]['theta']
        beta = item_parameters[item_id]['beta']

        student_parameters[student_id]['theta'] = get_new_theta(
            answerCode, beta, left_asymptote, theta, student_parameters[student_id]['nb_answers']
        )
        item_parameters[item_id]['beta'] = get_new_beta(
            answerCode, beta, left_asymptote, theta, item_parameters[item_id]['nb_answers']
        )
        
        student_parameters[student_id]['nb_answers'] += 1
        item_parameters[item_id]['nb_answers'] += 1

    print(f"Theta & beta estimations on {granularity_feature_name} are updated.")
        
    return student_parameters, item_parameters

In [5]:
def estimate_probas(test_df, student_parameters, item_parameters, granularity_feature_name='assessmentItemID'):
    probability_of_success_list = []
    
    for student_id, item_id, left_asymptote in tqdm(
        zip(test_df.userID.values, test_df[granularity_feature_name].values, test_df.left_asymptote.values)
    ):
        theta = student_parameters[student_id]['theta'] if student_id in student_parameters else 0
        beta = item_parameters[item_id]['beta'] if item_id in item_parameters else 0

        probability_of_success_list.append(probability_of_good_answer(theta, beta, left_asymptote))

    return probability_of_success_list

In [6]:
def train_elo(train_df):
    train_df['left_asymptote'] = 1/2

    print(f"Dataset of shape {train_df.shape}")
    print(f"Columns are {list(train_df.columns)}")
    
    student_parameters, item_parameters = estimate_parameters(train_df)
    
    return student_parameters, item_parameters

In [7]:
def load_train_data(df):
    last_cond = df['userID'] != df['userID'].shift(-1)
    train_df = df[~last_cond].copy()
    return train_df

In [8]:
data_dir = '/opt/ml/project/data/'
csv_file_path = os.path.join(data_dir, 'total_data.csv')
df = pd.read_csv(
    filepath_or_buffer=csv_file_path,
    usecols=['userID', 'assessmentItemID', 'answerCode'],
    dtype = {'answerCode': 'int8'},
)

In [9]:
train_df = load_train_data(df)

student_parameters, item_parameters = train_elo(train_df)

with open(f'./assets/elo_student_parameters.pickle','wb') as f:
    pickle.dump(student_parameters, f)

with open(f'./assets/elo_item_parameters.pickle','wb') as f:
    pickle.dump(item_parameters, f)

Dataset of shape (2519258, 4)
Columns are ['userID', 'assessmentItemID', 'answerCode', 'left_asymptote']
Parameter estimation is starting...


2519258it [00:43, 57933.65it/s]


Theta & beta estimations on assessmentItemID are completed.


In [10]:
test_df = df[df['answerCode'] == -1].copy()
test_df['left_asymptote']=1/2

preds = estimate_probas(test_df, student_parameters, item_parameters)

write_path = './output/elo_submission.csv'
    
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(preds):
        w.write('{},{}\n'.format(id,p))

print("Done")

744it [00:00, 134148.49it/s]

writing prediction : ./output/elo_submission.csv
Done



