In [1]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm

ELO functions

In [2]:
def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
    return theta + learning_rate_theta(nb_previous_answers) * (
        is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
    )

def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
    return beta - learning_rate_beta(nb_previous_answers) * (
        is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
    )

def learning_rate_theta(nb_answers):
    return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

def learning_rate_beta(nb_answers):
    return 1 / (1 + 0.05 * nb_answers)

def probability_of_good_answer(theta, beta, left_asymptote):
    return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

Parameter Estimation

In [3]:
def estimate_parameters(answers_df, granularity_feature_name='content_id'):
    item_parameters = {
        granularity_feature_value: {"beta": 0, "nb_answers": 0}
        for granularity_feature_value in np.unique(answers_df[granularity_feature_name])
    }
    student_parameters = {
        student_id: {"theta": 0, "nb_answers": 0}
        for student_id in np.unique(answers_df.student_id)
    }

    print("Parameter estimation is starting...")

    for student_id, item_id, left_asymptote, answered_correctly in tqdm(
        zip(answers_df.student_id.values, answers_df[granularity_feature_name].values, answers_df.left_asymptote.values, answers_df.answered_correctly.values)
    ):
        theta = student_parameters[student_id]["theta"]
        beta = item_parameters[item_id]["beta"]

        item_parameters[item_id]["beta"] = get_new_beta(
            answered_correctly, beta, left_asymptote, theta, item_parameters[item_id]["nb_answers"],
        )
        student_parameters[student_id]["theta"] = get_new_theta(
            answered_correctly, beta, left_asymptote, theta, student_parameters[student_id]["nb_answers"],
        )
        
        item_parameters[item_id]["nb_answers"] += 1
        student_parameters[student_id]["nb_answers"] += 1

    print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
    return student_parameters, item_parameters

Update Parameters

In [4]:
def update_parameters(answers_df, student_parameters, item_parameters, granularity_feature_name='content_id'):
    for student_id, item_id, left_asymptote, answered_correctly in tqdm(zip(
        answers_df.student_id.values, 
        answers_df[granularity_feature_name].values, 
        answers_df.left_asymptote.values, 
        answers_df.answered_correctly.values)
    ):
        if student_id not in student_parameters:
            student_parameters[student_id] = {'theta': 0, 'nb_answers': 0}
        if item_id not in item_parameters:
            item_parameters[item_id] = {'beta': 0, 'nb_answers': 0}
            
        theta = student_parameters[student_id]['theta']
        beta = item_parameters[item_id]['beta']

        student_parameters[student_id]['theta'] = get_new_theta(
            answered_correctly, beta, left_asymptote, theta, student_parameters[student_id]['nb_answers']
        )
        item_parameters[item_id]['beta'] = get_new_beta(
            answered_correctly, beta, left_asymptote, theta, item_parameters[item_id]['nb_answers']
        )
        
        student_parameters[student_id]['nb_answers'] += 1
        item_parameters[item_id]['nb_answers'] += 1
        
    return student_parameters, item_parameters

Probability Estimation

In [5]:
def estimate_probas(test_df, student_parameters, item_parameters, granularity_feature_name='content_id'):
    probability_of_success_list = []
    
    for student_id, item_id, left_asymptote in tqdm(
        zip(test_df.student_id.values, test_df[granularity_feature_name].values, test_df.left_asymptote.values)
    ):
        theta = student_parameters[student_id]['theta'] if student_id in student_parameters else 0
        beta = item_parameters[item_id]['beta'] if item_id in item_parameters else 0

        probability_of_success_list.append(probability_of_good_answer(theta, beta, left_asymptote))

    return probability_of_success_list

Main

In [6]:
compute_estimations = True
nb_rows_training = None

In [7]:
if compute_estimations:
    DATA_PATH = '../../data/'
    objective = "assessmentItemID"
    training = pd.read_csv(DATA_PATH+"train_data.csv", usecols=[objective,"userID","answerCode"], dtype={"answerCode": "int8"})
    test = pd.read_csv(DATA_PATH+"test_data.csv", usecols=[objective,"userID","answerCode"], dtype={"answerCode": "int8"})
    training = pd.concat([training,test])
    training.columns = ["student_id", "content_id", "answered_correctly"]
    #df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
    # training = pd.read_csv(
    #     filepath_or_buffer="/kaggle/input/riiid-test-answer-prediction/train.csv", 
    #     usecols=["content_id", "user_id", "answered_correctly"], 
    #     dtype={'answered_correctly': "int8"},
    #     nrows=nb_rows_training
    # )
    # training.rename(columns={'userID': 'student_id'}, inplace=True)
    
    training = training[training.answered_correctly != -1]
    training['left_asymptote'] = 1/100

    print(f"Dataset of shape {training.shape}")
    print(f"Columns are {list(training.columns)}")
    
    student_parameters, item_parameters = estimate_parameters(training)
else:
    student_data = pd.read_csv('../input/thetas-20201217/thetas_20201217.csv', index_col='student_id')
    student_parameters = student_data.to_dict('index')
    print(f"Successfully read student parameter file and converted to dict.")
    
    content_data = pd.read_csv('../input/betas-content-id-20201217/betas_content_id_20201217.csv', index_col='content_id')
    item_parameters = content_data.to_dict('index')
    print(f"Successfully read item parameter file and converted to dict.")

Dataset of shape (2525956, 4)
Columns are ['student_id', 'content_id', 'answered_correctly', 'left_asymptote']
Parameter estimation is starting...


2525956it [00:17, 146140.15it/s]

Theta & beta estimations on content_id are completed.





In [10]:
item_df = pd.DataFrame(item_parameters).T.reset_index()
item_df.columns = ['assessmentItemID','beta', 'nb_answers']

In [11]:
user_df = pd.DataFrame(student_parameters).T.reset_index()
user_df.columns = ['userID','theta', 'nb_answers']

In [45]:
user_df.to_csv('../../data/user_elo.csv',index=False)
item_df.to_csv('../../data/item_elo.csv',index=False)

Submission

In [None]:
def format_test_df(test_df):
    test_copy = test_df.copy()
    test_copy = test_copy[test_copy['content_type_id'] == 0]
    test_copy['left_asymptote'] = 1/4
    test_copy = test_copy.rename(columns={'user_id': 'student_id'})
    return test_copy

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

previous_test_df = None

for (test_df, sample_prediction_df) in iter_test:
    if previous_test_df is not None:
        previous_test_df['answered_correctly'] = eval(test_df["prior_group_answers_correct"].iloc[0])
        previous_test_df = format_test_df(previous_test_df)
        student_parameters, item_parameters = update_parameters(previous_test_df, student_parameters, item_parameters)

    previous_test_df = test_df.copy()
    test_df = format_test_df(test_df)
    test_df['answered_correctly'] = estimate_probas(test_df, student_parameters, item_parameters)
    env.predict(test_df[['row_id', 'answered_correctly']])