In [1]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
import pickle

In [2]:
def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
    return theta + learning_rate_theta(nb_previous_answers) * (
        is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
    )

def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
    return beta - learning_rate_beta(nb_previous_answers) * (
        is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
    )

def learning_rate_theta(nb_answers):
    return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

def learning_rate_beta(nb_answers):
    return 1 / (1 + 0.05 * nb_answers)

def probability_of_good_answer(theta, beta, left_asymptote):
    return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [3]:
def estimate_parameters(answers_df, granularity_feature_name='assessmentItemID'):
    item_parameters = {
        granularity_feature_value: {"beta": 0, "nb_answers": 0}
        for granularity_feature_value in np.unique(answers_df[granularity_feature_name])
    }
    student_parameters = {
        student_id: {"theta": 0, "nb_answers": 0}
        for student_id in np.unique(answers_df.userID)
    }

    print("Parameter estimation is starting...")

    for student_id, item_id, left_asymptote, answerCode in tqdm(
        zip(answers_df.userID.values, answers_df[granularity_feature_name].values, answers_df.left_asymptote.values, answers_df.answerCode.values)
    ):
        theta = student_parameters[student_id]["theta"]
        beta = item_parameters[item_id]["beta"]

        item_parameters[item_id]["beta"] = get_new_beta(
            answerCode, beta, left_asymptote, theta, item_parameters[item_id]["nb_answers"],
        )
        student_parameters[student_id]["theta"] = get_new_theta(
            answerCode, beta, left_asymptote, theta, student_parameters[student_id]["nb_answers"],
        )
        
        item_parameters[item_id]["nb_answers"] += 1
        student_parameters[student_id]["nb_answers"] += 1

    print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
    return student_parameters, item_parameters


def update_parameters(answers_df, student_parameters, item_parameters, granularity_feature_name='assessmentItemID'):
    for student_id, item_id, left_asymptote, answerCode in tqdm(zip(
        answers_df.userID.values, 
        answers_df[granularity_feature_name].values, 
        answers_df.left_asymptote.values, 
        answers_df.answerCode.values)
    ):
        if student_id not in student_parameters:
            student_parameters[student_id] = {'theta': 0, 'nb_answers': 0}
        if item_id not in item_parameters:
            item_parameters[item_id] = {'beta': 0, 'nb_answers': 0}
            
        theta = student_parameters[student_id]['theta']
        beta = item_parameters[item_id]['beta']

        student_parameters[student_id]['theta'] = get_new_theta(
            answerCode, beta, left_asymptote, theta, student_parameters[student_id]['nb_answers']
        )
        item_parameters[item_id]['beta'] = get_new_beta(
            answerCode, beta, left_asymptote, theta, item_parameters[item_id]['nb_answers']
        )
        
        student_parameters[student_id]['nb_answers'] += 1
        item_parameters[item_id]['nb_answers'] += 1

    print(f"Theta & beta estimations on {granularity_feature_name} are updated.")
        
    return student_parameters, item_parameters


def estimate_probas(test_df, student_parameters, item_parameters, granularity_feature_name='assessmentItemID'):
    probability_of_success_list = []
    
    for student_id, item_id, left_asymptote in tqdm(
        zip(test_df.userID.values, test_df[granularity_feature_name].values, test_df.left_asymptote.values)
    ):
        theta = student_parameters[student_id]['theta'] if student_id in student_parameters else 0
        beta = item_parameters[item_id]['beta'] if item_id in item_parameters else 0

        probability_of_success_list.append(probability_of_good_answer(theta, beta, left_asymptote))

    return probability_of_success_list

In [4]:
data_dir = '/opt/ml/project/data/'
csv_file_path = os.path.join(data_dir, 'total_data.csv')
df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) 
df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

In [5]:
def train_elo(train_df):
    train_df['left_asymptote'] = 1/2

    print(f"Dataset of shape {train_df.shape}")
    print(f"Columns are {list(train_df.columns)}")
    
    student_parameters, item_parameters = estimate_parameters(train_df)
    
    return student_parameters, item_parameters

In [6]:
def valid_update(df, cv_num):
    users_file_path = os.path.join(data_dir, f'cv1_users.pickle')
    with open(users_file_path,'rb') as f:
        users = pickle.load(f)
    train_users = users['train_users']
    test_users = users['test_users']

    valid_cond1 = df['userID'].isin(train_users) == False
    valid_cond2 = df['userID'].isin(test_users) == False
    cv_idx = df[valid_cond1&valid_cond2].groupby('userID', as_index=False).nth(-cv_num).index
    valid_idx = df[valid_cond1&valid_cond2].groupby('userID').tail(cv_num).index
    
    df['cv_idx'] = False
    df['is_valid'] = False
    
    df.loc[cv_idx, 'cv_idx'] = True
    df.loc[valid_idx, 'is_valid'] = True
    
    return df

In [7]:
cv_len = 5

for cv_num in range(1, 1+cv_len):
    df = valid_update(df, cv_num)
    
    cond1 = df['is_valid'] == True
    cond2 = df['answerCode'] == -1
    test_df = df[cond1|cond2].copy()
    train_df = df[~(cond1|cond2)].copy()

    print(train_df.userID.nunique(), len(df[cond1]), len(df[cond2]), len(train_df) + len(test_df))
    
    student_parameters, item_parameters = train_elo(train_df)
    with open(f'./assets3/cv{cv_num}/elo_student_parameters.pickle','wb') as f:
        pickle.dump(student_parameters, f)

    with open(f'./assets3/cv{cv_num}/elo_item_parameters.pickle','wb') as f:
        pickle.dump(item_parameters, f)

7442 2007 744 2526700
Dataset of shape (2523949, 13)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'elapsedTime', 'testType', 'testNumber', 'questionNumber', 'cv_idx', 'is_valid', 'left_asymptote']
Parameter estimation is starting...


2523949it [00:39, 63618.57it/s]


Theta & beta estimations on assessmentItemID are completed.
7442 4014 744 2526700
Dataset of shape (2521942, 13)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'elapsedTime', 'testType', 'testNumber', 'questionNumber', 'cv_idx', 'is_valid', 'left_asymptote']
Parameter estimation is starting...


2521942it [00:34, 73709.02it/s]


Theta & beta estimations on assessmentItemID are completed.
7442 6021 744 2526700
Dataset of shape (2519935, 13)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'elapsedTime', 'testType', 'testNumber', 'questionNumber', 'cv_idx', 'is_valid', 'left_asymptote']
Parameter estimation is starting...


2519935it [00:33, 74501.37it/s]


Theta & beta estimations on assessmentItemID are completed.
7442 8028 744 2526700
Dataset of shape (2517928, 13)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'elapsedTime', 'testType', 'testNumber', 'questionNumber', 'cv_idx', 'is_valid', 'left_asymptote']
Parameter estimation is starting...


2517928it [00:34, 73833.29it/s]


Theta & beta estimations on assessmentItemID are completed.
7442 10035 744 2526700
Dataset of shape (2515921, 13)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'elapsedTime', 'testType', 'testNumber', 'questionNumber', 'cv_idx', 'is_valid', 'left_asymptote']
Parameter estimation is starting...


2515921it [00:34, 73391.43it/s]


Theta & beta estimations on assessmentItemID are completed.
