In [9]:
import numpy as np
import pandas as pd

import os

from tqdm import tqdm

In [10]:
def elo(df):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...", flush=True)

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            ),
            total=len(answers_df),
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters

    def gou_func(theta, beta):
        return 1 / (1 + np.exp(-(theta - beta)))

    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    prob = [
        gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"])
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]

    df["elo"] = prob

    return df

In [11]:
# train_data.csv 데이터프레임으로 불러오기
data_type = {"userID": "int16", "answerCode": "int8", "knowledgeTag": "int16"}

df1 = pd.read_csv(
    "/opt/ml/input/data/train_data.csv", dtype=data_type, parse_dates=["Timestamp"]
)
df2 = pd.read_csv(
    "/opt/ml/input/data/test_data.csv", dtype=data_type, parse_dates=["Timestamp"]
)

df1['dataset'] = 1
df2['dataset'] = 2

df = pd.concat([df1,df2])

df = df.sort_values(by=["userID", "Timestamp"]).reset_index(drop=True)


In [16]:
df2

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,dataset
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626,2
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626,2
2,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625,2
3,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625,2
4,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623,2
...,...,...,...,...,...,...,...
260109,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832,2
260110,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832,2
260111,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244,2
260112,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244,2


In [12]:
# ELO Function 적용
# df1 = elo(df1)
# df2 = elo(df2)
df = elo(df)

# 필요없는 column 제거
df = df.drop(columns=["left_asymptote"])
# df2 = df2.drop(columns=["left_asymptote"])


Dataset of shape (2526700, 8)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'dataset', 'left_asymptote']
Parameter estimation is starting...


100%|██████████| 2526700/2526700 [01:03<00:00, 39845.04it/s]


Theta & beta estimations on assessmentItemID are completed.


In [13]:
import gc
gc.get_count()
gc.collect()

183

In [14]:
gc.get_count()

(122, 0, 0)

In [15]:
# 새롭게 추가된 elo feature 저장
os.makedirs("/opt/ml/input/data/", exist_ok=True)
df.to_csv("/opt/ml/input/data/elo.csv", index=False)  # 최종 결과 csv로 반환
# df2.to_csv("/opt/ml/input/data/elo2.csv", index=False)  # 최종 결과 csv로 반환