In [15]:
import numpy as np
import pandas as pd
import tqdm

df = pd.read_csv("/opt/ml/input/data/train_test_data.csv")
df = df.sort_values(by=["userID", "Timestamp"]).reset_index(drop=True)

In [16]:
from datetime import date
import math
from tqdm import tqdm

def fe(df):
    df = add_last_problem(df)
    df = ELO_function(df)
    df = df.drop(['last_problem', 'left_asymptote'], axis=1)
    
    df['assessmentItemID0'] = df['assessmentItemID'].str[2]
    df['assessmentItemID1'] = df['assessmentItemID'].str[4:7]
    df['assessmentItemID2'] = df["assessmentItemID"].apply(lambda x: int(x[-2:]))

    df['month'] = df.Timestamp.str[5:7]
    df['hour'] = df.Timestamp.str[11:13]

    yy = df.Timestamp.str[:4]
    mm = df.Timestamp.str[5:7]
    dd = df.Timestamp.str[8:10]

    y = [int(y) for y in list(yy)]
    m = [int(m) for m in list(mm)]
    d = [int(d) for d in list(dd)]

    df['week'] = [date(i,j,k).weekday() for i,j,k in zip(y,m,d)]

    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    diff = df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())

    df['elapsed'] = diff

    df["elapsed"] = df["elapsed"].apply(
        lambda x: x if x < 1800 and x >= 0 else 0)

    def elap(x):
        if x == 0:
            return 0
        elif x < 9:
            return 1
        elif x < 301:
            return 2
        elif x < 601:
            return 3
        elif x < 1201:
            return 4
        elif x < 1501:
            return 5
        elif x < 1801:
            return 6
        else:
            return 7

    df['elapsed_cate'] = df.elapsed.apply(lambda x : elap(x))
    
    correct_k = df.groupby(["KnowledgeTag"])["answerCode"].agg(["mean", "std"])
    correct_k.columns = ["tag_mean", "tag_std"]

    correct_a = df.groupby(["assessmentItemID0"])["answerCode"].agg(["mean", "std"])
    correct_a.columns = ["ass0_mean", "ass0_std"]
    correct_a1 = df.groupby(["assessmentItemID1"])["answerCode"].agg(["mean", "std"])
    correct_a1.columns = ["ass1_mean", "ass1_std"]
    correct_a2 = df.groupby(["assessmentItemID2"])["answerCode"].agg(["mean", "std"])
    correct_a2.columns = ["ass2_mean", "ass2_std"]

    df = pd.merge(df, correct_k, on=["KnowledgeTag"], how="left")
    df = pd.merge(df, correct_a, on=["assessmentItemID0"], how="left")
    df = pd.merge(df, correct_a1, on=["assessmentItemID1"], how="left")
    df = pd.merge(df, correct_a2, on=["assessmentItemID2"], how="left")
    
    df["user_correct_answer"] = (
        df.groupby("userID")["answerCode"]
        .transform(lambda x: x.cumsum().shift(1))
        .fillna(0)
    )
    df["user_total_answer"] = df.groupby("userID")["answerCode"].cumcount() + 1
    df["user_acc"] = (df["user_correct_answer"] / df["user_total_answer"])
    
    df['recAccuracy'] = -1

    group = df.groupby("userID")

    def set_rec_count(group):
        group.sort_values(by='Timestamp', axis=0, inplace=True)
        recAccuracy = group['answerCode'].rolling(10, min_periods=1).sum().shift(1)
        group['recCount'] = recAccuracy

        return group

    df['recCount'] = 0

    group = df.groupby("userID")

    group = group.apply(set_rec_count)
    group.reset_index(drop=True, inplace=True)

    df['recCount'] = group['recCount']
    df['recCount']= df['recCount'].fillna(0)
    df['recCount'] = df['recCount'].astype(int)
    
    def abc(x):
        if x > 10:
            return 10
        else:
            return x

    df['abc'] = df.user_total_answer.apply(lambda x:abc(x))
    df['recAccuracy'] = df['recCount']/df['abc']
    df = df.drop(['abc'], axis=1)
    
    df["solve_order"] = df.groupby(["userID", "testId"]).cumcount()
    df["solve_order"] = (
        df["solve_order"]
        - df["assessmentItemID2"] * (df["solve_order"] > df["assessmentItemID2"]).apply(int)
        + 1
    )
    
    return df

def ELO_function(df):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...")

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            )
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters

    def gou_func(theta, beta):
        return 1 / (1 + np.exp(-(theta - beta)))

    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    prob = [
        gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"])
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]

    df["elo_prob"] = prob

    return df

def add_last_problem(df):
    new = []
    pre = df["testId"][0]
    for idx in df["testId"]:
        if pre != idx:
            new[-1] = -1
            pre = idx
        new.append(0)
    df["last_problem"] = new
    return df

df = fe(df)
df
#df.to_csv('../../../data/pkj_2.0.csv', index=False)

Dataset of shape (2525956, 8)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'last_problem', 'left_asymptote']


26429it [00:00, 131973.88it/s]

Parameter estimation is starting...


2525956it [00:17, 141907.01it/s]


Theta & beta estimations on assessmentItemID are completed.


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elo_prob,assessmentItemID0,assessmentItemID1,assessmentItemID2,...,ass1_mean,ass1_std,ass2_mean,ass2_std,user_correct_answer,user_total_answer,user_acc,recAccuracy,recCount,solve_order
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0.980777,6,001,1,...,0.708478,0.454477,0.749694,0.433190,0.0,1,0.000000,0.000000,0,1
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,0.973329,6,001,2,...,0.708478,0.454477,0.720312,0.448847,1.0,2,0.500000,0.500000,1,2
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,0.947325,6,001,3,...,0.708478,0.454477,0.688360,0.463164,2.0,3,0.666667,0.666667,2,3
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,0.974927,6,001,4,...,0.708478,0.454477,0.663597,0.472479,3.0,4,0.750000,0.750000,3,4
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,0.961414,6,001,5,...,0.708478,0.454477,0.599375,0.490026,4.0,5,0.800000,0.800000,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525951,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,0.295043,3,071,5,...,0.641616,0.479541,0.599375,0.490026,1.0,5,0.200000,0.200000,1,5
2525952,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,0.616699,4,165,1,...,0.709128,0.454192,0.749694,0.433190,1.0,6,0.166667,0.166667,1,1
2525953,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,0.600956,4,165,2,...,0.709128,0.454192,0.720312,0.448847,2.0,7,0.285714,0.285714,2,2
2525954,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,0.725808,4,165,3,...,0.709128,0.454192,0.688360,0.463164,3.0,8,0.375000,0.375000,3,3


In [17]:
# label encoding

cate = ['KnowledgeTag', 'month', 'hour', 'week', 'elapsed_cate',

'assessmentItemID0', 'assessmentItemID1', 'assessmentItemID2']

conti = ['elapsed' , 'tag_mean', 'tag_std', 'ass0_mean', 'ass0_std',

'ass1_mean', 'ass1_std', 'ass2_mean', 'ass2_std', 'user_correct_answer',

'user_total_answer', 'user_acc', 'recAccuracy', 'recCount',  'elo_prob',  'solve_order']


for i in cate:
    cate2label = {j:i for i,j in enumerate(df[i].unique())}
    df[i] = df[i].map(cate2label)

In [None]:
q = pd.read_csv('../../../data/pkj_elo.csv')
df['elo_prob'] = q['elo_prob']
df.to_csv('../../../data/pkj_2.0.csv', index=False)

In [None]:
# df = pd.read_csv("/opt/ml/input/data/train_data.csv")
# test = pd.read_csv('/opt/ml/input/data/test_data.csv')
# df = pd.concat([df, test])
# df = df.sort_values(by=["userID", "Timestamp"]).reset_index(drop=True)

# df = add_last_problem(df)
# # elo 추가
# df = ELO_function(df)
# df = df.drop(['last_problem', 'left_asymptote'], axis=1)
# df = df[df['answerCode']>= 0]