In [9]:
import pandas as pd
train = pd.read_csv('input/data/test_data.csv')

In [10]:
import numpy as np

In [11]:
from datetime import date
import math
from tqdm import tqdm

def fe(df):
    df = add_last_problem(df)
    df = ELO_function(df)
    df = df.drop(['last_problem', 'left_asymptote'], axis=1)
    
    df['assessmentItemID0'] = df['assessmentItemID'].str[2]
    df['assessmentItemID1'] = df['assessmentItemID'].str[4:7]
    df['assessmentItemID2'] = df["assessmentItemID"].apply(lambda x: int(x[-2:]))

    df['month'] = df.Timestamp.str[5:7]
    df['hour'] = df.Timestamp.str[11:13]

    yy = df.Timestamp.str[:4]
    mm = df.Timestamp.str[5:7]
    dd = df.Timestamp.str[8:10]

    y = [int(y) for y in list(yy)]
    m = [int(m) for m in list(mm)]
    d = [int(d) for d in list(dd)]

    df['week'] = [date(i,j,k).weekday() for i,j,k in zip(y,m,d)]

    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    diff = df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())

    df['elapsed'] = diff

    df["elapsed"] = df["elapsed"].apply(
        lambda x: x if x < 1800 and x >= 0 else 0)

    def elap(x):
        if x == 0:
            return 0
        elif x < 9:
            return 1
        elif x < 301:
            return 2
        elif x < 601:
            return 3
        elif x < 1201:
            return 4
        elif x < 1501:
            return 5
        elif x < 1801:
            return 6
        else:
            return 7

    df['elapsed_cate'] = df.elapsed.apply(lambda x : elap(x))
    
    correct_k = df.groupby(["KnowledgeTag"])["answerCode"].agg(["mean", "std"])
    correct_k.columns = ["tag_mean", "tag_std"]

    correct_a = df.groupby(["assessmentItemID0"])["answerCode"].agg(["mean", "std"])
    correct_a.columns = ["ass0_mean", "ass0_std"]
    correct_a1 = df.groupby(["assessmentItemID1"])["answerCode"].agg(["mean", "std"])
    correct_a1.columns = ["ass1_mean", "ass1_std"]
    correct_a2 = df.groupby(["assessmentItemID2"])["answerCode"].agg(["mean", "std"])
    correct_a2.columns = ["ass2_mean", "ass2_std"]

    df = pd.merge(df, correct_k, on=["KnowledgeTag"], how="left")
    df = pd.merge(df, correct_a, on=["assessmentItemID0"], how="left")
    df = pd.merge(df, correct_a1, on=["assessmentItemID1"], how="left")
    df = pd.merge(df, correct_a2, on=["assessmentItemID2"], how="left")
    
    df["user_correct_answer"] = (
        df.groupby("userID")["answerCode"]
        .transform(lambda x: x.cumsum().shift(1))
        .fillna(0)
    )
    df["user_total_answer"] = df.groupby("userID")["answerCode"].cumcount() + 1
    df["user_acc"] = (df["user_correct_answer"] / df["user_total_answer"])
    
    df['recAccuracy'] = -1

    group = df.groupby("userID")

    def set_rec_count(group):
        group.sort_values(by='Timestamp', axis=0, inplace=True)
        recAccuracy = group['answerCode'].rolling(10, min_periods=1).sum().shift(1)
        group['recCount'] = recAccuracy

        return group

    df['recCount'] = 0

    group = df.groupby("userID")

    group = group.apply(set_rec_count)
    group.reset_index(drop=True, inplace=True)

    df['recCount'] = group['recCount']
    df['recCount']= df['recCount'].fillna(0)
    df['recCount'] = df['recCount'].astype(int)
    
    def abc(x):
        if x > 10:
            return 10
        else:
            return x

    df['abc'] = df.user_total_answer.apply(lambda x:abc(x))
    df['recAccuracy'] = df['recCount']/df['abc']
    df = df.drop(['abc'], axis=1)
    
    df["solve_order"] = df.groupby(["userID", "testId"]).cumcount()
    df["solve_order"] = (
        df["solve_order"]
        - df["assessmentItemID2"] * (df["solve_order"] > df["assessmentItemID2"]).apply(int)
        + 1
    )
    
    return df

def ELO_function(df):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...")

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            )
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters

    def gou_func(theta, beta):
        return 1 / (1 + np.exp(-(theta - beta)))

    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    prob = [
        gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"])
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]

    df["elo_prob"] = prob

    return df

def add_last_problem(df):
    new = []
    pre = df["testId"][0]
    for idx in df["testId"]:
        if pre != idx:
            new[-1] = -1
            pre = idx
        new.append(0)
    df["last_problem"] = new
    return df



In [12]:
df = fe(train)
df
#df.to_csv('../../../pkj_elo.csv', index=False)

Dataset of shape (260114, 8)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'last_problem', 'left_asymptote']
Parameter estimation is starting...


260114it [00:01, 143602.83it/s]


Theta & beta estimations on assessmentItemID are completed.


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elo_prob,assessmentItemID0,assessmentItemID1,assessmentItemID2,...,ass1_mean,ass1_std,ass2_mean,ass2_std,user_correct_answer,user_total_answer,user_acc,recAccuracy,recCount,solve_order
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626,0.752296,5,023,1,...,0.598709,0.491388,0.747717,0.434382,0.0,1,0.000000,0.000000,0,1
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626,0.416693,5,023,2,...,0.598709,0.491388,0.722324,0.448068,1.0,2,0.500000,0.500000,1,2
2,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625,0.281076,5,023,3,...,0.598709,0.491388,0.693241,0.461459,2.0,3,0.666667,0.666667,2,3
3,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625,0.382929,5,023,4,...,0.598709,0.491388,0.664376,0.473811,2.0,4,0.500000,0.500000,2,4
4,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623,0.166432,5,023,6,...,0.598709,0.491388,0.560674,0.504929,2.0,5,0.400000,0.400000,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260109,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832,0.132075,4,130,1,...,0.578438,0.508151,0.747717,0.434382,8.0,12,0.666667,0.700000,7,1
260110,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832,0.410814,4,130,2,...,0.578438,0.508151,0.722324,0.448068,8.0,13,0.615385,0.600000,6,2
260111,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244,0.621880,4,130,3,...,0.578438,0.508151,0.693241,0.461459,9.0,14,0.642857,0.600000,6,3
260112,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244,0.760312,4,130,4,...,0.578438,0.508151,0.664376,0.473811,10.0,15,0.666667,0.600000,6,4


In [13]:
df.columns 

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'elo_prob', 'assessmentItemID0', 'assessmentItemID1',
       'assessmentItemID2', 'month', 'hour', 'week', 'elapsed', 'elapsed_cate',
       'tag_mean', 'tag_std', 'ass0_mean', 'ass0_std', 'ass1_mean', 'ass1_std',
       'ass2_mean', 'ass2_std', 'user_correct_answer', 'user_total_answer',
       'user_acc', 'recAccuracy', 'recCount', 'solve_order'],
      dtype='object')

In [14]:
for i in ['elapsed', 'elapsed_cate',
       'tag_mean', 'tag_std', 'ass0_mean', 'ass0_std', 'ass1_mean', 'ass1_std',
       'ass2_mean', 'ass2_std', 'user_correct_answer', 'user_total_answer',
       'user_acc', 'recAccuracy', 'recCount', 'solve_order']:
    np.save('test_col/'+i+'.npy', df[i])

In [15]:
np.unique(df['user_correct_answer'])

array([0.000e+00, 1.000e+00, 2.000e+00, ..., 1.109e+03, 1.110e+03,
       1.111e+03])