In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

path = '/opt/ml/input/data/'

train = pd.read_csv(path + 'train_data.csv', parse_dates=["Timestamp"])
test = pd.read_csv(path + 'test_data.csv', parse_dates=["Timestamp"])

dat = pd.concat([train, test], axis = 0)
dat = dat.sort_values(by = ['userID', 'Timestamp'])


In [2]:
_train = dat[dat['answerCode'] >= 0]
_test = dat[dat['answerCode'] < 0]

In [3]:
# valid 제작하는 함수 새로 개편.
# 예전 valid 제작 함수는 1분 걸렸는데 0.6초만에 끗~
_train['train_valid'] = 0
_train.loc[_train.drop_duplicates(subset='userID', keep = 'last').index, 'train_valid'] = -1
_valid = _train[_train['train_valid'] == -1]
_train = _train[_train['train_valid'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _train['train_valid'] = 0


In [4]:
def elo(df):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...", flush=True)

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            ),
            total=len(answers_df),
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters

    df["left_asymptote"] = 1/4

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    
    t = [
        student_parameters[student]["theta"]
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]
    b = [
        item_parameters[item]["beta"]
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]
    #theta represents the global level of the student
    df['theta'] = t

    #beta represents the difficulty of the item 
    df['beta'] = b

    return df


In [5]:
# ELO Function 적용
_train = elo(_train)

# 필요없는 column 제거
_train = _train.drop(columns=["left_asymptote"])


Dataset of shape (2517453, 8)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'train_valid', 'left_asymptote']
Parameter estimation is starting...


100%|██████████| 2517453/2517453 [00:16<00:00, 152719.31it/s]


Theta & beta estimations on assessmentItemID are completed.


In [6]:
_train

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,train_valid,theta,beta
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0,0.101186,-3.451272
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,0,0.101186,-3.103244
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,0,0.101186,-2.285213
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,0,0.101186,-3.174484
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,0,0.101186,-2.666981
...,...,...,...,...,...,...,...,...,...
2266580,7441,A030071004,A030000071,0,2020-06-05 06:49:57,438,0,-0.260698,0.336689
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,0,-0.260698,1.624803
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,0,-0.260698,-0.156949
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,0,-0.260698,-0.097456


In [7]:
users = _train.drop_duplicates(['userID'])[['userID','theta']]
items = _train.drop_duplicates(['assessmentItemID'])[['assessmentItemID','beta']]

In [8]:
users.to_csv('/opt/ml/input/data/FE/users_elo.csv', index=False)
items.to_csv('/opt/ml/input/data/FE/items_elo.csv', index=False)

In [9]:
_test = pd.merge(_test, users, on='userID', how='left')
_test = pd.merge(_test, items, on='assessmentItemID', how='left')
_test

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,theta,beta
0,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289,0.046952,0.931300
1,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080,0.371375,0.724024
2,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660,-0.616772,2.251412
3,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611,1.211347,3.703508
4,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422,0.531571,3.060104
...,...,...,...,...,...,...,...,...
739,7395,A040122005,A040000122,-1,2020-09-08 02:05:20,10615,-1.581653,1.516570
740,7404,A030111005,A030000111,-1,2020-10-13 09:49:18,7636,-0.898962,-1.870949
741,7416,A050193004,A050000193,-1,2020-10-04 02:44:41,10402,-0.368780,-1.556544
742,7417,A050193004,A050000193,-1,2020-09-06 13:09:15,10402,-1.552496,-1.556544


In [10]:
_valid = pd.merge(_valid, users, on='userID', how='left')
_valid = pd.merge(_valid, items, on='assessmentItemID', how='left')
_valid

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,train_valid,theta,beta
0,0,A080129006,A080000129,0,2020-12-23 03:40:19,2725,-1,0.101186,7.510783
1,1,A090024003,A090000024,1,2020-05-07 23:26:04,10169,-1,1.734775,4.986012
2,1,A090074006,A090000074,1,2020-11-13 02:47:20,2648,-1,1.734775,1.681038
3,2,A030019003,A030000019,1,2020-03-21 05:01:12,419,-1,0.150922,-0.276852
4,2,A050139007,A050000139,0,2020-10-20 11:32:26,428,-1,0.150922,8.424690
...,...,...,...,...,...,...,...,...,...
8498,7437,A060003007,A060000003,0,2020-05-22 01:53:49,7226,-1,-1.252442,-1.503742
8499,7438,A030188005,A030000188,1,2020-10-19 10:28:29,1934,-1,-0.097728,0.691423
8500,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244,-1,-0.070211,-1.394714
8501,7440,A030197005,A030000197,0,2020-10-21 08:33:20,1984,-1,-1.105478,0.511546


In [11]:
dat = pd.concat([_train,_valid,_test])
dat = dat.sort_values(by = ['userID', 'Timestamp'])

In [12]:
dat

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,train_valid,theta,beta
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0.0,0.101186,-3.451272
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,0.0,0.101186,-3.103244
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,0.0,0.101186,-2.285213
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,0.0,0.101186,-3.174484
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,0.0,0.101186,-2.666981
...,...,...,...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,0.0,-0.260698,1.624803
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,0.0,-0.260698,-0.156949
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,0.0,-0.260698,-0.097456
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,0.0,-0.260698,-0.887061


In [13]:
dat['elo'] = dat.apply(lambda x : 1 / (np.exp(-(x['theta']-x['beta']))),axis=1)

In [14]:
diff = dat.loc[:, ["userID", "Timestamp"]].groupby("userID").diff().fillna(pd.Timedelta(seconds=0))
diff['Timestamp'] = diff['Timestamp'].apply(pd.Timedelta.total_seconds)
diff['Timestamp'] = diff['Timestamp'].apply(lambda x: 0 if x >600 or x<0 else x)
dat['solve_time'] = diff['Timestamp']

In [15]:
# 시험 난이도(?)로 추청되는 특성 따로 분류
dat['b_category'] = dat['assessmentItemID'].str[2]
# 시험지로 추청되는 특성 따로 분류. 뒤에 것이 좋을 것 같은데 일단 제출을 한 것으로 써놓음.
dat['test_category'] = dat['assessmentItemID'].str[2] + dat['assessmentItemID'].str[4:7] # dat['assessmentItemID'].str[4:7]
dat['problem_id'] = dat['assessmentItemID'].str[-3:]

In [16]:
diff['Timestamp'] = dat['solve_time']
diff['Timestamp'] = pd.qcut(diff['Timestamp'],5)
diff['Timestamp'] = diff['Timestamp'].astype("str")
dat['category_st_qcut_5'] = diff['Timestamp']

In [17]:
dat['solve_time'] = dat['solve_time'].astype(int)

In [18]:
dat['last_answerCode'] = dat.groupby("userID")['answerCode'].shift(1).fillna(1)
dat['last_answerCode2'] = dat.groupby("userID")['answerCode'].shift(2).fillna(1)
dat['last_answerCode3'] = dat.groupby("userID")['answerCode'].shift(3).fillna(1)
dat['last_answerCode4'] = dat.groupby("userID")['answerCode'].shift(4).fillna(1)
dat['last_answerCode5'] = dat.groupby("userID")['answerCode'].shift(5).fillna(1)
dat['last_answerCode6'] = dat.groupby("userID")['answerCode'].shift(6).fillna(1)
dat['last_answerCode7'] = dat.groupby("userID")['answerCode'].shift(7).fillna(1)
dat['last_answerCode8'] = dat.groupby("userID")['answerCode'].shift(8).fillna(1)
dat['last_answerCode9'] = dat.groupby("userID")['answerCode'].shift(9).fillna(1)
dat['last_answerCode10'] = dat.groupby("userID")['answerCode'].shift(10).fillna(1)

In [19]:
dat['last_answerCode'] = dat['last_answerCode'].astype(int)
dat['last_answerCode2'] = dat['last_answerCode2'].astype(int)
dat['last_answerCode3'] = dat['last_answerCode3'].astype(int)
dat['last_answerCode4'] = dat['last_answerCode4'].astype(int)
dat['last_answerCode5'] = dat['last_answerCode5'].astype(int)
dat['last_answerCode6'] = dat['last_answerCode6'].astype(int)
dat['last_answerCode7'] = dat['last_answerCode7'].astype(int)
dat['last_answerCode8'] = dat['last_answerCode8'].astype(int)
dat['last_answerCode9'] = dat['last_answerCode9'].astype(int)
dat['last_answerCode10'] = dat['last_answerCode10'].astype(int)

In [None]:
dat.drop(['train_valid'], axis = 1, inplace = True) 

In [20]:
dat['year'] = dat['Timestamp'].dt.year # 연도 정보
dat['month'] =  dat['Timestamp'].dt.month # 월 정보
dat['day'] =  dat['Timestamp'].dt.day # 일 정보
dat['hour'] =  dat['Timestamp'].dt.hour # 시간 정보

dat.to_csv('/opt/ml/input/data/FE/FE_total_2_elo2.csv', index=False)

In [28]:
dat['elo'].isnull().value_counts()

False    2526700
Name: elo, dtype: int64