In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

path = '/opt/ml/input/data/'

train = pd.read_csv(path + 'train_data.csv', parse_dates=["Timestamp"])
test = pd.read_csv(path + 'test_data.csv', parse_dates=["Timestamp"])

dat = pd.concat([train, test], axis = 0)
dat = dat.sort_values(by = ['userID', 'Timestamp'])


In [2]:
_train = dat[dat['answerCode'] >= 0]
_test = dat[dat['answerCode'] < 0]

In [3]:
# valid 제작하는 함수 새로 개편.
# 예전 valid 제작 함수는 1분 걸렸는데 0.6초만에 끗~
_train['train_valid'] = 0
_train.loc[_train.drop_duplicates(subset='userID', keep = 'last').index, 'train_valid'] = -1
_valid = _train[_train['train_valid'] == -1]
_train = _train[_train['train_valid'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _train['train_valid'] = 0


In [4]:
def elo(df):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...", flush=True)

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            ),
            total=len(answers_df),
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters
        




    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    
    t = [
        student_parameters[student]["theta"]
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]
    b = [
        item_parameters[item]["beta"]
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]


    #theta represents the global level of the student
    df['elouser'] = t

    #beta represents the difficulty of the item 
    df['eloitem'] = b

    student_parameters, item_parameters = estimate_parameters(df,granularity_feature_name = "KnowledgeTag")


    b = [
        item_parameters[item]["beta"]
        for student, item in zip(df.userID.values, df.KnowledgeTag.values)
    ]
    

    #beta represents the difficulty of the item 
    df['elotag'] = b

    student_parameters, item_parameters = estimate_parameters(df,granularity_feature_name = "testId")


    b = [
        item_parameters[item]["beta"]
        for student, item in zip(df.userID.values, df.testId.values)
    ]
    

    #beta represents the difficulty of the item 
    df['elotest'] = b


    return df


In [5]:
# ELO Function 적용
_train = elo(_train)

# 필요없는 column 제거
_train = _train.drop(columns=["left_asymptote"])


Dataset of shape (2517453, 8)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'train_valid', 'left_asymptote']
Parameter estimation is starting...


100%|██████████| 2517453/2517453 [00:56<00:00, 44278.23it/s]


Theta & beta estimations on assessmentItemID are completed.
Parameter estimation is starting...


100%|██████████| 2517453/2517453 [00:56<00:00, 44210.22it/s]


Theta & beta estimations on KnowledgeTag are completed.
Parameter estimation is starting...


100%|██████████| 2517453/2517453 [00:57<00:00, 44100.18it/s]


Theta & beta estimations on testId are completed.


In [6]:
_train

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,train_valid,elouser,eloitem,elotag,elotest
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0,0.604065,-3.354789,-2.988107,-2.915382
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,0,0.604065,-3.019619,-2.256471,-2.915382
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,0,0.604065,-2.311757,-2.256471,-2.915382
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,0,0.604065,-3.083097,-2.256471,-2.915382
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,0,0.604065,-2.637877,-2.256471,-2.915382
...,...,...,...,...,...,...,...,...,...,...,...
2266580,7441,A030071004,A030000071,0,2020-06-05 06:49:57,438,0,-0.099183,0.011644,-0.419509,-0.380745
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,0,-0.099183,0.924409,-0.419509,-0.380745
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,0,-0.099183,-0.412535,-0.576496,-0.444296
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,0,-0.099183,-0.346430,-0.576496,-0.444296


In [7]:
users = _train.drop_duplicates(['userID'])[['userID','elouser']]
items = _train.drop_duplicates(['assessmentItemID'])[['assessmentItemID','eloitem']]
tag = _train.drop_duplicates(['KnowledgeTag'])[['KnowledgeTag','elotag']]
testid = _train.drop_duplicates(['testId'])[['testId','elotest']]
#update = _train.drop_duplicates(['assessmentItemID'])[['assessmentItemID','eloitem']]

In [9]:
users.to_csv('/opt/ml/input/data/users_elo.csv', index=False)
items.to_csv('/opt/ml/input/data/items_elo.csv', index=False)
tag.to_csv('/opt/ml/input/data/tag_elo.csv', index=False)
testid.to_csv('/opt/ml/input/data/testid_elo.csv', index=False)

In [10]:
_test = pd.merge(_test, users, on='userID', how='left')
_test = pd.merge(_test, items, on='assessmentItemID', how='left')
_test = pd.merge(_test, tag, on='KnowledgeTag', how='left')
_test = pd.merge(_test, testid, on='testId', how='left')
_test

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elouser,eloitem,elotag,elotest
0,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289,0.538307,0.631749,0.184383,0.007187
1,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080,0.777092,0.643335,0.372203,0.045865
2,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660,-0.088830,1.358287,0.551662,0.751898
3,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611,1.601569,1.922164,1.053062,0.898570
4,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422,1.007924,1.603718,-0.172100,-0.240808
...,...,...,...,...,...,...,...,...,...,...
739,7395,A040122005,A040000122,-1,2020-09-08 02:05:20,10615,-1.159209,0.809970,-0.572445,-1.038277
740,7404,A030111005,A030000111,-1,2020-10-13 09:49:18,7636,-0.644115,-1.650815,-1.333114,-1.504015
741,7416,A050193004,A050000193,-1,2020-10-04 02:44:41,10402,-0.167283,-1.436450,-1.233924,-0.999234
742,7417,A050193004,A050000193,-1,2020-09-06 13:09:15,10402,-1.158855,-1.436450,-1.233924,-0.999234


In [11]:
_valid = pd.merge(_valid, users, on='userID', how='left')
_valid = pd.merge(_valid, items, on='assessmentItemID', how='left')
_valid = pd.merge(_test, tag, on='KnowledgeTag', how='left')
_valid = pd.merge(_test, testid, on='testId', how='left')
_valid

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elouser,eloitem,elotag,elotest_x,elotest_y
0,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289,0.538307,0.631749,0.184383,0.007187,0.007187
1,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080,0.777092,0.643335,0.372203,0.045865,0.045865
2,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660,-0.088830,1.358287,0.551662,0.751898,0.751898
3,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611,1.601569,1.922164,1.053062,0.898570,0.898570
4,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422,1.007924,1.603718,-0.172100,-0.240808,-0.240808
...,...,...,...,...,...,...,...,...,...,...,...
739,7395,A040122005,A040000122,-1,2020-09-08 02:05:20,10615,-1.159209,0.809970,-0.572445,-1.038277,-1.038277
740,7404,A030111005,A030000111,-1,2020-10-13 09:49:18,7636,-0.644115,-1.650815,-1.333114,-1.504015,-1.504015
741,7416,A050193004,A050000193,-1,2020-10-04 02:44:41,10402,-0.167283,-1.436450,-1.233924,-0.999234,-0.999234
742,7417,A050193004,A050000193,-1,2020-09-06 13:09:15,10402,-1.158855,-1.436450,-1.233924,-0.999234,-0.999234


In [12]:
dat = pd.concat([_train,_valid,_test])
dat = dat.sort_values(by = ['userID', 'Timestamp'])

In [13]:
dat

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,train_valid,elouser,eloitem,elotag,elotest,elotest_x,elotest_y
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0.0,0.604065,-3.354789,-2.988107,-2.915382,,
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,0.0,0.604065,-3.019619,-2.256471,-2.915382,,
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,0.0,0.604065,-2.311757,-2.256471,-2.915382,,
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,0.0,0.604065,-3.083097,-2.256471,-2.915382,,
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,0.0,0.604065,-2.637877,-2.256471,-2.915382,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266580,7441,A030071004,A030000071,0,2020-06-05 06:49:57,438,0.0,-0.099183,0.011644,-0.419509,-0.380745,,
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,0.0,-0.099183,0.924409,-0.419509,-0.380745,,
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,0.0,-0.099183,-0.412535,-0.576496,-0.444296,,
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,0.0,-0.099183,-0.346430,-0.576496,-0.444296,,


In [29]:
dat['elo'] = dat.apply(lambda x : 1 / (1 + (np.exp(-(x['elouser']-x['eloitem'])))),axis=1)

In [15]:
diff = dat.loc[:, ["userID", "Timestamp"]].groupby("userID").diff().fillna(pd.Timedelta(seconds=0))
diff['Timestamp'] = diff['Timestamp'].apply(pd.Timedelta.total_seconds)
#diff['Timestamp'] = diff['Timestamp'].apply(lambda x: 0 if x >600 or x<0 else x)
dat['solve_time'] = diff['Timestamp']

In [16]:
# 시험 난이도(?)로 추청되는 특성 따로 분류
dat['b_category'] = dat['assessmentItemID'].str[2]
# 시험지로 추청되는 특성 따로 분류. 뒤에 것이 좋을 것 같은데 일단 제출을 한 것으로 써놓음.
dat['test_category'] = dat['assessmentItemID'].str[2] + dat['assessmentItemID'].str[4:7] # dat['assessmentItemID'].str[4:7]
dat['problem_id'] = dat['assessmentItemID'].str[-3:]

In [17]:
diff['Timestamp'] = dat['solve_time']
diff['Timestamp'] = pd.qcut(diff['Timestamp'],5)
diff['Timestamp'] = diff['Timestamp'].astype("str")
dat['category_st_qcut_5'] = diff['Timestamp']

In [18]:
dat['solve_time'] = dat['solve_time'].astype(int)

In [19]:
dat['last_answerCode'] = dat.groupby("userID")['answerCode'].shift(1).fillna(1)
dat['last_answerCode2'] = dat.groupby("userID")['answerCode'].shift(2).fillna(1)
dat['last_answerCode3'] = dat.groupby("userID")['answerCode'].shift(3).fillna(1)
dat['last_answerCode4'] = dat.groupby("userID")['answerCode'].shift(4).fillna(1)
dat['last_answerCode5'] = dat.groupby("userID")['answerCode'].shift(5).fillna(1)
dat['last_answerCode6'] = dat.groupby("userID")['answerCode'].shift(6).fillna(1)
dat['last_answerCode7'] = dat.groupby("userID")['answerCode'].shift(7).fillna(1)
dat['last_answerCode8'] = dat.groupby("userID")['answerCode'].shift(8).fillna(1)
dat['last_answerCode9'] = dat.groupby("userID")['answerCode'].shift(9).fillna(1)
dat['last_answerCode10'] = dat.groupby("userID")['answerCode'].shift(10).fillna(1)

In [20]:
dat['last_answerCode'] = dat['last_answerCode'].astype(int)
dat['last_answerCode2'] = dat['last_answerCode2'].astype(int)
dat['last_answerCode3'] = dat['last_answerCode3'].astype(int)
dat['last_answerCode4'] = dat['last_answerCode4'].astype(int)
dat['last_answerCode5'] = dat['last_answerCode5'].astype(int)
dat['last_answerCode6'] = dat['last_answerCode6'].astype(int)
dat['last_answerCode7'] = dat['last_answerCode7'].astype(int)
dat['last_answerCode8'] = dat['last_answerCode8'].astype(int)
dat['last_answerCode9'] = dat['last_answerCode9'].astype(int)
dat['last_answerCode10'] = dat['last_answerCode10'].astype(int)

In [21]:
dat.drop(['train_valid'], axis = 1, inplace = True) 

In [22]:
dat['year'] = dat['Timestamp'].dt.year # 연도 정보
dat['month'] =  dat['Timestamp'].dt.month # 월 정보
dat['day'] =  dat['Timestamp'].dt.day # 일 정보
dat['hour'] =  dat['Timestamp'].dt.hour # 시간 정보


In [23]:
dat.loc[dat['elotest'].isna(), 'elotest'] = dat.loc[dat['elotest'].isna(), 'elotest_x']

In [24]:
dat.drop(['elotest_x', 'elotest_y'], axis=1, inplace=True)

In [30]:
dat.to_csv('/opt/ml/input/data/FE_total_2_elo2.csv', index=False)

In [26]:
dat.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'elouser', 'eloitem', 'elotag', 'elotest', 'elo',
       'solve_time', 'b_category', 'test_category', 'problem_id',
       'category_st_qcut_5', 'last_answerCode', 'last_answerCode2',
       'last_answerCode3', 'last_answerCode4', 'last_answerCode5',
       'last_answerCode6', 'last_answerCode7', 'last_answerCode8',
       'last_answerCode9', 'last_answerCode10', 'year', 'month', 'day',
       'hour'],
      dtype='object')

In [27]:
sum(dat.isna().values)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [31]:
dat['elo']

0          0.981272
1          0.974009
2          0.948623
3          0.975569
4          0.962382
             ...   
2266580    0.472322
2266581    0.264328
2266582    0.577703
2266583    0.561499
2266584    0.690796
Name: elo, Length: 2518941, dtype: float64