In [93]:
import numpy as np
import pandas as pd
from tqdm import tqdm

path = '/opt/ml/input/data/'

train = pd.read_csv(path + 'train_data.csv', parse_dates=["Timestamp"])
test = pd.read_csv(path + 'test_data.csv', parse_dates=["Timestamp"])

dat = pd.concat([train, test], axis = 0)
dat = dat.sort_values(by = ['userID', 'Timestamp'])


In [94]:
_train = dat[dat['answerCode'] >= 0]
_test = dat[dat['answerCode'] < 0]

In [None]:
# valid 제작하는 함수 새로 개편.
# 예전 valid 제작 함수는 1분 걸렸는데 0.6초만에 끗~
_train['train_valid'] = 0
_train.loc[_train.drop_duplicates(subset='userID', keep = 'last').index, 'train_valid'] = -1
_valid = _train[_train['train_valid'] == -1]
_train = _train[_train['train_valid'] == 0]

In [96]:
def elo(df):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...", flush=True)

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            ),
            total=len(answers_df),
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters
        




    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    
    t = [
        student_parameters[student]["theta"]
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]
    b = [
        item_parameters[item]["beta"]
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]


    #theta represents the global level of the student
    df['elouser'] = t

    #beta represents the difficulty of the item 
    df['eloitem'] = b

    student_parameters, item_parameters = estimate_parameters(df,granularity_feature_name = "KnowledgeTag")


    b = [
        item_parameters[item]["beta"]
        for student, item in zip(df.userID.values, df.KnowledgeTag.values)
    ]
    

    #beta represents the difficulty of the item 
    df['elotag'] = b

    student_parameters, item_parameters = estimate_parameters(df,granularity_feature_name = "testId")


    b = [
        item_parameters[item]["beta"]
        for student, item in zip(df.userID.values, df.testId.values)
    ]
    

    #beta represents the difficulty of the item 
    df['elotest'] = b


    return df


In [None]:
# ELO Function 적용
_train = elo(_train)

# 필요없는 column 제거
_train = _train.drop(columns=["left_asymptote"])


In [None]:
_train

In [None]:
users = _train.drop_duplicates(['userID'])[['userID','elouser']]
items = _train.drop_duplicates(['assessmentItemID'])[['assessmentItemID','eloitem']]
tag = _train.drop_duplicates(['KnowledgeTag'])[['KnowledgeTag','elotag']]
testid = _train.drop_duplicates(['testId'])[['testId','elotest']]
#update = _train.drop_duplicates(['assessmentItemID'])[['assessmentItemID','eloitem']]

In [None]:
users.to_csv('/opt/ml/input/data/FE/users_elo.csv', index=False)
items.to_csv('/opt/ml/input/data/FE/items_elo.csv', index=False)
tag.to_csv('/opt/ml/input/data/FE/tag_elo.csv', index=False)
testid.to_csv('/opt/ml/input/data/FE/testid_elo.csv', index=False)

In [None]:
_test = pd.merge(_test, users, on='userID', how='left')
_test = pd.merge(_test, items, on='assessmentItemID', how='left')
_test = pd.merge(_test, tag, on='KnowledgeTag', how='left')
_test = pd.merge(_test, testid, on='testId', how='left')
_test

In [None]:
_valid = pd.merge(_valid, users, on='userID', how='left')
_valid = pd.merge(_valid, items, on='assessmentItemID', how='left')
_valid = pd.merge(_test, tag, on='KnowledgeTag', how='left')
_valid = pd.merge(_test, testid, on='testId', how='left')
_valid

In [None]:
dat = pd.concat([_train,_valid,_test])
dat = dat.sort_values(by = ['userID', 'Timestamp'])

In [None]:
dat

In [None]:
dat['elo'] = dat.apply(lambda x : 1 / (1 + (np.exp(-(x['elouser']-x['eloitem'])))),axis=1)

In [None]:
diff = dat.loc[:, ["userID", "Timestamp"]].groupby("userID").diff().fillna(pd.Timedelta(seconds=0))
diff['Timestamp'] = diff['Timestamp'].apply(pd.Timedelta.total_seconds)
#diff['Timestamp'] = diff['Timestamp'].apply(lambda x: 0 if x >600 or x<0 else x)
dat['solve_time'] = diff['Timestamp']

In [None]:
# 시험 난이도(?)로 추청되는 특성 따로 분류
dat['b_category'] = dat['assessmentItemID'].str[2]
# 시험지로 추청되는 특성 따로 분류. 뒤에 것이 좋을 것 같은데 일단 제출을 한 것으로 써놓음.
dat['test_category'] = dat['assessmentItemID'].str[2] + dat['assessmentItemID'].str[4:7] # dat['assessmentItemID'].str[4:7]
dat['problem_id'] = dat['assessmentItemID'].str[-3:]

In [None]:
diff['Timestamp'] = dat['solve_time']
diff['Timestamp'] = pd.qcut(diff['Timestamp'],5)
diff['Timestamp'] = diff['Timestamp'].astype("str")
dat['category_st_qcut_5'] = diff['Timestamp']

In [None]:
dat['solve_time'] = dat['solve_time'].astype(int)

In [None]:
dat['last_answerCode'] = dat.groupby("userID")['answerCode'].shift(1).fillna(1)
dat['last_answerCode2'] = dat.groupby("userID")['answerCode'].shift(2).fillna(1)
dat['last_answerCode3'] = dat.groupby("userID")['answerCode'].shift(3).fillna(1)
dat['last_answerCode4'] = dat.groupby("userID")['answerCode'].shift(4).fillna(1)
dat['last_answerCode5'] = dat.groupby("userID")['answerCode'].shift(5).fillna(1)
dat['last_answerCode6'] = dat.groupby("userID")['answerCode'].shift(6).fillna(1)
dat['last_answerCode7'] = dat.groupby("userID")['answerCode'].shift(7).fillna(1)
dat['last_answerCode8'] = dat.groupby("userID")['answerCode'].shift(8).fillna(1)
dat['last_answerCode9'] = dat.groupby("userID")['answerCode'].shift(9).fillna(1)
dat['last_answerCode10'] = dat.groupby("userID")['answerCode'].shift(10).fillna(1)

In [None]:
dat['last_answerCode'] = dat['last_answerCode'].astype(int)
dat['last_answerCode2'] = dat['last_answerCode2'].astype(int)
dat['last_answerCode3'] = dat['last_answerCode3'].astype(int)
dat['last_answerCode4'] = dat['last_answerCode4'].astype(int)
dat['last_answerCode5'] = dat['last_answerCode5'].astype(int)
dat['last_answerCode6'] = dat['last_answerCode6'].astype(int)
dat['last_answerCode7'] = dat['last_answerCode7'].astype(int)
dat['last_answerCode8'] = dat['last_answerCode8'].astype(int)
dat['last_answerCode9'] = dat['last_answerCode9'].astype(int)
dat['last_answerCode10'] = dat['last_answerCode10'].astype(int)

In [None]:
dat.drop(['train_valid'], axis = 1, inplace = True) 

In [None]:
dat['year'] = dat['Timestamp'].dt.year # 연도 정보
dat['month'] =  dat['Timestamp'].dt.month # 월 정보
dat['day'] =  dat['Timestamp'].dt.day # 일 정보
dat['hour'] =  dat['Timestamp'].dt.hour # 시간 정보


In [None]:
dat.loc[dat['elotest'].isna(), 'elotest'] = dat.loc[dat['elotest'].isna(), 'elotest_x']

In [None]:
dat.drop(['elotest_x', 'elotest_y'], axis=1, inplace=True)

In [None]:
dat.to_csv('/opt/ml/input/data/FE_total_2_elo2.csv', index=False)

In [None]:
dat.columns

In [None]:
sum(dat.isna().values)

In [None]:
dat['elo']