In [6]:
import pandas as pd
import numpy as np
import math
import time

from tqdm import tqdm

dtype = {"userID": "int16", "answerCode": "int8", "KnowledgeTag": "int16"}

DATA_PATH = "/opt/ml/input/data/total_data.csv"

df = pd.read_csv(DATA_PATH, dtype=dtype, parse_dates=["Timestamp"])
df = df.sort_values(by=["userID", "Timestamp"]).reset_index(drop=True)

train = pd.read_csv(DATA_PATH, dtype=dtype, parse_dates=["Timestamp"])
train = train.sort_values(by=["userID", "Timestamp"]).reset_index(drop=True)

In [7]:
# ELO
def ELO_function(df):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...")

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            )
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters

    def gou_func(theta, beta):
        return 1 / (1 + np.exp(-(theta - beta)))

    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    prob = [
        gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"])
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]

    df["elo_prob"] = prob

    return df

In [8]:
def feature_engineering(df):

    # 유저별 시퀀스를 고려하기 위해 정렬
    df.sort_values(by=["userID", "Timestamp"], inplace=True)
    # ELO
    df = ELO_function(df)

    df["hour"] = df["Timestamp"].dt.hour  # 시간
    df["dow"] = df["Timestamp"].dt.dayofweek  # 요일

    # 풀이시간
    solving_time = df[['userID', 'Timestamp']].groupby('userID').diff(periods=-1).fillna(pd.Timedelta(seconds=0))
    solving_time = solving_time['Timestamp'].apply(lambda x: x.total_seconds())
    df['elap_time'] = -solving_time
    df["elap_time"] = df["elap_time"].apply(lambda x: x if x < 700 and x >= 0 else 0)

    df["prefix"] = df["testId"].map(lambda x: int(x[1:4]) // 10)  # 대분류
    df["mid"] = df["testId"].map(lambda x: int(x[-3:]))  # 중분류
    df['suffix'] = df["assessmentItemID"].map(lambda x: int(x[-3:])) # 소분류

    correct_t = df.groupby(["testId"])["answerCode"].agg(["mean", "sum"])
    correct_t.columns = ["test_mean", "test_sum"]  # 시험지별 정답률
    correct_k = df.groupby(["KnowledgeTag"])["answerCode"].agg(["mean", "sum"])
    correct_k.columns = ["tag_mean", "tag_sum"]  # tag별 정답률
    correct_a = df.groupby(["assessmentItemID"])["answerCode"].agg(["mean", "sum"])
    correct_a.columns = ["ass_mean", "ass_sum"]  # 문제별 정답률
    correct_h = df.groupby(["hour"])["answerCode"].agg(["mean", "sum"]) # 시간별 정답률
    correct_h.columns = ["hour_mean", "hour_sum"]
    correct_d = df.groupby(["dow"])["answerCode"].agg(["mean", "sum"])  # 요일별 정답률
    correct_d.columns = ["dow_mean", "dow_sum"]
    correct_s = df.groupby(["suffix"])["answerCode"].agg(["mean", "sum"]) # 뒤의 3자리별 정답률
    correct_s.columns = ["suffix_mean", "suffix_sum"]  

    df = pd.merge(df, correct_t, on=["testId"], how="left")
    df = pd.merge(df, correct_k, on=["KnowledgeTag"], how="left")
    df = pd.merge(df, correct_a, on=["assessmentItemID"], how="left")
    df = pd.merge(df, correct_s, on=["suffix"], how="left")
    df = pd.merge(df, correct_h, on=["hour"], how="left")
    df = pd.merge(df, correct_d, on=["dow"], how="left")

    ## 여까지 왓네

    df_o = df[df["answerCode"] == 1]
    df_x = df[df["answerCode"] == 0]

    # tag별 풀이시간 평균
    tag_elp = df.groupby(["KnowledgeTag"])["elap_time"].agg("mean").reset_index()
    tag_elp.columns = ["KnowledgeTag", "tag_elp"]
    tag_elp_o = df_o.groupby(["KnowledgeTag"])["elap_time"].agg("mean").reset_index()
    tag_elp_o.columns = ["KnowledgeTag", "tag_elp_o"]
    tag_elp_x = df_x.groupby(["KnowledgeTag"])["elap_time"].agg("mean").reset_index()
    tag_elp_x.columns = ["KnowledgeTag", "tag_elp_x"]

    df = pd.merge(df, tag_elp, on=["KnowledgeTag"], how="left")
    df = pd.merge(df, tag_elp_o, on=["KnowledgeTag"], how="left")
    df = pd.merge(df, tag_elp_x, on=["KnowledgeTag"], how="left")

    # 문제별 풀이시간 평균
    ass_elp = df.groupby(["assessmentItemID"])["elap_time"].agg("mean").reset_index()
    ass_elp.columns = ["assessmentItemID", "ass_elp"]
    ass_elp_o = df_o.groupby(["assessmentItemID"])["elap_time"].agg("mean").reset_index()
    ass_elp_o.columns = ["assessmentItemID", "ass_elp_o"]
    ass_elp_x = df_x.groupby(["assessmentItemID"])["elap_time"].agg("mean").reset_index()
    ass_elp_x.columns = ["assessmentItemID", "ass_elp_x"]

    df = pd.merge(df, ass_elp, on=["assessmentItemID"], how="left")
    df = pd.merge(df, ass_elp_o, on=["assessmentItemID"], how="left")
    df = pd.merge(df, ass_elp_x, on=["assessmentItemID"], how="left")

    # 문항번호별 풀이시간 평균
    suffix_elp = df.groupby(["suffix"])["elap_time"].agg("mean").reset_index()
    suffix_elp.columns = ["suffix", "suffix_elp"]
    suffix_elp_o = df_o.groupby(["suffix"])["elap_time"].agg("mean").reset_index()
    suffix_elp_o.columns = ["suffix", "suffix_elp_o"]
    suffix_elp_x = df_x.groupby(["suffix"])["elap_time"].agg("mean").reset_index()
    suffix_elp_x.columns = ["suffix", "suffix_elp_x"]

    df = pd.merge(df, suffix_elp, on=["suffix"], how="left")
    df = pd.merge(df, suffix_elp_o, on=["suffix"], how="left")
    df = pd.merge(df, suffix_elp_x, on=["suffix"], how="left")

    df["user_correct_answer"] = (df.groupby("userID")["answerCode"].transform(lambda x: x.cumsum().shift(1))
        .fillna(0)
    )
    df["user_total_answer"] = df.groupby("userID")["answerCode"].cumcount() # 푼 문제수
    df["user_acc"] = (df["user_correct_answer"] / df["user_total_answer"]).fillna(0) # 정답률

    df["Prefix_o"] = (
        df.groupby(["userID", "prefix"])["answerCode"]
        .transform(lambda x: x.cumsum().shift(1))
        .fillna(0)
    )
    # 문항별로 분류
    df["PrefixCount"] = df.groupby(["userID", "prefix"]).cumcount()
    df["PrefixAcc"] = (df["Prefix_o"] / df["PrefixCount"]).fillna(0)
    df["PrefixElp"] = (
        df.groupby(["userID", "prefix"])["elap_time"]
        .transform(lambda x: x.cumsum())
        .fillna(0)
    )
    df["PrefixMElp"] = df["PrefixElp"] / [
        v if v != 0 else 1 for v in df["PrefixCount"].values
    ]

    f = lambda x: len(set(x))
    test = df.groupby(["testId"]).agg({"suffix": "max", "KnowledgeTag": f})
    test.reset_index(inplace=True)

    test.columns = ["testId", "suffix_count", "tag_count"]

    df = pd.merge(df, test, on="testId", how="left")

    gdf = df[["userID", "testId", "suffix", "prefix", "Timestamp"]].sort_values(
        by=["userID", "prefix", "Timestamp"]
    )
    gdf["buserID"] = gdf["userID"] != gdf["userID"].shift(1)
    gdf["bprefix"] = gdf["prefix"] != gdf["prefix"].shift(1)
    gdf["first"] = gdf[["buserID", "prefix"]].any(axis=1).apply(lambda x: 1 - int(x))
    gdf["RepeatedTime"] = gdf["Timestamp"].diff().fillna(pd.Timedelta(seconds=0))
    gdf["RepeatedTime"] = (
        gdf["RepeatedTime"].apply(lambda x: x.total_seconds()) * gdf["first"]
    )
    df["RepeatedTime"] = gdf["RepeatedTime"].apply(lambda x: math.log(x + 1))
    df["prior_KnowledgeTag_frequency"] = df.groupby(["userID", "KnowledgeTag"]).cumcount()
    df["testId"] = df["testId"].apply(lambda x: int(x[1:4] + x[-3]))
    
    return df

train = feature_engineering(train)
train = train.fillna(0) # null 값 분포 우선 fillna로 처리
train.to_csv("/opt/ml/input/data/FE_data.csv", index=False)

Dataset of shape (2526700, 8)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'dataset', 'left_asymptote']
Parameter estimation is starting...


2526700it [00:19, 132965.18it/s]


Theta & beta estimations on assessmentItemID are completed.
