# Transformer계열 모델에서 사용할 total data

In [1]:
import pandas as pd
import os
import random
import numpy as np
from tqdm import tqdm

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

## 1. Data load

In [2]:
data_dir = '../data/' # 경로는 상황에 맞춰서 수정해주세요!

# LOAD TRAINDATA
csv_file_path = os.path.join(data_dir, 'train_data.csv') # 데이터는 대회홈페이지에서 받아주세요 :)
train_data = pd.read_csv(csv_file_path)

# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_data = pd.read_csv(test_csv_file_path)
test_data['answerCode'] = test_data['answerCode'].replace(-1, np.nan)

# # LEAVE LAST INTERACTION ONLY
# train_df = test_df[test_df['userID'] == test_df['userID'].shift(-1)]
# test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# df = pd.concat([df, train_df], ignore_index=True)

## 2. Outlier

In [3]:
userID = pd.unique(train_data.userID).tolist()
answer_list = train_data.groupby("userID")["answerCode"].apply(list)
answer_rate = np.array(userID)
answer_rate = answer_rate[:, np.newaxis].tolist()

In [4]:
# answer_rate -> userID, avg_acc, solved_count
for u in range(len(answer_list)):
    answer_rate[u].append(sum(answer_list[userID[u]])/len(answer_list[userID[u]]))
    answer_rate[u].append(len(answer_list[userID[u]]))

In [5]:
answer_rate_data = pd.DataFrame(data=answer_rate, index=userID, columns=['userID','answerRate','item_count'])

In [6]:
answer_rate_data

Unnamed: 0,userID,answerRate,item_count
0,0,0.630872,745
1,1,0.853162,933
2,2,0.612319,276
5,5,0.795918,833
6,6,0.442997,921
...,...,...,...
7436,7436,0.466667,15
7437,7437,0.375000,16
7438,7438,0.750000,16
7440,7440,0.400000,15


In [7]:
# 문제풀이 수가 30 이하인 유저 개수
small_solved = answer_rate_data[answer_rate_data['item_count']<=30].userID.tolist()
smallacc_bigacc = answer_rate_data[(answer_rate_data['answerRate']<=0.1) | (answer_rate_data['answerRate']>=0.95)].userID.tolist()
len(small_solved), len(smallacc_bigacc)   # userID

(594, 12)

In [8]:
print('original train data length : ', len(train_data))
for outlier in tqdm(small_solved):
    drop_idx = train_data[train_data['userID'] == outlier].index
    train_data.drop(inplace=True, axis=0, index=drop_idx)
print('processed train data length : ', len(train_data))

original train data length :  2266586


100%|██████████| 594/594 [02:10<00:00,  4.54it/s]

processed train data length :  2253503





In [9]:
print('original train data length : ', len(train_data))
for outlier in tqdm(smallacc_bigacc):
    drop_idx = train_data[train_data['userID'] == outlier].index
    train_data.drop(inplace=True, axis=0, index=drop_idx)
print('processed train data length : ', len(train_data))

original train data length :  2253503


100%|██████████| 12/12 [00:02<00:00,  5.01it/s]

processed train data length :  2253154





## 3. Feature Engineering

In [10]:
train_data['train'] = 1
test_data['train'] = 0

In [11]:
test_data['answerCode'] = test_data['answerCode'].replace(-1, np.nan)

In [12]:
total = pd.concat((train_data, test_data), axis=0)
total = total.sort_values('userID')
total.to_csv(data_dir + 'total_data.csv', index=False)

In [18]:
def feature_engineering(data_dir):
    total = pd.read_csv(os.path.join(data_dir, 'total_data.csv'), parse_dates=["Timestamp"])
    
    ## 유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    print('Timestamp')
    total['Timestamp'] = pd.to_datetime(total['Timestamp'])
    total.sort_values(by=['userID','Timestamp'], axis=0, inplace=True)#.reset_index(drop=True)
    
    ## 문제 푼 시간 재정의
    # 같은 문제 몇번째 푸는지
    print('same_item_cnt')
    total['same_item_cnt'] = total.groupby(['userID', 'assessmentItemID']).cumcount() + 1
    
    # elapsed
    print('prior elapsed')    # 직전 문제 풀이 소요 시간
    diff = total.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
    total['prior_elapsed'] = diff
    total['prior_elapsed'] = total['prior_elapsed'].apply(lambda x: x if x < 600 and x>=0 else 0)

    print('current elapsed')    # 현재 문제 풀이 소요 시간
    diff = total.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().shift(-1).fillna(method='ffill') # 마지막 문제 소요시간을 직전 문제 시간으로 채움
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
    total['current_elapsed'] = diff
    total['current_elapsed'] = total['current_elapsed'].apply(lambda x: x if x < 600 and x>=0 else np.nan)
    total['current_elapsed'] = total['current_elapsed'].fillna(method='ffill')

    print('time class')
    total['timeClass'] = total['current_elapsed']//5 + 1

    print('day_diff')
    day_diff = total.loc[:,['userID','Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    day_diff = day_diff['Timestamp'].apply(lambda x: x.days)
    total['day_diff'] = day_diff
    total['day_diff'] = total['day_diff'].apply(lambda x: x if x <= 3 and x>=0 else 4)   # 0-3은 그대로 / 나머지 1일로 클립


    # 대분류
    print('Bigcat')
    total['Bigcat'] = total['assessmentItemID'].str[2]
    total['Bigcat'] = total['Bigcat'].astype('category')



    # # 유저, assessmentItemID, same_item_cnt 구분했을 때 문제 푸는데 걸린 시간 > shift, fillna x
    # diff_shift = total.loc[:, ['userID', 'assessmentItemID', 'Timestamp', 'same_item_cnt']].groupby(['userID', 'testId', 'same_item_cnt']).diff().shift(-1)
    # diff_shift = diff_shift['Timestamp'].apply(lambda x: x.total_seconds())
    # total['solved_time_shift'] = diff_shift
    # # total['solved_time_shift'] = total.groupby(['userID', 'testId', 'same_item_cnt'])['solved_time_shift'].apply(lambda x:x.fillna(x.mean()))

    # # 맞은 사람의 문제별 평균 풀이시간
    # total = total.set_index('assessmentItemID')
    # total['Item_mean_solved_time'] = total[total['answerCode'] == 1].groupby('assessmentItemID')['solved_time_shift'].mean()
    # total = total.reset_index(drop = False)

    
    # 1. agg 값 구하기
    ## 1-1. 유저/문제/카테고리/태그별 평균 정답률
    print('acc_avg')
    total['user_avg'] = total.groupby('userID')['answerCode'].transform('mean')
    total['test_avg'] = total.groupby('testId')['answerCode'].transform('mean')
    total['item_avg'] = total.groupby('assessmentItemID')['answerCode'].transform('mean')
    total['Bigcat_avg'] = total.groupby('Bigcat')['answerCode'].transform('mean')
    total['tag_avg'] = total.groupby('KnowledgeTag')['answerCode'].transform('mean')

    ## 1-2. 유저/문제/카테고리/태그별 평균 풀이시간
    print('time_avg')
    total['user_time_avg'] = total.groupby('userID')['prior_elapsed'].transform('mean')
    total['test_time_avg'] = total.groupby('testId')['prior_elapsed'].transform('mean')
    total['item_time_avg'] = total.groupby('assessmentItemID')['prior_elapsed'].transform('mean')
    total['Bigcat_time_avg'] = total.groupby('Bigcat')['prior_elapsed'].transform('mean')
    total['tag_time_avg'] = total.groupby('KnowledgeTag')['prior_elapsed'].transform('mean')
     
    ## 1-3 유저/문제/카테고리/태그별 표준편차
    print('std')
    total['user_std'] = total.groupby('userID')['answerCode'].transform('std')
    total['test_std'] = total.groupby('testId')['answerCode'].transform('std')
    total['item_std'] = total.groupby('assessmentItemID')['answerCode'].transform('std')
    total['Bigcat_std'] = total.groupby('Bigcat')['answerCode'].transform('std')
    total['tag_std'] = total.groupby('KnowledgeTag')['answerCode'].transform('std')

    ## 1-4-1 유저별 누적 정답횟수
    print('correct_answer')
    total['user_correct_answer'] = total.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    
    ## 1-4-2 유저별 누적 정답률
    print('Cumacc')
    total['user_total_answer'] = total.groupby('userID')['answerCode'].cumcount()
    total['user_Cumacc'] = total['user_correct_answer']/total['user_total_answer'] # 누적정답률
    
    ## 1-4-3 유저의 카테고리별 누적 정답횟수/정답률
    print('Bigcat_Cumacc')
    total['user_Bigcat_correct_answer'] = total.groupby(['userID','Bigcat'])['answerCode'].transform(lambda x: x.cumsum().shift(1))
    total['user_Bigcat_total_answer'] = total.groupby(['userID','Bigcat'])['answerCode'].cumcount()
    total['user_Bigcat_Cumacc'] = total['user_Bigcat_correct_answer']/total['user_Bigcat_total_answer']

    ## 1-5. 현재 유저의 해당 문제지 평균 정답률/풀이시간
    print('current_avg')
    total['user_current_avg'] = total.groupby(['userID', 'testId', 'same_item_cnt'])['answerCode'].transform('mean')
    total['user_current_time_avg'] = total.groupby(['userID', 'testId', 'same_item_cnt'])['prior_elapsed'].transform('mean')

    # Bigcat_class
    print('Bigcat_class')
    total['Bigcat_class'] = round(total['Bigcat_avg']*10)
    
    # count
    print('count')
    total['assess_count'] = total.groupby('userID')['assessmentItemID'].cumcount()
    total['tag_count'] = total.groupby('userID')['KnowledgeTag'].cumcount()
    

    # 2. 문제 푼 순서 추가 > 상대적 순서?
    print('item_seq')
    total['item_seq'] = total.groupby(['userID', 'testId', 'same_item_cnt']).cumcount() +1
    total['item_seq'] = total['item_seq'].astype('category')

    # 2-1 유저/문제별 최근 정답횟수 
    print('retCount_correct_answer')
    total['user_retCount_correct_answer'] = total.groupby('userID')['answerCode'].transform(lambda x:x.rolling(10, min_periods=1).sum().shift(1))

    # 2-1 최근 정답률
    print('retCumacc')
    total['user_retCount'] = total.groupby('userID')['answerCode'].transform(lambda x:x.rolling(10, min_periods=1).count().shift(1))
    total['user_retCumacc'] = total['user_retCount_correct_answer']/total['user_retCount']


    # elo 위해서 다시 -1로 변환
    total['answerCode'] = total['answerCode'].fillna(0.5)

    
    total[['user_correct_answer',
           'user_Cumacc',
           'user_Bigcat_correct_answer', 'user_Bigcat_Cumacc', 
           'user_retCount_correct_answer',
           'user_retCount', 'user_retCumacc']] = total[['user_correct_answer',
                                                        'user_Cumacc',
                                                        'user_Bigcat_correct_answer', 'user_Bigcat_Cumacc', 
                                                        'user_retCount_correct_answer',
                                                        'user_retCount', 'user_retCumacc']].fillna(0)
    
    
    return total

In [19]:
total = feature_engineering(data_dir)
total

Timestamp
same_item_cnt
prior elapsed
current elapsed
time class
day_diff
Bigcat
acc_avg
time_avg
std
correct_answer
Cumacc
Bigcat_Cumacc
current_avg
Bigcat_class
count
item_seq
retCount_correct_answer
retCumacc


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,train,same_item_cnt,prior_elapsed,current_elapsed,...,user_Bigcat_Cumacc,user_current_avg,user_current_time_avg,Bigcat_class,assess_count,tag_count,item_seq,user_retCount_correct_answer,user_retCount,user_retCumacc
0,0,A060001001,A060000001,1.0,2020-03-24 00:17:11,7224,1,1,0.0,3.0,...,0.000000,1.00,6.0,7.0,0,0,1,0.0,0.0,0.0
491,0,A060001002,A060000001,1.0,2020-03-24 00:17:14,7225,1,1,3.0,8.0,...,1.000000,1.00,6.0,7.0,1,1,2,1.0,1.0,1.0
492,0,A060001003,A060000001,1.0,2020-03-24 00:17:22,7225,1,1,8.0,7.0,...,1.000000,1.00,6.0,7.0,2,2,3,2.0,2.0,1.0
493,0,A060001004,A060000001,1.0,2020-03-24 00:17:29,7225,1,1,7.0,7.0,...,1.000000,1.00,6.0,7.0,3,3,4,3.0,3.0,1.0
494,0,A060001005,A060000001,1.0,2020-03-24 00:17:36,7225,1,1,7.0,11.0,...,1.000000,1.00,6.0,7.0,4,4,5,4.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2513254,7439,A040130001,A040000130,0.0,2020-10-14 23:07:23,8832,0,1,0.0,18.0,...,0.727273,0.75,32.0,7.0,11,11,1,7.0,10.0,0.7
2513253,7439,A040130002,A040000130,1.0,2020-10-14 23:07:41,8832,0,1,18.0,21.0,...,0.666667,0.75,32.0,7.0,12,12,2,6.0,10.0,0.6
2513252,7439,A040130003,A040000130,1.0,2020-10-14 23:08:02,8244,0,1,21.0,89.0,...,0.692308,0.75,32.0,7.0,13,13,3,6.0,10.0,0.6
2513265,7439,A040130004,A040000130,1.0,2020-10-14 23:09:31,8244,0,1,89.0,32.0,...,0.714286,0.75,32.0,7.0,14,14,4,6.0,10.0,0.6


In [20]:
total.isna().sum()

userID                          0
assessmentItemID                0
testId                          0
answerCode                      0
Timestamp                       0
KnowledgeTag                    0
train                           0
same_item_cnt                   0
prior_elapsed                   0
current_elapsed                 0
timeClass                       0
day_diff                        0
Bigcat                          0
user_avg                        0
test_avg                        0
item_avg                        0
Bigcat_avg                      0
tag_avg                         0
user_time_avg                   0
test_time_avg                   0
item_time_avg                   0
Bigcat_time_avg                 0
tag_time_avg                    0
user_std                        0
test_std                        0
item_std                        0
Bigcat_std                      0
tag_std                         0
user_correct_answer             0
user_total_ans

In [21]:
total.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'train', 'same_item_cnt', 'prior_elapsed',
       'current_elapsed', 'timeClass', 'day_diff', 'Bigcat', 'user_avg',
       'test_avg', 'item_avg', 'Bigcat_avg', 'tag_avg', 'user_time_avg',
       'test_time_avg', 'item_time_avg', 'Bigcat_time_avg', 'tag_time_avg',
       'user_std', 'test_std', 'item_std', 'Bigcat_std', 'tag_std',
       'user_correct_answer', 'user_total_answer', 'user_Cumacc',
       'user_Bigcat_correct_answer', 'user_Bigcat_total_answer',
       'user_Bigcat_Cumacc', 'user_current_avg', 'user_current_time_avg',
       'Bigcat_class', 'assess_count', 'tag_count', 'item_seq',
       'user_retCount_correct_answer', 'user_retCount', 'user_retCumacc'],
      dtype='object')

In [22]:
total.day_diff.unique()

array([0, 2, 4, 1, 3], dtype=int64)

## 4. Elo

In [23]:
def elo(df, key):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name=key):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("\nParameter estimation is starting...", flush=True)

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df['userID'].values,
                answers_df[granularity_feature_name].values,
                answers_df['left_asymptote'].values,
                answers_df['answerCode'].values,
            ),
            total=len(answers_df),
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.\n")
        return student_parameters, item_parameters

    def gou_func(theta, beta):
        return 1 / (1 + np.exp(-(theta - beta)))

    df["left_asymptote"] = 0

    print('======================================')
    print(f"Dataset of shape {df.shape}")
    # print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    prob = [
        gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"])
        for student, item in zip(df.userID.values, df[key].values)
    ]
    if key == 'assessmentItemID':
        df['elo'] = prob
    else:
        df[f"{key}_elo"] = prob
    df = df.drop(columns=["left_asymptote"])

    return df

In [24]:
total['userID'] = total['userID'].astype(np.int64)
total['answerCode'] = total['answerCode'].astype(np.int64)
total['KnowledgeTag'] = total['KnowledgeTag'].astype(np.int64)

In [25]:
total = total.sort_values(by=["userID", "Timestamp"]).reset_index(drop=True)
print('assessmentItemID_elo...')
total = elo(total, 'assessmentItemID')
print('KnowledgeTag_elo...')
total = elo(total, 'KnowledgeTag')
print('Bigcat_elo...')
total = elo(total, 'Bigcat')


assessmentItemID_elo...
Dataset of shape (2513268, 44)

Parameter estimation is starting...


100%|██████████| 2513268/2513268 [00:17<00:00, 146139.39it/s]


Theta & beta estimations on assessmentItemID are completed.

KnowledgeTag_elo...
Dataset of shape (2513268, 45)

Parameter estimation is starting...


100%|██████████| 2513268/2513268 [00:17<00:00, 145942.90it/s]


Theta & beta estimations on KnowledgeTag are completed.

Bigcat_elo...
Dataset of shape (2513268, 46)

Parameter estimation is starting...


100%|██████████| 2513268/2513268 [00:16<00:00, 153971.26it/s]


Theta & beta estimations on Bigcat are completed.



In [26]:
# save total_data
total.to_csv(data_dir + 'total_data.csv', index=False)