## 1.라이브러리 세팅

In [2]:
import numpy as np
import pandas as pd
import random
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

import pdb
import wandb
import seaborn as sns

from sklearn.model_selection import KFold, GroupKFold


In [3]:
# wandb.login()

In [4]:
%%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

# 데이터 경로 맞춰주세요!
# 혹시 코랩환경을 사용하신다면 왼쪽 폴더모양 아이콘을 눌러 "train_data.csv"를 드래그&드롭으로 업로드한 후 사용해주세요
DATA_PATH = '/opt/ml/input/data/'
df = pd.read_csv(DATA_PATH+'train_data.csv' , dtype=dtype, parse_dates=['Timestamp'])

CPU times: user 2.52 s, sys: 740 ms, total: 3.26 s
Wall time: 3.3 s


In [5]:
test_df = pd.read_csv(DATA_PATH+'test_data.csv' , dtype=dtype, parse_dates=['Timestamp'])
test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
df['dataset'] = 1 #train_data
test_df['dataset'] = 2 #test_data
# test_df = test_df[(test_df['dataset'] == 2)&(test_df['answerCode'] != -1)] #테스트셋에 대해서는 제거
all_df = pd.concat([df,test_df],axis=0)

## 2. 피쳐 엔지니어링

In [6]:
day_dict = {'Tuesday': 0,
 'Thursday': 1,
 'Monday': 2,
 'Saturday': 3,
 'Friday': 4,
 'Wednesday': 5,
 'Sunday': 6}

def feature_engineering(df):
    df = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']
    
    # 문제 푸는데 걸린 시간
    # 10분이상 시간소요는 새로운 문제집을 시작한 것으로 판단
    diff = df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
    df['elapsed'] = diff
    df['elapsed'] = df['elapsed'].apply(lambda x: 0 if x>= 600 else x)
    # 문제 푸는데 걸린 누적 시간
    df['elapsed_cumsum'] = df.groupby('userID')['elapsed'].cumsum()
    #문제 푸는데 걸린 시간의 중앙값
    elapsed_med = df.groupby('userID')['elapsed'].agg(['median'])
    elapsed_med.columns = ['elapsed_med']
    #시간 쪼개기 + 요일
    df['month'] = pd.to_datetime(df.Timestamp).dt.month
    df['day'] = pd.to_datetime(df.Timestamp).dt.day
    df['hour'] = pd.to_datetime(df.Timestamp).dt.hour
    df['dayname'] = pd.to_datetime(df.Timestamp).dt.day_name().map(day_dict)
    
    #대분류/유저
    df['bigclass'] = df['testId'].apply(lambda x : x[2]).astype(int)
    # 유저별 대분류 문제 풀이시간
    bigclasstime = df.groupby(['userID','bigclass']).agg({'elapsed' : 'mean'}).reset_index()

    # 유저별 대분류 문제 횟수
    bigclassCount = df.groupby(['userID','bigclass'])['answerCode'].count().reset_index()
    # 유저별 대분류 문제 정답 횟수
    bigclasssum = df.groupby(['userID','bigclass'])['answerCode'].sum().reset_index()
    v = bigclasssum['answerCode'].values/bigclassCount['answerCode'].values
    bigclasstime['bigclass_acc'] = v
    bigclasstime['bigclass_count']  = bigclassCount['answerCode'].values
    bigclasstime['bigclass_sum'] = bigclasssum['answerCode'].values
    bigclass = bigclasstime.rename(columns = {'elapsed' : 'bigclasstime'})
    df = pd.merge(df,bigclass, on = ['userID','bigclass'],how = 'left')


    
    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, elapsed_med, on =['userID'], how = 'left')
    df.fillna(0,inplace = True)
    # df.sort_values(by=['userID','Timestamp'], inplace=True)
    return df

In [7]:
#elo 함수
def elo(df,col):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name=col):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...", flush=True)

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            ),
            total=len(answers_df),
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters

    def gou_func(theta, beta):
        return 1 / (1 + np.exp(-(theta - beta)))

    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    prob = [
        gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"])
        for student, item in zip(df.userID.values, df[col].values)
    ]

    df["elo_"+col] = prob

    return df

In [8]:
df2 = feature_engineering(all_df)
for col in ['assessmentItemID','testId','KnowledgeTag']:
    df2 = elo(df2,col)
df2.head()

Dataset of shape (2526700, 29)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'dataset', 'user_correct_answer', 'user_total_answer', 'user_acc', 'elapsed', 'elapsed_cumsum', 'month', 'day', 'hour', 'dayname', 'bigclass', 'bigclasstime', 'bigclass_acc', 'bigclass_count', 'bigclass_sum', 'test_mean', 'test_std', 'test_sum', 'tag_mean', 'tag_std', 'tag_sum', 'elapsed_med', 'left_asymptote']
Parameter estimation is starting...


100%|██████████| 2526700/2526700 [00:17<00:00, 147839.85it/s]


Theta & beta estimations on assessmentItemID are completed.
Dataset of shape (2526700, 30)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'dataset', 'user_correct_answer', 'user_total_answer', 'user_acc', 'elapsed', 'elapsed_cumsum', 'month', 'day', 'hour', 'dayname', 'bigclass', 'bigclasstime', 'bigclass_acc', 'bigclass_count', 'bigclass_sum', 'test_mean', 'test_std', 'test_sum', 'tag_mean', 'tag_std', 'tag_sum', 'elapsed_med', 'left_asymptote', 'elo_assessmentItemID']
Parameter estimation is starting...


100%|██████████| 2526700/2526700 [00:16<00:00, 153685.14it/s]


Theta & beta estimations on testId are completed.
Dataset of shape (2526700, 31)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'dataset', 'user_correct_answer', 'user_total_answer', 'user_acc', 'elapsed', 'elapsed_cumsum', 'month', 'day', 'hour', 'dayname', 'bigclass', 'bigclasstime', 'bigclass_acc', 'bigclass_count', 'bigclass_sum', 'test_mean', 'test_std', 'test_sum', 'tag_mean', 'tag_std', 'tag_sum', 'elapsed_med', 'left_asymptote', 'elo_assessmentItemID', 'elo_testId']
Parameter estimation is starting...


100%|██████████| 2526700/2526700 [00:16<00:00, 151449.21it/s]


Theta & beta estimations on KnowledgeTag are completed.


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,dataset,user_correct_answer,user_total_answer,user_acc,...,test_std,test_sum,tag_mean,tag_std,tag_sum,elapsed_med,left_asymptote,elo_assessmentItemID,elo_testId,elo_KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,1,0.0,0,0.0,...,0.212422,1429,0.957333,0.202239,718,14.0,0,0.980768,0.962544,0.96631
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1,1.0,1,1.0,...,0.212422,1429,0.917067,0.275818,3439,14.0,0,0.973315,0.962544,0.93246
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,1,2.0,2,1.0,...,0.212422,1429,0.917067,0.275818,3439,14.0,0,0.947292,0.962544,0.93246
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,1,3.0,3,1.0,...,0.212422,1429,0.917067,0.275818,3439,14.0,0,0.974914,0.962544,0.93246
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,1,4.0,4,1.0,...,0.212422,1429,0.917067,0.275818,3439,14.0,0,0.961391,0.962544,0.93246


## 3. Train/Test 데이터 셋 분리

## 3. Train_Test 분리

In [9]:
### train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
def custom_train_test_split(df, ratio=0.7, split=True):
    
    """
    사용자 기준으로 train_test_split
    """
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    # users.sort(key=lambda x: x[0])
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [12]:
# 유저별 분리
train_df = df2[df2['dataset'] == 1]
random.seed(42)
train, test = custom_train_test_split(train_df)

# 사용할 Feature 설정
FEATS = [
        'KnowledgeTag', 
         'user_correct_answer', 
         'user_total_answer', 
         'user_acc',
         'test_mean', 
         'test_sum', 
         'tag_mean',
         'tag_sum',
         'elapsed',
         'elapsed_cumsum',
         'month',
         'day',
         'hour',
         'dayname',
         'elapsed_med',
         'bigclass',
         'bigclasstime',
         'bigclass_acc',
         'bigclass_sum',
         'bigclass_count',
        'elo_assessmentItemID',
        'elo_testId',
        'elo_KnowledgeTag'
             ]
# X, y 값 분리
# y_train = train['answerCode']
# train = train.drop(['answerCode'], axis=1)

# train = train[train['userID'] != train['userID'].shift(-1)]
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

## 4. 훈련 및 검증

In [14]:
#모델 생성
model = XGBClassifier(
    n_estimators = 500,
    booster = 'gbtree',
    objective = 'binary:logistic',
    random_state = 42,
tree_method='gpu_hist')

#모델 학습
model.fit(
    X=train[FEATS],
    y=y_train,
    eval_set=[(test[FEATS],y_test)],
    eval_metric="auc",
    early_stopping_rounds=100,
    verbose=100)

preds = model.predict_proba(test[FEATS])[:,1]
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

# wandb.log({"valid_accuracy": acc})
# wandb.log({"valid_roc_auc": auc})

[0]	validation_0-auc:0.80962
[100]	validation_0-auc:0.84514
[200]	validation_0-auc:0.84679
[300]	validation_0-auc:0.84659
[325]	validation_0-auc:0.84636
VALID AUC : 0.8473186245279267 ACC : 0.772795216741405



In [117]:
# wandb.finish()

## Inference

In [24]:
# LOAD TESTDATA
test_df = pd.read_csv(DATA_PATH+'test_data.csv' , dtype=dtype, parse_dates=['Timestamp'])
test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
# test_csv_file_path = os.path.join(DATA_PATH, 'test_data.csv')
# test_df = pd.read_csv(test_csv_file_path)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)
test_df = elo(test_df)

# LEAVE LAST INTERACTION ONLY
test_df_last = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# # DROP ANSWERCODE
# test_df = test_df.drop(['answerCode'], axis=1)

Dataset of shape (260114, 28)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 'user_acc', 'elapsed', 'elapsed_cumsum', 'month', 'day', 'hour', 'dayname', 'bigclass', 'bigclasstime', 'bigclass_acc', 'bigclass_count', 'bigclass_sum', 'test_mean', 'test_std', 'test_sum', 'tag_mean', 'tag_std', 'tag_sum', 'elapsed_med', 'left_asymptote']
Parameter estimation is starting...


100%|██████████| 260114/260114 [00:01<00:00, 150048.08it/s]


Theta & beta estimations on assessmentItemID are completed.


In [25]:
test_df_y = test_df_last['answerCode']
test_df_X = test_df_last.drop('answerCode',axis=1)

In [31]:
# MAKE PREDICTION
total_preds = model.predict_proba(test_df_X[FEATS])[:,1]

In [32]:
# SAVE OUTPUT
output_dir = '/opt/ml/input/output/LGBM'
write_path = os.path.join(output_dir, "XGBoost.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : /opt/ml/input/output/LGBM/XGBoost.csv
