# XGB 베이스라인

In [10]:
import pandas as pd
import os
import random
import numpy as np

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import wandb 
from tqdm import tqdm
from xgboost import XGBClassifier
from xgboost import plot_importance

In [12]:
pd.options.display.max_columns = 100

## 1. 데이터 로딩

In [14]:
#wandb.login()

In [15]:
data_dir = '/opt/ml/input/data/' # 경로는 상황에 맞춰서 수정해주세요!

# LOAD TRAINDATA
csv_file_path = os.path.join(data_dir, 'train_data.csv') # 데이터는 대회홈페이지에서 받아주세요 :)
train_data = pd.read_csv(csv_file_path)

# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_data = pd.read_csv(test_csv_file_path)
# test_data['answerCode'] = test_data['answerCode'].replace(-1, np.nan)

# # LEAVE LAST INTERACTION ONLY
# train_df = test_df[test_df['userID'] == test_df['userID'].shift(-1)]
# test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# df = pd.concat([df, train_df], ignore_index=True)

## 2. Feature Engineering

In [16]:
def feature_engineering(train, test):
    ## train, test data concat
    train['train'] = 1
    test['train'] = 0
    total = pd.concat([train, test], ignore_index=True)
    # total['answerCode'] = total['answerCode'].fillna(0.5)
    total = elo(total)
    
    ## 유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    total.sort_values(by=['userID','Timestamp'], inplace=True)
    total['Timestamp'] = pd.to_datetime(total['Timestamp'])
    
    total['assessmentItemID'] = total['assessmentItemID'].astype('category')
    total['KnowledgeTag'] = total['KnowledgeTag'].astype('category')
    
    total['Bigcat'] = total['testId'].str[2]
    total['Bigcat'] = total['Bigcat'].astype('category')
    total['Bigcat_avg'] = total.groupby('Bigcat')['answerCode'].transform('mean')
    
    # total['past_count'] = total.groupby('userID').cumcount()
    
    diff = total.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
    total['elapsed'] = diff

    categories = ["assessmentItemID",'KnowledgeTag',"Bigcat"]

    # # label encode your categorical columns
    le = LabelEncoder()
    for category in categories:
        total[category] = le.fit_transform(total[category])
    # total['Bigcat'] = le.fit_transform(total[category])

    ## train, test 나누어서 return
    train_df = total[total['train']==1]
    test_df = total[total['train']==0]
    test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]
    
    
    return train_df, test_df

In [17]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [18]:
def elo(df):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...", flush=True)

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            ),
            total=len(answers_df),
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters

    def gou_func(theta, beta):
        return 1 / (1 + np.exp(-(theta - beta)))

    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    prob = [
        gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"])
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]

    df["elo"] = prob

    return df

## 3. Train/Test 데이터 셋 분리

In [19]:
# wandb.init(project="LGBM", config= config)

df, submission = feature_engineering(train_data, test_data)
# df = elo(df)

# 유저별 분리
train, test = custom_train_test_split(df)
print(train.columns)

# 사용할 Feature 설정
# FEATS = ['KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
#          'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum', 'elapsed','Bigcat','smallcat']
# FEATS = ['KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
#          'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum', 'elapsed', 'Bigcat', 'smallcat', 'item_num', 'item_seq', 'solved_time_shift']
# FEATS = ['KnowledgeTag', 'same_item_cnt', 'user_avg', 'item_avg', 'test_avg', 'tag_avg', 'user_time_avg', 'item_time_avg',
#        'test_time_avg', 'tag_time_avg', 'user_current_avg', 'user_current_time_avg', 'hour', 'item_num', 'Bigcat','Bigcat_avg']

FEATS = ['assessmentItemID', 'KnowledgeTag','Bigcat_avg','elapsed','elo']
# FEATS = ['KnowledgeTag', 'same_item_cnt', 'user_avg', 'item_avg', 'test_avg', 'tag_avg']

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

Dataset of shape (2526700, 8)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'train', 'left_asymptote']
Parameter estimation is starting...


100%|██████████| 2526700/2526700 [00:16<00:00, 155566.00it/s]


Theta & beta estimations on assessmentItemID are completed.
Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'train', 'left_asymptote', 'elo', 'Bigcat',
       'Bigcat_avg', 'elapsed'],
      dtype='object')


In [20]:
model = XGBClassifier(
    objective="binary:logistic",
    booster="gbtree",
    learning_rate = 0.01,
    max_depth=7,
    min_child_weight=4,
    n_estimators=70,
    nthread=4,
    random_state=42,
    #enable_categorical=True
)

model.fit(train[FEATS], y_train,
    eval_set=[(test[FEATS] , y_test)],
    early_stopping_rounds=100,verbose=5)

y_pred_train = model.predict_proba(train[FEATS])[:,1]
y_pred_test = model.predict_proba(test[FEATS])[:,1]
acc = accuracy_score(y_test, np.where(y_pred_test >= 0.5, 1, 0))
auc = roc_auc_score( y_test,y_pred_test)
print(acc, auc)



[0]	validation_0-logloss:0.69031
[5]	validation_0-logloss:0.67698
[10]	validation_0-logloss:0.66482
[15]	validation_0-logloss:0.65372
[20]	validation_0-logloss:0.64358
[25]	validation_0-logloss:0.63430
[30]	validation_0-logloss:0.62577
[35]	validation_0-logloss:0.61793
[40]	validation_0-logloss:0.61075
[45]	validation_0-logloss:0.60410
[50]	validation_0-logloss:0.59798
[55]	validation_0-logloss:0.59240
[60]	validation_0-logloss:0.58726
[65]	validation_0-logloss:0.58253
[69]	validation_0-logloss:0.57901
0.7558545092177379 0.8180013913734844


In [21]:
# lgb_train = lgb.Dataset(train[FEATS], y_train)
# lgb_test = lgb.Dataset(test[FEATS], y_test)

# model = lgb.train(
#     {'learning_rate': 0.001, 
#       'objective': 'binary', 
#       'metric': 'binary_logloss',
#       'sub_feature': 0.5, 
#       'num_leaves': 10, 
#       'min_data': 50, 
#       'max_depth': 10}, 
#     lgb_train,
#     valid_sets=[lgb_train, lgb_test],
#     verbose_eval=100,
#     num_boost_round=2000,
#     early_stopping_rounds=100,
#     callbacks=[wandb.lightgbm.wandb_callback()] 
# )

# # wandb.lightgbm.log_summary(model, save_model_checkpoint=True)

# preds = model.predict(test[FEATS])
# acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
# auc = roc_auc_score(y_test, preds)
# wandb.log({"valid_accuracy": acc})
# wandb.log({"valid_roc_auc": auc})

# print(f'VALID AUC : {auc} ACC : {acc}\n')

In [22]:
a = elo(df)

Dataset of shape (2266586, 12)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'train', 'left_asymptote', 'elo', 'Bigcat', 'Bigcat_avg', 'elapsed']
Parameter estimation is starting...


100%|██████████| 2266586/2266586 [00:15<00:00, 146410.26it/s]


Theta & beta estimations on assessmentItemID are completed.


In [23]:
a[['userID','assessmentItemID','elo']].sort_values('elo')

Unnamed: 0,userID,assessmentItemID,elo
1629640,2857,6048,0.002898
2208384,5887,4396,0.002899
1629633,2857,6035,0.002928
2208444,5887,7848,0.003557
2042613,4400,4981,0.003640
...,...,...,...
1708427,3069,1044,0.998301
1168609,1814,1044,0.998342
275286,373,1388,0.998382
1168738,1814,1388,0.998527


In [24]:
df[:2]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,train,left_asymptote,elo,Bigcat,Bigcat_avg,elapsed
0,0,5354,A060000001,1,2020-03-24 00:17:11,556,1,0,0.97935,5,0.711898,0.0
1,0,5355,A060000001,1,2020-03-24 00:17:14,557,1,0,0.970579,5,0.711898,3.0


In [25]:
pd.options.display.max_columns = 100

In [27]:
df['userID'].nunique()

6698

In [30]:
submission[:2]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,train,left_asymptote,elo,Bigcat,Bigcat_avg,elapsed
2267621,3,4965,A050000133,-1,2020-10-26 13:13:57,469,0,0,0.431119,4,0.658649,46.0
2268292,4,7748,A070000146,-1,2020-12-27 02:47:54,781,0,0,0.508981,6,0.521167,23.0


In [29]:
df.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'train', 'left_asymptote', 'elo', 'Bigcat',
       'Bigcat_avg', 'elapsed'],
      dtype='object')

## 4. 훈련 및 검증

## 5. Inference

In [None]:
# # LOAD TESTDATA
# test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
# test_df = pd.read_csv(test_csv_file_path)

# # FEATURE ENGINEERING
# test_df = feature_engineering(test_df)

# # LEAVE LAST INTERACTION ONLY
# test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# submission = submission[FEATS]
submission.head(10)


# # DROP ANSWERCODE
# test_df = test_df.drop(['answerCode'], axis=1)

Unnamed: 0,assessmentItemID,userID,testId,answerCode,Timestamp,KnowledgeTag,train,same_item_cnt,solved_time_shift,user_avg,...,test_std,tag_std,user_current_avg,user_current_time_avg,hour,month,item_num,item_seq,Bigcat,smallcat
2989,A050133008,3,A050000133,,2020-10-26 13:13:57,5289,0,1,,0.692754,...,0.47538,0.496517,0.857143,45.142857,13,10,8,8,5,133
3660,A070146008,4,A070000146,,2020-12-27 02:47:54,9080,0,1,,0.69403,...,0.476573,0.497938,0.857143,24.571429,2,12,8,8,7,146
10860,A070111008,13,A070000111,,2020-12-27 04:35:09,9660,0,1,,0.695289,...,0.498249,0.500051,0.428571,14.857143,4,12,8,8,7,111
15278,A090064006,17,A090000064,,2020-10-30 05:48:37,2611,0,1,,0.818904,...,0.497463,0.493738,1.0,76.0,5,10,6,6,9,64
23531,A060135007,26,A060000135,,2020-10-23 11:44:18,1422,0,1,,0.759067,...,0.48003,0.487891,0.666667,45.833333,11,10,7,7,6,135
26895,A020190005,29,A020000190,,2020-10-22 04:38:45,8097,0,1,,0.847597,...,0.438876,0.453456,1.0,26.0,4,10,5,5,2,190
39887,A040136005,45,A040000136,,2020-10-23 08:24:19,2107,0,1,,0.688827,...,0.448255,0.457104,0.25,25.0,8,10,5,5,4,136
47628,A040140005,53,A040000140,,2020-10-26 09:13:20,2110,0,1,,0.523121,...,0.452402,0.452832,0.5,10.25,9,10,5,5,4,140
51927,A070159007,58,A070000159,,2020-12-24 21:09:29,9122,0,1,,0.364198,...,0.49964,0.482075,0.142857,1.857143,21,12,7,8,7,159
57352,A070146008,64,A070000146,,2020-12-29 04:30:22,9080,0,1,,0.833727,...,0.476573,0.497938,1.0,1.714286,4,12,8,8,7,146


In [32]:
# MAKE PREDICTION
total_preds = model.predict_proba(submission[FEATS])[:,1]
total_preds[:10]

array([0.5309973 , 0.56643975, 0.38493556, 0.5535409 , 0.3897956 ,
       0.6840572 , 0.3577807 , 0.29596797, 0.3402987 , 0.49149314],
      dtype=float32)

In [None]:
# model: 이미 학습이 완료된 LGBMModel, or Booster
# FEATURES: model 학습에 사용된 모든 features
# PATH: 그림을 저장할 경로

# SPLIT
ax = plot_importance(model, max_num_features=len(FEATS), importance_type='split')
ax.set(title=f'Feature Importance (split)',
	xlabel='Feature Importance',
	ylabel='Features')
ax.figure.savefig(f'./output/fi_split.png', dpi=300)


# GAIN
ax = plot_importance(model, max_num_features=len(FEATS), importance_type='gain')
ax.set(title=f'Feature Importance (gain)',
	xlabel='Feature Importance',
	ylabel='Features')
ax.figure.savefig(f'./output/fi_gain.png', dpi=300)

In [35]:
# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "xgb1202_submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/xgb1202_submission.csv


###**콘텐츠 라이선스**

<font color='red'><b>**WARNING**</b></font> : **본 교육 콘텐츠의 지식재산권은 재단법인 네이버커넥트에 귀속됩니다. 본 콘텐츠를 어떠한 경로로든 외부로 유출 및 수정하는 행위를 엄격히 금합니다.** 다만, 비영리적 교육 및 연구활동에 한정되어 사용할 수 있으나 재단의 허락을 받아야 합니다. 이를 위반하는 경우, 관련 법률에 따라 책임을 질 수 있습니다.

