# XGB 베이스라인

In [12]:
import pandas as pd
import os
import random
import numpy as np

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import wandb 
from tqdm import tqdm
from xgboost import XGBClassifier
from xgboost import plot_importance

In [13]:
pd.options.display.max_columns = 100

## 1. 데이터 로딩

In [14]:
wandb.login()

True

In [15]:
data_dir = '/opt/ml/input/data/' # 경로는 상황에 맞춰서 수정해주세요!

# LOAD TRAINDATA
csv_file_path = os.path.join(data_dir, 'train_data.csv') # 데이터는 대회홈페이지에서 받아주세요 :)
train_data = pd.read_csv(csv_file_path)

# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_data = pd.read_csv(test_csv_file_path)
# test_data['answerCode'] = test_data['answerCode'].replace(-1, np.nan)

# # LEAVE LAST INTERACTION ONLY
# train_df = test_df[test_df['userID'] == test_df['userID'].shift(-1)]
# test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# df = pd.concat([df, train_df], ignore_index=True)

## 2. Feature Engineering

In [16]:
def feature_engineering(train, test):
    ## train, test data concat
    train['train'] = 1
    test['train'] = 0
    total = pd.concat([train, test], ignore_index=True)
    # total['answerCode'] = total['answerCode'].fillna(0.5)
    total = elo(total)
    
    ## 유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    total.sort_values(by=['userID','Timestamp'], inplace=True)
    total['Timestamp'] = pd.to_datetime(total['Timestamp'])
    
    total['assessmentItemID'] = total['assessmentItemID'].astype('category')
    total['KnowledgeTag'] = total['KnowledgeTag'].astype('category')

    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    total['user_correct_answer'] = total.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    total['user_total_answer'] = total.groupby('userID')['answerCode'].cumcount()
    total['user_acc'] = total['user_correct_answer']/total['user_total_answer']

    total['Bigcat'] = total['testId'].str[2]
    total['Bigcat'] = total['Bigcat'].astype('category')
    total['Bigcat_avg'] = total.groupby('Bigcat')['answerCode'].transform('mean')
    
    # total['past_count'] = total.groupby('userID').cumcount()
    
    diff = total.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
    total['elapsed'] = diff

    total['item_avg'] = total.groupby('assessmentItemID')['answerCode'].transform('mean')

    categories = ["assessmentItemID",'KnowledgeTag',"Bigcat"]

    # # label encode your categorical columns
    le = LabelEncoder()
    for category in categories:
        total[category] = le.fit_transform(total[category])
    # total['Bigcat'] = le.fit_transform(total[category])

    ## train, test 나누어서 return
    train_df = total[total['train']==1]
    test_df = total[total['train']==0]
    test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]
    
    
    return train_df, test_df

In [17]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [18]:
def elo(df):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...", flush=True)

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            ),
            total=len(answers_df),
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters

    def gou_func(theta, beta):
        return 1 / (1 + np.exp(-(theta - beta)))

    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    prob = [
        gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"])
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]

    df["elo"] = prob

    return df

## 3. Train/Test 데이터 셋 분리

In [19]:
# wandb.init(project="LGBM", config= config)

df, submission = feature_engineering(train_data, test_data)
# df = elo(df)

# 유저별 분리
train, test = custom_train_test_split(df)
print(train.columns)

# 사용할 Feature 설정
# FEATS = ['KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
#          'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum', 'elapsed','Bigcat','smallcat']
# FEATS = ['KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
#          'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum', 'elapsed', 'Bigcat', 'smallcat', 'item_num', 'item_seq', 'solved_time_shift']
# FEATS = ['KnowledgeTag', 'same_item_cnt', 'user_avg', 'item_avg', 'test_avg', 'tag_avg', 'user_time_avg', 'item_time_avg',
#        'test_time_avg', 'tag_time_avg', 'user_current_avg', 'user_current_time_avg', 'hour', 'item_num', 'Bigcat','Bigcat_avg']
# FEATS = ['assessmentItemID', 'KnowledgeTag','user_acc','Bigcat_avg','elapsed','elo']
# FEATS = ['KnowledgeTag', 'same_item_cnt', 'user_avg', 'item_avg', 'test_avg', 'tag_avg']
FEATS = ['assessmentItemID', 'KnowledgeTag','Bigcat_avg','item_avg','elapsed','elo','answer_cnt']

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

Dataset of shape (2526700, 8)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'train', 'left_asymptote']
Parameter estimation is starting...


100%|██████████| 2526700/2526700 [00:16<00:00, 149959.85it/s]


Theta & beta estimations on assessmentItemID are completed.
Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'train', 'left_asymptote', 'elo', 'user_correct_answer',
       'user_total_answer', 'user_acc', 'Bigcat', 'Bigcat_avg', 'elapsed'],
      dtype='object')


In [20]:
model = XGBClassifier(
    objective="binary:logistic",
    booster="gbtree",
    learning_rate = 0.01,
    max_depth=7,
    min_child_weight=4,
    n_estimators=70,
    nthread=4,
    random_state=42,
    #enable_categorical=True
)

model.fit(train[FEATS], y_train,
    eval_set=[(test[FEATS] , y_test)],
    early_stopping_rounds=100,verbose=5)

y_pred_train = model.predict_proba(train[FEATS])[:,1]
y_pred_test = model.predict_proba(test[FEATS])[:,1]
acc = accuracy_score(y_test, np.where(y_pred_test >= 0.5, 1, 0))
auc = roc_auc_score( y_test,y_pred_test)
print(acc, auc)



[0]	validation_0-logloss:0.69035
[5]	validation_0-logloss:0.67723
[10]	validation_0-logloss:0.66523
[15]	validation_0-logloss:0.65429
[20]	validation_0-logloss:0.64424
[25]	validation_0-logloss:0.63504
[30]	validation_0-logloss:0.62665
[35]	validation_0-logloss:0.61892
[40]	validation_0-logloss:0.61181
[45]	validation_0-logloss:0.60528
[50]	validation_0-logloss:0.59925
[55]	validation_0-logloss:0.59380
[60]	validation_0-logloss:0.58872
[65]	validation_0-logloss:0.58396
[69]	validation_0-logloss:0.58048
0.7498754359740907 0.8144961240310078


In [32]:
X = df.drop(['answerCode'], axis=1)
y = df['answerCode']

In [40]:
score_train, score_test = [], [] 
# 유저별 분리
fold = StratifiedKFold(n_splits = 5, shuffle =True)

for train_index , test_index in fold.split(X, y):
    X_train,X_test = X.iloc[train_index], X.iloc[test_index]
    y_train,y_test = y.iloc[train_index], y.iloc[test_index]

    model = XGBClassifier(
    objective="binary:logistic",
    booster="gbtree",
    learning_rate = 0.01,
    max_depth=7,
    min_child_weight=4,
    n_estimators=70,
    nthread=4,
    random_state=42,
    #enable_categorical=True
    )

    model.fit(X_train[FEATS], y_train,
    eval_set=[(X_test[FEATS] , y_test)],
    early_stopping_rounds=100,verbose=5)

    y_pred_train = model.predict_proba(X_train[FEATS])[:,1]
    y_pred_test = model.predict_proba(X_test[FEATS])[:,1]
    score_train.append(roc_auc_score( y_train,y_pred_train))
    score_test.append(roc_auc_score( y_test,y_pred_test))
    # make predictions on test

    acc = accuracy_score(y_test, np.where(y_pred_test >= 0.5, 1, 0))
    auc = roc_auc_score( y_test,y_pred_test)
    wandb.init(project = "XGBC")
    wandb.log({"valid_accuracy": acc})
    wandb.log({"valid_roc_auc": auc})

print('\n')
print('Mean training AUC:',np.mean(score_train))
print('Mean testing AUC:',np.mean(score_test))



[0]	validation_0-logloss:0.68958
[5]	validation_0-logloss:0.67276
[10]	validation_0-logloss:0.65749
[15]	validation_0-logloss:0.64360
[20]	validation_0-logloss:0.63094
[25]	validation_0-logloss:0.61938
[30]	validation_0-logloss:0.60881
[35]	validation_0-logloss:0.59912
[40]	validation_0-logloss:0.59023
[45]	validation_0-logloss:0.58207
[50]	validation_0-logloss:0.57456
[55]	validation_0-logloss:0.56765
[60]	validation_0-logloss:0.56129
[65]	validation_0-logloss:0.55542
[69]	validation_0-logloss:0.55106




[0]	validation_0-logloss:0.68958
[5]	validation_0-logloss:0.67279
[10]	validation_0-logloss:0.65756
[15]	validation_0-logloss:0.64370
[20]	validation_0-logloss:0.63106
[25]	validation_0-logloss:0.61951
[30]	validation_0-logloss:0.60896
[35]	validation_0-logloss:0.59929
[40]	validation_0-logloss:0.59043
[45]	validation_0-logloss:0.58229
[50]	validation_0-logloss:0.57481
[55]	validation_0-logloss:0.56792
[60]	validation_0-logloss:0.56156
[65]	validation_0-logloss:0.55571
[69]	validation_0-logloss:0.55136


0,1
valid_accuracy,▁
valid_roc_auc,▁

0,1
valid_accuracy,0.77158
valid_roc_auc,0.81679




[0]	validation_0-logloss:0.68960
[5]	validation_0-logloss:0.67288
[10]	validation_0-logloss:0.65770
[15]	validation_0-logloss:0.64390
[20]	validation_0-logloss:0.63131
[25]	validation_0-logloss:0.61982
[30]	validation_0-logloss:0.60931
[35]	validation_0-logloss:0.59969
[40]	validation_0-logloss:0.59087
[45]	validation_0-logloss:0.58277
[50]	validation_0-logloss:0.57534
[55]	validation_0-logloss:0.56850
[60]	validation_0-logloss:0.56218
[65]	validation_0-logloss:0.55636
[69]	validation_0-logloss:0.55203


0,1
valid_accuracy,▁
valid_roc_auc,▁

0,1
valid_accuracy,0.77127
valid_roc_auc,0.81562




[0]	validation_0-logloss:0.68958
[5]	validation_0-logloss:0.67274
[10]	validation_0-logloss:0.65746
[15]	validation_0-logloss:0.64357
[20]	validation_0-logloss:0.63090
[25]	validation_0-logloss:0.61933
[30]	validation_0-logloss:0.60875
[35]	validation_0-logloss:0.59906
[40]	validation_0-logloss:0.59018
[45]	validation_0-logloss:0.58202
[50]	validation_0-logloss:0.57452
[55]	validation_0-logloss:0.56762
[60]	validation_0-logloss:0.56124
[65]	validation_0-logloss:0.55536
[69]	validation_0-logloss:0.55099


0,1
valid_accuracy,▁
valid_roc_auc,▁

0,1
valid_accuracy,0.77107
valid_roc_auc,0.81471




[0]	validation_0-logloss:0.68959
[5]	validation_0-logloss:0.67282
[10]	validation_0-logloss:0.65761
[15]	validation_0-logloss:0.64378
[20]	validation_0-logloss:0.63117
[25]	validation_0-logloss:0.61966
[30]	validation_0-logloss:0.60913
[35]	validation_0-logloss:0.59948
[40]	validation_0-logloss:0.59063
[45]	validation_0-logloss:0.58251
[50]	validation_0-logloss:0.57504
[55]	validation_0-logloss:0.56817
[60]	validation_0-logloss:0.56183
[65]	validation_0-logloss:0.55599
[69]	validation_0-logloss:0.55165


0,1
valid_accuracy,▁
valid_roc_auc,▁

0,1
valid_accuracy,0.77258
valid_roc_auc,0.81655




Mean training AUC: 0.8164782021767565
Mean testing AUC: 0.8157452913843495


In [33]:
df[:2]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,train,left_asymptote,elo,user_correct_answer,user_total_answer,user_acc,Bigcat,Bigcat_avg,elapsed
0,0,5354,A060000001,1,2020-03-24 00:17:11,556,1,0,0.97935,,0,,5,0.711898,0.0
1,0,5355,A060000001,1,2020-03-24 00:17:14,557,1,0,0.970579,1.0,1,1.0,5,0.711898,3.0


In [25]:
pd.options.display.max_columns = 100

In [26]:
df['userID'].nunique()

6698

In [27]:
submission[:2]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,train,left_asymptote,elo,user_correct_answer,user_total_answer,user_acc,Bigcat,Bigcat_avg,elapsed
2267621,3,4965,A050000133,-1,2020-10-26 13:13:57,469,0,0,0.431119,717.0,1035,0.692754,4,0.658649,46.0
2268292,4,7748,A070000146,-1,2020-12-27 02:47:54,781,0,0,0.508981,465.0,670,0.69403,6,0.521167,23.0


In [28]:
df.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'train', 'left_asymptote', 'elo', 'user_correct_answer',
       'user_total_answer', 'user_acc', 'Bigcat', 'Bigcat_avg', 'elapsed'],
      dtype='object')

## 4. 훈련 및 검증

## 5. Inference

In [41]:
# # LOAD TESTDATA
# test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
# test_df = pd.read_csv(test_csv_file_path)

# # FEATURE ENGINEERING
# test_df = feature_engineering(test_df)

# # LEAVE LAST INTERACTION ONLY
# test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# submission = submission[FEATS]
submission.head(10)


# # DROP ANSWERCODE
# test_df = test_df.drop(['answerCode'], axis=1)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,train,left_asymptote,elo,user_correct_answer,user_total_answer,user_acc,Bigcat,Bigcat_avg,elapsed
2267621,3,4965,A050000133,-1,2020-10-26 13:13:57,469,0,0,0.431119,717.0,1035,0.692754,4,0.658649,46.0
2268292,4,7748,A070000146,-1,2020-12-27 02:47:54,781,0,0,0.508981,465.0,670,0.69403,6,0.521167,23.0
2269609,13,7484,A070000111,-1,2020-12-27 04:35:09,820,0,0,0.30721,915.0,1316,0.695289,6,0.521167,8.0
2270869,17,9381,A090000064,-1,2020-10-30 05:48:37,309,0,0,0.447042,1031.0,1259,0.818904,8,0.45447,75.0
2271256,26,6231,A060000135,-1,2020-10-23 11:44:18,183,0,0,0.247744,293.0,386,0.759067,5,0.711898,17.0
2272110,29,1967,A020000190,-1,2020-10-22 04:38:45,680,0,0,0.86781,723.0,853,0.847597,1,0.737515,30.0
2273194,45,3759,A040000136,-1,2020-10-23 08:24:19,276,0,0,0.677722,746.0,1083,0.688827,3,0.679714,4.0
2273887,53,3779,A040000140,-1,2020-10-26 09:13:20,279,0,0,0.040234,362.0,692,0.523121,3,0.679714,20.0
2274698,58,7847,A070000159,-1,2020-12-24 21:09:29,792,0,0,0.182091,295.0,810,0.364198,6,0.521167,2.0
2275968,64,7748,A070000146,-1,2020-12-29 04:30:22,781,0,0,0.75509,1058.0,1269,0.833727,6,0.521167,2.0


In [42]:
# MAKE PREDICTION
total_preds = model.predict_proba(submission[FEATS])[:,1]
total_preds[:10]

array([0.5273881 , 0.55462515, 0.37222853, 0.5635203 , 0.40078378,
       0.68049055, 0.35328853, 0.3022312 , 0.33937305, 0.5462816 ],
      dtype=float32)

In [35]:
# model: 이미 학습이 완료된 LGBMModel, or Booster
# FEATURES: model 학습에 사용된 모든 features
# PATH: 그림을 저장할 경로

# SPLIT
ax = plot_importance(model, max_num_features=len(FEATS), importance_type='split')
ax.set(title=f'Feature Importance (split)',
	xlabel='Feature Importance',
	ylabel='Features')
ax.figure.savefig(f'./output/fi_split.png', dpi=300)


# GAIN
ax = plot_importance(model, max_num_features=len(FEATS), importance_type='gain')
ax.set(title=f'Feature Importance (gain)',
	xlabel='Feature Importance',
	ylabel='Features')
ax.figure.savefig(f'./output/fi_gain.png', dpi=300)

XGBoostError: [09:58:14] ../src/gbm/gbtree.h:338: Unknown feature importance type, expected one of: {"weight", "total_gain", "total_cover", "gain", "cover"}, got: split
Stack trace:
  [bt] (0) /opt/conda/envs/dkt/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x29da59) [0x7fa2cd28ea59]
  [bt] (1) /opt/conda/envs/dkt/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x2bb1ba) [0x7fa2cd2ac1ba]
  [bt] (2) /opt/conda/envs/dkt/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x2ce164) [0x7fa2cd2bf164]
  [bt] (3) /opt/conda/envs/dkt/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGBoosterFeatureScore+0x6e5) [0x7fa2cd127275]
  [bt] (4) /opt/conda/envs/dkt/lib/python3.10/lib-dynload/../../libffi.so.7(+0x69dd) [0x7fa3350939dd]
  [bt] (5) /opt/conda/envs/dkt/lib/python3.10/lib-dynload/../../libffi.so.7(+0x6067) [0x7fa335093067]
  [bt] (6) /opt/conda/envs/dkt/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x93ef) [0x7fa3350a23ef]
  [bt] (7) /opt/conda/envs/dkt/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x865a) [0x7fa3350a165a]
  [bt] (8) /opt/conda/envs/dkt/bin/python(_PyObject_MakeTpCall+0x25b) [0x4f7e1b]



In [43]:
# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "xgbacckfold_submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/xgbacckfold_submission.csv


###**콘텐츠 라이선스**

<font color='red'><b>**WARNING**</b></font> : **본 교육 콘텐츠의 지식재산권은 재단법인 네이버커넥트에 귀속됩니다. 본 콘텐츠를 어떠한 경로로든 외부로 유출 및 수정하는 행위를 엄격히 금합니다.** 다만, 비영리적 교육 및 연구활동에 한정되어 사용할 수 있으나 재단의 허락을 받아야 합니다. 이를 위반하는 경우, 관련 법률에 따라 책임을 질 수 있습니다.

