In [211]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
import sys
sys.path.append(r'../')
from feature_engineering import feature_engineering
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score

import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier

from datetime import datetime
from sklearn.inspection import permutation_importance
import time
import numpy as np

In [212]:
IS_CUSTOM = True

In [213]:
def load_xgb_data(basepath="../../data/"):
    """Load data for xgboost"""
    path1 = os.path.join(basepath, "train_data.csv")
    path2 = os.path.join(basepath, "test_data.csv")
    if IS_CUSTOM:
        print("Load custom...")
        path2 = os.path.join(basepath, "custom_test_data.csv")
    data1 = pd.read_csv(path1)
    data2 = pd.read_csv(path2)
    data = pd.concat([data1, data2])
    data = data.sort_values(["userID", "Timestamp"])
    data.drop_duplicates(
        subset=["userID", "assessmentItemID"], keep="last", inplace=True
    )
    data = feature_engineering(data)
    return data

xgb_data = load_xgb_data()

Load custom...


In [214]:
drops = ['year','day','minute', 'second']

In [215]:
xgb_data = xgb_data.drop(drops, axis=1)

In [216]:
def split_train_test(data):
    train = data[data["answerCode"]>=0]
    test = data[data["answerCode"]<0]
    # x_train, x_valid, y_train, y_valid = train_test_split(train.drop("answerCode", axis=1), train["answerCode"], test_size=0.2, shuffle=True, random_state=42)
    return train, test
_train, _test = split_train_test(xgb_data)
# x_train, x_valid, y_train, y_valid, test = split_train_test(xgb_data)

In [217]:
random.seed(42)
def train_test_split_mode_1(df:pd.DataFrame, ratio=0.8, split=True):
    users = list(zip(df["userID"].value_counts().index, df["userID"].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio * len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)

    train = df[df["userID"].isin(user_ids)]
    valid = df[df["userID"].isin(user_ids) == False]

    #valid데이터셋은 각 유저의 마지막 interaction만 추출
    valid = valid[valid["userID"] != valid["userID"].shift(-1)]
    return train, valid

In [218]:
"""
train_test_split_mode_2:
    train: train data
    valid: test data에서 마지막에서 두번째 데이터까지 사용
"""
def train_test_split_mode_2(train_df:pd.DataFrame, test_df:pd.DataFrame):
    valid = test_df[test_df["answerCode"] != -1]
    valid = valid[valid["userID"] != valid["userID"].shift(-1)]
    return train_df, test_df, valid

In [219]:

train, test, valid = train_test_split_mode_2(_train, _test)
x_train = train.drop(["answerCode"], axis=1)
y_train = train["answerCode"]
x_valid = valid.drop(["answerCode"], axis=1)
y_valid = valid["answerCode"]

In [220]:
DROPS = ["userID","assessmentItemID","testId","Timestamp"]

# XGB preprocessing

In [221]:
def xgb_preprocessing(data):
    data = data.drop(DROPS, axis=1)
    for col in data.columns:
        data[col] = data[col].astype(float)
    return data
x_train = xgb_preprocessing(x_train)
x_valid = xgb_preprocessing(x_valid)
test = xgb_preprocessing(test)

In [222]:
x_train.head()

Unnamed: 0,KnowledgeTag,userAnswerRate,userSolvedLen,testAnswerRate,testSolvedLen,tagAnswerRate,tagSolvedLen,month,hour,first3,timeConcentrationRate,timeConcentrationCount,timeConcentrationLevel,user_correct_answer,user_total_answer,user_acc,monthAnswerRate,monthSolvedCount
0,7224.0,0.630872,745.0,0.951701,1470.0,0.957823,735.0,3.0,0.0,60.0,0.650422,109984.0,2.0,,0.0,,0.681427,189115.0
1,7225.0,0.630872,745.0,0.951701,1470.0,0.916689,3673.0,3.0,0.0,60.0,0.650422,109984.0,2.0,1.0,1.0,1.0,0.681427,189115.0
2,7225.0,0.630872,745.0,0.951701,1470.0,0.916689,3673.0,3.0,0.0,60.0,0.650422,109984.0,2.0,2.0,2.0,1.0,0.681427,189115.0
3,7225.0,0.630872,745.0,0.951701,1470.0,0.916689,3673.0,3.0,0.0,60.0,0.650422,109984.0,2.0,3.0,3.0,1.0,0.681427,189115.0
4,7225.0,0.630872,745.0,0.951701,1470.0,0.916689,3673.0,3.0,0.0,60.0,0.650422,109984.0,2.0,4.0,4.0,1.0,0.681427,189115.0


# XGBRegressor

In [223]:
model=XGBRegressor(tree_method='gpu_hist', gpu_id=0, early_stopping_rounds=100)

param_grid={
     'booster': ['gbtree'], 
     'colsample_bylevel': [0.9], 
     'colsample_bytree': [0.8], 
     'gamma': [3], 
     'max_depth': [8], 
     'min_child_weight': [3], 
     'n_estimators': [i*300 for i in range(1,5)], 
     'nthread': [4], 
     'objective': ['binary:logistic'], 
     'random_state': [42], 
     'verbosity': [1]
     }
cv=KFold(n_splits=5, random_state=42, shuffle=True)

gcv=GridSearchCV(model, param_grid=param_grid, cv=cv, scoring='roc_auc', n_jobs=4, pre_dispatch=8, verbose=1)

In [224]:
gcv.fit(x_train,y_train,eval_set=[(x_train,y_train),(x_valid,y_valid)],verbose=True)
print('final params', gcv.best_params_)
print('best score', gcv.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[0]	validation_0-logloss:0.63066	validation_1-logloss:0.00000
[1]	validation_0-logloss:0.59922	validation_1-logloss:0.00000
[2]	validation_0-logloss:0.58182	validation_1-logloss:0.00000
[3]	validation_0-logloss:0.57083	validation_1-logloss:0.00000
[4]	validation_0-logloss:0.56327	validation_1-logloss:0.00000
[5]	validation_0-logloss:0.55915	validation_1-logloss:0.00000
[6]	validation_0-logloss:0.55554	validation_1-logloss:0.00000
[7]	validation_0-logloss:0.55319	validation_1-logloss:0.00000
[8]	validation_0-logloss:0.55160	validation_1-logloss:0.00000
[9]	validation_0-logloss:0.55037	validation_1-logloss:0.00000
[10]	validation_0-logloss:0.54975	validation_1-logloss:0.00000
[0]	validation_0-logloss:0.63075	validation_1-logloss:0.00000
[11]	validation_0-logloss:0.54853	validation_1-logloss:0.00000
[12]	validation_0-logloss:0.54766	validation_1-logloss:0.00000
[0]	validation_0-logloss:0.63072	validation_1-logloss:0.00000
[13]	val

In [225]:
# SAVE OUTPUT
model = gcv.best_estimator_
sub = pd.read_csv("/opt/ml/input/data/sample_submission.csv")
sub["prediction"]=model.predict(test.drop("answerCode",axis=1))

output_dir = "./output/"
file_name = f"XGB_grid_kfold_custom_submission_{datetime.now().microsecond}.csv"
write_path = os.path.join(output_dir, file_name)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, "w", encoding="utf8") as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(sub["prediction"]):
        w.write("{},{}\n".format(id, p))

writing prediction : ./output/XGB_grid_kfold_custom_submission_823483.csv


In [230]:
def get_accuracy(PRED_PATH = file_name):
    threshold=0.5
    ANSWER_PATH = "../../data/custom_answer.csv"

    submission_result = pd.read_csv(PRED_PATH)
    answer = pd.read_csv(ANSWER_PATH)

    y_pred, y = submission_result["prediction"], answer["prediction"]

    return f"accuracy_score: {accuracy_score(y,y_pred.apply(lambda x: 1 if x > threshold else 0))}\nroc  auc_score: {roc_auc_score(y,y_pred)}"

In [231]:
if IS_CUSTOM:
    print(get_accuracy("output/"+file_name))

accuracy_score: 0.6599462365591398
roc  auc_score: 0.7192600016093282
