In [1]:
import numpy as np
import pandas as pd

import os
import random

import warnings

warnings.filterwarnings(action='ignore')

path = '/opt/ml/input/data/'
train = pd.read_csv(path + 'train_data.csv')
test = pd.read_csv(path + 'test_data.csv')

In [2]:
# train, test 병합. 모든 데이터 활용하기.
dat = pd.concat([train, test], axis = 0)
dat = dat.sort_values(by = ['userID', 'Timestamp'])

# 시험 난이도(?)로 추청되는 특성 따로 분류
dat['b_category'] = dat['assessmentItemID'].str[2]
# 시험지로 추청되는 특성 따로 분류. 뒤에 것이 좋을 것 같은데 일단 제출을 한 것으로 써놓음.
dat['test_category'] = dat['assessmentItemID'].str[4:7] # dat['assessmentItemID'].str[2] + dat['assessmentItemID'].str[4:7]

# answerCode이 -1로 되어있는 문제를 맞추기 위해 사용.
_train = dat[dat['answerCode'] >= 0]
_test = dat[dat['answerCode'] < 0]

In [4]:
# 유저 마다 가장 마지막 문제를 맞추는 것을 기준으로 평가 하려함
# test 유저에 경우 가장 마지막 문제 전 문제를 맞추는 것을 기준으로 평가함.
user_final_time = _train.groupby('userID')['Timestamp'].max()
_train['train_valid'] = _train.apply(lambda x : -1 if x.Timestamp == user_final_time[x.userID] else x['answerCode'], axis = 1)
_valid = _train[_train['train_valid'] < 0]
_train = _train[_train['train_valid'] >= 0]

In [5]:

# 1. Count the total number of users
# 2. Choose the user number for train / valid set each
# 3. Find the first `user_id` for the validation set
# 4. Find the first `index` for the validation set
# 5. Split the dataset to Train / Validation set

def train_valid_split(df, train_size=0.90):
    # Total user number
    total_user = df.userID.nunique()

    # Total user indices
    total_user_index = df.userID.unique()

    # Divide the user number with ratio
    train_user = round(total_user * train_size)
    valid_user = total_user - train_user

    # Find the first user which located at validation set
    valid_user_id = total_user_index[train_user]

    # Find the first index of the validation set
    valid_index = df.query('userID == @valid_user_id').index[0]

    # Split the dataset to train and valid
    train = df.iloc[:valid_index]
    valid = df.iloc[valid_index:]

    report = f'''
        There is {total_user} number of users\n
        The ratio of train set is {train_size}\n
        User will be splited to {train_user} for train, {valid_user} for validation\n
        =======================================================================\n
        The first user ID of the validation set is {valid_user_id}\n
        The first index of the validation set is {valid_index}\n
        =======================================================================\n
        Data was be splited to {len(train)} for train, {len(valid)} for validation
    '''
    # print(valid)
    return train, valid

In [6]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
_test.drop(['Timestamp', 'testId', 'answerCode'], axis = 1, inplace = True)
_test['userID'] = _test['userID'].astype('str')
_test['KnowledgeTag'] = _test['KnowledgeTag'].astype('str')
_test['prediction'] = 0
for _ in range(10):
    train_idx, valid_idx = train_valid_split(_train)
    _train_value = train_idx['answerCode']
    train_idx.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True)
    
    valid_value = valid_idx['answerCode']
    valid_idx.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True)   

    # CatBoost에 적용하기 위해선 문자열 데이터로 변환 필요.
    train_idx['userID'] = train_idx['userID'].astype('str')
    train_idx['KnowledgeTag'] = train_idx['KnowledgeTag'].astype('str')

    valid_idx['userID'] = valid_idx['userID'].astype('str')
    valid_idx['KnowledgeTag'] = valid_idx['KnowledgeTag'].astype('str')
    
    cat_model = CatBoostClassifier(learning_rate=0.05, iterations=1000, task_type="GPU")
    cat_model.fit(train_idx, _train_value, early_stopping_rounds=100, cat_features=list(train_idx.columns) ,verbose=500)
    _valid_pred = cat_model.predict_proba(valid_idx)[:,1]
    print(roc_auc_score(valid_value, _valid_pred)) # auc
    print(accuracy_score(valid_value, np.where(_valid_pred >= 0.5, 1, 0))) # acc, 정확도

    # submission 제출하기 위한 코드

    _test_pred = cat_model.predict_proba(_test)[:,1]
    _test['prediction'] += _test_pred
_test['prediction'] /= 10
submission = _test['prediction'].reset_index(drop = True).reset_index()
submission.rename(columns = {'index':'id'}, inplace = True)
submission.to_csv('./output/cat_submission.csv', index = False)

0:	learn: 0.6764593	total: 56.6ms	remaining: 56.6s
500:	learn: 0.4666532	total: 32.1s	remaining: 32s
999:	learn: 0.4650090	total: 1m 2s	remaining: 0us
0.7385637030462846
0.6870424150496661
0:	learn: 0.6764596	total: 57ms	remaining: 56.9s
500:	learn: 0.4667303	total: 33.1s	remaining: 32.9s
999:	learn: 0.4650481	total: 1m 4s	remaining: 0us
0.7373464880567873
0.6861656277168384
0:	learn: 0.6764595	total: 57.4ms	remaining: 57.3s
500:	learn: 0.4667671	total: 33.9s	remaining: 33.7s
999:	learn: 0.4650620	total: 1m 5s	remaining: 0us
0.7364907479112387
0.6856772904681748
0:	learn: 0.6764594	total: 55.5ms	remaining: 55.4s
500:	learn: 0.4667409	total: 32.7s	remaining: 32.6s
999:	learn: 0.4650535	total: 1m 3s	remaining: 0us
0.7387511701784089
0.6876528366104956
0:	learn: 0.6764595	total: 56ms	remaining: 55.9s
500:	learn: 0.4667395	total: 32.3s	remaining: 32.2s
999:	learn: 0.4650201	total: 1m 2s	remaining: 0us
0.7369800439366109
0.6845489354617932
0:	learn: 0.6764594	total: 56.5ms	remaining: 56.5s


In [None]:
#!pip install catboost
# 가상환경에 패키지 설치 필요할듯
from catboost import CatBoostClassifier

# 하이퍼파라미터 수정할 여지가 많으니 추가나 수정하고 싶은 사람 수정해도 괜찮을듯.
# 일단 지난번에 사용했던 파라미터들 주석으로 들고 옴. 참고하면서 공부하면 좋을듯.

# params_cat = {
#     "task_type" : "GPU",
#     "devices" : '0',
#     "random_state": SEED,
#     'learning_rate': 0.04574578205475402, 
#     'bagging_temperature': 0.12172958098369972, 
#     'n_estimators': 2000, 
#     'max_depth': 8, 
#     'random_strength': 28, 
#     'l2_leaf_reg': 1.6285455533915874e-05, 
#     'min_child_samples': 18, 
#     'max_bin': 441, 
#     'od_type': 'Iter',
#     "cat_features" : list(train_ratings.drop(['rating'],axis = 1).columns),
# }

# model = CatBoostRegressor(**params_cat)

# 실제 적용한 모델은 learning_rate, iterations(=n_estimators, 아마도?)만 사용함.
cat_model = CatBoostClassifier(learning_rate=0.05, iterations=1000, task_type="GPU")
cat_model.fit(_train, _train_value, early_stopping_rounds=100, cat_features=list(_train.columns) ,verbose=500)