In [18]:
import pandas as pd
import numpy as np
import os
import random

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score


In [19]:
path = '/opt/ml/input/data/'
# # train = pd.read_csv(path + 'train_data.csv')
# # test = pd.read_csv(path + 'test_data.csv')
dat = pd.read_csv(path + 'FE/FE_total.csv')

dat = dat.sort_values(by = ['userID', 'Timestamp'])
dat['tem'] = 0

In [20]:
_train = dat[dat['answerCode'] >= 0]
_test = dat[dat['answerCode'] == -1]

# 데이터 증강하는 법.
def data_argument(train):
    _train = train.copy()
    _train.reset_index(drop = True, inplace= True)
    _train.loc[_train.drop_duplicates(subset='userID', keep = 'last').index, 'tem'] = -1
    _valid = _train[_train['tem'] == -1]
    _train = _train[_train['tem'] == 0]

    return _train, _valid

_train_x, _valid = data_argument(_train)
_train_x_1, _train_1 = data_argument(_train_x)
_train_x_2, _train_2 = data_argument(_train_x_1)
_train_x_3, _train_3 = data_argument(_train_x_2)

In [21]:
def data_merge(_train_x, _train_y): 
    # _train_x : 값 제공하는 전체 DB, _train_y : 현재 맞춰야 하는 유저와 아이템 상태.
    # 유저 단위 변수 추가
    tem1 = _train_x.groupby('userID')['answerCode']
    tem1 = pd.DataFrame({'answer_mean' : tem1.mean(), 'answer_cnt':tem1.count()}).reset_index()
    tem2 = _train_x.groupby('userID')['solve_time']
    tem2 = pd.DataFrame({'time_mean' : tem2.mean()}).reset_index()
    tem3 = pd.DataFrame({'tag_mode' : _train_x.groupby('userID')['KnowledgeTag'].agg(pd.Series.mode)})
    tem3['tag_mode'] = tem3['tag_mode'].apply(lambda x : x if str(type(x)) =="<class 'numpy.int64'>" else x[0])
    tem4 = _train_x.groupby('userID')['answerCode'].apply(lambda x : x.iloc[-3:])
    tem4 = pd.DataFrame({'last3_mean' : tem4.groupby('userID').mean()})

    user_df = pd.merge(tem1, tem2, on=['userID'], how='left')
    user_df = pd.merge(user_df, tem3, on=['userID'], how='left')
    user_df = pd.merge(user_df, tem4, on=['userID'], how='left')

    # 아이템 단위 변수 추가.
    correct_k = _train_x.groupby(['assessmentItemID'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["item_mean", 'item_sum']
    correct_k.reset_index(inplace=True)

    _train_y = pd.merge(_train_y[['userID', 'assessmentItemID', 'answerCode']], user_df, on=['userID'], how='left')
    _train_y = pd.merge(_train_y, correct_k, on=['assessmentItemID'], how='left')

    return _train_y.drop(['userID','assessmentItemID'],axis=1)

test = data_merge(_train, _test)
valid = data_merge(_train_x, _valid)
train_1 = data_merge(_train_x_1, _train_1)
train_2 = data_merge(_train_x_2, _train_2)
train_3 = data_merge(_train_x_3, _train_3)

In [22]:
train = pd.concat([train_1,train_2,train_3])

model = LGBMClassifier()
model.fit(train.drop(['answerCode'],axis=1), train['answerCode'])
preds = model.predict_proba(train.drop(['answerCode'],axis=1))[:,1]

acc = accuracy_score(train['answerCode'], np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(train['answerCode'], preds)
print(f'TRAIN AUC : {auc} ACC : {acc}\n')

preds = model.predict_proba(valid.drop(['answerCode'],axis=1))[:,1]
acc = accuracy_score(valid['answerCode'], np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(valid['answerCode'], preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')
# train 대비 Valid가 낮다. valid는 리더보드와 거의 동일하다.
# train에 모델 과적합. => 과적합 방지 하이퍼파라미터 튜닝 + 데이터 증강필요?

TRAIN AUC : 0.867752911350715 ACC : 0.7841530054644809

VALID AUC : 0.7850297282254111 ACC : 0.7141897339424885



In [23]:
# submission 제출하기 위한 코드

test_pred = model.predict_proba(test.drop(['answerCode'],axis=1))[:,1]
test['prediction'] = test_pred
submission = test['prediction'].reset_index(drop = True).reset_index()
submission.rename(columns = {'index':'id'}, inplace = True)
submission.to_csv('../output/sequence_LGBM_submission3.csv', index = False)