In [27]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import os
import random
import warnings
warnings.filterwarnings('ignore')
base_feats = ['KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum']

## 1. 데이터 로딩
data_dir = '/opt/ml/input/data' # 경로
train_file_path = os.path.join(data_dir, 'train_data.csv') # 데이터
test_file_path = os.path.join(data_dir, 'test_data.csv')
df_train = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)
# df_test = df_test[df_test.answerCode!=-1]  # answer_code -1 제외
df_train.shape, df_test.shape
df = pd.read_csv(os.path.join(data_dir, 'train_test_last2.csv'))

day_dict = {'Tuesday': 0,
 'Thursday': 1,
 'Monday': 2,
 'Saturday': 3,
 'Friday': 4,
 'Wednesday': 5,
 'Sunday': 6}

def feature_engineering(df):
    df2 = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df2.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df2['user_correct_answer'] = df2.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df2['user_total_answer'] = df2.groupby('userID')['answerCode'].cumcount()
    df2['user_acc'] = df2['user_correct_answer']/df2['user_total_answer']
    df2['month'] = pd.to_datetime(df2.Timestamp).dt.month
    df2['day'] = pd.to_datetime(df2.Timestamp).dt.day
    df2['hour'] = pd.to_datetime(df2.Timestamp).dt.hour
    df2['dayname'] = pd.to_datetime(df2.Timestamp).dt.day_name().map(day_dict)
    df2['big_category'] = df2.testId.map(lambda x:x[2]).astype(int)
    df2['problem_num'] = df2.assessmentItemID.map(lambda x: int(x[-3:]))
    df2['mid_category'] = df2.testId.map(lambda x: int(x[-3:]))

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df2.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df2.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']

    df2 = pd.merge(df2, correct_t, on=['testId'], how="left")
    df2 = pd.merge(df2, correct_k, on=['KnowledgeTag'], how="left")
    
    return df2

train_fe = feature_engineering(df_train)
test_fe = feature_engineering(df_test)
train_fe.shape, test_fe.shape

((2266586, 6), (260114, 6))

((2266586, 22), (260114, 22))

In [2]:
test = df_test[df_test.answerCode==-1]

In [3]:
df[df.group==6]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,date,...,problem_num,mid_category,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,group,valid
2266586,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626,,0,,2020-01-09,...,1,23,0.473214,0.500400,106,0.587097,0.493952,91,6,0
2266587,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626,1.0,1,1.000000,2020-01-09,...,2,23,0.473214,0.500400,106,0.587097,0.493952,91,6,0
2266588,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625,2.0,2,1.000000,2020-01-09,...,3,23,0.473214,0.500400,106,0.588517,0.493284,123,6,0
2266589,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625,2.0,3,0.666667,2020-01-09,...,4,23,0.473214,0.500400,106,0.588517,0.493284,123,6,0
2266590,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623,2.0,4,0.500000,2020-01-09,...,6,23,0.473214,0.500400,106,0.581986,0.493803,252,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525951,7439,A040197006,A040000197,1,2020-08-21 07:39:45,2132,7.0,10,0.700000,2020-08-21,...,6,197,0.763158,0.426268,145,0.738903,0.439807,283,6,0
2525952,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832,8.0,11,0.727273,2020-10-14,...,1,130,0.612440,0.488363,128,0.589666,0.492644,194,6,0
2525953,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832,8.0,12,0.666667,2020-10-14,...,2,130,0.612440,0.488363,128,0.589666,0.492644,194,6,0
2525954,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244,9.0,13,0.692308,2020-10-14,...,3,130,0.612440,0.488363,128,0.748344,0.435409,113,6,0


In [4]:
# 사용할 Feature 설정
FEATS = ['KnowledgeTag', 'user_correct_answer', 'user_total_answer',
         'big_category',
         'mid_category',
         'problem_num',
         'month','day','dayname','hour',
         'user_acc',
         'test_mean',
         'test_sum',
         'test_std', 'tag_std',
         'tag_mean',
         'tag_sum',
        ]

params = {
#     "max_depth": 8,  # 8,
#     "min_data_in_leaf": 1000,
#     "bagging_fraction": 0.75,
    "objective": "binary",
#     "boosting": "gbdt",  # dart
#     "learning_rate": 0.01,  # 0.01,
#     "seed": 42,
#     "num_leaves": 80,  # 40,
#     "metric": "auc",
}

In [6]:
def fe(df):
    df2 = df.copy()
    df2['userID'] = df2.userID.map(userid2idx)
    df2['assessmentItemID'] = df2['assessmentItemID'].map(assid2idx)
    df2['testId'] = df2['testId'].map(testid2idx)
    return df2

In [7]:
x = df[df.group==6]
userid2idx = {k:v for v,k in enumerate(sorted(x.userID.unique()))}
assid2idx = {k:v for v,k in enumerate(sorted(x.assessmentItemID.unique()))}
testid2idx = {k:v for v,k in enumerate(sorted(x.testId.unique()))}
x = fe(x)

In [8]:
train = x.drop('answerCode', axis=1)
y_train = x[['answerCode']]

In [19]:
val = x[x.userID != x.userID.shift(-1)]

In [20]:
valid = val.drop('answerCode', axis=1)
y_valid = val[['answerCode']]

In [11]:
test = fe(test)

In [12]:
test.shape, x.shape

((744, 6), (259370, 25))

In [54]:
x[:5]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,date,...,problem_num,mid_category,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,group,valid
2266586,0,4235,804,1,2020-01-09 10:56:31,2626,,0,,2020-01-09,...,1,23,0.473214,0.5004,106,0.587097,0.493952,91,6,0
2266587,0,4236,804,1,2020-01-09 10:56:57,2626,1.0,1,1.0,2020-01-09,...,2,23,0.473214,0.5004,106,0.587097,0.493952,91,6,0
2266588,0,4237,804,0,2020-01-09 10:58:31,2625,2.0,2,1.0,2020-01-09,...,3,23,0.473214,0.5004,106,0.588517,0.493284,123,6,0
2266589,0,4238,804,0,2020-01-09 10:58:36,2625,2.0,3,0.666667,2020-01-09,...,4,23,0.473214,0.5004,106,0.588517,0.493284,123,6,0
2266590,0,4240,804,0,2020-01-09 10:58:43,2623,2.0,4,0.5,2020-01-09,...,6,23,0.473214,0.5004,106,0.581986,0.493803,252,6,0


In [53]:
test[:10]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
1035,0,4965,914,-1,2020-10-26 13:13:57,5289
1706,1,7748,1306,-1,2020-12-27 02:47:54,9080
3023,2,7484,1271,-1,2020-12-27 04:35:09,9660
4283,3,9381,1526,-1,2020-10-30 05:48:37,2611
4670,4,6231,1109,-1,2020-10-23 11:44:18,1422
5524,5,1967,380,-1,2020-10-22 04:38:45,8097
6608,6,3759,719,-1,2020-10-23 08:24:19,2107
7301,7,3779,723,-1,2020-10-26 09:13:20,2110
8112,8,7847,1319,-1,2020-12-24 21:09:29,9122
9382,9,7748,1306,-1,2020-12-29 04:30:22,9080


In [13]:
set(test.assessmentItemID) - set(x.assessmentItemID)
set(test.testId) - set(x.testId)

set()

set()

In [14]:
FEATS = ['userID',
 'assessmentItemID',
 'testId',
 'KnowledgeTag',
 'user_correct_answer',
 'user_total_answer',
 'big_category',
 'mid_category',
 'problem_num',
 'month',
 'day',
 'dayname',
 'hour',
 'user_acc',
 'test_mean',
 'test_sum',
 'test_std',
 'tag_std',
 'tag_mean',
 'tag_sum']


In [15]:
model = lgb.LGBMClassifier(
    **params,
    n_estimators=3200,
    silent=-1,
)

## 500 epoch로 테스트 데이터로만 학습

In [24]:
model = lgb.LGBMClassifier(
    **params,
    n_estimators=500,
    silent=-1,
)

model.fit(
    X=train[FEATS],
    y=y_train,
    early_stopping_rounds=300,
    eval_set=[(train[FEATS], y_train), (valid[FEATS], y_valid)],
    eval_names=["train", "valid"],
    eval_metric="roc_auc",
    verbose=100,
)

[100]	train's binary_logloss: 0.526652	valid's binary_logloss: 0.586375
[200]	train's binary_logloss: 0.510716	valid's binary_logloss: 0.564206
[300]	train's binary_logloss: 0.498272	valid's binary_logloss: 0.545775
[400]	train's binary_logloss: 0.488178	valid's binary_logloss: 0.53046
[500]	train's binary_logloss: 0.478494	valid's binary_logloss: 0.519541


In [25]:
preds1 = model.predict_proba(valid[FEATS])[:, 1]
acc1 = accuracy_score(y_valid, np.where(preds1 >= 0.5, 1, 0))
auc1 = roc_auc_score(y_valid, preds1)
print(f'VALID AUC : {auc1} ACC : {acc1}\n')

VALID AUC : 0.8268582254961486 ACC : 0.7540322580645161



In [29]:
# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_df = pd.read_csv(test_csv_file_path)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)

In [33]:
test_df = fe(test_df)

In [34]:
# MAKE PREDICTION
total_preds = model.predict_proba(test_df[FEATS])[:,1]

# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "submission_only_test.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    q = w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        q = w.write('{},{}\n'.format(id,p))

writing prediction : output/submission_only_test.csv


14

19

21

22

21

20

21

21

21

22

21

22

22

22

22

21

22

23

22

22

23

22

23

22

23

22

22

22

22

23

21

23

22

23

22

22

22

21

23

23

22

23

23

23

22

23

22

22

23

22

22

22

22

22

22

22

22

22

22

23

22

22

22

23

23

23

23

22

22

22

22

22

22

23

22

22

23

23

22

22

22

22

22

22

22

22

22

22

23

21

22

23

23

23

22

22

22

23

22

22

22

24

24

23

23

23

23

23

22

24

24

23

23

24

24

23

23

23

24

24

23

24

23

23

24

24

24

23

24

23

24

23

23

22

24

23

23

23

22

23

23

23

23

23

23

22

24

23

23

22

23

23

23

23

23

24

23

24

21

22

22

24

23

23

24

23

24

23

24

23

23

23

23

23

23

24

23

23

23

24

24

22

24

22

22

23

23

23

24

23

24

23

24

23

23

24

23

23

23

23

24

23

23

24

23

23

23

24

23

23

24

24

23

24

23

23

23

23

23

23

23

23

23

24

23

23

23

23

23

23

23

24

23

23

23

23

23

23

24

23

22

23

22

24

23

23

23

24

22

24

22

22

23

23

23

23

23

23

23

23

22

24

23

24

23

24

23

23

23

23

23

24

24

23

24

22

24

23

24

22

23

23

23

23

23

23

24

23

23

23

23

24

24

24

23

24

24

23

24

23

23

22

23

23

23

23

24

23

23

24

23

24

24

24

23

24

23

23

24

23

22

23

23

23

24

23

23

23

24

23

24

23

24

23

23

23

23

25

23

23

22

23

23

23

23

24

23

23

23

23

23

23

23

23

23

23

23

23

23

24

23

23

24

24

22

24

24

23

23

24

24

23

23

23

23

24

23

23

23

23

23

23

23

24

23

23

23

24

23

23

23

24

23

24

23

23

24

22

24

23

24

23

23

24

21

23

24

23

22

24

22

23

24

24

24

23

23

22

23

23

24

23

23

22

23

23

22

23

22

23

23

23

23

23

23

23

23

24

24

22

24

24

24

23

23

23

23

24

23

24

23

23

23

23

23

22

23

23

23

23

24

23

23

23

23

23

23

23

23

24

23

23

24

23

24

23

23

23

23

22

23

23

22

24

24

23

24

24

24

24

24

24

23

22

24

23

24

24

22

24

23

23

23

24

23

23

23

22

23

24

23

23

24

22

23

23

24

23

24

24

23

23

23

23

24

23

23

23

23

23

23

23

24

24

23

24

23

23

23

24

23

23

23

23

23

23

23

23

23

23

23

23

23

23

23

24

23

23

23

24

23

23

23

23

24

23

23

24

23

24

24

23

23

24

22

23

23

24

23

23

24

23

23

23

22

23

23

24

24

23

23

23

23

23

23

23

23

23

23

23

23

23

21

23

23

23

23

24

23

24

23

23

24

23

24

23

23

23

24

24

24

23

24

23

23

23

23

23

23

24

23

23

21

23

24

24

24

23

24

23

24

23

24

24

23

23

24

23

23

23

22

23

23

23

23

24

23

24

23

24

24

23

23

23

23

23

22

24

23

23

23

24

23

23

24

23

23

24

21

24

23

22

24

24

23

23

23

22

23

23

23

24

23

23

23

22

24

23

24

23

24

23

23

23

23

23

24

23

24

23

24

23

23

23

23

24

22

22

23

24

24

23

23

24

23

24

23

23

23

23

24

23

24

24

24

24

23

23

24

23

23

23

23

22

23

## 1200 epoch로 테스트 데이터로만 학습

In [35]:
model = lgb.LGBMClassifier(
    **params,
    n_estimators=1200,
    silent=-1,
)

model.fit(
    X=train[FEATS],
    y=y_train,
    early_stopping_rounds=300,
    eval_set=[(train[FEATS], y_train), (valid[FEATS], y_valid)],
    eval_names=["train", "valid"],
    eval_metric="roc_auc",
    verbose=100,
)

[100]	train's binary_logloss: 0.526652	valid's binary_logloss: 0.586375
[200]	train's binary_logloss: 0.510716	valid's binary_logloss: 0.564206
[300]	train's binary_logloss: 0.498272	valid's binary_logloss: 0.545775
[400]	train's binary_logloss: 0.488178	valid's binary_logloss: 0.53046
[500]	train's binary_logloss: 0.478494	valid's binary_logloss: 0.519541
[600]	train's binary_logloss: 0.470161	valid's binary_logloss: 0.511292
[700]	train's binary_logloss: 0.46297	valid's binary_logloss: 0.50426
[800]	train's binary_logloss: 0.456311	valid's binary_logloss: 0.4955
[900]	train's binary_logloss: 0.449875	valid's binary_logloss: 0.486935
[1000]	train's binary_logloss: 0.443391	valid's binary_logloss: 0.479238
[1100]	train's binary_logloss: 0.437531	valid's binary_logloss: 0.470946
[1200]	train's binary_logloss: 0.432351	valid's binary_logloss: 0.463831


In [36]:
preds1 = model.predict_proba(valid[FEATS])[:, 1]
acc1 = accuracy_score(y_valid, np.where(preds1 >= 0.5, 1, 0))
auc1 = roc_auc_score(y_valid, preds1)
print(f'VALID AUC : {auc1} ACC : {acc1}\n')

VALID AUC : 0.8742529425104058 ACC : 0.8091397849462365



In [37]:
# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_df = pd.read_csv(test_csv_file_path)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)

In [38]:
test_df = fe(test_df)

In [39]:
# MAKE PREDICTION
total_preds = model.predict_proba(test_df[FEATS])[:,1]

# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "submission_only_test_1200_epoch.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    q = w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        q = w.write('{},{}\n'.format(id,p))

writing prediction : output/submission_only_test_1200_epoch.csv


## 앙상블 해보기

In [42]:
!readlink -ef ../ensemble/input

/opt/ml/input/code/ensemble/input


In [46]:
!python ../ensemble/ensemble.py --ENSEMBLE_FILES submission_lgbm_add_time_test_tag_std_big_mid_problem_number_3200epoch_valauc_7718,submission_only_test_1200_epoch --ENSEMBLE_WEIGHT 0.5,0.5 --RESULT_PATH /opt/ml/input/code/ensemble/input/

> /opt/ml/input/code/ensemble/ensemble.py(89)<module>()
-> main(args)
(Pdb) 
--KeyboardInterrupt--
(Pdb) 

## 1600 epoch로 테스트 데이터로만 학습

In [47]:
model = lgb.LGBMClassifier(
    **params,
    n_estimators=1600,
    silent=-1,
)

model.fit(
    X=train[FEATS],
    y=y_train,
    early_stopping_rounds=300,
    eval_set=[(train[FEATS], y_train), (valid[FEATS], y_valid)],
    eval_names=["train", "valid"],
    eval_metric="roc_auc",
    verbose=100,
)

[100]	train's binary_logloss: 0.526652	valid's binary_logloss: 0.586375
[200]	train's binary_logloss: 0.510716	valid's binary_logloss: 0.564206
[300]	train's binary_logloss: 0.498272	valid's binary_logloss: 0.545775
[400]	train's binary_logloss: 0.488178	valid's binary_logloss: 0.53046
[500]	train's binary_logloss: 0.478494	valid's binary_logloss: 0.519541
[600]	train's binary_logloss: 0.470161	valid's binary_logloss: 0.511292
[700]	train's binary_logloss: 0.46297	valid's binary_logloss: 0.50426
[800]	train's binary_logloss: 0.456311	valid's binary_logloss: 0.4955
[900]	train's binary_logloss: 0.449875	valid's binary_logloss: 0.486935
[1000]	train's binary_logloss: 0.443391	valid's binary_logloss: 0.479238
[1100]	train's binary_logloss: 0.437531	valid's binary_logloss: 0.470946
[1200]	train's binary_logloss: 0.432351	valid's binary_logloss: 0.463831
[1300]	train's binary_logloss: 0.427109	valid's binary_logloss: 0.459243
[1400]	train's binary_logloss: 0.422209	valid's binary_logloss: 0

In [48]:
preds1 = model.predict_proba(valid[FEATS])[:, 1]
acc1 = accuracy_score(y_valid, np.where(preds1 >= 0.5, 1, 0))
auc1 = roc_auc_score(y_valid, preds1)
print(f'VALID AUC : {auc1} ACC : {acc1}\n')

VALID AUC : 0.8878371359809221 ACC : 0.8158602150537635



In [49]:
# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_df = pd.read_csv(test_csv_file_path)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)

In [50]:
test_df = fe(test_df)

In [51]:
# MAKE PREDICTION
total_preds = model.predict_proba(test_df[FEATS])[:,1]

# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "submission_only_test_1600_epoch.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    q = w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        q = w.write('{},{}\n'.format(id,p))

writing prediction : output/submission_only_test_1600_epoch.csv
