# LGBM을 활용한 베이스라인

In [35]:
import pandas as pd
import os
import random

## 1. 데이터 로딩

In [2]:
data_dir = '/opt/ml/input/data/train_dataset'
csv_file_path = os.path.join(data_dir, 'train_data.csv')
df = pd.read_csv(csv_file_path) 

## 2. Feature Engineering

In [39]:
df = df.fillna(0)
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean_x,...,tag_mean_x,tag_sum_x,test_mean_y,test_sum_y,tag_mean_y,tag_sum_y,test_mean,test_sum,tag_mean,tag_sum
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0.0,0,0.0,0.947683,...,0.955022,637,0.947683,1268,0.955022,637,0.947683,1268,0.955022,637
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,0.947683,...,0.913187,3040,0.947683,1268,0.913187,3040,0.947683,1268,0.913187,3040
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,0.947683,...,0.913187,3040,0.947683,1268,0.913187,3040,0.947683,1268,0.913187,3040
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.0,0.947683,...,0.913187,3040,0.947683,1268,0.913187,3040,0.947683,1268,0.913187,3040
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.0,0.947683,...,0.913187,3040,0.947683,1268,0.913187,3040,0.947683,1268,0.913187,3040


In [3]:
def feature_engineering(df):
    
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    
    return df

In [33]:
df = feature_engineering(df)

In [34]:
df[df['KnowledgeTag'] ==5289]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean_x,...,tag_mean_x,tag_sum_x,test_mean_y,test_sum_y,tag_mean_y,tag_sum_y,test_mean,test_sum,tag_mean,tag_sum
1930,2,A050131003,A050000131,0,2020-10-08 12:39:27,5289,159.0,252,0.630952,0.441158,...,0.560703,1658,0.441158,701,0.560703,1658,0.441158,701,0.560703,1658
13406,20,A050125003,A050000125,0,2020-10-15 07:48:15,5289,403.0,684,0.589181,0.549937,...,0.560703,1658,0.549937,870,0.560703,1658,0.549937,870,0.560703,1658
13407,20,A050125004,A050000125,0,2020-10-15 07:48:20,5289,403.0,685,0.588321,0.549937,...,0.560703,1658,0.549937,870,0.560703,1658,0.549937,870,0.560703,1658
13409,20,A050125006,A050000125,0,2020-10-15 07:48:29,5289,403.0,687,0.586608,0.549937,...,0.560703,1658,0.549937,870,0.560703,1658,0.549937,870,0.560703,1658
13996,21,A050126001,A050000126,0,2020-06-04 08:31:09,5289,446.0,556,0.802158,0.582908,...,0.560703,1658,0.582908,914,0.560703,1658,0.582908,914,0.560703,1658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2262210,7185,A050126005,A050000126,0,2020-10-23 12:16:53,5289,3.0,19,0.157895,0.582908,...,0.560703,1658,0.582908,914,0.560703,1658,0.582908,914,0.560703,1658
2263655,7265,A050133003,A050000133,1,2020-10-07 18:03:34,5289,3.0,10,0.300000,0.653970,...,0.560703,1658,0.653970,1219,0.560703,1658,0.653970,1219,0.560703,1658
2263656,7265,A050133004,A050000133,1,2020-10-07 18:07:26,5289,4.0,11,0.363636,0.653970,...,0.560703,1658,0.653970,1219,0.560703,1658,0.653970,1219,0.560703,1658
2263659,7265,A050133007,A050000133,0,2020-10-07 18:15:01,5289,7.0,14,0.500000,0.653970,...,0.560703,1658,0.653970,1219,0.560703,1658,0.653970,1219,0.560703,1658


## 3. Train/Test 데이터 셋 분리

In [5]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))

    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [6]:
# 유저별 분리
train, test = custom_train_test_split(df)

# 사용할 Feature 설정
FEATS = ['KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum']

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [7]:
# !pip install lightgbm

In [8]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np

In [9]:
# python lightgbm을 사용할 땐 데이터 변환이 필요
lgb_train = lgb.Dataset(train[FEATS], y_train)
lgb_test = lgb.Dataset(test[FEATS], y_test)

## 4. 훈련 및 검증

In [10]:
model = lgb.train(
                    {'objective': 'binary'}, 
                    lgb_train,
                    valid_sets=[lgb_train, lgb_test],
                    verbose_eval=100,
                    num_boost_round=500,
                    early_stopping_rounds=100
                )

preds = model.predict(test[FEATS])
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

[LightGBM] [Info] Number of positive: 1039062, number of negative: 547095
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2032
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655081 -> initscore=0.641451
[LightGBM] [Info] Start training from score 0.641451
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.559559	valid_1's binary_logloss: 0.680841
[200]	training's binary_logloss: 0.556752	valid_1's binary_logloss: 0.679192
[300]	training's binary_logloss: 0.55447	valid_1's binary_logloss: 0.678525
[400]	training's binary_logloss: 0.552651	valid_1's binary_logloss: 0.677655
[500]	training's binary_logloss: 0.550829	valid_1's binary_logloss: 0.677497
Did not meet early stopping. Best iteration is:
[500]	training's binary_logloss: 0.550829	valid_1'

In [None]:
# INSTALL MATPLOTLIB IN ADVANCE
# _ = lgb.plot_importance(model)

## 5. Inference

In [26]:
# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_df = pd.read_csv(test_csv_file_path)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# # DROP ANSWERCODE
# test_df = test_df.drop(['answerCode'], axis=1)

In [31]:
test_df[test_df['answerCode'] <0]
# test_df[test_df['KnowledgeTag'] == 9125]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum
1035,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289,717.0,1035,0.692754,0.661765,90,0.542662,159
1706,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080,465.0,670,0.694030,0.740385,77,0.565693,155
3023,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660,915.0,1316,0.695289,0.417857,117,0.446753,172
4283,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611,1031.0,1259,0.818904,0.625000,30,0.514286,36
4670,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422,293.0,386,0.759067,0.678571,133,0.602767,305
...,...,...,...,...,...,...,...,...,...,...,...,...,...
260052,7395,A040122005,A040000122,-1,2020-09-08 02:05:20,10615,7.0,23,0.304348,0.753846,147,0.654902,167
260067,7404,A030111005,A030000111,-1,2020-10-13 09:49:18,7636,7.0,14,0.500000,0.866667,156,0.834661,419
260082,7416,A050193004,A050000193,-1,2020-10-04 02:44:41,10402,7.0,14,0.500000,0.750000,75,0.792517,233
260097,7417,A050193004,A050000193,-1,2020-09-06 13:09:15,10402,2.0,14,0.142857,0.750000,75,0.792517,233


In [None]:
# MAKE PREDICTION
total_preds = model.predict(test_df[FEATS])

In [None]:
# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "output.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)    
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))