# Feature Engineering
메모리 (Memory)
순서 데이터(Sequence Data)가 주어질 경우 순서 모델(Sequence Model, [ex: LSTM, Transformer, etc])을 사용하며 이 모델은 메모리(Memory)를 내장하고 있어 이전/이후에 입력된 데이터들을 기억하고 참조한다.

주어진 데이터 이전/이후의 데이터들을 포함하는 메모리(Memory)를 feature로 포함시킴으로서 순서 모델(Sequence Model)를 사용하지 않고 일반적인 지도 학습 모델(ex: Light GBM, NN)을 사용하여 훈련할 수 있다

이동 (Shift)
누적 합 (Cumsum)
누적 총계 (Cumcount)
이동 평균 (Rolling Mean)

## 1.라이브러리 세팅

In [3]:
import numpy as np
import pandas as pd
import random
import os
import warnings
warnings.filterwarnings("ignore")

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [9]:
%%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

# 데이터 경로 맞춰주세요!
# 혹시 코랩환경을 사용하신다면 왼쪽 폴더모양 아이콘을 눌러 "train_data.csv"를 드래그&드롭으로 업로드한 후 사용해주세요
DATA_PATH = '/opt/ml/input/data/'
df = pd.read_csv(DATA_PATH+'train_data.csv' , dtype=dtype, parse_dates=['Timestamp'])

CPU times: user 2.77 s, sys: 0 ns, total: 2.77 s
Wall time: 2.77 s


## 2.피쳐 이름 변경

In [10]:
day_dict = {'Tuesday': 0,
 'Thursday': 1,
 'Monday': 2,
 'Saturday': 3,
 'Friday': 4,
 'Wednesday': 5,
 'Sunday': 6}

def feature_engineering(df):
    df = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']
    
    # 문제 푸는데 걸린 시간
    # 10분이상 시간소요는 새로운 문제집을 시작한 것으로 판단
    diff = df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
    df['elapsed'] = diff
    df['elapsed'] = df['elapsed'].apply(lambda x: 0 if x>= 600 else x)
    # 문제 푸는데 걸린 누적 시간
    df['elapsed_cumsum'] = df.groupby('userID')['elapsed'].cumsum()

    #문제 푸는데 걸린 시간의 중앙값
    elapsed_med = df.groupby('userID')['elapsed'].agg(['median'])
    elapsed_med.columns = ['elapsed_med']
    

    #시간 쪼개기 + 요일
    df['month'] = pd.to_datetime(df.Timestamp).dt.month
    df['day'] = pd.to_datetime(df.Timestamp).dt.day
    df['hour'] = pd.to_datetime(df.Timestamp).dt.hour
    df['dayname'] = pd.to_datetime(df.Timestamp).dt.day_name().map(day_dict)
    
    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, elapsed_med, on =['userID'], how = 'left')
    df.fillna(0,inplace = True)
    return df

In [11]:
df2 = feature_engineering(df)
df2.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,elapsed,...,day,hour,dayname,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,elapsed_med
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0.0,0,0.0,0.0,...,24,0,0,0.947683,0.222749,1268,0.955022,0.20741,637,14.0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,3.0,...,24,0,0,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,8.0,...,24,0,0,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.0,7.0,...,24,0,0,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.0,7.0,...,24,0,0,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0


## 3. Train/Test 데이터 셋 분리

In [22]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    """
    사용자 기준으로 train_test_split
    """
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    # users.sort(key=lambda x: x[0])
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [23]:
# 유저별 분리
train, test = custom_train_test_split(df2)

# 사용할 Feature 설정
FEATS = ['KnowledgeTag', 
         'user_correct_answer', 
         'user_total_answer', 
         'user_acc',
         'test_mean', 
         'test_sum', 
         'tag_mean',
         'tag_sum',
         'elapsed',
         'elapsed_cumsum',
         'month',
         'day',
         'hour',
         'dayname',
         'elapsed_med'
        
        ]

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [24]:
test

Unnamed: 0,userID,assessmentItemID,testId,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,elapsed,elapsed_cumsum,...,day,hour,dayname,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,elapsed_med
744,0,A080129006,A080000129,2020-12-23 03:40:19,2725,470.0,744,0.631720,5.0,23563.0,...,23,3,5,0.360111,0.480143,780,0.301939,0.459258,436,14.0
2786,5,A080138007,A080000138,2020-12-11 22:48:28,8431,662.0,832,0.795673,0.0,53131.0,...,11,22,4,0.504405,0.500118,916,0.491346,0.500166,511,34.0
3707,6,A030145005,A030000145,2020-10-26 09:52:14,7817,408.0,920,0.443478,24.0,36391.0,...,26,9,2,0.628679,0.483340,833,0.617073,0.486160,2530,10.0
4519,7,A090052006,A090000052,2020-11-02 02:02:59,2600,492.0,811,0.606658,4.0,44300.0,...,2,2,2,0.448326,0.497504,616,0.350352,0.477290,398,15.0
5922,9,A070158008,A070000158,2020-12-28 21:08:31,9797,590.0,866,0.681293,36.0,46934.0,...,28,21,2,0.680398,0.466654,479,0.537484,0.498674,1649,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266199,7415,A020171006,A020000171,2020-10-26 05:17:20,8131,4.0,15,0.266667,15.0,508.0,...,26,5,2,0.721631,0.448328,1221,0.696643,0.459766,2781,13.0
2266277,7421,A040161007,A040000161,2020-10-22 06:33:19,2118,12.0,17,0.705882,13.0,1207.0,...,22,6,1,0.600856,0.489853,1123,0.703704,0.456800,912,19.0
2266381,7427,A040187005,A040000187,2020-10-15 02:53:08,2129,15.0,19,0.789474,13.0,701.0,...,15,2,1,0.955474,0.206335,1309,0.927544,0.259276,3546,26.5
2266469,7432,A060177007,A060000177,2020-10-26 08:09:52,1578,8.0,18,0.444444,107.0,879.0,...,26,8,2,0.720437,0.448938,1054,0.660964,0.473751,425,12.0


In [25]:
lgb_train = lgb.Dataset(train[FEATS], y_train)
lgb_test = lgb.Dataset(test[FEATS], y_test)

## 4. 훈련 및 검증

In [26]:
model = lgb.train(
    {'objective': 'binary'}, 
    lgb_train,
    valid_sets=[lgb_train,lgb_test],
    verbose_eval=100,
    early_stopping_rounds=100,
    num_boost_round=500
)

preds = model.predict(test[FEATS])
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2741
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.521947	valid_1's binary_logloss: 0.625212
[200]	training's binary_logloss: 0.51749	valid_1's binary_logloss: 0.62137
[300]	training's binary_logloss: 0.513875	valid_1's binary_logloss: 0.620565
[400]	training's binary_logloss: 0.510876	valid_1's binary_logloss: 0.619878
[500]	training's binary_logloss: 0.508285	valid_1's binary_logloss: 0.619564
Did not meet early stopping. Best iteration is:
[500]	training's binary_logloss: 0.508285	valid_1'

[baseline]
- VALID AUC : 0.6868241736037339 ACC : 0.587942202291978
        
[문제풀이 시간 추가]
- VALID AUC : 0.7395128748367014 ACC : 0.6507224713502741

[누적 풀이시간 추가] 성능하락 → 당시에 문제푸는 시간이 얼마나 걸렸느냐가 중요하지 이때까지 몇시간동안 공부했냐는 안중요한 듯?

- VALID AUC : 0.7390277302636187 ACC : 0.6522172396611858

[시간 쪼개기] 미세한 성능 향상

- VALID AUC : 0.740232589311615 ACC : 0.6542102640757349

### Ver2 - valid_set 고정
[baseline]
- VALID AUC : 0.6916288131757744 ACC : 0.5795076513639388

[문제 풀이 시간 추가]
- VALID AUC : 0.747883430697164 ACC : 0.6544688400975827

[문제 풀이 시간 + 누적 시간]
- VALID AUC : 0.7455611673251767 ACC : 0.6551341760922599

[문제풀이시간 + 풀이시간_median]

- VALID AUC : 0.744333015051247 ACC : 0.6540252827677977

[문제풀이시간 + 날짜 쪼개기]

- VALID AUC : 0.7413914025989795 ACC : 0.6535817254380129

[전부 추가]

- VALID AUC : 0.7411653066743833 ACC : 0.6522510534486582

'''흐음'''

## Inference

In [27]:
# LOAD TESTDATA
test_df = pd.read_csv(DATA_PATH+'/'+'test_data.csv' , dtype=dtype, parse_dates=['Timestamp'])
test_df = test_df.sort_values(by=['userID', 'Timestamp','assessmentItemID']).reset_index(drop=True)
# test_csv_file_path = os.path.join(DATA_PATH, 'test_data.csv')
# test_df = pd.read_csv(test_csv_file_path)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# # DROP ANSWERCODE
# test_df = test_df.drop(['answerCode'], axis=1)

In [28]:
test_df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,elapsed,...,day,hour,dayname,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,elapsed_med
1035,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289,717.0,1035,0.692754,46.0,...,26,13,2,0.661765,0.490209,90,0.542662,0.505845,159,25.0
1706,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080,465.0,670,0.694030,23.0,...,27,2,6,0.740385,0.539601,77,0.565693,0.552442,155,38.0
3023,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660,915.0,1316,0.695289,8.0,...,27,4,6,0.417857,0.501291,117,0.446753,0.518307,172,20.0
4283,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611,1031.0,1259,0.818904,75.0,...,30,5,4,0.625000,0.530957,30,0.514286,0.531415,36,34.5
4670,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422,293.0,386,0.759067,17.0,...,23,11,4,0.678571,0.479048,133,0.602767,0.493836,305,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260052,7395,A040122005,A040000122,-1,2020-09-08 02:05:20,10615,7.0,23,0.304348,2.0,...,8,2,0,0.753846,0.443653,147,0.654902,0.484530,167,2.5
260067,7404,A030111005,A030000111,-1,2020-10-13 09:49:18,7636,7.0,14,0.500000,107.0,...,13,9,0,0.866667,0.356895,156,0.834661,0.377186,419,15.0
260082,7416,A050193004,A050000193,-1,2020-10-04 02:44:41,10402,7.0,14,0.500000,24.0,...,4,2,6,0.750000,0.479372,75,0.792517,0.446234,233,14.0
260097,7417,A050193004,A050000193,-1,2020-09-06 13:09:15,10402,2.0,14,0.142857,21.0,...,6,13,6,0.750000,0.479372,75,0.792517,0.446234,233,21.0


In [29]:
# MAKE PREDICTION
total_preds = model.predict(test_df[FEATS])

In [30]:
total_preds

array([0.77238646, 0.84119066, 0.35362747, 0.85127323, 0.70782224,
       0.7811658 , 0.33514096, 0.68813068, 0.1286457 , 0.94885811,
       0.74834478, 0.80742716, 0.94849761, 0.64587623, 0.81047939,
       0.92640262, 0.22172223, 0.90543599, 0.76135094, 0.64793502,
       0.82486558, 0.52503033, 0.58689051, 0.68327898, 0.29318806,
       0.8557476 , 0.88095462, 0.89398695, 0.61364283, 0.84985637,
       0.70189476, 0.72417684, 0.73331087, 0.21942133, 0.66054904,
       0.80169934, 0.82646719, 0.60562091, 0.67542135, 0.29101715,
       0.68981475, 0.2393136 , 0.22120175, 0.64339729, 0.50338169,
       0.7890489 , 0.8236402 , 0.21319412, 0.93906127, 0.60909706,
       0.64538714, 0.20146733, 0.67456561, 0.31930596, 0.69676991,
       0.77754366, 0.61267871, 0.92807227, 0.57832512, 0.23159641,
       0.79565966, 0.89768398, 0.41721787, 0.26647983, 0.33843439,
       0.52228815, 0.7637267 , 0.28501054, 0.21011674, 0.68933458,
       0.78173827, 0.84169592, 0.74171795, 0.42896503, 0.75117

In [32]:
# SAVE OUTPUT
output_dir = '/opt/ml/input/output/LGBM'
write_path = os.path.join(output_dir, "submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : /opt/ml/input/output/LGBM/submission.csv
