# Feature Engineering

## 1.라이브러리 세팅

In [155]:
import numpy as np
import pandas as pd
import random
import os
import warnings
warnings.filterwarnings("ignore")

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

import pdb


In [82]:
%%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

# 데이터 경로 맞춰주세요!
# 혹시 코랩환경을 사용하신다면 왼쪽 폴더모양 아이콘을 눌러 "train_data.csv"를 드래그&드롭으로 업로드한 후 사용해주세요
DATA_PATH = '/opt/ml/input/data/'
df = pd.read_csv(DATA_PATH+'train_data.csv' , dtype=dtype, parse_dates=['Timestamp'])

CPU times: user 2.47 s, sys: 136 ms, total: 2.61 s
Wall time: 2.61 s


## 2.피쳐 엔지니어링

In [180]:
def feature_engineering(df):
    df = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']
    
    # 문제 푸는데 걸린 시간
    # 10분이상 시간소요는 새로운 문제집을 시작한 것으로 판단
    diff = df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
    df['elapsed'] = diff
    df['elapsed'] = df['elapsed'].apply(lambda x: 0 if x>= 600 else x)

    #대분류/유저
    df['bigclass'] = df['testId'].apply(lambda x : x[2]).astype(int)
    # 유저별 대분류 문제 풀이시간
    bigclasstime = df.groupby(['userID','bigclass']).agg({'elapsed' : 'mean'}).reset_index()

    # 유저별 대분류 문제 횟수
    bigclassCount = df.groupby(['userID','bigclass'])['answerCode'].count().reset_index()
    # 유저별 대분류 문제 정답 횟수
    bigclasssum = df.groupby(['userID','bigclass'])['answerCode'].sum().reset_index()
    v = bigclasssum['answerCode'].values/bigclassCount['answerCode'].values
    bigclasstime['bigclass_acc'] = v
    bigclasstime['bigclass_count']  = bigclassCount['answerCode'].values
    bigclasstime['bigclass_sum'] = bigclasssum['answerCode'].values
    bigclass = bigclasstime.rename(columns = {'elapsed' : 'bigclasstime'})
    df = pd.merge(df,bigclass, on = ['userID','bigclass'],how = 'left')


    
    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    df.fillna(0,inplace = True)
    return df

In [178]:
df2 = feature_engineering(df)
df2.head()

      userID assessmentItemID      testId  answerCode           Timestamp   
0          0       A060001001  A060000001           1 2020-03-24 00:17:11  \
1          0       A060001002  A060000001           1 2020-03-24 00:17:14   
2          0       A060001003  A060000001           1 2020-03-24 00:17:22   
3          0       A060001004  A060000001           1 2020-03-24 00:17:29   
4          0       A060001005  A060000001           1 2020-03-24 00:17:36   
...      ...              ...         ...         ...                 ...   
1031       1       A040022003  A040000022           1 2020-05-07 09:59:53   
1032       1       A090024001  A090000024           1 2020-05-07 23:22:50   
1033       1       A090024002  A090000024           1 2020-05-07 23:23:49   
1034       1       A090024003  A090000024           1 2020-05-07 23:26:04   
1035       1       A090024004  A090000024           1 2020-05-07 23:27:24   

      KnowledgeTag  user_correct_answer  user_total_answer  user_acc  elaps

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,elapsed,...,bigclasstime,bigclass_acc,bigclass_count,bigclass_sum,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0.0,0,0.0,0.0,...,31.254335,0.791908,346,274,0.947683,0.222749,1268,0.955022,0.20741,637
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,3.0,...,31.254335,0.791908,346,274,0.947683,0.222749,1268,0.913187,0.281603,3040
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,8.0,...,31.254335,0.791908,346,274,0.947683,0.222749,1268,0.913187,0.281603,3040
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.0,7.0,...,31.254335,0.791908,346,274,0.947683,0.222749,1268,0.913187,0.281603,3040
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.0,7.0,...,31.254335,0.791908,346,274,0.947683,0.222749,1268,0.913187,0.281603,3040


## 3. Train/Test 데이터 셋 분리

In [140]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
def custom_train_test_split(df, ratio=0.7, split=True):
    
    """
    사용자 기준으로 train_test_split
    """
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    # users.sort(key=lambda x: x[0])
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [141]:
random.seed(42)
# 유저별 분리
train, test = custom_train_test_split(df2)

# 사용할 Feature 설정
FEATS = ['KnowledgeTag', 
         'user_correct_answer', 
         'user_total_answer', 
         'user_acc',
         'test_mean', 
         'test_sum', 
         'tag_mean',
         'tag_sum',
         'elapsed',
         'bigclass',
         'bigclasstime',
         'bigclass_acc',
         'bigclass_sum',
         'bigclass_count'
             ]

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [142]:
y_test

708        0
2566       1
3401       0
4519       1
5922       1
          ..
2266199    1
2266277    0
2266381    1
2266464    1
2266585    1
Name: answerCode, Length: 2007, dtype: int8

In [143]:
lgb_train = lgb.Dataset(train[FEATS], y_train)
lgb_test = lgb.Dataset(test[FEATS], y_test)

## 4. 훈련 및 검증

In [144]:
model = lgb.train(
    {'objective': 'binary'}, 
    lgb_train,
    valid_sets=[lgb_train,lgb_test],
    verbose_eval=100,
    early_stopping_rounds=100,
    num_boost_round=500
)

preds = model.predict(test[FEATS])
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3316
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.488447	valid_1's binary_logloss: 0.57548
[200]	training's binary_logloss: 0.484468	valid_1's binary_logloss: 0.571101
[300]	training's binary_logloss: 0.48179	valid_1's binary_logloss: 0.568642
[400]	training's binary_logloss: 0.479484	valid_1's binary_logloss: 0.567301
[500]	training's binary_logloss: 0.477019	valid_1's binary_logloss: 0.565792
Did not meet early stopping. Best iteration is:
[500]	training's binary_logloss: 0.477019	valid_1'

## Inference

In [165]:
# LOAD TESTDATA
test_df = pd.read_csv(DATA_PATH+'test_data.csv' , dtype=dtype, parse_dates=['Timestamp'])
test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# # DROP ANSWERCODE
# test_df = test_df.drop(['answerCode'], axis=1)

In [166]:
test_df[test_df['userID'] == 3]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626
2,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625
3,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625
4,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623
...,...,...,...,...,...,...
1031,3,A050133004,A050000133,1,2020-10-26 13:12:11,5289
1032,3,A050133005,A050000133,1,2020-10-26 13:12:36,5288
1033,3,A050133006,A050000133,1,2020-10-26 13:12:52,5288
1034,3,A050133007,A050000133,0,2020-10-26 13:13:11,5289


In [183]:
# MAKE PREDICTION
total_preds = model.predict(test_df[FEATS])

In [153]:
# SAVE OUTPUT
output_dir = '/opt/ml/input/output/LGBM'
write_path = os.path.join(output_dir, "submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : /opt/ml/input/output/LGBM/submission.csv
