# LGBM을 활용한 베이스라인

In [1]:
import pandas as pd
import os
import random
import numpy as np

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

## 1. 데이터 로딩

In [2]:
data_dir = '../data/' # 경로는 상황에 맞춰서 수정해주세요!

# LOAD TRAINDATA
csv_file_path = os.path.join(data_dir, 'train_data.csv') # 데이터는 대회홈페이지에서 받아주세요 :)
train_data = pd.read_csv(csv_file_path)

# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_data = pd.read_csv(test_csv_file_path)
test_data['answerCode'] = test_data['answerCode'].replace(-1, np.nan)

# # LEAVE LAST INTERACTION ONLY
# train_df = test_df[test_df['userID'] == test_df['userID'].shift(-1)]
# test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# df = pd.concat([df, train_df], ignore_index=True)

## 2. Feature Engineering

In [3]:
train_data['train'] = 1
test_data['train'] = 0
total = pd.concat((train_data, test_data), axis=0)
total = total.sort_values('userID')
total.to_csv(data_dir + 'total_data.csv', index=False)

In [21]:
def feature_engineering(data_dir):
    total = pd.read_csv(os.path.join(data_dir, 'total_data.csv'), parse_dates=["Timestamp"])
    
    ## 유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    print('Timestamp')
    total['Timestamp'] = pd.to_datetime(total['Timestamp'])
    total.sort_values(by=['userID','Timestamp'], inplace=True)#.reset_index(drop=True)
    
    ## 문제 푼 시간 재정의
    # 같은 문제 몇번째 푸는지
    print('same_item_cnt')
    total['same_item_cnt'] = total.groupby(['userID', 'assessmentItemID']).cumcount() + 1
    
    # elapsed
    print('elapsed')
    diff = total.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
    total['elapsed'] = diff
    total['elapsed'] = total['elapsed'].apply(lambda x: x if x < 650 and x>=0 else 0)

    # 대분류
    print('Bigcat')
    total['Bigcat'] = total['assessmentItemID'].str[2]
    total['Bigcat'] = total['Bigcat'].astype('category')



    # # 유저, assessmentItemID, same_item_cnt 구분했을 때 문제 푸는데 걸린 시간 > shift, fillna x
    # diff_shift = total.loc[:, ['userID', 'assessmentItemID', 'Timestamp', 'same_item_cnt']].groupby(['userID', 'testId', 'same_item_cnt']).diff().shift(-1)
    # diff_shift = diff_shift['Timestamp'].apply(lambda x: x.total_seconds())
    # total['solved_time_shift'] = diff_shift
    # # total['solved_time_shift'] = total.groupby(['userID', 'testId', 'same_item_cnt'])['solved_time_shift'].apply(lambda x:x.fillna(x.mean()))

    # # 맞은 사람의 문제별 평균 풀이시간
    # total = total.set_index('assessmentItemID')
    # total['Item_mean_solved_time'] = total[total['answerCode'] == 1].groupby('assessmentItemID')['solved_time_shift'].mean()
    # total = total.reset_index(drop = False)

    
    # 1. agg 값 구하기
    ## 1-1. 유저/문제/카테고리/태그별 평균 정답률
    print('acc_avg')
    total['user_avg'] = total.groupby('userID')['answerCode'].transform('mean')
    total['item_avg'] = total.groupby('assessmentItemID')['answerCode'].transform('mean')
    total['Bigcat_avg'] = total.groupby('Bigcat')['answerCode'].transform('mean')
    total['tag_avg'] = total.groupby('KnowledgeTag')['answerCode'].transform('mean')

    ## 1-2. 유저/문제/카테고리/태그별 평균 풀이시간
    print('time_avg')
    total['user_time_avg'] = total.groupby('userID')['elapsed'].transform('mean')
    total['item_time_avg'] = total.groupby('assessmentItemID')['elapsed'].transform('mean')
    total['Bigcat_time_avg'] = total.groupby('Bigcat')['elapsed'].transform('mean')
    total['tag_time_avg'] = total.groupby('KnowledgeTag')['elapsed'].transform('mean')
     
    ## 1-3 유저/문제/카테고리/태그별 표준편차
    print('std')
    total['user_std'] = total.groupby('userID')['answerCode'].transform('std')
    total['item_std'] = total.groupby('assessmentItemID')['answerCode'].transform('std')
    total['Bigcat_std'] = total.groupby('Bigcat')['answerCode'].transform('std')
    total['tag_std'] = total.groupby('KnowledgeTag')['answerCode'].transform('std')

    ## 1-4-1 유저/문제별 누적 정답횟수
    print('correct_answer')
    total['user_correct_answer'] = total.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    total['item_correct_answer'] = total.groupby('assessmentItemID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    
    ## 1-4-2 유저별 누적 정답률
    print('Cumacc')
    total['user_total_answer'] = total.groupby('userID')['answerCode'].cumcount()
    total['user_Cumacc'] = total['user_correct_answer']/total['user_total_answer'] # 누적정답률
    
    ## 1-4-3 유저의 카테고리별 누적 정답횟수/정답률
    print('Bigcat_Cumacc')
    total['user_Bigcat_correct_answer'] = total.groupby(['userID','Bigcat'])['answerCode'].transform(lambda x: x.cumsum().shift(1))
    total['user_Bigcat_total_answer'] = total.groupby(['userID','Bigcat'])['answerCode'].cumcount()
    total['user_Bigcat_Cumacc'] = total['user_Bigcat_correct_answer']/total['user_Bigcat_total_answer']

    ## 1-5. 현재 유저의 해당 문제지 평균 정답률/풀이시간
    print('current_avg')
    total['user_current_avg'] = total.groupby(['userID', 'testId', 'same_item_cnt'])['answerCode'].transform('mean')
    total['user_current_time_avg'] = total.groupby(['userID', 'testId', 'same_item_cnt'])['elapsed'].transform('mean')
    
    

    # 2. 문제 푼 순서 추가 > 상대적 순서?
    print('item_seq')
    total['item_seq'] = total.groupby(['userID', 'testId', 'same_item_cnt']).cumcount() +1
    total['item_seq'] = total['item_seq'].astype('category')

    # 2-1 유저/문제별 최근 정답횟수 
    print('retCount_correct_answer')
    total['user_retCount_correct_answer'] = total.groupby('userID')['answerCode'].transform(lambda x:x.rolling(5, min_periods=1).sum().shift(1))
    total['item_retCount_correct_answer'] = total.groupby('assessmentItemID')['answerCode'].transform(lambda x:x.rolling(5, min_periods=1).sum().shift(1))

    # 2-1 유저/문제별 최근 정답률
    print('retCumacc')
    total['user_retCount'] = total.groupby('userID')['answerCode'].transform(lambda x:x.rolling(5, min_periods=1).count().shift(1))
    total['user_retCumacc'] = total['user_retCount_correct_answer']/total['user_retCount']

    total['item_retCount'] = total.groupby('assessmentItemID')['answerCode'].transform(lambda x:x.rolling(5, min_periods=1).count().shift(1))
    total['item_retCumacc'] = total['item_retCount_correct_answer']/total['item_retCount']

    

    ## train, test 나누어서 return
    train_df = total[total['train']==1]
    test_df = total[total['train']==0]
    test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]
    
    
    return train_df, test_df, total

In [22]:
train_df, test_df, total = feature_engineering(data_dir)
display(train_df)
display(test_df)

Timestamp
same_item_cnt
elapsed
Bigcat
acc_avg
time_avg
std
correct_answer
Cumacc
Bigcat_Cumacc
current_avg
item_seq
retCount_correct_answer
retCumacc


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,train,same_item_cnt,elapsed,Bigcat,...,user_Bigcat_Cumacc,user_current_avg,user_current_time_avg,item_seq,user_retCount_correct_answer,item_retCount_correct_answer,user_retCount,user_retCumacc,item_retCount,item_retCumacc
0,0,A060001001,A060000001,1.0,2020-03-24 00:17:11,7224,1,1,0.0,6,...,,1.0,6.0,1,,,,,,
491,0,A060001002,A060000001,1.0,2020-03-24 00:17:14,7225,1,1,3.0,6,...,1.00,1.0,6.0,2,1.0,,1.0,1.00,,
492,0,A060001003,A060000001,1.0,2020-03-24 00:17:22,7225,1,1,8.0,6,...,1.00,1.0,6.0,3,2.0,,2.0,1.00,,
493,0,A060001004,A060000001,1.0,2020-03-24 00:17:29,7225,1,1,7.0,6,...,1.00,1.0,6.0,4,3.0,,3.0,1.00,,
494,0,A060001005,A060000001,1.0,2020-03-24 00:17:36,7225,1,1,7.0,6,...,1.00,1.0,6.0,5,4.0,,4.0,1.00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526693,7441,A030071005,A030000071,0.0,2020-06-05 06:50:21,438,1,1,24.0,3,...,0.25,0.2,44.0,5,1.0,2.0,4.0,0.25,5.0,0.4
2526691,7441,A040165001,A040000165,1.0,2020-08-21 01:06:39,8836,1,1,0.0,4,...,,1.0,32.5,1,1.0,5.0,5.0,0.20,5.0,1.0
2526692,7441,A040165002,A040000165,1.0,2020-08-21 01:06:50,8836,1,1,11.0,4,...,1.00,1.0,32.5,2,2.0,4.0,5.0,0.40,5.0,0.8
2526699,7441,A040165003,A040000165,1.0,2020-08-21 01:07:36,8836,1,1,46.0,4,...,1.00,1.0,32.5,3,3.0,4.0,5.0,0.60,5.0,0.8


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,train,same_item_cnt,elapsed,Bigcat,...,user_Bigcat_Cumacc,user_current_avg,user_current_time_avg,item_seq,user_retCount_correct_answer,item_retCount_correct_answer,user_retCount,user_retCumacc,item_retCount,item_retCumacc
2633,3,A050133008,A050000133,,2020-10-26 13:13:57,5289,0,1,46.0,5,...,0.655052,0.857143,39.500000,8,4.0,,5.0,0.8,,
3217,4,A070146008,A070000146,,2020-12-27 02:47:54,9080,0,1,23.0,7,...,0.703529,0.857143,21.500000,8,4.0,,5.0,0.8,,
10507,13,A070111008,A070000111,,2020-12-27 04:35:09,9660,0,1,8.0,7,...,0.390593,0.428571,13.000000,8,2.0,0.0,5.0,0.4,1.0,0.00
15278,17,A090064006,A090000064,,2020-10-30 05:48:37,2611,0,1,75.0,9,...,0.924757,1.000000,63.333333,6,5.0,,5.0,1.0,,
23388,26,A060135007,A060000135,,2020-10-23 11:44:18,1422,0,1,17.0,6,...,0.817365,0.666667,39.285714,7,3.0,0.0,5.0,0.6,2.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525915,7395,A040122005,A040000122,,2020-09-08 02:05:20,10615,0,1,2.0,4,...,0.111111,0.250000,2.400000,5,1.0,1.0,5.0,0.2,5.0,0.20
2526074,7404,A030111005,A030000111,,2020-10-13 09:49:18,7636,0,1,107.0,3,...,0.500000,0.500000,43.800000,5,2.0,5.0,5.0,0.4,5.0,1.00
2526268,7416,A050193004,A050000193,,2020-10-04 02:44:41,10402,0,1,24.0,5,...,0.666667,0.666667,11.750000,4,2.0,4.0,5.0,0.4,5.0,0.80
2526297,7417,A050193004,A050000193,,2020-09-06 13:09:15,10402,0,1,21.0,5,...,0.666667,0.666667,15.750000,4,2.0,3.0,5.0,0.4,4.0,0.75


In [24]:
train_df.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'train', 'same_item_cnt', 'elapsed', 'Bigcat',
       'user_avg', 'item_avg', 'Bigcat_avg', 'tag_avg', 'user_time_avg',
       'item_time_avg', 'Bigcat_time_avg', 'tag_time_avg', 'user_std',
       'item_std', 'Bigcat_std', 'tag_std', 'user_correct_answer',
       'item_correct_answer', 'user_total_answer', 'user_Cumacc',
       'user_Bigcat_correct_answer', 'user_Bigcat_total_answer',
       'user_Bigcat_Cumacc', 'user_current_avg', 'user_current_time_avg',
       'item_seq', 'user_retCount_correct_answer',
       'item_retCount_correct_answer', 'user_retCount', 'user_retCumacc',
       'item_retCount', 'item_retCumacc'],
      dtype='object')

In [26]:
# save total_data
total.to_csv(data_dir + 'total_data.csv', index=False)

+ `user_avg` : 유저별 평균 정답률
- `user_time_avg` : 유저별 평균 풀이시간