## dataload & FeatureEngineering

In [5]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
plt.style.use('seaborn')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# import missingno
import pandas as pd
pd.set_option('display.min_rows', 500)
import warnings
warnings.filterwarnings('ignore')

import sys
import os
sys.path.append(os.path.dirname("/opt/ml/dkt_team/code/FeatrueEngineering"))
from LGBM.utils import apply_elo_granularity_feature_name

def concat_and_export(train_fe, test_fe):
    
    train_fe['kind']='train'
    test_fe['kind']='test'
    
    df = pd.concat([train_fe,test_fe])
    data_dir = '/opt/ml/input/data' # 경로
    write_path=f'{data_dir}/after_fe_train_test_cumsum.pkl'
    df.to_pickle(write_path,index=False)
    print(f"Write: {write_path}")
    
def export(df, output='after_fe_train_test.pkl'):
    data_dir = '/opt/ml/input/data' # 경로
    write_path=f'{data_dir}/{output}'
    df.to_pickle(write_path)
    print(f"Write: {write_path}")
    
path='/opt/ml/dkt_team/data'
train = pd.read_csv(f"{path}/train_data.csv")
test = pd.read_csv(f"{path}/test_data.csv")

day_dict = {'Tuesday': 0,
 'Thursday': 1,
 'Monday': 2,
 'Saturday': 3,
 'Friday': 4,
 'Wednesday': 5,
 'Sunday': 6}

def feature_engineering(df):
    uid2idx = {v:k for k,v in enumerate(sorted(df.userID.unique()))}
    ass2idx = {v:k for k,v in enumerate(sorted(df.assessmentItemID.unique()))}
    test2idx = {v:k for k,v in enumerate(sorted(df.testId.unique()))}

    df2 = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df2.sort_values(by=['userID','Timestamp'], inplace=True)
    
    # userID, assessmentItemID, testId 라벨인코딩
    df2['uidIdx'] = df2.userID.map(uid2idx)
    df2['assIdx'] = df2.assessmentItemID.map(ass2idx)
    df2['testIdx'] = df2.testId.map(test2idx)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df2['user_correct_answer'] = df2.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df2['user_total_answer'] = df2.groupby('userID')['answerCode'].cumcount()
    df2['user_acc'] = df2['user_correct_answer']/df2['user_total_answer']
    df2['month'] = pd.to_datetime(df2.Timestamp).dt.month
    df2['day'] = pd.to_datetime(df2.Timestamp).dt.day
    df2['hour'] = pd.to_datetime(df2.Timestamp).dt.hour
    df2['dayname'] = pd.to_datetime(df2.Timestamp).dt.day_name().map(day_dict)
    df2['big_category'] = df2.testId.map(lambda x:x[2]).astype(int)
    df2['problem_num'] = df2.assessmentItemID.map(lambda x: int(x[-3:]))
    df2['mid_category'] = df2.testId.map(lambda x: int(x[-3:]))

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df2.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df2.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']
    correct_b = df2.groupby(['big_category'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_b.columns = ["big_mean", 'big_std', 'big_sum']

    df2 = pd.merge(df2, correct_t, on=['testId'], how="left")
    df2 = pd.merge(df2, correct_k, on=['KnowledgeTag'], how="left")
    df2 = pd.merge(df2, correct_b, on=['big_category'], how="left")
    
    # 유저별 문제푼 시간, solvesec_3600, time_category
    df2['Timestamp2'] = pd.to_datetime(df2.Timestamp)
    df2['solvetime'] = df2.groupby('userID')['Timestamp2'].diff().fillna(pd.Timedelta(seconds=0))
    df2['solvesec'] = df2.solvetime.map(lambda x : x.total_seconds())
    df2['solvesec_3600'] = df2.solvesec
    df2.loc[df2.solvesec>=3600,'solvesec_3600']=3600

    df2['time_category'] = ''
    tc = [0,5,7,10,60,600,1200,2400,3600]
    df2.loc[(df2.solvesec==0), 'time_category'] = "0 - [0,0]"
    for i in range(len(tc)-1):
        s,e = tc[i],tc[i+1]
        df2.loc[(df2.solvesec>s) & (df2.solvesec<=e),'time_category']=f"{i+1} - ({s}, {e}]"
    df2.loc[(df2.solvesec>=tc[-1]),'time_category'] = f"{i+2} - ({e}, )"
    timecat2idx={k:v for v,k in enumerate(sorted(df2.time_category.unique()))}
    df2['time_category'] = df2.time_category.map(timecat2idx)
    
    # 유저별 문제푼 시간 Cumsum
    df2['solvesec_cumsum'] = df2.groupby(['userID','testId'])['solvesec_3600'].cumsum()%3601
    df2['solvecumsum_category'] = ''
    tc = [0,5,7,10,60,600,1200,2400,3600,7200]
    df2.loc[(df2.solvesec_cumsum==0), 'solvecumsum_category'] = "0 - [0,0]"
    for i in range(len(tc)-1):
        s,e = tc[i],tc[i+1]
        df2.loc[(df2.solvesec_cumsum>s) & (df2.solvesec_cumsum<=e),'solvecumsum_category']=f"{i+1} - ({s}, {e}]"
    df2.loc[(df2.solvesec_cumsum>=tc[-1]),'solvecumsum_category'] = f"{i+2} - ({e}, )"
    solvecumsum_category2idx={k:v for v,k in enumerate(sorted(df2.solvecumsum_category.unique()))}
    df2['solvecumsum_category'] = df2.solvecumsum_category.map(solvecumsum_category2idx)
    
    ## big_category
    # big_category 전체 정답률, 표준편차
    big_category_answermean = dict(df2.groupby("big_category").answerCode.mean())
    big_category_answerstd = dict(df2.groupby("big_category").answerCode.std())
    df2['big_category_acc'] = df2.big_category.map(big_category_answermean)
    df2['big_category_std'] = df2.big_category.map(big_category_answerstd)

    # 유저별 big category 문제 푼 횟수
    df2['big_category_cumconut'] = df2.groupby(['userID','big_category']).answerCode.cumcount()

    # 유저별 정답률, 표준 편차
    df2_user_big_ans = df2.groupby(['userID','big_category']).answerCode.mean().reset_index()
    df2_user_big_ans.rename(columns={'answerCode':'big_category_user_acc'},inplace=True)
    df2 = df2.merge(df2_user_big_ans,on=['userID','big_category'])

    df2_user_big_ans_std = df2.groupby(['userID','big_category']).answerCode.std().reset_index()
    df2_user_big_ans_std.rename(columns={'answerCode':'big_category_user_std'},inplace=True)
    df2 = df2.merge(df2_user_big_ans_std,on=['userID','big_category'])

    # 문제별 맞춘 횟수 + log1p
    df2['big_category_answer'] = df2.groupby(['userID','big_category']).answerCode.transform(lambda x: x.cumsum().shift(1))
    df2['big_category_answer_log1p'] = np.log1p(df2.big_category_answer)
    
    df2 = apply_elo_granularity_feature_name(df2, "assessmentItemID")
    
    df2.sort_values(by=['userID','Timestamp'], inplace=True)
    return df2

train['kind']='train'
test['kind']='test'
df = pd.concat([train,test])
df = feature_engineering(df)

Dataset of shape (2526700, 44)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'kind', 'uidIdx', 'assIdx', 'testIdx', 'user_correct_answer', 'user_total_answer', 'user_acc', 'month', 'day', 'hour', 'dayname', 'big_category', 'problem_num', 'mid_category', 'test_mean', 'test_std', 'test_sum', 'tag_mean', 'tag_std', 'tag_sum', 'big_mean', 'big_std', 'big_sum', 'Timestamp2', 'solvetime', 'solvesec', 'solvesec_3600', 'time_category', 'solvesec_cumsum', 'solvecumsum_category', 'big_category_acc', 'big_category_std', 'big_category_cumconut', 'big_category_user_acc', 'big_category_user_std', 'big_category_answer', 'big_category_answer_log1p', 'left_asymptote']
Parameter estimation is starting...


100%|██████████| 2526700/2526700 [00:22<00:00, 110640.35it/s]


Theta & beta estimations on assessmentItemID are completed.


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2526700 entries, 0 to 2526699
Data columns (total 45 columns):
 #   Column                     Dtype          
---  ------                     -----          
 0   userID                     int64          
 1   assessmentItemID           object         
 2   testId                     object         
 3   answerCode                 int64          
 4   Timestamp                  object         
 5   KnowledgeTag               int64          
 6   kind                       object         
 7   uidIdx                     int64          
 8   assIdx                     int64          
 9   testIdx                    int64          
 10  user_correct_answer        float64        
 11  user_total_answer          int64          
 12  user_acc                   float64        
 13  month                      int64          
 14  day                        int64          
 15  hour                       int64          
 16  dayname           

In [6]:
df1 = df[df['kind']=='train']
df2 = df[df['kind']=='test']

In [7]:
write_path="/opt/ml/input/data/df1_FE.pkl"
df1.to_pickle(write_path)

In [8]:
write_path="/opt/ml/input/data/df2_FE.pkl"
df2.to_pickle(write_path)

## 내보내기

In [11]:
df2 = df.copy()
export(df2, output='base_lgbm2.pkl')

Write: /opt/ml/input/data/base_lgbm2.pkl
