# Add `previous_` features

`previous`는 어떤 사람이 어떤 테스트를 볼 때, 바로 직전에 푼 문제 (하나 전 row)와 지금 푸는 문제 (현재 row) 사이에 어떤 관계가 있는지 feature로 나타낸다. 

예를 들어 현재 푸는 문제의 번호가 바로 직전에 푼 문제 번호의 다음인지 (즉, 문제를 순서대로 풀고 있는지) 아니면 직전에 뒤의 문제에서 현재 앞의 문제로 이동한 것인지 등을 나타낸다. 

In [14]:
import os, sys
from pathlib import Path
from glob import glob

In [15]:
BASE_PATH = Path('.').resolve().parent
BASE_PATH

PosixPath('/opt/ml')

In [16]:
if BASE_PATH.as_posix() not in sys.path:
    sys.path.append(BASE_PATH.as_posix())

In [17]:
sys.path

['/opt/ml/jaepil',
 '/opt/conda/lib/python37.zip',
 '/opt/conda/lib/python3.7',
 '/opt/conda/lib/python3.7/lib-dynload',
 '',
 '/opt/conda/lib/python3.7/site-packages',
 '/opt/conda/lib/python3.7/site-packages/transformers-4.6.1-py3.8.egg',
 '/opt/conda/lib/python3.7/site-packages/IPython/extensions',
 '/opt/ml/.ipython',
 '/opt/ml']

In [18]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
import numpy as np
import pandas as pd

In [20]:
import numba
import dask.dataframe as dd

In [21]:
import json
import pickle

In [22]:
from tqdm import tqdm, trange
# from tqdm.notebook import tqdm
from time import time

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.__version__

'1.6.0'

In [24]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device # Always check cudatoolkit version!

device(type='cuda', index=0)

In [25]:
from collections import defaultdict

## Load data

`['user_id', 'Timestamp']`로 sorting되고

`Timestamp`가 `datetime64` type으로 변환되었으며

`testId`에서 대분류인 `test_cat`을 뽑아내고

`assessmentItemID`에서 문제번호인 `question_num`을 뽑아낸

상태로 저장된 `train_df` 피클을 불러온다. 

In [91]:
train_df = pd.read_pickle('train_df.pkl')

## `previous` feature

우선 각 학생의 각 테스트 기준으로 previous를 구할 것이므로 user와 test로 groupby해준다. 

In [92]:
stu_test_groupby = train_df.groupby(['userID', 'testId'])

한 학생이 두 번 이상 같은 테스트를 본 경우 어느 row에서 끊어줘야 하는지 알 수 있게 하기 위해 단순 Timestamp diff를 한다. (즉, 진짜 delta를 보려는 용도가 아니라 끊어줄 지점을 알기 위한 delta)

In [93]:
%%time

train_df.loc[:, "delta"] = stu_test_groupby['Timestamp'].diff()

CPU times: user 1min 44s, sys: 1.24 s, total: 1min 46s
Wall time: 1min 46s


### `is_previous_ordered` 

현재 문제가 이전에 푼 문제의 바로 직후 문제번호를 가지는지 확인한다. 

In [94]:
train_df['q_num_prev'] = train_df.question_num.shift()

In [95]:
def is_previous_ordered(row):
    q_num = row.question_num
    q_num_prev = row.q_num_prev
    delta = row.delta
    delta_thres = 1 # hour
    
    if pd.isnull(delta) or delta > pd.Timedelta(hours=1):
        return -1
    elif q_num == q_num_prev + 1:
        return 1
    else:
        return 0
        

In [96]:
train_df['is_previous_ordered'] =  train_df.apply(lambda row: is_previous_ordered(row), axis=1)
train_df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,test_cat,question_num,delta,q_num_prev,is_previous_ordered
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,6,1,NaT,,-1
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,6,2,0 days 00:00:03,1.0,1
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,6,3,0 days 00:00:08,2.0,1
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,6,4,0 days 00:00:07,3.0,1
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,6,5,0 days 00:00:07,4.0,1


In [97]:
def is_previous_decreasing(row):
    q_num = row.question_num
    q_num_prev = row.q_num_prev
    delta = row.delta
    delta_thres = 1 # hour
    
    if pd.isnull(delta) or delta > pd.Timedelta(hours=1):
        return -1
    elif q_num < q_num_prev:
        return 1
    else:
        return 0

In [98]:
train_df['is_previous_decreasing'] = train_df.apply(lambda row: is_previous_decreasing(row), axis=1)
train_df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,test_cat,question_num,delta,q_num_prev,is_previous_ordered,is_previous_decreasing
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,6,1,NaT,,-1,-1
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,6,2,0 days 00:00:03,1.0,1,0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,6,3,0 days 00:00:08,2.0,1,0
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,6,4,0 days 00:00:07,3.0,1,0
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,6,5,0 days 00:00:07,4.0,1,0


In [99]:
train_df['is_prev_ord_shift'] = train_df.is_previous_ordered.shift()
train_df['is_prev_dec_shift'] = train_df.is_previous_decreasing.shift()

In [100]:
train_df.head(10)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,test_cat,question_num,delta,q_num_prev,is_previous_ordered,is_previous_decreasing,is_prev_ord_shift,is_prev_dec_shift
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,6,1,NaT,,-1,-1,,
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,6,2,0 days 00:00:03,1.0,1,0,-1.0,-1.0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,6,3,0 days 00:00:08,2.0,1,0,1.0,0.0
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,6,4,0 days 00:00:07,3.0,1,0,1.0,0.0
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,6,5,0 days 00:00:07,4.0,1,0,1.0,0.0
5,0,A060001007,A060000001,1,2020-03-24 00:17:47,7225,6,7,0 days 00:00:11,5.0,0,0,1.0,0.0
6,0,A060003001,A060000003,0,2020-03-26 05:52:03,7226,6,1,NaT,7.0,-1,-1,0.0,0.0
7,0,A060003002,A060000003,1,2020-03-26 05:52:10,7226,6,2,0 days 00:00:07,1.0,1,0,-1.0,-1.0
8,0,A060003003,A060000003,1,2020-03-26 05:53:14,7226,6,3,0 days 00:01:04,2.0,1,0,1.0,0.0
9,0,A060003004,A060000003,1,2020-03-26 05:53:29,7226,6,4,0 days 00:00:15,3.0,1,0,1.0,0.0


In [101]:
def is_probably_easy(row):
    delta = row.delta
    delta_thres = 1 # hour
    
    is_prev_ord = row.is_previous_ordered
    is_prev_dec = row.is_previous_decreasing
    is_prev_ord_shift = row.is_prev_ord_shift
    is_prev_dec_shift = row.is_prev_dec_shift
    
    case = (is_prev_ord_shift, is_prev_dec_shift, is_prev_ord, is_prev_dec)
    
    probably_easy_l = [
        (np.nan, np.nan, -1, -1),
        (-1, -1, 1, 0),
        (1, 0, 1, 0),
        (1, 0, 0, 0),
    ]
    
    if pd.isnull(delta) or delta > pd.Timedelta(hours=1):
        return -1
    elif case in probably_easy_l:
        return 1
    else:
        return 0

In [102]:
train_df['is_probably_easy'] = train_df.apply(lambda row: is_probably_easy(row), axis=1)

In [103]:
train_df.drop(labels=['delta', 'q_num_prev', 'is_previous_ordered', 'is_previous_decreasing', 'is_prev_ord_shift', 'is_prev_dec_shift'], axis=1, inplace=True)

In [106]:
train_df[train_df.is_probably_easy == 0]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,test_cat,question_num,is_probably_easy
54,0,A080004007,A080000004,0,2020-04-18 00:50:48,1356,8,7,0
78,0,A080008001,A080000008,1,2020-04-24 01:12:12,4673,8,1,0
79,0,A080008002,A080000008,1,2020-04-24 01:12:50,1444,8,2,0
104,0,A080012004,A080000012,1,2020-04-30 00:57:57,4671,8,4,0
105,0,A080012005,A080000012,1,2020-04-30 00:58:00,4668,8,5,0
...,...,...,...,...,...,...,...,...,...
2266527,7436,A030019004,A030000019,0,2020-04-08 12:07:06,419,3,4,0
2266528,7436,A030019002,A030000019,1,2020-04-08 12:07:16,7321,3,2,0
2266529,7436,A030019001,A030000019,0,2020-04-08 12:07:21,331,3,1,0
2266565,7440,A050096004,A050000096,1,2020-08-19 04:58:27,5267,5,4,0


In [107]:
train_df.iloc[30:60]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,test_cat,question_num,is_probably_easy
30,0,A080002004,A080000002,1,2020-04-06 00:37:58,1397,8,4,1
31,0,A080002005,A080000002,0,2020-04-06 00:38:01,1396,8,5,1
32,0,A080002006,A080000002,1,2020-04-06 00:38:05,1395,8,6,1
33,0,A060009001,A060000009,1,2020-04-07 01:42:13,7230,6,1,-1
34,0,A060009002,A060000009,1,2020-04-07 01:42:40,7230,6,2,1
35,0,A060009003,A060000009,1,2020-04-07 01:43:31,7230,6,3,1
36,0,A060009004,A060000009,1,2020-04-07 01:45:29,7230,6,4,1
37,0,A060009005,A060000009,1,2020-04-07 01:46:25,7230,6,5,1
38,0,A060009006,A060000009,1,2020-04-07 01:46:59,7230,6,6,1
39,0,A060009007,A060000009,0,2020-04-07 01:47:27,7230,6,7,1
