# Timestamp `delta`

In [1]:
import os, sys
from pathlib import Path
from glob import glob

In [2]:
BASE_PATH = Path('.').resolve().parent
BASE_PATH

PosixPath('/opt/ml')

In [3]:
if BASE_PATH.as_posix() not in sys.path:
    sys.path.append(BASE_PATH.as_posix())

In [4]:
sys.path

['/opt/ml/jaepil',
 '/opt/conda/lib/python37.zip',
 '/opt/conda/lib/python3.7',
 '/opt/conda/lib/python3.7/lib-dynload',
 '',
 '/opt/conda/lib/python3.7/site-packages',
 '/opt/conda/lib/python3.7/site-packages/transformers-4.6.1-py3.8.egg',
 '/opt/conda/lib/python3.7/site-packages/IPython/extensions',
 '/opt/ml/.ipython',
 '/opt/ml']

In [5]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
import numpy as np
import pandas as pd

In [19]:
import datetime

In [7]:
import numba
import dask.dataframe as dd

In [8]:
import json
import pickle

In [9]:
from tqdm import tqdm, trange
# from tqdm.notebook import tqdm
from time import time

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.__version__

'1.6.0'

In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device # Always check cudatoolkit version!

device(type='cuda', index=0)

In [12]:
from collections import defaultdict

## Load data

`['user_id', 'Timestamp']`로 sorting되고

`Timestamp`가 `datetime64` type으로 변환되었으며

`testId`에서 대분류인 `test_cat`을 뽑아내고

`assessmentItemID`에서 문제번호인 `question_num`을 뽑아낸

상태로 저장된 `train_df` 피클을 불러온다. 

In [65]:
train_df = pd.read_pickle('train_df.pkl')

In [66]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2266586 entries, 0 to 2266585
Data columns (total 8 columns):
 #   Column            Dtype         
---  ------            -----         
 0   userID            int64         
 1   assessmentItemID  object        
 2   testId            object        
 3   answerCode        int64         
 4   Timestamp         datetime64[ns]
 5   KnowledgeTag      int64         
 6   test_cat          int64         
 7   question_num      int64         
dtypes: datetime64[ns](1), int64(5), object(2)
memory usage: 138.3+ MB


## groupby로 timedelta (`delta`) 구하기

각 유저가 각 테스트를 풀 때 한 문제에 얼마나 걸리는지 `Timestamp`를 diff해서 구한다. 

그런데 이전에 살펴봤듯, 한 유저가 어떤 테스트를 2번 이상 풀게되면 diff된 timedelta가 말도안되게 큰 경우가 생긴다. 
- 어떤 테스트를 3월에 한 번 보고 6월에 또 보면 `['userID', 'testId']`로 groupby했을 때 3월에 본 기록이랑 6월에 본 기록이 함께 붙은채로 나온다. 
- 때문에 그냥 Timestamp diff를 하면 3월에 테스트의 마지막 문제를 푼 시간과 6월에 테스트의 첫번째 문제를 푼 시간이 서로 diff된다. 

따라서 그냥 간단하게 해결하기 위해 1시간보다 큰 값은 첫 문제라고 가정하고 그냥 걸린 시간(`delta`)을 0 seconds로 바꿔준다. 

In [67]:
%%time

stu_test_groupby = train_df.groupby(['userID', 'testId'])
train_df.loc[:, "delta"] = stu_test_groupby['Timestamp'].diff()

CPU times: user 1min 39s, sys: 348 ms, total: 1min 39s
Wall time: 1min 39s


In [72]:
%%time

train_df.delta.fillna(value=pd.Timedelta(seconds=0), inplace=True)
train_df[train_df.delta > pd.Timedelta(hours=1)].loc[:, 'delta'] = pd.Timedelta(seconds=0)

CPU times: user 48 ms, sys: 72 ms, total: 120 ms
Wall time: 120 ms


In [69]:
train_df.delta

0         0 days 00:00:00
1         0 days 00:00:03
2         0 days 00:00:08
3         0 days 00:00:07
4         0 days 00:00:07
                ...      
2266581   0 days 00:00:24
2266582   0 days 00:00:00
2266583   0 days 00:00:11
2266584   0 days 00:00:46
2266585   0 days 00:01:13
Name: delta, Length: 2266586, dtype: timedelta64[ns]

In [70]:
train_df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,test_cat,question_num,delta
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,6,1,0 days 00:00:00
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,6,2,0 days 00:00:03
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,6,3,0 days 00:00:08
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,6,4,0 days 00:00:07
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,6,5,0 days 00:00:07
...,...,...,...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,3,5,0 days 00:00:24
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,4,1,0 days 00:00:00
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,4,2,0 days 00:00:11
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,4,3,0 days 00:00:46


In [71]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2266586 entries, 0 to 2266585
Data columns (total 9 columns):
 #   Column            Dtype          
---  ------            -----          
 0   userID            int64          
 1   assessmentItemID  object         
 2   testId            object         
 3   answerCode        int64          
 4   Timestamp         datetime64[ns] 
 5   KnowledgeTag      int64          
 6   test_cat          int64          
 7   question_num      int64          
 8   delta             timedelta64[ns]
dtypes: datetime64[ns](1), int64(5), object(2), timedelta64[ns](1)
memory usage: 155.6+ MB


이전과 같이 한 사람이 테스트를 2번 이상 보는 경우를 확인해보자. 

새로운 테스트를 풀 때 delta가 정상적으로 0초로 들어간 것을 알 수 있다. 

In [73]:
%%time
stu_test_groupby = train_df.groupby(['userID', 'testId'])
stu_test_df_l = list(stu_test_groupby)
# stu_test_df_l = [x[1] for x in stu_test_df_l]
more_than_1_test_df_l = [x for x in stu_test_df_l if x[1].Timestamp.max().date() != x[1].Timestamp.min().date()]
len(more_than_1_test_df_l)

CPU times: user 2min 44s, sys: 1.11 s, total: 2min 45s
Wall time: 2min 45s


9492

In [74]:
more_than_1_test_df_l[0][1]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,test_cat,question_num,delta
1166,1,A040155001,A040000155,1,2020-06-21 22:57:14,2111,4,1,0 days 00:00:00
1167,1,A040155002,A040000155,1,2020-06-21 22:57:55,2111,4,2,0 days 00:00:41
1168,1,A040155003,A040000155,0,2020-06-21 22:58:33,2111,4,3,0 days 00:00:38
1169,1,A040155004,A040000155,1,2020-06-21 22:58:49,2111,4,4,0 days 00:00:16
1170,1,A040155005,A040000155,0,2020-06-21 23:00:10,2111,4,5,0 days 00:01:21
1171,1,A040155006,A040000155,0,2020-06-21 23:01:44,2111,4,6,0 days 00:01:34
1397,1,A040155001,A040000155,1,2020-08-17 09:56:27,2111,4,1,0 days 00:00:00
1398,1,A040155002,A040000155,1,2020-08-17 09:57:06,2111,4,2,0 days 00:00:39
1399,1,A040155003,A040000155,1,2020-08-17 09:58:03,2111,4,3,0 days 00:00:57
1400,1,A040155004,A040000155,1,2020-08-17 09:58:29,2111,4,4,0 days 00:00:26
