## 1. Data Load

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme(color_codes=True)
import missingno as msno
import time
from datetime import datetime
from tqdm import tqdm

In [10]:
%%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

# 데이터 경로 맞춰주세요!
DATA_PATH = './input/data/train_dataset/train_data.csv'

df = pd.read_csv(DATA_PATH)
# df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

CPU times: user 2.71 s, sys: 232 ms, total: 2.94 s
Wall time: 2.97 s


In [12]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225
...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836


### Time convert

In [4]:
def convert_time(s):
    timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())
    return int(timestamp)

df['Timestamp'] = df['Timestamp'].apply(convert_time)

In [5]:
# dataframe 사용자별, 시간순으로 정렬
df = df.sort_values(by=['userID','Timestamp'], axis=0)

## 2. 사용자별 누적 문제 풀이 갯수 & 정답률

In [6]:
# 사용자별 누적 정답 갯수
user_cumsum = df.loc[:, ['userID', 'answerCode']].groupby('userID').transform(lambda x: x.cumsum().shift(1))
# 사용자별 누적갯수
user_cumcount = df.loc[:, ['userID', 'answerCode']].groupby('userID').agg({'answerCode': 'cumcount'})
# 사용자별 누적 정답률
user_ans = user_cumsum / user_cumcount

df['user_ans'] = user_ans
df['user_cnt'] = user_cumcount

In [9]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626
2,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625
3,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625
4,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623
...,...,...,...,...,...,...
260109,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832
260110,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832
260111,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244
260112,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244


## 3. Elapsed time

In [8]:
df['elapsed_time'] = df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().shift(-1)

In [9]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_ans,user_cnt,elapsed_time
0,3,A050023001,A050000023,1,1578567391,2626,,0,26.0
1,3,A050023002,A050000023,1,1578567417,2626,1.000000,1,94.0
2,3,A050023003,A050000023,0,1578567511,2625,1.000000,2,5.0
3,3,A050023004,A050000023,0,1578567516,2625,0.666667,3,7.0
4,3,A050023006,A050000023,0,1578567523,2623,0.500000,4,3.0
...,...,...,...,...,...,...,...,...,...
260109,7439,A040130001,A040000130,0,1602716843,8832,0.727273,11,18.0
260110,7439,A040130002,A040000130,1,1602716861,8832,0.666667,12,21.0
260111,7439,A040130003,A040000130,1,1602716882,8244,0.692308,13,89.0
260112,7439,A040130004,A040000130,1,1602716971,8244,0.714286,14,32.0


## 4. assignmentID -> Category, Number

In [10]:
df['category'] = df['assessmentItemID'].apply(lambda x: x[2])

In [11]:
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_ans,user_cnt,elapsed_time,category
0,3,A050023001,A050000023,1,1578567391,2626,,0,26.0,5
1,3,A050023002,A050000023,1,1578567417,2626,1.0,1,94.0,5
2,3,A050023003,A050000023,0,1578567511,2625,1.0,2,5.0,5
3,3,A050023004,A050000023,0,1578567516,2625,0.666667,3,7.0,5
4,3,A050023006,A050000023,0,1578567523,2623,0.5,4,3.0,5


### Category별 누적 정답률

In [12]:
# 사용자별 누적 정답 갯수
cate_cumsum = df.loc[:, ['userID', 'answerCode','category']].groupby(['userID','category']).transform(lambda x: x.cumsum().shift(1))
# 사용자별 누적갯수
cate_cumcount = df.loc[:, ['userID', 'answerCode','category']].groupby(['userID','category']).agg({'answerCode': 'cumcount'})
# 사용자별 누적 정답률
cate_ans = cate_cumsum / cate_cumcount

df['cate_ans'] = cate_ans
df['cate_cnt'] = cate_cumcount

In [13]:
df.head(10)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_ans,user_cnt,elapsed_time,category,cate_ans,cate_cnt
0,3,A050023001,A050000023,1,1578567391,2626,,0,26.0,5,,0
1,3,A050023002,A050000023,1,1578567417,2626,1.0,1,94.0,5,1.0,1
2,3,A050023003,A050000023,0,1578567511,2625,1.0,2,5.0,5,1.0,2
3,3,A050023004,A050000023,0,1578567516,2625,0.666667,3,7.0,5,0.666667,3
4,3,A050023006,A050000023,0,1578567523,2623,0.5,4,3.0,5,0.5,4
5,3,A050023007,A050000023,0,1578567526,2623,0.4,5,25.0,5,0.4,5
6,3,A050023005,A050000023,0,1578567551,192,0.333333,6,352664.0,5,0.333333,6
7,3,A050027001,A050000027,0,1578920215,3691,0.285714,7,151.0,5,0.285714,7
8,3,A050027002,A050000027,0,1578920366,3691,0.25,8,63.0,5,0.25,8
9,3,A050027003,A050000027,0,1578920429,3682,0.222222,9,17.0,5,0.222222,9


In [14]:
df['number'] = df['assessmentItemID'].apply(lambda x: x[0]+x[-6:])

## 5. testID별 누적 정답률

In [15]:
# 사용자별 누적 정답 갯수
test_cumsum = df.loc[:, ['userID', 'answerCode','testId']].groupby(['userID','testId']).transform(lambda x: x.cumsum().shift(1))

In [16]:
# 사용자별 누적갯수
test_cumcount = df.loc[:, ['userID', 'answerCode','testId']].groupby(['userID','testId']).agg({'answerCode': 'cumcount'})

In [17]:
# 사용자별 누적 정답률
test_ans = test_cumsum / test_cumcount

df['test_ans'] = test_ans
df['test_cnt'] = test_cumcount
df['test_cumsum'] = test_cumsum

In [18]:
df.head(20)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_ans,user_cnt,elapsed_time,category,cate_ans,cate_cnt,number,test_ans,test_cnt,test_cumsum
0,3,A050023001,A050000023,1,1578567391,2626,,0,26.0,5,,0,A023001,,0,
1,3,A050023002,A050000023,1,1578567417,2626,1.0,1,94.0,5,1.0,1,A023002,1.0,1,1.0
2,3,A050023003,A050000023,0,1578567511,2625,1.0,2,5.0,5,1.0,2,A023003,1.0,2,2.0
3,3,A050023004,A050000023,0,1578567516,2625,0.666667,3,7.0,5,0.666667,3,A023004,0.666667,3,2.0
4,3,A050023006,A050000023,0,1578567523,2623,0.5,4,3.0,5,0.5,4,A023006,0.5,4,2.0
5,3,A050023007,A050000023,0,1578567526,2623,0.4,5,25.0,5,0.4,5,A023007,0.4,5,2.0
6,3,A050023005,A050000023,0,1578567551,192,0.333333,6,352664.0,5,0.333333,6,A023005,0.333333,6,2.0
7,3,A050027001,A050000027,0,1578920215,3691,0.285714,7,151.0,5,0.285714,7,A027001,,0,
8,3,A050027002,A050000027,0,1578920366,3691,0.25,8,63.0,5,0.25,8,A027002,0.0,1,0.0
9,3,A050027003,A050000027,0,1578920429,3682,0.222222,9,17.0,5,0.222222,9,A027003,0.0,2,0.0


## 6. KnowledgeTag별 누적 정답률

In [19]:
# 사용자별 누적 정답 갯수
tag_cumsum = df.loc[:, ['userID', 'answerCode','KnowledgeTag']].groupby(['userID','KnowledgeTag']).transform(lambda x: x.cumsum().shift(1))
# 사용자별 누적갯수
tag_cumcount = df.loc[:, ['userID', 'answerCode','KnowledgeTag']].groupby(['userID','KnowledgeTag']).agg({'answerCode': 'cumcount'})
# 사용자별 누적 정답률
tag_ans = tag_cumsum / tag_cumcount

df['tag_ans'] = tag_ans
df['tag_cnt'] = tag_cumcount

In [20]:
df.head(20)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_ans,user_cnt,elapsed_time,category,cate_ans,cate_cnt,number,test_ans,test_cnt,test_cumsum,tag_ans,tag_cnt
0,3,A050023001,A050000023,1,1578567391,2626,,0,26.0,5,,0,A023001,,0,,,0
1,3,A050023002,A050000023,1,1578567417,2626,1.0,1,94.0,5,1.0,1,A023002,1.0,1,1.0,1.0,1
2,3,A050023003,A050000023,0,1578567511,2625,1.0,2,5.0,5,1.0,2,A023003,1.0,2,2.0,,0
3,3,A050023004,A050000023,0,1578567516,2625,0.666667,3,7.0,5,0.666667,3,A023004,0.666667,3,2.0,0.0,1
4,3,A050023006,A050000023,0,1578567523,2623,0.5,4,3.0,5,0.5,4,A023006,0.5,4,2.0,,0
5,3,A050023007,A050000023,0,1578567526,2623,0.4,5,25.0,5,0.4,5,A023007,0.4,5,2.0,0.0,1
6,3,A050023005,A050000023,0,1578567551,192,0.333333,6,352664.0,5,0.333333,6,A023005,0.333333,6,2.0,,0
7,3,A050027001,A050000027,0,1578920215,3691,0.285714,7,151.0,5,0.285714,7,A027001,,0,,,0
8,3,A050027002,A050000027,0,1578920366,3691,0.25,8,63.0,5,0.25,8,A027002,0.0,1,0.0,0.0,1
9,3,A050027003,A050000027,0,1578920429,3682,0.222222,9,17.0,5,0.222222,9,A027003,0.0,2,0.0,,0


### KnowledgeTag별 평균 정답률

In [21]:
tag_group = df.loc[:, ['userID', 'answerCode','KnowledgeTag']].groupby('KnowledgeTag').agg({'answerCode': 'mean'})

In [22]:
tag_group.loc[23]['answerCode']

0.67

In [23]:
def tag_mean(x):
    return tag_group.loc[x]['answerCode']

In [24]:
df['tag_mean'] = df['KnowledgeTag'].apply(tag_mean)

In [25]:
df.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'user_ans', 'user_cnt', 'elapsed_time', 'category',
       'cate_ans', 'cate_cnt', 'number', 'test_ans', 'test_cnt', 'test_cumsum',
       'tag_ans', 'tag_cnt', 'tag_mean'],
      dtype='object')

In [16]:
DATA_PATH_3 = './input/data/train_dataset/tuning_data.csv'

data = pd.read_csv(DATA_PATH_3)

In [17]:
data = data[data.datasetType==0]

In [18]:
data

Unnamed: 0,userID,testCategory,testID,assmtID,assmtCorrectRate,assmtETMedian,cumInteraction,cumCorrectRate,retry,timestamp,unixTime,relTime2Last,elapsedTime,_elapsedTime,relElapsedTime,knowledgeTag,cumTag,cumTagCorrectRate,answerCode,datasetType
0,0,60,A060000001,A060001001,0.984000,6,0,0.500000,0,2020-03-24 00:17:11,1585009031,-23685788,3,3,0.500000,7224,0,0.500000,1,0
1,0,60,A060000001,A060001002,0.968000,14,1,1.000000,0,2020-03-24 00:17:14,1585009034,-23685785,8,8,0.571429,7225,0,0.500000,1,0
2,0,60,A060000001,A060001003,0.916000,10,2,1.000000,0,2020-03-24 00:17:22,1585009042,-23685777,7,7,0.700000,7225,1,1.000000,1,0
3,0,60,A060000001,A060001004,0.972000,11,3,1.000000,0,2020-03-24 00:17:29,1585009049,-23685770,7,7,0.636364,7225,2,1.000000,1,0
4,0,60,A060000001,A060001005,0.948000,21,4,1.000000,0,2020-03-24 00:17:36,1585009056,-23685763,11,11,0.523810,7225,3,1.000000,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266581,7441,30,A030000071,A030071005,0.446667,46,4,0.292486,0,2020-06-05 06:50:21,1591339821,-6632308,89,-2,1.934783,438,4,0.292486,0,0
2266582,7441,40,A040000165,A040165001,0.643333,17,5,0.253253,0,2020-08-21 01:06:39,1597971999,-130,11,11,0.647059,8836,0,0.500000,1,0
2266583,7441,40,A040000165,A040165002,0.640000,14,6,0.374161,0,2020-08-21 01:06:50,1597972010,-119,46,46,3.285714,8836,1,1.000000,1,0
2266584,7441,40,A040000165,A040165003,0.786667,78,7,0.460983,0,2020-08-21 01:07:36,1597972056,-73,73,73,0.935897,8836,2,1.000000,1,0


In [19]:
DATA_PATH_4 = './input/data/train_dataset/tuning_train_data.csv'

data2 = pd.read_csv(DATA_PATH_4)

In [20]:
data2

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_ans,user_cnt,elapsed_time,category,cate_ans,cate_cnt,number,test_ans,test_cnt,test_cumsum,tag_ans,tag_cnt,tag_mean
0,0,A060001001,A060000001,1,1585009031,7224,,0,3.0,6,,0,A001001,,0,,,0,0.955022
1,0,A060001002,A060000001,1,1585009034,7225,1.000000,1,8.0,6,1.00,1,A001002,1.00,1,1.0,,0,0.913187
2,0,A060001003,A060000001,1,1585009042,7225,1.000000,2,7.0,6,1.00,2,A001003,1.00,2,2.0,1.00,1,0.913187
3,0,A060001004,A060000001,1,1585009049,7225,1.000000,3,7.0,6,1.00,3,A001004,1.00,3,3.0,1.00,2,0.913187
4,0,A060001005,A060000001,1,1585009056,7225,1.000000,4,11.0,6,1.00,4,A001005,1.00,4,4.0,1.00,3,0.913187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,1591339821,438,0.250000,4,6632178.0,3,0.25,4,A071005,0.25,4,1.0,0.25,4,0.689706
2266582,7441,A040165001,A040000165,1,1597971999,8836,0.200000,5,11.0,4,,0,A165001,,0,,,0,0.697874
2266583,7441,A040165002,A040000165,1,1597972010,8836,0.333333,6,46.0,4,1.00,1,A165002,1.00,1,1.0,1.00,1,0.697874
2266584,7441,A040165003,A040000165,1,1597972056,8836,0.428571,7,73.0,4,1.00,2,A165003,1.00,2,2.0,1.00,2,0.697874


In [21]:
data2['relElapsedTime'] = data['relElapsedTime']

In [22]:
data2.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'user_ans', 'user_cnt', 'elapsed_time', 'category',
       'cate_ans', 'cate_cnt', 'number', 'test_ans', 'test_cnt', 'test_cumsum',
       'tag_ans', 'tag_cnt', 'tag_mean', 'relElapsedTime'],
      dtype='object')

In [24]:
data2

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_ans,user_cnt,elapsed_time,category,cate_ans,cate_cnt,number,test_ans,test_cnt,test_cumsum,tag_ans,tag_cnt,tag_mean,relElapsedTime
0,0,A060001001,A060000001,1,1585009031,7224,,0,3.0,6,,0,A001001,,0,,,0,0.955022,0.500000
1,0,A060001002,A060000001,1,1585009034,7225,1.000000,1,8.0,6,1.00,1,A001002,1.00,1,1.0,,0,0.913187,0.571429
2,0,A060001003,A060000001,1,1585009042,7225,1.000000,2,7.0,6,1.00,2,A001003,1.00,2,2.0,1.00,1,0.913187,0.700000
3,0,A060001004,A060000001,1,1585009049,7225,1.000000,3,7.0,6,1.00,3,A001004,1.00,3,3.0,1.00,2,0.913187,0.636364
4,0,A060001005,A060000001,1,1585009056,7225,1.000000,4,11.0,6,1.00,4,A001005,1.00,4,4.0,1.00,3,0.913187,0.523810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,1591339821,438,0.250000,4,6632178.0,3,0.25,4,A071005,0.25,4,1.0,0.25,4,0.689706,1.934783
2266582,7441,A040165001,A040000165,1,1597971999,8836,0.200000,5,11.0,4,,0,A165001,,0,,,0,0.697874,0.647059
2266583,7441,A040165002,A040000165,1,1597972010,8836,0.333333,6,46.0,4,1.00,1,A165002,1.00,1,1.0,1.00,1,0.697874,3.285714
2266584,7441,A040165003,A040000165,1,1597972056,8836,0.428571,7,73.0,4,1.00,2,A165003,1.00,2,2.0,1.00,2,0.697874,0.935897


In [23]:
data2.to_csv('./input/data/train_dataset/tuning_data2.csv',index=False,mode='w')