In [27]:
import pandas as pd
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F 
import copy
import math

from tqdm import tqdm

In [28]:
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

In [29]:
TEST_DATA_PATH = '/opt/ml/input/data/train_dataset/test_all_three.csv'
test_df = pd.read_csv(TEST_DATA_PATH, dtype=dtype, parse_dates=['Timestamp'])
test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

In [30]:
test_df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elapsed
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626,26.0
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626,94.0
2,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625,5.0
3,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625,7.0
4,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623,3.0


In [31]:
test_df['UID']=test_df['userID'].apply(lambda x:str(x).zfill(6))+test_df['testId']

In [32]:
test_df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elapsed,UID
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626,26.0,000003A050000023
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626,94.0,000003A050000023
2,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625,5.0,000003A050000023
3,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625,7.0,000003A050000023
4,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623,3.0,000003A050000023


In [36]:
#userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elapsed
group = test_df[list(test_df.columns)].groupby('UID').apply(
                lambda r: (
                    r['answerCode'].values,
                    r['assessmentItemID'].values, 
                    r['testId'].values,
                    r['Timestamp'].values,
                    r['KnowledgeTag'].values,
                    r['elapsed'].values,
                )
            )

In [37]:
group[0]

(array([1, 1, 1, 1, 0], dtype=int8),
 array(['A020001001', 'A020001002', 'A020001003', 'A020001004',
        'A020001005'], dtype=object),
 array(['A020000001', 'A020000001', 'A020000001', 'A020000001',
        'A020000001'], dtype=object),
 array(['2020-03-12T00:59:09.000000000', '2020-03-12T00:59:25.000000000',
        '2020-03-12T00:59:34.000000000', '2020-03-12T01:00:19.000000000',
        '2020-03-12T01:01:13.000000000'], dtype='datetime64[ns]'),
 array([7916, 7916, 7916, 7916, 7916], dtype=int16),
 array([16.       ,  9.       , 45.       , 54.       , 45.7140266]))

In [40]:
for idx,g in enumerate(group):
    if idx<15:
        print(g)

(array([1, 1, 1, 1, 0], dtype=int8), array(['A020001001', 'A020001002', 'A020001003', 'A020001004',
       'A020001005'], dtype=object), array(['A020000001', 'A020000001', 'A020000001', 'A020000001',
       'A020000001'], dtype=object), array(['2020-03-12T00:59:09.000000000', '2020-03-12T00:59:25.000000000',
       '2020-03-12T00:59:34.000000000', '2020-03-12T01:00:19.000000000',
       '2020-03-12T01:01:13.000000000'], dtype='datetime64[ns]'), array([7916, 7916, 7916, 7916, 7916], dtype=int16), array([16.       ,  9.       , 45.       , 54.       , 45.7140266]))
(array([1, 1, 1, 1, 1], dtype=int8), array(['A020005001', 'A020005002', 'A020005003', 'A020005004',
       'A020005005'], dtype=object), array(['A020000005', 'A020000005', 'A020000005', 'A020000005',
       'A020000005'], dtype=object), array(['2020-03-03T07:37:10.000000000', '2020-03-03T07:37:32.000000000',
       '2020-03-03T07:37:38.000000000', '2020-03-03T07:37:44.000000000',
       '2020-03-03T07:38:17.000000000'], dtype=

In [41]:
z_d = test_df.groupby(['UID'])['answerCode'].agg(['min']).to_dict()

In [49]:
test_df["min"]=test_df.UID.map(z_d['min'])

In [50]:
test_df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elapsed,UID,min
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626,26.0,000003A050000023,0
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626,94.0,000003A050000023,0
2,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625,5.0,000003A050000023,0
3,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625,7.0,000003A050000023,0
4,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623,3.0,000003A050000023,0


In [61]:
ret=test_df[test_df['min']==-1]
ret.reset_index(drop=True)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elapsed,UID,min
0,3,A050133001,A050000133,1,2020-10-26 13:08:41,5288,47.00000,000003A050000133,-1
1,3,A050133002,A050000133,1,2020-10-26 13:09:28,5288,54.00000,000003A050000133,-1
2,3,A050133003,A050000133,1,2020-10-26 13:10:22,5289,109.00000,000003A050000133,-1
3,3,A050133004,A050000133,1,2020-10-26 13:12:11,5289,25.00000,000003A050000133,-1
4,3,A050133005,A050000133,1,2020-10-26 13:12:36,5288,16.00000,000003A050000133,-1
...,...,...,...,...,...,...,...,...,...
4832,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832,18.00000,007439A040000130,-1
4833,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832,21.00000,007439A040000130,-1
4834,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244,89.00000,007439A040000130,-1
4835,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244,32.00000,007439A040000130,-1


In [62]:
len(ret.groupby("UID"))

744

In [63]:
ret.head(50)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elapsed,UID,min
1028,3,A050133001,A050000133,1,2020-10-26 13:08:41,5288,47.0,000003A050000133,-1
1029,3,A050133002,A050000133,1,2020-10-26 13:09:28,5288,54.0,000003A050000133,-1
1030,3,A050133003,A050000133,1,2020-10-26 13:10:22,5289,109.0,000003A050000133,-1
1031,3,A050133004,A050000133,1,2020-10-26 13:12:11,5289,25.0,000003A050000133,-1
1032,3,A050133005,A050000133,1,2020-10-26 13:12:36,5288,16.0,000003A050000133,-1
1033,3,A050133006,A050000133,1,2020-10-26 13:12:52,5288,19.0,000003A050000133,-1
1034,3,A050133007,A050000133,0,2020-10-26 13:13:11,5289,46.0,000003A050000133,-1
1035,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289,89.076807,000003A050000133,-1
1699,4,A070146001,A070000146,1,2020-12-27 02:45:02,9079,11.0,000004A070000146,-1
1700,4,A070146002,A070000146,1,2020-12-27 02:45:13,9080,33.0,000004A070000146,-1


In [64]:
del ret['min']

In [65]:
ret.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elapsed,UID
1028,3,A050133001,A050000133,1,2020-10-26 13:08:41,5288,47.0,000003A050000133
1029,3,A050133002,A050000133,1,2020-10-26 13:09:28,5288,54.0,000003A050000133
1030,3,A050133003,A050000133,1,2020-10-26 13:10:22,5289,109.0,000003A050000133
1031,3,A050133004,A050000133,1,2020-10-26 13:12:11,5289,25.0,000003A050000133
1032,3,A050133005,A050000133,1,2020-10-26 13:12:36,5288,16.0,000003A050000133


In [66]:
ret.to_csv('/opt/ml/input/data/train_dataset/test_uid.csv',index=False)

In [67]:
TRAIN_DATA_PATH = '/opt/ml/input/data/train_dataset/train_all_three.csv'
train_df = pd.read_csv(TRAIN_DATA_PATH, dtype=dtype, parse_dates=['Timestamp'])
train_df = train_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

In [68]:
train_df['UID']=train_df['userID'].apply(lambda x:str(x).zfill(6))+train_df['testId']

In [69]:
train_df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,elapsed,UID
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,3.0,000000A060000001
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,8.0,000000A060000001
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,7.0,000000A060000001
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,7.0,000000A060000001
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,11.0,000000A060000001


In [70]:
train_df.to_csv('/opt/ml/input/data/train_dataset/train_uid.csv',index=False)