In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

from torch import nn
import torch
import torch.nn.functional as F
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Config:
    device = torch.device("cuda")
    MAX_SEQ = 100
    EMBED_DIMS = 512
    ENC_HEADS = DEC_HEADS = 8
    NUM_ENCODER = NUM_DECODER = 4
    BATCH_SIZE = 32
    TRAIN_FILE = "/opt/ml/input/data/train_data.csv"
    TEST_FILE = "/opt/ml/input/data/test_data.csv"
    TOTAL_EXE = 13523
    TOTAL_CAT = 10000



# 1.데이터 불러오기

기존 SAINT : train의 Column 종류

row_id, timestamp, user_id, content_id, content_type_id,

task_container_id, user_answer, answered_correctly, prior_question_elapsed_time, prior_question_had_explanation

우리 DKT

userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag

공통적으로 User id, timestamp 있고

콘텐츠적으로 나누면 우리 dkt 는 3개정도 분류될거 같고,

saint+이기 때문에 time 에 관해서 더 프로세싱하면 될거 같다.(elapsed, Lagged time 은 데이터상으로 구별하기 힘들다.)


전처리


'timestamp': 'int64',  -> Timestamp

'user_id': 'int32',  -> userID

'content_id': 'int16' -> assessmentItemID

'answered_correctly': 'int8',   -> answerCode



"content_type_id": "int8", -> 0으로 생성



"prior_question_elapsed_time": "float32", -> Timestamp 변형

"task_container_id": "int16" -> 누적 개수 (새로 생성 필요)



In [3]:
print("loading csv.....")
train_df = pd.read_csv(Config.TRAIN_FILE)
test_df = pd.read_csv(Config.TEST_FILE)
print("shape of (train) dataframe :", train_df.shape)

# 기본적인 데이터 전처리 부분.
# column : userID assessmentItemID  testId answerCode Timestamp KnowledgeTag grade

train_df['grade'] = (train_df['assessmentItemID'].str[2]).astype('int')
train_df['testId'] = (train_df['assessmentItemID'].str[2] + train_df['assessmentItemID'].str[4:7])
train_df["Timestamp"] = pd.to_datetime(train_df["Timestamp"])

test_df['grade'] = (test_df['assessmentItemID'].str[2]).astype('int')
test_df['testId'] = test_df['assessmentItemID'].str[2] + test_df['assessmentItemID'].str[4:7]
test_df["Timestamp"] = pd.to_datetime(test_df["Timestamp"])

ItemID2idx = {v:k for k,v in enumerate(train_df['assessmentItemID'].unique())} # 9453개
testId2idx = {v:k for k,v in enumerate(train_df['testId'].unique())} # 1536개
Tag2idx = {v:k for k,v in enumerate(train_df['KnowledgeTag'].unique())} # 911개

# ItemID2idx = {v:k for k,v in enumerate(test_df['assessmentItemID'].unique())} # 9453개
# testId2idx = {v:k for k,v in enumerate(test_df['testId'].unique())} # 1536개
# Tag2idx = {v:k for k,v in enumerate(test_df['KnowledgeTag'].unique())} # 911개


train_df['assessmentItemID'] = train_df['assessmentItemID'].map(ItemID2idx)
train_df['testId'] = train_df['testId'].map(testId2idx)
train_df['KnowledgeTag'] = train_df['KnowledgeTag'].map(Tag2idx)

test_df['assessmentItemID'] = test_df['assessmentItemID'].map(ItemID2idx)
test_df['testId'] = test_df['testId'].map(testId2idx)
test_df['KnowledgeTag'] = test_df['KnowledgeTag'].map(Tag2idx)



# #필요 Column 생성 - 'content_type_id'
# train_df['content_type_id'] = 0
# test_df['content_type_id'] = 0

#필요 Column 생성 - 'prior_question_elapsed_time'
train_df['prior_question_elapsed_time'] = 0
test_df['prior_question_elapsed_time'] = 0

for i in range(len(train_df)-1):
    if train_df.at[i,'userID'] == train_df.at[(i+1),'userID']:
        train_df.at[i,'prior_question_elapsed_time'] = (train_df.at[i+1,'Timestamp']-train_df.at[i,'Timestamp']).total_seconds()
    if i %100000 == 0:
        print(f'train elapsetime process  {i}/{len(train_df)} done ')

for i in range(len(test_df)-1):
    if test_df.at[i,'userID'] == test_df.at[(i+1),'userID']:
        test_df.at[i,'prior_question_elapsed_time'] = (test_df.at[i+1,'Timestamp']-test_df.at[i,'Timestamp']).total_seconds()
    if i %100000 == 0:
        print(f'test elpasetime process  {i}/{len(test_df)} done ')
        
train_df = train_df.drop(['Timestamp'], axis=1)
test_df = test_df.drop(['Timestamp'], axis=1)

# # 필요 Column 생성 - 'task_container_id'
# train_df['task_container_id'] = 0
# test_df['task_container_id'] = 0

# for i in range(1,len(train_df)):
#     if train_df.at[i,'userID'] == train_df.at[i-1,'userID']:
#         train_df.at[i,'task_container_id'] = train_df.at[i-1,'task_container_id'] + 1
#     if i %100000 == 0:
#         print(f'train task process  {i}/{len(train_df)} done ')

# for i in range(1,len(test_df)):
#     if test_df.at[i,'userID'] == test_df.at[i-1,'userID']:
#         test_df.at[i,'task_container_id'] = test_df.at[i-1,'task_container_id'] + 1
#     if i %100000 == 0:
#         print(f'test task process  {i}/{len(test_df)} done ')

# # SAINT에 필요한 것만 남김
# train_df = train_df.drop(['testId','KnowledgeTag','grade'], axis=1)
# test_df = test_df.drop(['testId','KnowledgeTag','grade'], axis=1)

# # change to SAINT Input
# train_df  = train_df.rename(columns = {'userID': 'user_id', 'Timestamp':'timestamp', 'assessmentItemID' : 'content_id',
#                            'answerCode' : 'answered_correctly'  })
# test_df  = test_df.rename(columns = {'userID': 'user_id', 'Timestamp':'timestamp', 'assessmentItemID' : 'content_id',
#                            'answerCode' : 'answered_correctly'  })


loading csv.....
shape of (train) dataframe : (2266586, 6)
train elapsetime process  0/2266586 done 
train elapsetime process  100000/2266586 done 


In [None]:
#data augmentation 증강 수 aug에 입력하자
aug = 5  


print(f'======origin length      : {len(train_df)}======')
train_origin = train_df.copy()
n= 1
for i in range(aug):
    print(f'START {n}th AUGMENTATION')
    tem = train_origin.drop_duplicates(subset = ["userID"],keep = "last")
    train_origin = train_origin.drop(index=tem.index)
    train_origin['userID'] += 7442 #train_origin['userID'].nunique()+1
    train_df = pd.concat([train_df, train_origin], axis = 0)
    print(f'END   {n}th AUGMENTATION')
    n += 1
print(f'======after augmentation : {len(train_df)}======')

START 5th AUGMENTATION
END   5th AUGMENTATION
START 6th AUGMENTATION
END   6th AUGMENTATION
START 7th AUGMENTATION
END   7th AUGMENTATION
START 8th AUGMENTATION
END   8th AUGMENTATION
START 9th AUGMENTATION
END   9th AUGMENTATION


In [18]:
train_df.to_csv(f'/opt/ml/input/data/train_saint_dkt_x{aug}.csv',index = False)
test_df.to_csv(f'/opt/ml/input/data/test_saint_dkt.csv',index = False)