In [1]:
# dkt 가상 환경에서 진행.
import numpy as np
import pandas as pd

import math
import time
from datetime import datetime
from sklearn.metrics import accuracy_score, roc_auc_score

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

try:
    from transformers.modeling_bert import BertConfig, BertEncoder, BertModel
except:
    from transformers.models.bert.modeling_bert import (
        BertConfig,
        BertEncoder,
        BertModel,
    )

import os
import random

import warnings

warnings.filterwarnings(action='ignore')


path = '/opt/ml/input/data/'

train_path = os.path.join(path, 'train_data.csv')
test_path = os.path.join(path, 'test_data.csv')

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

class args: 
    gpu_idx = 0
    device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")
    seed = 64
    max_seq_len = 20 # 최대 문장 길이.
    num_workers = 1
    batch_size = 128
    n_epochs = 10
    hidden_dim = 8
    n_layers = 3
    lr = 0.01
    clip_grad = 10
    log_steps = 200
    augmentation = 50 # 데이터 증강 횟수.
    

In [2]:
# 기본적인 데이터 전처리 부분.
train['grade'] = (train['assessmentItemID'].str[2]).astype('int')
test['grade'] = (test['assessmentItemID'].str[2]).astype('int')
train['testId'] = (train['assessmentItemID'].str[2] + train['assessmentItemID'].str[4:7])
test['testId'] = test['assessmentItemID'].str[2] + test['assessmentItemID'].str[4:7]
train["Timestamp"] = pd.to_datetime(train["Timestamp"])
test["Timestamp"] = pd.to_datetime(test["Timestamp"])

ItemID2idx = {v:k for k,v in enumerate(train['assessmentItemID'].unique())} # 9453개
testId2idx = {v:k for k,v in enumerate(train['testId'].unique())} # 1536개
Tag2idx = {v:k for k,v in enumerate(train['KnowledgeTag'].unique())} # 911개

args.n_item = train['assessmentItemID'].nunique()
args.n_test = train['testId'].nunique()
args.n_tag = train['KnowledgeTag'].nunique()
args.n_grade = train['grade'].nunique()


train['assessmentItemID'] = train['assessmentItemID'].map(ItemID2idx)
train['testId'] = train['testId'].map(testId2idx)
train['KnowledgeTag'] = train['KnowledgeTag'].map(Tag2idx)

test['assessmentItemID'] = test['assessmentItemID'].map(ItemID2idx)
test['testId'] = test['testId'].map(testId2idx)
test['KnowledgeTag'] = test['KnowledgeTag'].map(Tag2idx)

train.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,grade
0,0,0,0,1,2020-03-24 00:17:11,0,6
1,0,1,0,1,2020-03-24 00:17:14,1,6
2,0,2,0,1,2020-03-24 00:17:22,1,6
3,0,3,0,1,2020-03-24 00:17:29,1,6
4,0,4,0,1,2020-03-24 00:17:36,1,6


In [3]:
# 데이터 증강 부분. 선태 코드 참고했으.
# 근데 이런식으로 증강하게 되면 이전에 1문제 풀었는데 이번 문제 맞추라는 식의 데이터가 생김
# test 데이터는 내가 알기로 1개 빼고는 이전 푼 문제가 15문제니 만큼 이에 잘 맞추는 식으로 변형해도 좋을듯.
# 즉 증강은 이대로 하되 후처리를 통해 이전에 1~5문제 등 적게 푼 사람 걸러내도 좋을듯.(귀찮아서 안한거 아님. 아마도.)
train_origin = train.copy()
train_new = train.copy()
for i in range(args.augmentation):
    tem = train_origin.drop_duplicates(subset = ["userID"],keep = "last")
    train_origin = train_origin.drop(index=tem.index)
    train_origin['userID'] += train_origin['userID'].nunique()
    train_new = pd.concat([train_new, train_origin], axis = 0)
train_new

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,grade
0,0,0,0,1,2020-03-24 00:17:11,0,6
1,0,1,0,1,2020-03-24 00:17:14,1,6
2,0,2,0,1,2020-03-24 00:17:22,1,6
3,0,3,0,1,2020-03-24 00:17:29,1,6
4,0,4,0,1,2020-03-24 00:17:36,1,6
...,...,...,...,...,...,...,...
2259582,318988,8051,1301,1,2020-06-06 02:53:11,689,7
2259583,318988,8055,1301,1,2020-06-06 02:53:14,690,7
2259584,318988,8057,1301,1,2020-06-06 02:53:18,682,7
2259585,318988,8058,1301,1,2020-06-06 02:53:22,691,7


In [4]:
# 베이스라인과 유사. 쓰는 데이터 조금 달라짐.
columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag", 'grade']
train_data = (
            train_new[columns] # train_new : 데이터 증강.
            .groupby("userID")
            .apply(
                lambda r: (
                    r["testId"].values,
                    r["assessmentItemID"].values,
                    r["KnowledgeTag"].values,
                    r["grade"].values,
                    r["answerCode"].values,
                )
            )
        ).values

test_data = (
            test[columns]
            .groupby("userID")
            .apply(
                lambda r: (
                    r["testId"].values,
                    r["assessmentItemID"].values,
                    r["KnowledgeTag"].values,
                    r["grade"].values,
                    r["answerCode"].values,
                )
            )
        ).values

In [5]:
ratio = 0.9 # 데이터 증강 했으니 10%만 써도 test 데이터 개수가 충분한듯.

random.seed(args.seed)
random.shuffle(train_data)

size = int(len(train_data) * ratio)
valid_data = train_data[size:]
train_data = train_data[:size]

In [6]:
class DKTDataset(torch.utils.data.Dataset):
    def __init__(self, data, args):
        self.data = data
        self.args = args

    def __getitem__(self, index):
        row = self.data[index]

        # 각 data의 sequence length
        seq_len = len(row[0])

        test, item, tag, grade, correct = row[0], row[1], row[2], row[3], row[4]

        cate_cols = [test, item, tag, grade, correct]

        # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다
        # max seq len 길이를 잘 조절하는 것도 관건 (실험을 통해 해결해야할 문제)
        # defalut가 20인데 이전 문제 정보를 더 사용하고 싶진 않은가? + 너무 키우면 이전 문제 정보를 너무 많이 사용하나?
        if seq_len > self.args.max_seq_len:
            for i, col in enumerate(cate_cols):
                cate_cols[i] = col[-self.args.max_seq_len :]
            mask = np.ones(self.args.max_seq_len, dtype=np.int16)
        else:
            mask = np.zeros(self.args.max_seq_len, dtype=np.int16)
            mask[-seq_len:] = 1

        # mask도 columns 목록에 포함시킴, mask는 이전 문제 정보가 max seq len(20) 보다 작으면 이를 알려주는 변수.
        cate_cols.append(mask)

        # np.array -> torch.tensor 형변환
        for i, col in enumerate(cate_cols):
            cate_cols[i] = torch.tensor(col)

        return cate_cols

    def __len__(self):
        return len(self.data)

train_dataset = DKTDataset(train_data, args)
valid_dataset = DKTDataset(valid_data, args)
test_dataset = DKTDataset(test_data, args)

In [7]:
def collate(batch):
    col_n = len(batch[0]) # column 요소의 수 (defalut : 6, mask까지)
    col_list = [[] for _ in range(col_n)]
    # 입력된 길이의 수. (미리 정해놓은 만큼 DKTDataset에서 이미 조정함)
    max_seq_len = len(batch[0][-1]) 

    # batch의 값들을 각 column끼리 그룹화
    for row in batch:
        for i, col in enumerate(row):
            pre_padded = torch.zeros(max_seq_len)
            pre_padded[-len(col) :] = col
            col_list[i].append(pre_padded)

    for i, _ in enumerate(col_list):
        # torch.stack : [tensor(20), tensor(20), ..] => tensor(batch_size, 20)
        col_list[i] = torch.stack(col_list[i])

    return tuple(col_list) # column수 * tensor(batch_size, max_seq_len)


train_loader = torch.utils.data.DataLoader(
    train_dataset,
    num_workers=args.num_workers,
    shuffle=True,
    batch_size=args.batch_size,
    pin_memory=False,
    collate_fn=collate,
)

valid_loader = torch.utils.data.DataLoader(
    valid_dataset,
    num_workers=args.num_workers,
    shuffle=False,
    batch_size=args.batch_size,
    pin_memory=False,
    collate_fn=collate,
)

test_loader = torch.utils.data.DataLoader(
    test_dataset,
    num_workers=args.num_workers,
    shuffle=False,
    batch_size=args.batch_size,
    pin_memory=False,
    collate_fn=collate,
)

args.total_steps = int(math.ceil(len(train_loader.dataset) / args.batch_size)) * (
        args.n_epochs
    )
args.warmup_steps = args.total_steps // 10 # 이 변수가 무엇인지 잘 모르겠음.

In [None]:
# # 이 모델은 베이스라인 LSTM 모델
# class MODEL(nn.Module):
#     def __init__(self, args):
#         super(MODEL, self).__init__()
#         self.args = args

#         self.hidden_dim = self.args.hidden_dim
#         self.n_layers = self.args.n_layers

#         # Embedding

#         self.embedding_item = nn.Embedding(self.args.n_item + 1, self.hidden_dim // 3)#self.hidden_dim // 4)
#         self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3)#self.hidden_dim // 4)
#         self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3)#self.hidden_dim // 4)
#         self.embedding_grade = nn.Embedding(self.args.n_grade + 1, 2) # self.hidden_dim // 4)

#         self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3)

#         # embedding combination projection
#         self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4+2, self.hidden_dim)

#         self.lstm = nn.LSTM(
#             self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True
#         )

#         # Fully connected layer
#         self.fc = nn.Linear(self.hidden_dim, 1)

#     def forward(self, input):

#         test, item, tag, grade, interaction, mask, _ = input

#         batch_size = interaction.size(0)

#         # Embedding
#         embed_interaction = self.embedding_interaction(interaction)
#         embed_item = self.embedding_item(item)
#         embed_test = self.embedding_test(test)
#         embed_tag = self.embedding_tag(tag)
#         embed_grade = self.embedding_grade(grade)

#         embed = torch.cat(
#             [
#                 embed_interaction,
#                 embed_item,
#                 embed_test,
#                 embed_tag,
#                 embed_grade,
#             ],
#             2,
#         )

#         X = self.comb_proj(embed)

#         # X : (batch * max_seq_len * hidden_dim)
#         # out : (batch * max_seq_len * hidden_dim)
#         out, _ = self.lstm(X)
#         out = out.contiguous().view(batch_size, -1, self.hidden_dim)
#         out = self.fc(out).view(batch_size, -1)
#         return out[:,-1]

In [81]:
# 이 모델은 내가 실험하고 있는 모델.
# 내(성연)가 슬랙에 올린 사진과 함께 보면 좋을 듯.

try:
    from transformers.modeling_bert import BertConfig, BertEncoder, BertModel
except:
    from transformers.models.bert.modeling_bert import (
        BertConfig,
        BertEncoder,
        BertModel,
    )

# 하이퍼 파라미터 내가 실험하고 싶어서 여기서 바꿔줌.
args.hidden_dim = 32
args.lr = 0.001
args.dropout = 0.2
args.n_layers = 2
args.n_heads = 2

class MODEL(nn.Module):
    def __init__(self, args):
        super(MODEL, self).__init__()
        self.args = args

        self.hidden_dim = self.args.hidden_dim
        self.n_layers = self.args.n_layers
        self.dropout = self.args.dropout

        # Embedding (변수 unique 개수, 임베딩 벡터 크기)
        # correct는 중요한 정보(해당 문제 맞았는지 틀렸는지)이기 때문에 임베딩 결과 크기를 크게 잡음.
        # 임베딩 크기를 item은 종류가 많아 풍부한 표현을 위해 hidden_dim
        # test와 tag는 이보다 떨어지기 때문에 hidden_dim // 2.
        # grade는 몇개 없기 때문에(9개?) 고정된 임베딩 벡터 크기 값 3 사용
        self.embedding_correct = nn.Embedding(3, self.hidden_dim) 
        self.embedding_item = nn.Embedding(self.args.n_item + 1, self.hidden_dim)
        self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 2)
        self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 2)
        self.embedding_grade = nn.Embedding(self.args.n_grade + 1, 3) 

        # embedding combination projection

        # 시험 문제 정보 만들기(임베딩 벡터 concat.)
        # 임베딩 벡터 concat 크기는 self.hidden_dim + (self.hidden_dim // 2) * 2 + 3.
        self.comb_proj = nn.Sequential(
            nn.Linear(self.hidden_dim * 2 + 3, self.hidden_dim) ,  
            nn.ReLU(), 
            nn.Dropout(p=self.dropout), 
        )

        # BERT(트랜스포머) 모델 사용.
        # max_seq_len - '1' 인 이유? 
        # => 현재 푼 문제가 맞았는지 틀렸는지 판단. 모델에서는 현재 푼 문제를 고려하고 싶지 않았음.
        # => 여기서 추출하고 싶었던 것은 유저의 과거 문제 풀이 기록을 통해 어느정도 실력을 가지고 있는지 뽑아내고싶음.
        self.config = BertConfig(
            3,  # not used
            hidden_size=self.hidden_dim, # 입력/출력 히든 레이어 길이.
            num_hidden_layers=self.args.n_layers, # n_layers 수(1) head 당 몇 번 반복?
            num_attention_heads=self.args.n_heads, # head 수(1) 같은 행위 몇 번 반복?
            max_position_embeddings=self.args.max_seq_len - 1, # 최대 문제 길이 개수.
        )

        # Bert Layer
        self.encoder = BertModel(self.config)
        

        # 혹시나 LSTM을 사용하고 싶다면 이거 사용하면 됨.
        # self.lstm = nn.LSTM(
        #     self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True
        # )

        # Fully connected layer
        self.fc = nn.Linear(self.hidden_dim, 1) # self.hidden_dim

    def forward(self, input):
        # test, item, tag, grade, correct, mask, ansewer.
        test, item, tag, grade, correct, mask, _ = input

        batch_size = test.size(0)

        # Embedding
        embed_correct = self.embedding_correct(correct[:,:-1]) # 마지막 값은 현재 문제 정보. 들어가면 안됨.
        embed_item = self.embedding_item(item)
        embed_test = self.embedding_test(test)
        embed_tag = self.embedding_tag(tag)
        embed_grade = self.embedding_grade(grade)


        embed = torch.cat(
            [
                embed_item,
                embed_test,
                embed_tag,
                embed_grade
            ],
            2,
        )

        embed = self.comb_proj(embed)

        # 문제 정보(현재 문제 제외)와 맞췄는지 정보를 결합해 줌.
        # 해당 문제를 맞췄는지, 틀렸는지 정보를 통해 해당 학생의 실력을 embed_before에 전달.
        embed_before = torch.mul(embed[:,:-1,:], embed_correct)

        # mul이 아니고 concat 한 후 linear layer 통과 시킬수도. 아래 두 코드. 
        # embed_cat = torch.cat([embed[:,:-1,:], embed_correct], 2)
        # embed_before = self.label_proj(embed_crt) # B X (S-1) X H

        # (max_seq_len -1) 만큼 학생 실력 정보를 트랜스포머 인코딩 부분을 통해 상호작용 구하기.
        # 이건 맞췄고, 저 실력은 부족하고.. 를 통해 학생의 최종 능력치(out) 구하기.
        encoded_layers = self.encoder(inputs_embeds=embed_before, attention_mask=mask[:,:-1])
        out = encoded_layers[0] # (batch * max_seq_len * hidden_dim)

        # BERT 대신 lstm 사용시 코드.
        # out, _ = self.lstm(embed_before)

        # 학생의 최종 능력치와 현재 문제에 대한 정보 벡터 곱하기.
        out = torch.mul(out[:,-1,:], embed[:,-1,:])
        # 최종 분류(맞췄냐? 틀렸냐?)을 위한 선형 레이어.
        out = self.fc(out).view(batch_size, -1) # (batch_size, hidden_dim)
        return out

In [82]:
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

model = MODEL(args)
model.to(args.device)
optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01)
scheduler = ReduceLROnPlateau(
            optimizer, patience=10, factor=0.5, mode="max", verbose=True
)
loss_f = nn.BCEWithLogitsLoss(reduction="none")

In [83]:
def process_batch(batch, swichs = True): # True(내가만든모델), False(베이스라인 LSTM) 
    """
    Args:
        batch : tuple(5, test/question/tag/correct/mask)
        tuple(test 등) : (batch_size : 64, max_seq_len : 20)
    Returns:
        tuple(test, question, tag, correct, mask, interaction)
        tuple(test 등) : (batch_size : 64, max_seq_len : 20), masking을 잘 해줌.
    """
    test, item, tag, grade, correct, mask = batch
    #test, question, tag, correct, mask = batch

    label = correct[:, -1] # [batch_size(64)]

    # change to float
    mask = mask.float()
    correct = correct.float()
    
    #  test_id, question_id, tag
    test = ((test + 1) * mask).int()
    item = ((item + 1) * mask).int()
    tag = ((tag + 1) * mask).int()
    grade = ((grade) * mask).int() # grade는 0이 없음. 1부터 시작.


    if swichs == False:
        # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용
        interaction = correct + 1  # 패딩을 위해 correct값에 1을 더해준다.
        interaction = interaction.roll(shifts=1, dims=1)
        interaction_mask = mask.roll(shifts=1, dims=1)
        interaction_mask[:, 0] = 0
        interaction = (interaction * interaction_mask).to(torch.int64)

        return (test, item, tag, grade, interaction, mask, label)

    else:
        # 베이스라인 대비 추가. correct 0 : 기록x, 1 : 못푼문제, 2 : 푼문제, 3 : 풀었는지 맞출문제.
        correct = ((correct + 1) * mask).int()
        correct[:, -1] = 3

        return (test, item, tag, grade, correct, mask, label)

def get_metric(targets, preds):
    auc = roc_auc_score(targets, preds)
    acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))

    return auc, acc

In [84]:
def _train(train_loader, model, optimizer, scheduler, args):
    model.train()

    total_preds = []
    total_targets = []
    losses = []
    for step, batch in enumerate(train_loader):
        # process_batch return 값 : tuple. (요소 별 튜플)
        # input : list(columns(6) * batch_size * max_seq_len)
        input = list(map(lambda t: t.to(args.device), process_batch(batch)))
        #print(input[-3][:,-1])
        #break

        preds = (model(input))[:,-1]
        targets = input[-1] 

        #preds = (model(input))[:,-1].squeeze()
        #targets = input[-1].squeeze() 

        loss = loss_f(preds, targets)
        loss = torch.mean(loss)
        loss.backward()
        # clip_grad_norm_ : 그래디언트(기울기) 소실/폭파 문제 개선을 위해 사용했데.
        # 역전파 기울기 최댓 임계값 설정해서 이 값 안 넘게 하는 듯.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
        optimizer.step()
        optimizer.zero_grad()

        if step % args.log_steps == 0:
            print(f"Training steps: {step} Loss: {str(loss.item())}")

        # predictions
        #preds = preds[:, -1]
        #targets = targets[:, -1]

        total_preds.append(preds.detach())
        total_targets.append(targets.detach())
        
        losses.append(loss)

    total_preds = torch.concat(total_preds).cpu().numpy()
    # 밑 1줄 베이스라인 대비 추가 / pred 값에 시그모이드 함수 적용. 
    total_preds = 1 / (1 + np.exp(-total_preds))
    total_targets = torch.concat(total_targets).cpu().numpy()

    # Train AUC / ACC
    auc, acc = get_metric(total_targets, total_preds)
    loss_avg = sum(losses) / len(losses)
    print(f"TRAIN AUC : {auc} ACC : {acc}")
    #breakpoint()
    return auc, acc, loss_avg


def validate(valid_loader, model, args):
    model.eval()

    total_preds = []
    total_targets = []
    for step, batch in enumerate(valid_loader):
        input = list(map(lambda t: t.to(args.device), process_batch(batch)))

        preds = (model(input))[:,-1]#.squeeze()
        targets = input[-1]

        #preds = (model(input))[:,-1].squeeze()
        #targets = input[-1].squeeze() # correct, [:,-1].unsqueeze(1)


        # predictions
        #preds = preds[:, -1]
        #targets = targets[:, -1]

        total_preds.append(preds.detach())
        total_targets.append(targets.detach())

    total_preds = torch.concat(total_preds).cpu().numpy()
    # 밑 1줄 베이스라인 대비 추가 / pred 값에 시그모이드 함수 적용. 
    total_preds = 1 / (1 + np.exp(-total_preds))
    total_targets = torch.concat(total_targets).cpu().numpy()

    # Train AUC / ACC
    auc, acc = get_metric(total_targets, total_preds)

    print(f"VALID AUC : {auc} ACC : {acc}\n")

    return auc, acc

아래 코드는 모델이 학습이 잘 되는지, 안되고 있는건 아닌지 확인을 위한 코드임.

In [85]:
for epoch in range(args.n_epochs): # 2

    print(f"Start Training: Epoch {epoch + 1}")

    ### TRAIN
    train_auc, train_acc, train_loss = _train(
        train_loader, model, optimizer, scheduler, args
    )
    auc, acc = validate(valid_loader, model, args)

    scheduler.step(auc)

Start Training: Epoch 1
Training steps: 0 Loss: 0.7157275676727295
Training steps: 200 Loss: 0.6145651340484619
Training steps: 400 Loss: 0.5741449594497681
Training steps: 600 Loss: 0.5806995630264282
Training steps: 800 Loss: 0.551110029220581
Training steps: 1000 Loss: 0.5278575420379639
Training steps: 1200 Loss: 0.6112567186355591
Training steps: 1400 Loss: 0.5412204265594482
Training steps: 1600 Loss: 0.5908793210983276
Training steps: 1800 Loss: 0.56856369972229
Training steps: 2000 Loss: 0.6467379331588745
TRAIN AUC : 0.7153236816968384 ACC : 0.6996266933497537
VALID AUC : 0.7447102834005279 ACC : 0.7155029093931837

Start Training: Epoch 2
Training steps: 0 Loss: 0.5916007161140442
Training steps: 200 Loss: 0.6208318471908569
Training steps: 400 Loss: 0.6226291656494141
Training steps: 600 Loss: 0.5577549934387207
Training steps: 800 Loss: 0.5783360004425049
Training steps: 1000 Loss: 0.578928530216217
Training steps: 1200 Loss: 0.5688868165016174
Training steps: 1400 Loss: 0.

In [60]:
for step, batch in enumerate(train_loader):
    # process_batch return 값 : tuple. (요소 별 튜플)
    # input : list(columns(6) * batch_size * max_seq_len)
    input = list(map(lambda t: t.to(args.device), process_batch(batch)))
    #print(input[-3][:,-1])
    #break

    preds = (model(input))[:,-1]#.squeeze()
    break
preds

tensor([ 0.6818,  1.3192, -0.8071,  1.3209,  0.9297,  0.6129,  1.1596, -0.6660,
         0.9773, -0.7839,  0.9905,  1.3501,  0.1142,  0.6664,  0.2785,  1.1624,
         1.2937,  1.2306,  0.7446, -0.6876, -0.8383, -0.8069,  1.0830,  1.3812,
         0.4079,  0.7446,  0.7422,  1.2916, -0.6315, -0.3606, -0.0268,  1.2981,
         1.3370, -0.4256,  0.0062,  0.3117,  1.3066,  1.1858,  1.3053,  1.1688,
        -0.3958,  0.7714, -0.8350,  1.4001, -0.0113,  1.2799,  0.4233,  0.1882,
        -0.3084, -0.3155,  0.5510,  0.8142,  1.1236,  1.2021,  1.3248, -0.3239,
         0.8269, -0.6694,  1.3602,  1.3177,  1.1615,  0.8308,  1.0613,  0.8455,
         0.1342, -0.5653,  1.1031,  0.6791,  1.3886, -0.6648,  1.3089, -0.1950,
        -0.1703,  1.3375,  0.9260,  1.1990,  1.3306,  1.3852,  1.3062, -0.8182,
        -0.7366,  0.3571,  1.0242,  0.8608, -0.6943,  1.3802,  1.0823, -0.7591,
        -0.6912,  0.2458, -0.4940, -0.6589,  1.2750,  0.2347,  1.3398,  1.3513,
         1.2572,  0.8112,  0.7041,  1.25

In [62]:
model.encoder()

MODEL(
  (embedding_correct): Embedding(3, 16)
  (embedding_item): Embedding(9455, 16)
  (embedding_test): Embedding(1538, 8)
  (embedding_tag): Embedding(913, 8)
  (embedding_grade): Embedding(10, 3)
  (comb_proj): Sequential(
    (0): Linear(in_features=35, out_features=16, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(3, 16, padding_idx=0)
      (position_embeddings): Embedding(19, 16)
      (token_type_embeddings): Embedding(2, 16)
      (LayerNorm): LayerNorm((16,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=16, out_features=16, bias=True)
              (key): Linear(in_features=16, out_features=16, bias=True)
              (v

In [87]:
t = model.encoder(inputs_embeds=torch.rand([2, 19, 32]).to(args.device), attention_mask=torch.ones([2,19]).to(args.device))
t[0].size()

torch.Size([2, 19, 32])

In [16]:
model.encoder

MODEL(
  (embedding_correct): Embedding(3, 16)
  (embedding_item): Embedding(9455, 16)
  (embedding_test): Embedding(1538, 8)
  (embedding_tag): Embedding(913, 8)
  (embedding_grade): Embedding(10, 3)
  (comb_proj): Sequential(
    (0): Linear(in_features=35, out_features=16, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (lstm): LSTM(16, 16, num_layers=3, batch_first=True)
  (fc): Linear(in_features=16, out_features=1, bias=True)
)

여기까지가 모델 테스트 공간.

In [131]:
# 최종 예측 코드.
model.eval()

total_preds = []

for step, batch in enumerate(test_loader):
    input = list(map(lambda t: t.to(args.device), process_batch(batch)))
    #print(input[-3][:, -1])
    #break
    preds = (model(input))[:,-1]#.squeeze()

    # predictions
    preds = preds.cpu().detach().numpy()
    preds = 1 / (1 + np.exp(-preds))
    total_preds += list(preds)

write_path = os.path.join('/opt/ml/input/code/dkt/output', "ksy_lstm_submission.csv")
# if not os.path.exists(args.output_dir):
#     os.makedirs(args.output_dir)
with open(write_path, "w", encoding="utf8") as w:
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write("{},{}\n".format(id, p))