In [1]:
import torch

# GPU 사용 가능 -> True, GPU 사용 불가 -> False
print(torch.cuda.is_available())

True


In [2]:
import os
import sys
import gc
import re

import random
import easydict
import tarfile

from tqdm import notebook
from collections import OrderedDict

import time
import datetime
from datetime import datetime

import pandas as pd
import numpy as np

import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam, AdamW

from torch.optim.lr_scheduler import ReduceLROnPlateau

from transformers import get_linear_schedule_with_warmup
from transformers import get_cosine_schedule_with_warmup

import scipy.stats

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [3]:
def elapsed(df) :
    diff_train = df.loc[:, ['userID','Timestamp']].groupby('userID').diff().shift(-1)
    diff_train = diff_train['Timestamp'].apply(lambda x : x.total_seconds())
    df['elapsed'] = diff_train
    
    df.groupby('userID').apply(lambda x :x.iloc[:-1])

    # 한 시간이 지나면 outlier로 처리
    outlier = 1*3600
    non_outlier = df[df['elapsed'] <= outlier]
    # outlier에 해당하지 않는 row로 재구성 한 후 각 태그의 평균처리
    mean_elapsed = non_outlier.groupby('KnowledgeTag')['elapsed'].mean()
    df.loc[df['elapsed'] > outlier, 'elapsed'] = df[df['elapsed'] > outlier].apply(lambda x: mean_elapsed.get(x['KnowledgeTag'], x['elapsed']), axis=1)
    df['elapsed'] = df['elapsed'].fillna(0)
    return df

def cumsum(df) :
    # 누적합
    _cumsum = df.loc[:, ['userID', 'answerCode']].groupby('userID').agg({'answerCode': 'cumsum'})
    # 누적갯수
    _cumcount = df.loc[:, ['userID', 'answerCode']].groupby('userID').agg({'answerCode': 'cumcount'}) + 1

    cum_ans = _cumsum / _cumcount
    df['cumulative'] = cum_ans['answerCode']

    df['paper_number'] = df['assessmentItemID'].apply(lambda x: x[7:]) # assessmentItemID의 뒤에 3자리를 의미 -> 각 시험지 별로 문제번호
    # item 열을 int16으로 변경
    df["paper_number"] = df["paper_number"].astype("int16")
    
    return df

def avg_percent(x) :
    return np.sum(x) / len(x)

def type_percent(df) :
    # 위에서 처리한 type을 변환하여 각각의 정답률 처리

    df['KnowledgeTag_percent'] = df.groupby('KnowledgeTag')['answerCode'].transform(avg_percent)

    return df

In [4]:
class Preprocess:
    def __init__(self,args):
        self.args = args
        self.train_data = None

    def get_train_data(self):
        return self.train_data

    def split_data(self, data, ratio=0.7, shuffle=True, seed=0):
        """
        split data into two parts with a given ratio.
        """
        if shuffle:
            random.seed(seed) # fix to default seed 0
            random.shuffle(data)

        size = int(len(data) * ratio)
        data_1 = data[:size]
        data_2 = data[size:]

        return data_1, data_2

    def __save_labels(self, encoder, name):
        le_path = os.path.join(self.args.data_dir, name + '_classes.npy')
        np.save(le_path, encoder.classes_)

    def __preprocessing(self, df):
        #con_col에 대한 전처리
        ###TODO: con_col에 대한 전처리 코드 추가
        con_cols= ["elapsed", "KnowledgeTag_percent", "cumulative", "paper_number"]
        df = elapsed(df)
        df = cumsum(df)
        df = type_percent(df)
        #################################CUSTUM#############################################
        
        #cate_col에 대한 전처리
        cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag']
        for col in cate_cols:

            #For UNKNOWN class
            #마지막을 nan값으로 준 이유는 마스킹 때문이라고 생각
            a = df[col].unique().tolist() + [np.nan]

            le = LabelEncoder()
            le.fit(a)
            df[col] = le.transform(df[col])
            self.__save_labels(le, col)

        # def convert_time(s):
        #     timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())
        #     return int(timestamp)

        # df['Timestamp'] = df['Timestamp'].apply(convert_time)
        
        return df

    def load_data_from_file(self, file_name):
       
        #################custun#########################
        dtype = {
            'userID': 'int16',
            'answerCode': 'int8',
            'KnowledgeTag': 'int16'
        }
        ######################################################
        csv_file_path = os.path.join(self.args.data_dir, file_name)
        df = pd.read_csv(csv_file_path,dtype=dtype, parse_dates=['Timestamp'])
        df = self.__preprocessing(df)
        print("elapsed nan값의 개수:",df["elapsed"].isna().sum())
        print("KnowledgeTag_percent nan값의 개수:",df["KnowledgeTag_percent"].isna().sum())
        print("cumulative nan값의 개수:",df["cumulative"].isna().sum())
        # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용
        self.args.n_questions = df['assessmentItemID'].nunique()
        self.args.n_test = df['testId'].nunique()
        self.args.n_tag = df['KnowledgeTag'].nunique() 
        
        df = df.sort_values(by=['userID','Timestamp'], axis=0)
        #기존 columns
        #columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag']
        # group = df[columns].groupby('userID').apply(
        #     lambda r: (
        #         r['testId'].values,
        #         r['assessmentItemID'].values,
        #         r['KnowledgeTag'].values,
        #         r['answerCode'].values
        #     )
        # )

        #columns 추가
        columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag',"elapsed", "KnowledgeTag_percent", "cumulative", "paper_number"]

        group = df[columns].groupby('userID').apply(
                lambda r: (
                    r['testId'].values,
                    r['assessmentItemID'].values,
                    r['KnowledgeTag'].values,
                    r['answerCode'].values,
                    r['elapsed'].values,
                    r['KnowledgeTag_percent'].values,
                    r['cumulative'].values,
                    r['paper_number'].values,
                )
            )
        return group.values

    def load_train_data(self, file_name):
        self.train_data = self.load_data_from_file(file_name)


In [5]:
class DKTDataset(torch.utils.data.Dataset):
    def __init__(self, data, args):
        self.data = data
        self.args = args

    def __getitem__(self, index):
        row = self.data[index]

        # 각 data의 sequence length
        seq_len = len(row[0])
        
        #original
        #test, question, tag, correct = row[0], row[1], row[2], row[3]
        #custum
        test, question, tag, correct, elapsed, KnowledgeTag_percent, cumulative, paper_number = row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]

        cate_cols = [test, question, tag, correct]
        
        #custum
        cont_cols = [elapsed, KnowledgeTag_percent, cumulative, paper_number]
        if seq_len > self.args.max_seq_len:
            # cate_col
            for i, col in enumerate(cate_cols):
                cate_cols[i] = col[-self.args.max_seq_len:]
            mask = np.ones(self.args.max_seq_len, dtype=np.int16)
            # cont_col
            for i, col in enumerate(cont_cols):
                cont_cols[i] = col[-self.args.max_seq_len:]
        else:
            mask = np.zeros(self.args.max_seq_len, dtype=np.int16)
            mask[:seq_len] = 1
        
        
        # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다
        # if seq_len > self.args.max_seq_len:
        #     for i, col in enumerate(cate_cols):
        #         cate_cols[i] = col[-self.args.max_seq_len:]
        #     mask = np.ones(self.args.max_seq_len, dtype=np.int16)
        # else:
        #     mask = np.zeros(self.args.max_seq_len, dtype=np.int16)
        #     mask[:seq_len] = 1

        # mask도 columns 목록에 포함시킴
        cate_cols.append(mask)

        #custum
        cate_cont_cols = []
        cate_cont_cols.extend(cate_cols)
        cate_cont_cols.extend(cont_cols)
        ############################################
        #original
        # np.array -> torch.tensor 형변환
        # for i, col in enumerate(cate_cols):
        #     cate_cols[i] = torch.tensor(col)
        # return cate_cols
        #custum
        for i, col in enumerate(cate_cont_cols):
            cate_cont_cols[i] = torch.tensor(col)
        return cate_cont_cols

    def __len__(self):
        return len(self.data)


In [6]:
from torch.nn.utils.rnn import pad_sequence

#padding을 위한 함수
def collate(batch):
    col_n = len(batch[0])
    col_list = [[] for _ in range(col_n)]
    # print("column 개수",col_n)
    # batch의 값들을 각 column끼리 그룹화
    for row in batch:
        for i, col in enumerate(row):
            col_list[i].append(col)

    # 각 column의 값들을 대상으로 padding 진행
    # pad_sequence([[1, 2, 3], [3, 4]]) -> [[1, 2, 3],
    #                                       [3, 4, 0]]
    for i, col_batch in enumerate(col_list):
        col_list[i] = pad_sequence(col_batch, batch_first=True)

    # mask의 경우 max_seq_len을 기준으로 길이가 설정되어있다.
    # 만약 다른 column들의 seq_len이 max_seq_len보다 작다면
    # 이 길이에 맞추어 mask의 길이도 조절해준다
    col_seq_len = col_list[0].size(1)
    mask_seq_len = col_list[-1].size(1)
    if col_seq_len < mask_seq_len:
        col_list[-1] = col_list[-1][:, :col_seq_len]

    return tuple(col_list)


def get_loaders(args, train, valid):

    pin_memory = False

    trainset = DKTDataset(train, args)
    valset = DKTDataset(valid, args)

    train_loader = torch.utils.data.DataLoader(trainset, shuffle=True,
                                               batch_size=args.batch_size,
                                               pin_memory=pin_memory,
                                               collate_fn=collate)

    valid_loader = torch.utils.data.DataLoader(valset, shuffle=False,
                                               batch_size=args.batch_size,
                                               pin_memory=pin_memory,
                                               collate_fn=collate)

    return train_loader, valid_loader

In [7]:
def slidding_window(data, args):
    window_size = args.max_seq_len
    stride = args.stride

    augmented_datas = []
    for row in data:
        seq_len = len(row[0])

        # 만약 window 크기보다 seq len이 같거나 작으면 augmentation을 하지 않는다
        if seq_len <= window_size:
            augmented_datas.append(row)
        else:
            total_window = ((seq_len - window_size) // stride) + 1

            # 앞에서부터 slidding window 적용
            for window_i in range(total_window):
                # window로 잘린 데이터를 모으는 리스트
                window_data = []
                for col in row:
                    window_data.append(col[window_i*stride:window_i*stride + window_size])

                # Shuffle
                # 마지막 데이터의 경우 shuffle을 하지 않는다
                if args.shuffle and window_i + 1 != total_window:
                    shuffle_datas = shuffle(window_data, window_size, args)
                    augmented_datas += shuffle_datas
                else:
                    augmented_datas.append(tuple(window_data))

            # slidding window에서 뒷부분이 누락될 경우 추가
            total_len = window_size + (stride * (total_window - 1))
            if seq_len != total_len:
                window_data = []
                for col in row:
                    window_data.append(col[-window_size:])
                augmented_datas.append(tuple(window_data))


    return augmented_datas

def shuffle(data, data_size, args):
    shuffle_datas = []
    for i in range(args.shuffle_n):
        # shuffle 횟수만큼 window를 랜덤하게 계속 섞어서 데이터로 추가
        shuffle_data = []
        random_index = np.random.permutation(data_size)
        for col in data:
            shuffle_data.append(col[random_index])
        shuffle_datas.append(tuple(shuffle_data))
    return shuffle_datas


In [8]:
def data_augmentation(data, args):
    if args.window == True:
        data = slidding_window(data, args)

    return data

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import copy
import math
from sklearn.preprocessing import StandardScaler

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=1000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.scale = nn.Parameter(torch.ones(1))
        
        # input embedding
        pe = torch.zeros(max_len, d_model) ## max_len X hidden_dim
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) #0부터 sequence 길이만큼 position 값 생성, 1 X max_len
        div_term = torch.exp(torch.arange(
            0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.scale * self.pe[:x.size(0), :]
        return self.dropout(x)

class Saint(nn.Module):

    def __init__(self, args):
        super(Saint, self).__init__()
        torch.autograd.set_detect_anomaly(True)
        self.args = args
        self.device = args.device

        self.hidden_dim = self.args.hidden_dim
        # self.dropout = self.args.dropout
        self.dropout = 0.

        ### Embedding
        # ENCODER embedding
        self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)
        self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)
        self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)

        # encoder combination projection
        # original
        # self.enc_comb_proj = nn.Linear((self.hidden_dim//3)*3, self.hidden_dim)
        # custum
        self.enc_cate_comb_proj = nn.Linear((self.hidden_dim//3)*3, self.hidden_dim//2)
        self.enc_cont_comb_proj = nn.Linear(2, self.hidden_dim//2) ## 임시로 현재 3개로 지정, 추후 코드 변경 필요, cont column의 개수임
        # batchnorm 추가
        # self.cont_bn = nn.BatchNorm1d(3)
        # # 재수정
        # self.enc_cate_comb_proj = nn.Sequential(nn.ReLU(),
        #                                         nn.Linear((self.hidden_dim//3)*3, self.hidden_dim//2),
        #                                         nn.LayerNorm(self.hidden_dim//2))
        # self.enc_cont_comb_proj = nn.Sequential(nn.ReLU(),
        #                                         nn.Linear(3, self.hidden_dim//2),
        #                                         nn.LayerNorm(self.hidden_dim//2))
        ###########################################################################
        
        # DECODER embedding
        # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
        self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)

        # decoder combination projection
        # original
        # self.dec_comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)
        # custum
        self.dec_cate_comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim//2)
        self.dec_cont_comb_proj = nn.Linear(2, self.hidden_dim//2)## 임시로 현재 3개로 지정, 추후 코드 변경 필요, cont column의 개수임
        # 재수정
        # self.dec_cate_comb_proj = nn.Sequential(nn.ReLU(),
        #                                         nn.Linear((self.hidden_dim//3)*4, self.hidden_dim//2),
        #                                         nn.LayerNorm(self.hidden_dim//2))
        # self.dec_cont_comb_proj = nn.Sequential(nn.ReLU(),
        #                                         nn.Linear(3, self.hidden_dim//2),
        #                                         nn.LayerNorm(self.hidden_dim//2))
        ###########################################################################
        # Positional encoding
        self.pos_encoder = PositionalEncoding(self.hidden_dim, self.dropout, self.args.max_seq_len)
        self.pos_decoder = PositionalEncoding(self.hidden_dim, self.dropout, self.args.max_seq_len)
        # cate data에만 positional encoding 적용하는 코드로 임시 수정
        # self.pos_encoder = PositionalEncoding(self.hidden_dim//2, self.dropout, self.args.max_seq_len)
        # self.pos_decoder = PositionalEncoding(self.hidden_dim//2, self.dropout, self.args.max_seq_len)
        
        self.transformer = nn.Transformer(
            d_model=self.hidden_dim,
            nhead=self.args.n_heads,
            num_encoder_layers=self.args.n_layers,
            num_decoder_layers=self.args.n_layers,
            dim_feedforward=self.hidden_dim,
            dropout=self.dropout,
            activation='relu')

        self.fc = nn.Linear(self.hidden_dim, 1)
        self.activation = nn.Sigmoid()

        self.enc_mask = None
        self.dec_mask = None
        self.enc_dec_mask = None

    def get_mask(self, seq_len):
        mask = torch.from_numpy(np.triu(np.ones((seq_len, seq_len)), k=1))

        return mask.masked_fill(mask==1, float('-inf'))

    def forward(self, input):
        #original
        #test, question, tag, _, mask, interaction, _ = input
        #custum
        test, question, tag, _, mask, elapsed, KnowledgeTag_percent, cumulative, paper_number, interaction, _ = input


        batch_size = interaction.size(0)
        seq_len = interaction.size(1)

        # 신나는 embedding
        # ENCODER
        embed_test = self.embedding_test(test)
        embed_question = self.embedding_question(question)
        embed_tag = self.embedding_tag(tag)
        # print("embed_tag size:", embed_tag.size())
        # original
        # embed_enc = torch.cat([embed_test,
        #                        embed_question,
        #                        embed_tag,], 2)
        # embed_enc = self.enc_comb_proj(embed_enc)
        
        #custum
        
        # print("embed_test size:", embed_test.size())
        # print("embed_question size:", embed_question.size())
        # print("embed_tag size:", embed_tag.size())
        
        cate_embed_enc = torch.cat([embed_test,
                               embed_question,
                               embed_tag,], 2)
    
        cate_embed_enc = self.enc_cate_comb_proj(cate_embed_enc)
        # Positional encoding
        # cate_embed_enc = self.pos_encoder(cate_embed_enc)
        
        cont_embed_enc = torch.cat([#elapsed,
                               KnowledgeTag_percent,
                               cumulative,], 1)
        
        cont_embed_enc = cont_embed_enc.view(batch_size, seq_len, -1) # (batch_size , seq_len, cont_col.size())
        # cont_embed_enc = self.cont_bn(cont_embed_enc.view(-1, cont_embed_enc.size(-1))) # batchnorm 1d
        # cont_embed_enc = cont_embed_enc.view(batch_size, -1, cont_embed_enc.size(-1)) # 다시 원래대로(batch_size , seq_len, cont_col.size())
        # print(cont_embed_enc.size())
        cont_embed_enc = self.enc_cont_comb_proj(cont_embed_enc)

        seq_emb_enc = torch.cat([cate_embed_enc,cont_embed_enc],2)
        # print("cont_embed_enc",cont_embed_enc)
#########################################################################################
        # DECODER
        embed_test = self.embedding_test(test)
        embed_question = self.embedding_question(question)
        embed_tag = self.embedding_tag(tag)

        embed_interaction = self.embedding_interaction(interaction)
       
        # cate data
        cate_embed_dec = torch.cat([embed_test,
                               embed_question,
                               embed_tag,
                               embed_interaction], 2)

        cate_embed_dec = self.dec_cate_comb_proj(cate_embed_dec)
        # Positional encoding
        # cate_embed_dec = self.pos_decoder(cate_embed_dec)

        # cont data
        cont_embed_dec = torch.cat([#elapsed,
                               KnowledgeTag_percent,
                               cumulative,], 1)
        
        cont_embed_dec = cont_embed_dec.view(batch_size, seq_len, -1)
        # cont_embed_dec = self.cont_bn(cont_embed_dec.view(-1, cont_embed_dec.size(-1))) # batchnorm 1d
        # cont_embed_dec = cont_embed_dec.view(batch_size, -1, cont_embed_dec.size(-1)) # 다시 원래대로(batch_size , seq_len, cont_col.size())
        # print(cont_embed_dec.size())
        cont_embed_dec = self.dec_cont_comb_proj(cont_embed_dec)
        # print("cont_embed_dec:",cont_embed_dec)
        seq_emb_dec = torch.cat([cate_embed_dec, cont_embed_dec],2)

        # ATTENTION MASK 생성
        # encoder하고 decoder의 mask는 가로 세로 길이가 모두 동일하여
        # 사실 이렇게 3개로 나눌 필요가 없다
        if self.enc_mask is None or self.enc_mask.size(0) != seq_len:
            self.enc_mask = self.get_mask(seq_len).to(self.device).to(torch.float32)

        if self.dec_mask is None or self.dec_mask.size(0) != seq_len:
            self.dec_mask = self.get_mask(seq_len).to(self.device).to(torch.float32)

        if self.enc_dec_mask is None or self.enc_dec_mask.size(0) != seq_len:
            self.enc_dec_mask = self.get_mask(seq_len).to(self.device).to(torch.float32)

        #original
        # embed_enc = embed_enc.permute(1, 0, 2)
        # embed_dec = embed_dec.permute(1, 0, 2)

        # Positional encoding
        # embed_enc = self.pos_encoder(embed_enc)
        # embed_dec = self.pos_decoder(embed_dec)

        # out = self.transformer(embed_enc, embed_dec,
        #                        src_mask=self.enc_mask,
        #                        tgt_mask=self.dec_mask,
        #                        memory_mask=self.enc_dec_mask)
        #custum
        seq_emb_enc = seq_emb_enc.permute(1, 0, 2)
        seq_emb_dec = seq_emb_dec.permute(1, 0, 2)

        # Positional encoding custum
        seq_emb_enc = self.pos_encoder(seq_emb_enc)
        seq_emb_dec = self.pos_decoder(seq_emb_dec)

        # print("seq_emb_enc:",seq_emb_enc)
        # print("seq_emb_dec:",seq_emb_dec)
        # print("self.enc_mask:",self.enc_mask)
        # print("seq_emb_enc shape:",seq_emb_enc.shape)
        # print("seq_emb_dec shape:",seq_emb_dec.shape)
        # print("self.enc_dec_mask:",self.enc_dec_mask)
        # std = StandardScaler()
        # std.fit(seq_emb_enc)
        # seq_emb_enc = std.transform(seq_emb_enc).to(torch.float32).to(self.args.device)

        # std = StandardScaler()
        # std.fit(seq_emb_dec)
        # seq_emb_dec = std.transform(seq_emb_dec).to(torch.float32).to(self.args.device)

        # nan 값 체크
        # nan_mask = torch.isnan(seq_emb_enc)
        # nan_count = torch.sum(nan_mask).item()
        # print("seq_emb_enc nan?",nan_count)
        # print("seq_emb_enc shape",seq_emb_enc.shape)
        # print(seq_emb_enc)
        # nan_mask = torch.isnan(seq_emb_dec)
        # nan_count = torch.sum(nan_mask).item()
        # print("dec_emb_enc nan?",nan_count)

        out = self.transformer(seq_emb_enc, seq_emb_dec,
                               src_mask=self.enc_mask,
                               tgt_mask=self.dec_mask,
                               memory_mask=self.enc_dec_mask)
        ###################################################################
        # print("transformer output:",out)
        out = out.permute(1, 0, 2)
        out = out.contiguous().view(batch_size, -1, self.hidden_dim)
        out = self.fc(out)

        preds = self.activation(out).view(batch_size, -1)
        
        return preds

In [10]:
def get_optimizer(model, args):
    if args.optimizer == 'adam':
        optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.0)
    if args.optimizer == 'adamW':
        optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.0)

    # 모든 parameter들의 grad값을 0으로 초기화
    optimizer.zero_grad()

    return optimizer


In [11]:
def get_scheduler(optimizer, args):
    if args.scheduler == 'plateau':
        scheduler = ReduceLROnPlateau(optimizer, patience=10, factor=0.5, mode='max', verbose=True)
    elif args.scheduler == 'linear_warmup':
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=args.warmup_steps,
                                                    num_training_steps=args.total_steps)
    return scheduler

In [12]:
def get_criterion(pred, target):
    loss = nn.BCELoss(reduction="none")
    return loss(pred, target)


In [13]:
def get_metric(targets, preds):
    auc = roc_auc_score(targets, preds)
    acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))

    return auc, acc


In [14]:
def get_model(args):
    """
    Load model and move tensors to a given devices.
    """
    if args.model == 'lstm': model = LSTM(args)
    if args.model == 'bert': model = Bert(args)
    if args.model == 'last_query': model = LastQuery(args)
    if args.model == 'saint': model = Saint(args)
    if args.model == 'tfixup': model = FixupEncoder(args)

    model.to(args.device)

    return model

In [15]:
# 배치 전처리
def process_batch(batch, args):

    test, question, tag, correct, mask, elapsed, KnowledgeTag_percent, cumulative, paper_number = batch
    # print("batch_test_shape : ", test.shape)
    # print("batch_question_shape : ", question.shape)
    # print("batch_tag_shape : ", tag.shape)
    # print("batch_correct_shape : ", correct.shape)
    # print("batch_mask_shape : ", mask.shape)
 
    # change to float
    mask = mask.type(torch.FloatTensor)
    correct = correct.type(torch.FloatTensor)

    #  interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용
    #  saint의 경우 decoder에 들어가는 input이다
    interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다.
    interaction = interaction.roll(shifts=1, dims=1) #dim 1에 해당하는 값을 1씩 이동
    interaction[:, 0] = 0 # set padding index to the first sequence
    interaction = (interaction * mask).to(torch.int64)


    #  test_id, question_id, tag
    test = ((test + 1) * mask).to(torch.int64)
    question = ((question + 1) * mask).to(torch.int64)
    tag = ((tag + 1) * mask).to(torch.int64)

    # gather index
    # 마지막 sequence만 사용하기 위한 index
    gather_index = torch.tensor(np.count_nonzero(mask, axis=1))
    gather_index = gather_index.view(-1, 1) - 1


    # device memory로 이동
    test = test.to(args.device)
    question = question.to(args.device)


    tag = tag.to(args.device)
    correct = correct.to(args.device)
    mask = mask.to(args.device)

    interaction = interaction.to(args.device)
    gather_index = gather_index.to(args.device)

    ############custum##############
    #cont 추가
    elapsed = elapsed.to(torch.float32)
    KnowledgeTag_percent = KnowledgeTag_percent.to(torch.float32)
    cumulative = cumulative.to(torch.float32)
    paper_number = paper_number.to(torch.float32)

    elapsed = elapsed.to(args.device)
    KnowledgeTag_percent = KnowledgeTag_percent.to(args.device)
    cumulative = cumulative.to(args.device)
    paper_number = paper_number.to(args.device)

    #original
    # return (test, question,
    #         tag, correct, mask,
    #         interaction, gather_index)

    #custum
    return (test, question,
            tag, correct, mask, 
            elapsed, KnowledgeTag_percent, cumulative, paper_number,
            interaction, gather_index)


In [16]:
# loss계산하고 parameter update!
def compute_loss(preds, targets, index):
    """
    Args :
        preds   : (batch_size, max_seq_len)
        targets : (batch_size, max_seq_len)
        index    : (batch_size, max_seq_len)

        만약 전체 sequence 길이가 max_seq_len보다 작다면 해당 길이로 진행
    """
    loss = get_criterion(preds, targets)
    loss = torch.gather(loss, 1, index)
    loss = torch.mean(loss)

    return loss


In [17]:
def get_gradient(model):
    gradient = []

    for name, param in model.named_parameters():
        grad = param.grad
        if grad != None:
            gradient.append(grad.cpu().numpy().astype(np.float16))
            # gradient.append(grad.clone().detach())
        else:
            gradient.append(None)

    return gradient

In [18]:
def train(train_loader, model, optimizer, scheduler, args, gradient=False):
    model.train()

    total_preds = []
    total_targets = []
    for step, batch in enumerate(train_loader):
        input = process_batch(batch, args)
        
        preds = model(input)
        targets = input[3] # correct
        index = input[-1] # gather index
        # print("preds shape",preds)
        # print("targets shape",targets.dtype)
        # print("index shape",index.dtype)
        loss = compute_loss(preds, targets, index)
        loss.backward()

        # save gradient distribution
        if gradient:
            args.n_iteration += 1
            args.gradient[f'iteration_{args.n_iteration}'] = get_gradient(model)

        # grad clip
        if args.clip_grad:
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)

        optimizer.step()
        optimizer.zero_grad()

        # warmup scheduler
        if args.scheduler == 'linear_warmup':
            scheduler.step()

        # predictions
        preds = preds.gather(1, index).view(-1)
        targets = targets.gather(1, index).view(-1)

        if args.device == 'cuda':
            preds = preds.to('cpu').detach().numpy()
            targets = targets.to('cpu').detach().numpy()
        else: # cpu
            preds = preds.detach().numpy()
            targets = targets.detach().numpy()

        total_preds.append(preds)
        total_targets.append(targets)

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    auc, acc = get_metric(total_targets, total_preds)

    return auc, acc


In [19]:
def validate(valid_loader, model, args):
    model.eval()

    total_preds = []
    total_targets = []
    for step, batch in enumerate(valid_loader):
        input = process_batch(batch, args)

        preds = model(input)
        targets = input[3] # correct
        index = input[-1] # gather index

        # predictions
        preds = preds.gather(1, index).view(-1)
        targets = targets.gather(1, index).view(-1)

        if args.device == 'cuda':
            preds = preds.to('cpu').detach().numpy()
            targets = targets.to('cpu').detach().numpy()
        else: # cpu
            preds = preds.detach().numpy()
            targets = targets.detach().numpy()

        total_preds.append(preds)
        total_targets.append(targets)

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    auc, acc = get_metric(total_targets, total_preds)

    return auc, acc, total_preds, total_targets


In [20]:
def run(args, train_data, valid_data, gradient=False):

    # 캐시 메모리 비우기 및 가비지 컬렉터 가동!
    torch.cuda.empty_cache()
    gc.collect()

    # augmentation
    augmented_train_data = data_augmentation(train_data, args)
    if len(augmented_train_data) != len(train_data):
        print(f"Data Augmentation applied. Train data {len(train_data)} -> {len(augmented_train_data)}\n")

    train_loader, valid_loader = get_loaders(args, augmented_train_data, valid_data)

    # only when using warmup scheduler
    args.total_steps = int(len(train_loader.dataset) / args.batch_size) * (args.n_epochs)
    args.warmup_steps = args.total_steps // 10

    model = get_model(args)
    optimizer = get_optimizer(model, args)
    scheduler = get_scheduler(optimizer, args)

    # 🌟 분석에 사용할 값 저장 🌟
    report = OrderedDict()

    # gradient step 분석에 사용할 변수
    if gradient:
        args.n_iteration = 0
        args.gradient = OrderedDict()

        # 모델의 gradient값을 가리키는 모델 명 저장
        args.gradient['name'] = [name for name, _ in model.named_parameters()]

    best_auc = -1
    best_auc_epoch = -1
    best_acc = -1
    best_acc_epoch = -1
    for epoch in notebook.tqdm(range(args.n_epochs)):
        epoch_report = {}

        ### TRAIN
        train_start_time = time.time()
        train_auc, train_acc = train(train_loader, model, optimizer, scheduler, args, gradient)
        train_time = time.time() - train_start_time

        epoch_report['train_auc'] = train_auc
        epoch_report['train_acc'] = train_acc
        epoch_report['train_time'] = train_time

        ### VALID
        valid_start_time = time.time()
        valid_auc, valid_acc, preds, targets = validate(valid_loader, model, args)
        valid_time = time.time() - valid_start_time

        epoch_report['valid_auc'] = valid_auc
        epoch_report['valid_acc'] = valid_acc
        epoch_report['valid_time'] = valid_time

        # save lr
        epoch_report['lr'] = optimizer.param_groups[0]['lr']


        # 🌟 save it to report 🌟
        report[f'{epoch + 1}'] = epoch_report


        ### TODO: model save or early stopping
        if valid_auc > best_auc:
            best_auc = valid_auc
            best_auc_epoch = epoch + 1

        if valid_acc > best_acc:
            best_acc = valid_acc
            best_acc_epoch = epoch + 1

        # scheduler
        if args.scheduler == 'plateau':
            scheduler.step(best_auc)

    # save best records
    report['best_auc'] = best_auc
    report['best_auc_epoch'] = best_auc_epoch
    report['best_acc'] = best_acc
    report['best_acc_epoch'] = best_acc_epoch

    # save gradient informations
    if gradient:
        report['gradient'] = args.gradient
        del args.gradient
        del args['gradient']

    return report


In [21]:
config = {}
DATA_PATH = './data'
FILE_PATH = 'train_data.csv'
# 설정
config['seed'] = 42
config['device'] = "cuda" if torch.cuda.is_available() else "cpu"
config['data_dir'] = DATA_PATH

# 데이터
config['max_seq_len'] = 300

# 데이터 증강 (Data Augmentation)
config['window'] = False
config['stride'] = config['max_seq_len']
config['shuffle'] = False
config['shuffle_n'] = 2

# 모델
config['hidden_dim'] = 128
config['n_layers'] = 1
config['dropout'] = 0.0
config['n_heads'] = 4

# T Fixup
config['Tfixup'] = False
config['layer_norm'] = True

# 훈련
config['n_epochs'] = 10
config['batch_size'] = 64
config['lr'] = 0.0001
config['clip_grad'] = 2.0

### 중요 ###
config['model'] = 'saint'
config['optimizer'] = 'adam'
config['scheduler'] = 'plateau'

args = easydict.EasyDict(config)

In [22]:
preprocess = Preprocess(args)
preprocess.load_train_data(FILE_PATH)

train_data = preprocess.get_train_data()
train_data, valid_data = preprocess.split_data(train_data)

elapsed nan값의 개수: 0
KnowledgeTag_percent nan값의 개수: 0
cumulative nan값의 개수: 0


In [23]:
print(f"훈련(train) 데이터 준비 완료 : {len(train_data)} 개")
print(f"검증(valid) 데이터 준비 완료 : {len(valid_data)} 개")

훈련(train) 데이터 준비 완료 : 4688 개
검증(valid) 데이터 준비 완료 : 2010 개


In [24]:
# 배치 단위로 주어지는 데이터를 살펴보자
for i,group in enumerate(np.asarray(train_data)):
     a,b,c,d,e,f,g,h = np.asarray(group)
     print(f"testID len : {len(a)}")
     print(f"assessmentItemID len : {len(b)}")
     print(f"KnowledgeTag size : {len(c)}")
     print(f"target size : {len(d)}")
     print(f"target size : {len(e)}")
     print(f"target size : {len(f)}")
     print(f"target size : {len(g)}")
     print(f"target size : {len(h)}")
     if i ==3:
          break

testID len : 24
assessmentItemID len : 24
KnowledgeTag size : 24
target size : 24
target size : 24
target size : 24
target size : 24
target size : 24
testID len : 573
assessmentItemID len : 573
KnowledgeTag size : 573
target size : 573
target size : 573
target size : 573
target size : 573
target size : 573
testID len : 74
assessmentItemID len : 74
KnowledgeTag size : 74
target size : 74
target size : 74
target size : 74
target size : 74
target size : 74
testID len : 110
assessmentItemID len : 110
KnowledgeTag size : 110
target size : 110
target size : 110
target size : 110
target size : 110
target size : 110


In [25]:
# 출처 : https://www.kaggle.com/bminixhofer/a-validation-framework-impact-of-the-random-seed

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True



In [26]:
# report에서 auc및 실행 시간 정보 얻기
def time_auc(report, n_epoch=10):
    total_time = 0
    for epoch in range(1, n_epoch + 1):
        result = report[str(epoch)]
        total_time += result['train_time']
        total_time += result['valid_time']

    return total_time, report['best_auc'], report['best_acc']

In [27]:
# seed 설정
seed_everything(args.seed)

# Gradient 분포도 체크할 것이므로 True로 표시
report = run(args, train_data, valid_data, gradient=True)
total_time, auc, acc = time_auc(report)

print(f"Cost Time : {total_time} sec, best AUC : {auc}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))


Cost Time : 134.3642065525055 sec, best AUC : 0.7089434184204289


In [28]:
print(f"AUC : {report['best_auc']} at epoch {report['best_auc_epoch']}")
print(f"ACC : {report['best_acc']} at epoch {report['best_acc_epoch']}")

AUC : 0.7089434184204289 at epoch 9
ACC : 0.6651741293532338 at epoch 9
