In [1]:
import argparse

import os
import numpy as np
import pandas as pd
from datetime import datetime
import time
import tqdm
import random
import gc

import pdb
import wandb

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import QuantileTransformer

import torch
import torch.nn as nn
from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel
from transformers import GPT2Config, GPT2Model
from transformers import XLMRobertaModel, XLMRobertaConfig 

from dkt.dataloader import Preprocess
from dkt import trainer
from dkt.utils import setSeeds, increment_path, delete_model
from dkt.optimizer import get_optimizer
from dkt.scheduler import get_scheduler
from dkt.trainer import compute_loss, update_params, get_lr, save_checkpoint
from dkt.metric import get_metric
from dkt.criterion import get_criterion
from dkt.model import *

In [2]:
namespace = {
    'seed' : 42,
    'device' : 'cuda',
    'data_dir' : '/opt/ml/input/data/train_dataset',
    'asset_dir' : '/opt/ml/asset/',
    'file_name' : 'train_data.csv',
    'model_dir' : '/opt/ml/models/',
    'model_name' : 'model.pt',
    'output_dir' : '/opt/ml/output/',
    'test_file_name' : 'test_data.csv',
    'max_seq_len' : 128,
    'window' : True,
    'shuffle' : False,
    'shuffle_n' : 2,
    'num_workers' : 1,
    'hidden_dim' : 512,
    'n_layers' : 2,
    'n_heads' : 2,
    'drop_out' : 0.2,
    'n_epochs' : 200,
    'batch_size' : 64,
    'lr' : 0.0001,
    'clip_grad' : 10,
    'patience' : 15,
    'log_steps' : 50,
    'model' : 'lstm',
    'optimizer' : 'adam',
    'scheduler' : 'plateau'
}

args = argparse.Namespace(**namespace)
device = "cuda" if torch.cuda.is_available() else "cpu"
args.stride = args.max_seq_len
args.device = device
setSeeds(args.seed)

## preprocess

In [3]:
class Preprocess:
    def __init__(self, args):
        self.args = args
        self.train_data = None
        self.test_data = None
        

    def get_train_data(self):
        return self.train_data

    def get_test_data(self):
        return self.test_data

    def split_data(self, data, ratio=0.7, shuffle=True, seed=0):
        """
        split data into two parts with a given ratio.
        """
        if shuffle:
            random.seed(seed) # fix to default seed 0
            random.shuffle(data)

        size = int(len(data) * ratio)
        data_1 = data[:size]
        data_2 = data[size:]

        return data_1, data_2

    def __save_labels(self, encoder, name):
        le_path = os.path.join(self.args.asset_dir, name + '_classes.npy')
        np.save(le_path, encoder.classes_)

    def __preprocessing(self, df, is_train = True):
        cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag']

        if not os.path.exists(self.args.asset_dir):
            os.makedirs(self.args.asset_dir)
            
        for col in cate_cols:
            
            le = LabelEncoder()
            if is_train:
                #For UNKNOWN class
                a = df[col].unique().tolist() + ['unknown']
                le.fit(a)
                self.__save_labels(le, col)
            else:
                label_path = os.path.join(self.args.asset_dir,col+'_classes.npy')
                le.classes_ = np.load(label_path)
                
                df[col] = df[col].apply(lambda x: x if str(x) in le.classes_ else 'unknown')

            #모든 컬럼이 범주형이라고 가정
            df[col]= df[col].astype(str)
            test = le.transform(df[col])
            df[col] = test
            

        # def convert_time(s):
        #     timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())
        #     return int(timestamp)

        # df['Timestamp'] = df['Timestamp'].apply(convert_time)
        
        return df

    def __feature_engineering(self, df):
        # 큰 카테고리
        df['grade'] = df['testId'].str[2].astype(int)
        
        # 이전 문제 소모시간 추가
        df['tmp_index'] = df.index
        tmp_df = df[['userID', 'testId', 'Timestamp', 'tmp_index']].shift(1)
        tmp_df['tmp_index'] += 1
        tmp_df = tmp_df.rename(columns={'Timestamp':'prior_timestamp'})
        df = df.merge(tmp_df, how='left', on=['userID', 'testId', 'tmp_index'])
        df['prior_elapsed'] = (df.Timestamp - df.prior_timestamp).dt.seconds

        upper_bound = df['prior_elapsed'].quantile(0.98) # outlier 설정
        median = df[df['prior_elapsed'] <= upper_bound]['prior_elapsed'].median() 
        df.loc[df['prior_elapsed'] > upper_bound, 'prior_elapsed'] = median 
        df['prior_elapsed'] = df['prior_elapsed'].fillna(median) # 빈값 채우기

        df['prior_elapsed'] = np.log1p(df['prior_elapsed']) #
        df['prior_elapsed'] = QuantileTransformer(output_distribution='normal').fit_transform(df.prior_elapsed.values.reshape(-1,1)).reshape(-1) 

        # 문제 평균 소모시간 추가
        assess_time = df.groupby('assessmentItemID').prior_elapsed.mean()
        assess_time.name = 'mean_elapsed'
        df = df.merge(assess_time, how='left', on=['assessmentItemID'])

        # 테스트 평균 소모시간 추가
        test_time = df.groupby('testId').prior_elapsed.mean()
        test_time.name = 'test_time'
        df = df.merge(test_time, how='left', on=['testId'])

        # 대분류별 평균 소모시간 추가
        grade_time = df.groupby('grade').prior_elapsed.mean()
        grade_time.name = 'grade_time'
        df = df.merge(grade_time, how='left', on=['grade'])

        # 수치형 로그
        # df['mean_elapsed'] = np.log1p(df['mean_elapsed'])
        # df['test_time'] = np.log1p(df['test_time'])
        # df['grade_time'] = np.log1p(df['grade_time'])

        # user&태그별 누적 카운트
        # df['tag_cumCount'] = df.groupby(['userID', 'KnowledgeTag']).cumcount()
        # df['tag_cumCount'] = np.log1p(df['tag_cumCount'])

        # user&태그별 누적 정답횟수
        df['tag_cumAnswer'] = df.groupby(['userID', 'KnowledgeTag']).answerCode.cumsum() - df['answerCode']
        df['tag_cumAnswer'] = np.log1p(df['tag_cumAnswer'])

        #TODO
        def percentile(s):
            return np.sum(s) / len(s)

        df = df.reset_index()
        # 큰 카테고리
        df['big_features'] = df['testId'].apply(lambda x : x[2]).astype(int)

        # 큰 카테고리별 정답률
        stu_groupby = df.groupby('big_features').agg({
        'assessmentItemID': 'count',
        'answerCode': percentile
        }).rename(columns = {'answerCode' : 'answer_rate'})

        # tag별 정답률
        stu_tag_groupby = df.groupby(['big_features', 'KnowledgeTag']).agg({
        'assessmentItemID': 'count',
        'answerCode': percentile
        }).rename(columns = {'answerCode' : 'answer_rate'})

        # 시험지별 정답률
        stu_test_groupby = df.groupby(['big_features', 'testId']).agg({
        'assessmentItemID': 'count',
        'answerCode': percentile
        }).rename(columns = {'answerCode' : 'answer_rate'})

        # 문항별 정답률
        stu_assessment_groupby = df.groupby(['big_features', 'assessmentItemID']).agg({
        'assessmentItemID': 'count',
        'answerCode': percentile
        }).rename(columns = {'assessmentItemID' : 'assessment_count', 'answerCode' : 'answer_rate'})

        df = df.sort_values(by='index', axis=0)

        # 정답 - 큰 카테고리별 정답률 
        '''ex)
        맞은 문제의 큰 카테고리별 정답률이 0.7 이면 1 - 0.7 = 0.3이 됨)
        틀린 문제의 큰 카테고리별 정답률이 0.7 이면 0 - 0.7 = -0.7이 됨)
        '''
        temp = pd.merge(df, stu_groupby.reset_index()[['big_features', 'answer_rate']], on = ['big_features'])
        temp = temp.sort_values(by='index', axis=0).set_index('index')
        df['big_mean'] = temp['answer_rate']
        df['answer_delta'] = temp['answerCode'] - temp['answer_rate']

        # 정답 - 태그별 정답률
        temp = pd.merge(df, stu_tag_groupby.reset_index()[['answer_rate', 'big_features', 'KnowledgeTag']], on = ['big_features', 'KnowledgeTag'])
        temp = temp.sort_values(by='index', axis=0).set_index('index')
        df['tag_mean'] = temp['answer_rate']
        df['tag_delta'] = temp['answerCode'] - temp['answer_rate']

        # 정답 - 시험별 정답률
        temp = pd.merge(df, stu_test_groupby.reset_index()[['answer_rate', 'big_features', 'testId']], on = ['big_features', 'testId'])
        temp = temp.sort_values(by='index', axis=0).set_index('index')
        df['test_mean'] = temp['answer_rate']
        df['test_delta'] = temp['answerCode'] - temp['answer_rate']

        # 정답 - 문항별 정답률
        temp = pd.merge(df, stu_assessment_groupby.reset_index()[['answer_rate', 'big_features', 'assessmentItemID']], on = ['big_features', 'assessmentItemID'])
        temp = temp.sort_values(by='index', axis=0).set_index('index')
        df['assess_mean'] = temp['answer_rate']
        df['assess_delta'] = temp['answerCode'] - temp['answer_rate']


        return df

    def load_data_from_file(self, file_name, is_train=True):
        csv_file_path = os.path.join(self.args.data_dir, file_name)
        df = pd.read_csv(csv_file_path, parse_dates=['Timestamp'])#, nrows=100000)
        df = self.__feature_engineering(df)
        df = self.__preprocessing(df, is_train)

        # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용

                
        self.args.n_questions = len(np.load(os.path.join(self.args.asset_dir,'assessmentItemID_classes.npy')))
        self.args.n_test = len(np.load(os.path.join(self.args.asset_dir,'testId_classes.npy')))
        self.args.n_tag = len(np.load(os.path.join(self.args.asset_dir,'KnowledgeTag_classes.npy')))
        self.args.n_big_features = 9


        df = df.sort_values(by=['userID','Timestamp'], axis=0)
        columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag', 'big_features', 'answer_delta', 'tag_delta', 'test_delta', 'assess_delta', 'big_mean', 'tag_mean', 'test_mean', 'assess_mean', 'prior_elapsed', 'mean_elapsed', 'test_time', 'grade_time', 'tag_cumAnswer']

        group = df[columns].groupby('userID').apply(
                lambda r: (
                    r['testId'].values, 
                    r['assessmentItemID'].values,
                    r['KnowledgeTag'].values,
                    r['answerCode'].values,
                    r['big_features'].values,
                    r['answer_delta'].values,
                    r['tag_delta'].values,
                    r['test_delta'].values,
                    r['assess_delta'].values,
                    r['big_mean'].values,
                    r['tag_mean'].values,
                    r['test_mean'].values,
                    r['assess_mean'].values,
                    r['prior_elapsed'].values,
                    r['mean_elapsed'].values,
                    r['test_time'].values,
                    r['grade_time'].values,
                    r['tag_cumAnswer'].values,
                )
            )

        return group.values

    def load_train_data(self, file_name):
        self.train_data = self.load_data_from_file(file_name)

    def load_test_data(self, file_name):
        self.test_data = self.load_data_from_file(file_name, is_train= False)


# Data augmentation
def slidding_window(data, args):
    window_size = args.max_seq_len
    stride = args.stride

    augmented_datas = []
    for row in data:
        seq_len = len(row[0])

        # 만약 window 크기보다 seq len이 같거나 작으면 augmentation을 하지 않는다
        if seq_len <= window_size:
            augmented_datas.append(row)
        else:
            total_window = ((seq_len - window_size) // stride) + 1
            
            # 앞에서부터 slidding window 적용
            for window_i in range(total_window):
                # window로 잘린 데이터를 모으는 리스트
                window_data = []
                for col in row:
                    window_data.append(col[window_i*stride:window_i*stride + window_size])

                # Shuffle
                # 마지막 데이터의 경우 shuffle을 하지 않는다
                if args.shuffle and window_i + 1 != total_window:
                    shuffle_datas = shuffle(window_data, window_size, args)
                    augmented_datas += shuffle_datas
                else:
                    augmented_datas.append(tuple(window_data))

            # slidding window에서 뒷부분이 누락될 경우 추가
            total_len = window_size + (stride * (total_window - 1))
            if seq_len != total_len:
                window_data = []
                for col in row:
                    window_data.append(col[-window_size:])
                augmented_datas.append(tuple(window_data))


    return augmented_datas


def shuffle(data, data_size, args):
    shuffle_datas = []
    for i in range(args.shuffle_n):
        # shuffle 횟수만큼 window를 랜덤하게 계속 섞어서 데이터로 추가
        shuffle_data = []
        random_index = np.random.permutation(data_size)
        for col in data:
            shuffle_data.append(col[random_index])
        shuffle_datas.append(tuple(shuffle_data))
    return shuffle_datas

def data_augmentation(data, args):
    if args.window == True:
        data = slidding_window(data, args)

    return data

In [4]:
preprocess = Preprocess(args)
preprocess.load_train_data(args.file_name)
train_data = preprocess.get_train_data()

train_data, valid_data = preprocess.split_data(train_data)

In [5]:
class DKTDataset(torch.utils.data.Dataset):
    def __init__(self, data, args):
        self.data = data
        self.args = args

    def __getitem__(self, index):
        row = self.data[index]

        # 각 data의 sequence length
        seq_len = len(row[0])

        test, question, tag, correct, big_features, answer_delta, tag_delta, test_delta, assess_delta, big_mean, tag_mean, test_mean, assess_mean, prior_elapsed, mean_elapsed, test_time, grade_time, tag_cumAnswer = row
        

        cate_cols = [test, question, tag, correct, big_features]
        cont_cols = [answer_delta, tag_delta, test_delta, assess_delta, big_mean, tag_mean, test_mean, assess_mean, prior_elapsed, mean_elapsed, test_time, grade_time, tag_cumAnswer]
        
        # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다
        if seq_len > self.args.max_seq_len:
            for i, col in enumerate(cate_cols):
                cate_cols[i] = col[-self.args.max_seq_len:]
            mask = np.ones(self.args.max_seq_len, dtype=np.int16)
        else:
            mask = np.zeros(self.args.max_seq_len, dtype=np.int16)
            mask[-seq_len:] = 1

        # mask도 columns 목록에 포함시킴
        cate_cols.append(mask)

        if seq_len > self.args.max_seq_len:
            for i, col in enumerate(cont_cols):
                cont_cols[i] = col[-self.args.max_seq_len:]

        # np.array -> torch.tensor 형변환
        for i, col in enumerate(cate_cols):
            cate_cols[i] = torch.tensor(col)

        # np.array -> torch.tensor 형변환
        for i, col in enumerate(cont_cols):
            cont_cols[i] = torch.tensor(col)

        return cate_cols, cont_cols

    def __len__(self):
        return len(self.data)


from torch.nn.utils.rnn import pad_sequence

def collate(batch):
    cate_col_n = len(batch[0][0])
    cont_col_n = len(batch[0][1])

    cate_col_list = [[] for _ in range(cate_col_n)]
    cont_col_list = [[] for _ in range(cont_col_n)]

    max_seq_len = len(batch[0][0][-1])

        
    # batch의 값들을 각 column끼리 그룹화
    for row in batch:
        for i, col in enumerate(row[0]):
            pre_padded = torch.zeros(max_seq_len)
            pre_padded[-len(col):] = col
            cate_col_list[i].append(pre_padded)
        for i, col in enumerate(row[1]):
            pre_padded = torch.zeros(max_seq_len)
            pre_padded[-len(col):] = col
            cont_col_list[i].append(pre_padded)


    for i, _ in enumerate(cate_col_list):
        cate_col_list[i] =torch.stack(cate_col_list[i])
    
    for i, _ in enumerate(cont_col_list):
        cont_col_list[i] =torch.stack(cont_col_list[i])

    return tuple(cate_col_list), tuple(cont_col_list)


def get_loaders(args, train, valid):

    pin_memory = False
    train_loader, valid_loader = None, None
    
    if train is not None:
        trainset = DKTDataset(train, args)
        train_loader = torch.utils.data.DataLoader(trainset, num_workers=args.num_workers, shuffle=True,
                            batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)
    if valid is not None:
        valset = DKTDataset(valid, args)
        valid_loader = torch.utils.data.DataLoader(valset, num_workers=args.num_workers, shuffle=False,
                            batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)

    return train_loader, valid_loader

In [6]:
train_loader, valid_loader = get_loaders(args, train_data, valid_data)

In [7]:
for batch in train_loader:
    break

In [8]:
def process_batch(batch, args):

    (test, question, tag, correct, big_features, mask), cont_features = batch    
    
    # change to float
    mask = mask.type(torch.FloatTensor)
    correct = correct.type(torch.FloatTensor)
    big_features = big_features.type(torch.FloatTensor)

    temp = []

    # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용
    interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다.
    interaction = interaction.roll(shifts=1, dims=1)
    interaction_mask = mask.roll(shifts=1, dims=1)
    interaction_mask[:, 0] = 0
    interaction = (interaction * interaction_mask).to(torch.int64)
    # print(interaction)
    # exit()
    #  test_id, question_id, tag
    test = ((test + 1) * mask).to(torch.int64)
    question = ((question + 1) * mask).to(torch.int64)
    tag = ((tag + 1) * mask).to(torch.int64)

    big_features = (big_features * mask).to(torch.int64)

    # gather index
    # 마지막 sequence만 사용하기 위한 index
    gather_index = torch.tensor(np.count_nonzero(mask, axis=1))
    gather_index = gather_index.view(-1, 1) - 1


    for i, cont_feature in enumerate(cont_features):
        cont_feature = cont_feature.type(torch.FloatTensor)
        if i < 4:
            cont_feature = cont_feature.roll(shifts=1, dims=1)
            cont_feature[:, 0] = 0 # set padding index to the first sequence
            cont_feature = (cont_feature * interaction_mask).unsqueeze(-1)
        else:
            cont_feature = (cont_feature * mask).unsqueeze(-1)
        temp.append(cont_feature)
    
    # device memory로 이동
    test = test.to(args.device)
    question = question.to(args.device)
    tag = tag.to(args.device)
    correct = correct.to(args.device)
    mask = mask.to(args.device)
    interaction = interaction.to(args.device)
    big_features = big_features.to(args.device)
    gather_index = gather_index.to(args.device)

    cont_features = torch.cat(temp, dim=-1).to(args.device)

    return (test, question,
            tag, correct, mask,
            interaction, big_features, gather_index), cont_features

In [9]:
input = process_batch(batch, args)

In [10]:
args.n_cate = 5
args.n_cont = 13

In [11]:
def compute_loss(preds, targets):
    """
    Args :
        preds   : (batch_size, max_seq_len)
        targets : (batch_size, max_seq_len)

    """
    loss = get_criterion(preds, targets)
    #마지막 시퀀드에 대한 값만 loss 계산
    # loss = loss[:,-1]
    # loss = loss[:, :-1]
    loss = torch.mean(loss)
    return loss

In [12]:
def train(train_loader, model, optimizer, scheduler, args):
    model.train()

    total_preds = []
    total_targets = []
    losses = []
    for step, batch in enumerate(train_loader):
        input = process_batch(batch, args)
        preds = model(input)
        targets = input[0][3] # correct

        loss = compute_loss(preds, targets)
        update_params(loss, model, optimizer, args)
        if args.scheduler != 'plateau':
            scheduler.step()

        if step % args.log_steps == 0:
            print(f"Training steps: {step} Loss: {str(loss.item())}")
        
        # predictions
        preds = preds[:,-1]
        targets = targets[:,-1]

        if args.device == 'cuda':
            preds = preds.to('cpu').detach().numpy()
            targets = targets.to('cpu').detach().numpy()
        else: # cpu
            preds = preds.detach().numpy()
            targets = targets.detach().numpy()
        
        total_preds.append(preds)
        total_targets.append(targets)
        losses.append(loss)
      

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    auc, acc = get_metric(total_targets, total_preds)
    loss_avg = sum(losses)/len(losses)
    print(f'TRAIN AUC : {auc} ACC : {acc}')
    return auc, acc, loss_avg

def validate(valid_loader, model, args):
    model.eval()

    total_preds = []
    total_targets = []
    for step, batch in enumerate(valid_loader):
        input = process_batch(batch, args)

        preds = model(input)
        targets = input[0][3] # correct


        # predictions
        preds = preds[:,-1]
        targets = targets[:,-1]
    
        if args.device == 'cuda':
            preds = preds.to('cpu').detach().numpy()
            targets = targets.to('cpu').detach().numpy()
        else: # cpu
            preds = preds.detach().numpy()
            targets = targets.detach().numpy()

        total_preds.append(preds)
        total_targets.append(targets)

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    auc, acc = get_metric(total_targets, total_preds)
    
    print(f'VALID AUC : {auc} ACC : {acc}\n')

    return auc, acc, total_preds, total_targets

In [18]:
class GPT2(nn.Module):
    
    def __init__(self, args):
        super(GPT2, self).__init__()
        self.args = args
        self.device = args.device

        # Defining some parameters
        self.hidden_dim = self.args.hidden_dim
        self.n_layers = self.args.n_layers

        # Embedding 
        # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
        self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)
        self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)
        self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)
        self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)
        # 큰 카테고리 embedding 추가
        self.embedding_big = nn.Embedding(self.args.n_big_features + 1, self.hidden_dim//3)

        # embedding combination projection
        self.comb_proj = nn.Sequential(
            nn.Linear((self.hidden_dim//3)*5, self.hidden_dim//2),
            nn.LayerNorm(self.hidden_dim//2)
        )

        # cont features
        self.cont_embed = nn.Sequential(
            nn.Linear(self.args.n_cont, self.hidden_dim//2),
            nn.LayerNorm(self.hidden_dim//2)
        )

        # Bert config
        self.config = GPT2Config( 
            3, # not used
            n_embd=self.hidden_dim,
            n_layer=self.args.n_layers,
            n_head=self.args.n_heads,
            n_positions=self.args.max_seq_len          
        )

        # Defining the layers
        # Bert Layer
        self.encoder = GPT2Model(self.config)  

        # Fully connected layer
        self.fc = nn.Linear(self.args.hidden_dim, 1)
        self.dropout = nn.Dropout(p=args.drop_out)
       
        self.activation = nn.Sigmoid()


    def forward(self, input):
        (test, question, tag, _, mask, interaction, big_features, _), cont_features = input

        batch_size = interaction.size(0)

        # 신나는 embedding
        embed_interaction = self.embedding_interaction(interaction)
        embed_test = self.embedding_test(test)
        embed_question = self.embedding_question(question)
        embed_tag = self.embedding_tag(tag)
        embed_big = self.embedding_big(big_features)
        

        embed = torch.cat([embed_interaction,
                           embed_test,
                           embed_question,
                           embed_big,
                           embed_tag,
                           ], 2)

        cate_embed = self.comb_proj(embed)
        cont_embed = self.cont_embed(cont_features)

        # cate변수와 cont변수를 concat해서 bert의 input에 넣어줌        
        X = torch.cat([cate_embed, cont_embed], 2)

        # Bert
        encoded_layers = self.encoder(inputs_embeds=X, attention_mask=mask)
        out = encoded_layers.last_hidden_state
        out = out.contiguous().view(batch_size, -1, self.hidden_dim)
        out = self.dropout(self.fc(out))
        preds = self.activation(out).view(batch_size, -1)

        return preds

In [119]:
# args.hidden_dim = 512
args.optimizer = 'adamW'

In [19]:
args.model = 'gpt2'

if args.model == 'customlstm':
    model = CustomLSTM(args)
elif args.model == 'customlstmattn':
    model = CustomLSTMATTN(args)
elif args.model == 'custombert':
    model = CustomBert(args)
elif args.model == 'customxlm':
    model = CustomXlmRoberta(args)
elif args.model == 'customsaint':
    model = CustomSaint(args)
elif args.model == 'customlastquery':
    model = CustomLastQuery(args)
elif args.model == 'gpt2':
    model = GPT2(args)
else:
    raise NotImplementedError
model.to(device)
print()




In [20]:
# /args.scheduler = 'plateau'

In [21]:
# only when using warmup scheduler
args.total_steps = int(len(train_loader.dataset) / args.batch_size) * (args.n_epochs)
args.warmup_steps = args.total_steps // 10
        
optimizer = get_optimizer(model, args)
scheduler = get_scheduler(optimizer, args)

best_auc = -1
early_stopping_counter = 0

for epoch in range(1):

    print(f"Start Training: Epoch {epoch + 1}")
    
    ### TRAIN
    train_auc, train_acc, train_loss = train(train_loader, model, optimizer, scheduler, args)
    
    ### VALID
    auc, acc,_ , _ = validate(valid_loader, model, args)

    lr = get_lr(optimizer)
    ### TODO: model save or early stopping
    # wandb.log({"epoch": epoch, "train_loss": train_loss, "train_auc": train_auc, "train_acc":train_acc,
    #             "valid_auc":auc, "valid_acc":acc, "lr":lr})
    if auc > best_auc:
        best_auc = auc
        # # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.
        # model_to_save = model.module if hasattr(model, 'module') else model
        # delete_model(args.model_dir)
        # save_checkpoint({
        #     'epoch': epoch + 1,
        #     'state_dict': model_to_save.state_dict(),
        #     },
        #     args.model_dir, f'model_{epoch + 1}.pt',
        # )
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= args.patience:
            print(f'EarlyStopping counter: {early_stopping_counter} out of {args.patience}')
            break

    # scheduler
    if args.scheduler == 'plateau':
        scheduler.step(best_auc)

Start Training: Epoch 1
Training steps: 0 Loss: 0.6987665891647339
Training steps: 50 Loss: 0.467035174369812
TRAIN AUC : 0.731706516596211 ACC : 0.6512372013651877
VALID AUC : 0.7932390521898596 ACC : 0.7059701492537314



In [23]:
args.add_features = ["bigfeature", "answer_delta","tag_delta","test_delta","assess_delta", "big_mean", 'tag_mean', 'test_mean', "assess_mean",'prior_elapsed', 'mean_elapsed', 'test_time', 'grade_time', 'tag_cumAnswer']
name = 'fe9_maxseq128_hiddendim512_gpt2'

args.model_dir = '/opt/ml/models/'
args.model_dir = increment_path(os.path.join(args.model_dir, args.model))
args.save_path = args.model_dir.split('/')[-1]
os.makedirs(args.model_dir, exist_ok=True)

wandb.login()

wandb.init(project='dkt', config=vars(args))
wandb.run.name = name

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcha-no[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.31 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [24]:
print(args.model)

if args.model == 'customlstm':
    model = CustomLSTM(args)
elif args.model == 'customlstmattn':
    model = CustomLSTMATTN(args)
elif args.model == 'custombert':
    model = CustomBert(args)
elif args.model == 'customxlm':
    model = CustomXlmRoberta(args)
elif args.model == 'customsaint':
    model = CustomSaint(args)
elif args.model == 'customlastquery':
    model = CustomLastQuery(args)
elif args.model == 'gpt2':
    model = GPT2(args)
else:
    raise NotImplementedError
model.to(device)
print()

gpt2



In [125]:
torch.cuda.empty_cache()
gc.collect()

# augmentation
augmented_train_data = data_augmentation(train_data, args)
if len(augmented_train_data) != len(train_data):
    print(f"Data Augmentation applied. Train data {len(train_data)} -> {len(augmented_train_data)}\n")

train_loader, valid_loader = get_loaders(args, augmented_train_data, valid_data)

Data Augmentation applied. Train data 4688 -> 10601



In [25]:
# only when using warmup scheduler
args.total_steps = int(len(train_loader.dataset) / args.batch_size) * (args.n_epochs)
args.warmup_steps = args.total_steps // 10
        
optimizer = get_optimizer(model, args)
scheduler = get_scheduler(optimizer, args)

best_auc = -1
early_stopping_counter = 0

for epoch in range(args.n_epochs):

    print(f"Start Training: Epoch {epoch + 1}")
    
    ### TRAIN
    train_auc, train_acc, train_loss = train(train_loader, model, optimizer, scheduler, args)
    
    ### VALID
    auc, acc,_ , _ = validate(valid_loader, model, args)

    lr = get_lr(optimizer)
    ### TODO: model save or early stopping
    wandb.log({"epoch": epoch, "train_loss": train_loss, "train_auc": train_auc, "train_acc":train_acc,
                "valid_auc":auc, "valid_acc":acc, "lr":lr})
    if auc > best_auc:
        best_auc = auc
        # # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.
        model_to_save = model.module if hasattr(model, 'module') else model
        delete_model(args.model_dir)
        save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model_to_save.state_dict(),
            },
            args.model_dir, f'model_{epoch + 1}.pt',
        )
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= args.patience:
            print(f'EarlyStopping counter: {early_stopping_counter} out of {args.patience}')
            break

    # scheduler
    if args.scheduler == 'plateau':
        scheduler.step(best_auc)

wandb.finish()


Start Training: Epoch 1
Training steps: 0 Loss: 0.647724986076355
Training steps: 50 Loss: 0.459611177444458
TRAIN AUC : 0.7335956466683138 ACC : 0.6548634812286689
VALID AUC : 0.7953456495887474 ACC : 0.7044776119402985

saving model ...
Start Training: Epoch 2
Training steps: 0 Loss: 0.4702879786491394
Training steps: 50 Loss: 0.48483526706695557
TRAIN AUC : 0.7587215689206104 ACC : 0.6708617747440273
VALID AUC : 0.7978906255423943 ACC : 0.7243781094527363

saving model ...
Start Training: Epoch 3
Training steps: 0 Loss: 0.4561827480792999
Training steps: 50 Loss: 0.44875749945640564
TRAIN AUC : 0.756398126732713 ACC : 0.66339590443686
VALID AUC : 0.8027752734658982 ACC : 0.7233830845771144

saving model ...
Start Training: Epoch 4
Training steps: 0 Loss: 0.4813230037689209
Training steps: 50 Loss: 0.4687924087047577
TRAIN AUC : 0.7591796752140337 ACC : 0.6781143344709898
VALID AUC : 0.8016049415776272 ACC : 0.7169154228855721

Start Training: Epoch 5
Training steps: 0 Loss: 0.432658

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,79.0
train_loss,0.44601
train_auc,0.78203
train_acc,0.69027
valid_auc,0.81024
valid_acc,0.7393
lr,1e-05
_runtime,806.0
_timestamp,1623226836.0
_step,79.0


0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_loss,█▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_auc,▁▄▅▅▆▅▅▆▇▅▇▆▇▆▆▆▇▇▇▇▇▇▇██▇▇▇▇▆▇▇▇▇▇▇▇█▇█
train_acc,▁▂▅▄▆▄▄▅▆▆▆▆▇▅▅▅▇▆▆█▆▆▆▇▆▆▆▆▇▆▆█▇▇▇▆▇█▆▇
valid_auc,▁▄▄▄▅▅▅▅▆▅▆▆▆▆▇▇█▇▇▇▇▇▇▇▇▇▇█▇▇▇▇████████
valid_acc,▁▄▃▅▄▅▅▃▅▄▄▄▅▅▆▆▅▆▅▆▅▆▇▆▅▆▆▆▆▆▆▇█▇▇▇▇▇▇▇
lr,██████████████████████▄▄▄▄▄▄▄▄▂▂▂▂▂▂▂▂▁▁
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███


In [52]:
preprocess = Preprocess(args)
preprocess.load_test_data(args.test_file_name)
test_data = preprocess.get_test_data()

In [53]:
_, test_loader = get_loaders(args, None, test_data)

In [54]:
for batch in test_loader:
    break

input = process_batch(batch, args)

In [55]:
def load_model(args):    
    model_path = args.model_dir
    print("Loading Model from:", args.model_dir)
    load_state = torch.load(model_path)

    if args.model == 'customlstm':
        model = CustomLSTM(args)
    elif args.model == 'customlstmattn':
        model = CustomLSTMATTN(args)
    elif args.model == 'custombert':
        model = CustomBert(args)
    elif args.model == 'customxlm':
        model = CustomXlmRoberta(args)
    elif args.model == 'customsaint':
        model = CustomSaint(args)
    elif args.model == 'customlastquery':
        model = CustomLastQuery(args)
    else:
        raise NotImplementedError

    # 1. load model state
    model.load_state_dict(load_state['state_dict'], strict=True)
    
    print("Loading Model from:", model_path, "...Finished.")
    return model

In [56]:
args.model_dir = '/opt/ml/models'
args.model = 'custombert'
args.model_name = 'custombert15/model_41.pt'
args.model_dir = os.path.join(args.model_dir, args.model_name)
model = load_model(args)
model.to(device)
print()

Loading Model from: /opt/ml/models/custombert15/model_41.pt
Loading Model from: /opt/ml/models/custombert15/model_41.pt ...Finished.



In [57]:
def inference(args, test_data):
    model.eval()
    _, test_loader = get_loaders(args, None, test_data)
    
    
    total_preds = []
    
    for step, batch in enumerate(test_loader):
        input = process_batch(batch, args)

        preds = model(input)
        

        # predictions
        preds = preds[:,-1]
        

        if args.device == 'cuda':
            preds = preds.to('cpu').detach().numpy()
        else: # cpu
            preds = preds.detach().numpy()
            
        total_preds+=list(preds)
    
    output_path = os.path.dirname(args.model_name)
    os.makedirs(os.path.join(args.output_dir, output_path), exist_ok=True)
    write_path = os.path.join(args.output_dir, output_path, "output.csv")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)    
    print(write_path)
    with open(write_path, 'w', encoding='utf8') as w:
        print("writing prediction : {}".format(write_path))
        w.write("id,prediction\n")
        for id, p in enumerate(total_preds):
            w.write('{},{}\n'.format(id,p))

In [58]:
inference(args, test_data)

/opt/ml/output/custombert15/output.csv
writing prediction : /opt/ml/output/custombert15/output.csv
