In [4]:
import os
import pandas as pd
import numpy as np


import time
from datetime import datetime
import random

import pickle

import matplotlib.pyplot as plt
import os, random, torch
import numpy as np

In [5]:
def setSeeds(seed = 42):
    # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)    
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [6]:
setSeeds(seed = 42)

In [7]:
%%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

# 데이터 경로 
train_path = '../input/data/train_dataset/train_data.csv'
test_path = '../input/data/train_dataset/test_data.csv'

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 12.6 µs


In [8]:
%%time
#데이터를 날짜/시간 포맷에 대한 명시적인 설정없이 그냥 pandas의 read_csv() 함수로 읽어와서 DataFrame을 만들 경우 아래와 같이 object 데이터 형태로 불어오게 됩니다

df = pd.read_csv(train_path)
df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
test_df = pd.read_csv(test_path)

CPU times: user 10.5 s, sys: 696 ms, total: 11.1 s
Wall time: 11.1 s


In [9]:
total_df = pd.concat([df, test_df[test_df.answerCode!=-1]], ignore_index=True)

### train, val split

In [10]:
# 임시 split function
def get_train_val_row_ids(df, test_df):
    train_size = int(len(df.groupby('userID'))*0.8)
    val_size = len(df.groupby('userID')) - train_size
    print(f'train_size = {train_size}, val_size = {val_size}')
    
    end_id_by_user = df.groupby('userID').apply(lambda x: x.index.values[-1]).reset_index()
    test_end_id_by_user = test_df.groupby('userID').apply(lambda x: x.index.values[-1]).reset_index()

    train_last = df.loc[end_id_by_user[0]]
    test_last = test_df.loc[test_end_id_by_user[0]]

    test_item_set = set(test_last.assessmentItemID.unique())

    user_id_lst = []
    for end_id in end_id_by_user[0]:
        if df.loc[end_id].assessmentItemID in test_item_set:
            user_id_lst.append(df.loc[end_id].userID)

    random.seed(0)
    random.shuffle(user_id_lst)
    print(f'test set 과 같은 문제를 예측하는 user 수 = {len(user_id_lst)}')
    val_user_ids = random.sample(user_id_lst, val_size)
    print(val_user_ids[:5])  

    train_user_ids = list(set(df.userID.unique())-set(val_user_ids))
    print(train_user_ids[:5])

    # 학습 과정에서 학습 샘플을 생성하기 위해서 필요한 유저별 row index를 저장
    row_ids_by_user_id = df.groupby('userID').apply(lambda x: x.index.values)
    row_ids_by_user_id = row_ids_by_user_id.reset_index()

    train_ids = []
    for ids in row_ids_by_user_id[row_ids_by_user_id.userID.isin(train_user_ids)][0]:
        train_ids.extend(ids)
    print(f'train_ids_len = {len(train_ids)}')

    val_ids = []
    for ids in row_ids_by_user_id[row_ids_by_user_id.userID.isin(val_user_ids)][0]:
        val_ids.extend(ids)
    print(f'val_ids_len = {len(val_ids)}')
    
    return train_ids, val_ids


In [11]:
# mktime: 시간의 부동 소수점을 나타내는 데 사용 시간 (초)을 리턴
def convert_time(s):
    timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())
    return int(timestamp)

In [12]:
# -- 시간 feature
df['time_stamp'] = df['Timestamp'].apply(convert_time)

In [13]:
def feature_engineering(df, total_df):
    # 학습 과정에서 학습 샘플을 생성하기 위해서 필요한 유저별 row index를 저장
    row_ids_by_user_id = df.groupby('userID').apply(lambda x: x.index.values)
    
    # 학습 과정에서 학습 샘플을 생성하기 위해서 필요한 유저별 시작 row index를 저장
    start_row_id_by_user_id = df.groupby('userID').apply(lambda x:x.index.values[0])
    
    df['day'] = df.Timestamp.apply(lambda x:x.split()[0])
    
    # -- 정확도 feature, 이거는 전체 데이터 필요할 듯
    acc_by_test_id = total_df.groupby('testId').answerCode.mean()
    acc_by_assessment_item_id = total_df.groupby('assessmentItemID').answerCode.mean()
    acc_by_tag = total_df.groupby('KnowledgeTag').answerCode.mean()
    df['acc_avg_by_test_id'] = df['testId'].map(acc_by_test_id)
    df['acc_avg_by_assessment_item_id'] = df['assessmentItemID'].map(acc_by_assessment_item_id)
    df['acc_avg_by_tag'] = df['KnowledgeTag'].map(acc_by_tag)
    
    
    # -- 상대적 feature
    df['relative_answered_correctly'] = df['answerCode'] - df['acc_avg_by_assessment_item_id']
    
    
    # -- 과거 feature
    # user 별 이전 문제까지 정답 횟수
    df['prior_acc_count'] = df.groupby('userID')['answerCode'].cumsum().shift(fill_value=0)

    # 이전까지 푼 문제 수
    df['content'] = [1]*len(df)
    df['prior_quest_count'] = df.groupby('userID')['content'].cumsum().shift(fill_value=0)

    # 이전 문제까지의 정답률
    df['prior_acc'] = (df['prior_acc_count'] / df['prior_quest_count']).fillna(0)
    
    # 이전 문제 상대적인(relative) 정답률
    df['prior_relative_acc_sum'] = df.groupby('userID')['relative_answered_correctly'].cumsum().shift(fill_value=0)
    df['prior_relative_accuracy'] = (df['prior_relative_acc_sum'] / df['prior_quest_count']).fillna(0)
    
    # 각 문제 종류별(tag)로 이전에 몇번 풀었는지
    # 1번 문제 3번 풀었다 / 3번 문제 1번 풀었다 ..etc
    df['prior_tags_frequency'] = df.groupby(['userID', 'KnowledgeTag']).cumcount()

    user_start_idx = df['userID'].diff() > 0
    features = ['prior_acc_count',
                'prior_quest_count',
                'prior_acc',
                'prior_relative_acc_sum',
                'prior_relative_accuracy']
    # 각 학생의 첫 row는 prior feature의 값을 0으로 초기화한다
    df.loc[user_start_idx, features] = 0
    
    
    # -- 바로전 feature 생성
    # 각 시험지 문제별 마지막으로 푼 시간
    prev_timestamp_ac = df.groupby(['userID', 'testId','day'])[['time_stamp']].shift()

    # 각 문제 종류별 마지막으로 풀기 시작한 시점으로부터 지난 시간, elapsed_time
    # 해당 문제 종류를 마지막으로 푼 시점으로부터 시간이 오래 지날수록 문제를 맞추기 힘들 것이다
    df['diff_time_btw_item'] = (df['time_stamp'] - prev_timestamp_ac['time_stamp'])

    # nan값은 [ diff_time_btw_content_ids ] 데이터 중 0 imputation을 한다
    max_diff_time_btw_tags = df['diff_time_btw_item'].max()
    df['diff_time_btw_item'] = df['diff_time_btw_item'].fillna(0)
    
    # 각 시험지 문제 종류별 마지막으로 풀었을때 정답 여부
    prev_correct_ac = df.groupby(['userID', 'testId'])[['answerCode']].shift()        
    df['prev_answered_correctly'] = prev_correct_ac['answerCode'].fillna(0)
    
    # 로그 스케일을 적용해야하는 데이터들은 어떤거지??
    # 최소값이 0인 피처들은 로그 스케일을 하기 전에 +1을 해야할 듯 -> log1p가 그 역할을 함
    log1p_cols = ['time_stamp',
                  #'elapsed_time',
                  'prior_tags_frequency',
                  'diff_time_btw_item'
                 ]

    df[log1p_cols] = np.log1p(df[log1p_cols]+1)

    return df

In [14]:
df = feature_engineering(df, total_df)

### 수치형 feature

In [15]:
cont_cols = ['time_stamp',
            'diff_time_btw_item', 
            'prior_tags_frequency', 
            'prior_acc', 
            'acc_avg_by_test_id', 
            'acc_avg_by_assessment_item_id', 
            'acc_avg_by_tag', 
            #'lag_time', 
            #'prior_relative_acc_sum', 로그 스케일 적용 안됨
            'prior_relative_accuracy', 
            'prev_answered_correctly', 
            #'relative_answered_correctly', # 로그 스케일 안됨
            'answerCode']
# train_df[cont_cols] = train_df[cont_cols].astype(np.float32)
# val_df[cont_cols] = val_df[cont_cols].astype(np.float32) 

In [16]:
import os
from datetime import datetime
import time
import tqdm
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch

In [17]:
def save_labels(encoder, name):
    le_path = os.path.join('./asset', name + '_classes.npy')
    np.save(le_path, encoder.classes_)

In [18]:
def preprocessing(df, is_train = True):
    cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag']
    
    asset_dir = './asset'
    if not os.path.exists(asset_dir):
        os.makedirs(asset_dir)

    for col in cate_cols:
        le = LabelEncoder()
        if is_train:
            #For UNKNOWN class
            a = df[col].unique().tolist() + ['unknown']
            le.fit(a)
            save_labels(le, col)
        else:
            label_path = os.path.join(asset_dir,col+'_classes.npy')
            le.classes_ = np.load(label_path)

            df[col] = df[col].apply(lambda x: x if x in le.classes_ else 'unknown')

        #모든 컬럼이 범주형이라고 가정
        df[col] = df[col].astype(str)
        test = le.transform(df[col])
        df[col] = test
    return df

In [19]:
df = df.sort_values(by=['userID','Timestamp'], axis=0)
columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag']
cont_cols = ['time_stamp',
            'diff_time_btw_item', 
            'prior_tags_frequency', 
            'prior_acc', 
            'acc_avg_by_test_id', 
            'acc_avg_by_assessment_item_id', 
            'acc_avg_by_tag', 
            #'lag_time', 
            #'prior_relative_acc_sum', 로그 스케일 적용 안됨
            'prior_relative_accuracy', 
            'prev_answered_correctly'
            #'relative_answered_correctly', # 로그 스케일 안됨
            ]

In [20]:
# 여기서 train, val 나눔
train_ids, val_ids = get_train_val_row_ids(df, test_df)
df = preprocessing(df, is_train=True)

train_size = 5358, val_size = 1340
test set 과 같은 문제를 예측하는 user 수 = 4366
[6329, 6370, 328, 7030, 6268]
[0, 1, 2, 5, 6]
train_ids_len = 1818587
val_ids_len = 447999


In [21]:
train_df = df.loc[train_ids].reset_index()
val_df = df.loc[val_ids].reset_index()

In [22]:
print(len(train_df.groupby('userID')))
print(len(val_df.groupby('userID')))

5358
1340


In [23]:
train_group = train_df[columns+cont_cols].groupby('userID').apply(
        lambda r: (
            r['testId'].values, 
            r['assessmentItemID'].values,
            r['KnowledgeTag'].values,
            r['answerCode'].values,
            r['time_stamp'].values,
            r['diff_time_btw_item'].values,
            r['prior_tags_frequency'].values,
            r['prior_acc'].values,
            r['acc_avg_by_test_id'].values,
            r['acc_avg_by_assessment_item_id'].values,
            r['acc_avg_by_tag'].values,
            r['prior_relative_accuracy'].values,
            r['prev_answered_correctly'].values,
            
        )
    )

In [24]:
val_group = val_df[columns+cont_cols].groupby('userID').apply(
        lambda r: (
            r['testId'].values, 
            r['assessmentItemID'].values,
            r['KnowledgeTag'].values,
            r['answerCode'].values,
            r['time_stamp'].values,
            r['diff_time_btw_item'].values,
            r['prior_tags_frequency'].values,
            r['prior_acc'].values,
            r['acc_avg_by_test_id'].values,
            r['acc_avg_by_assessment_item_id'].values,
            r['acc_avg_by_tag'].values,
            r['prior_relative_accuracy'].values,
            r['prev_answered_correctly'].values,
            
        )
    )

In [25]:
train_data = train_group.values
valid_data = val_group.values

In [42]:
import pickle
with open('train_data.txt', 'wb') as f:
    pickle.dump(train_data, f)
with open('valid_data.txt', 'wb') as f:
    pickle.dump(valid_data, f)

In [26]:
class CustomDKTDataset(torch.utils.data.Dataset):
    def __init__(self, data, args):
        self.data = data
        self.args = args

    def __getitem__(self, index):
        row = self.data[index]

        # 각 data의 sequence length
        seq_len = len(row[0])

        test, question, tag, correct = row[0], row[1], row[2], row[3]
        cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9 = row[4], row[5], row[6], row[7],row[8], row[9], row[10], row[11],row[12]
        
        cont_cols = [cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9]
        cate_cols = [test, question, tag, correct]

        # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다
        if seq_len > self.args.max_seq_len:
            for i, col in enumerate(cate_cols):
                cate_cols[i] = col[-self.args.max_seq_len:]
            mask = np.ones(self.args.max_seq_len, dtype=np.int16)
            
            for i, col in enumerate(cont_cols):
                cont_cols[i] = col[-self.args.max_seq_len:]
        else:
            mask = np.zeros(self.args.max_seq_len, dtype=np.int16)
            mask[-seq_len:] = 1

        # mask도 columns 목록에 포함시킴
        cate_cols.append(mask)

        # np.array -> torch.tensor 형변환
        for i, col in enumerate(cate_cols):
            cate_cols[i] = torch.tensor(col)
        
        for i, col in enumerate(cont_cols):
            cont_cols[i] = torch.tensor(col)

        return cate_cols+cont_cols

    def __len__(self):
        return len(self.data)



train_loader, valid_loader = get_loaders(args, train_data, valid_data)
    

In [27]:
def collate(batch):
    col_n = len(batch[0])
    col_list = [[] for _ in range(col_n)]
    max_seq_len = len(batch[0][-1])

    # batch의 값들을 각 column끼리 그룹화
    for row in batch:
        for i, col in enumerate(row):
            pre_padded = torch.zeros(max_seq_len)
            pre_padded[-len(col):] = col  # 앞부분이 0으로 padding됨
            col_list[i].append(pre_padded)


    for i, _ in enumerate(col_list):
        col_list[i] =torch.stack(col_list[i])
    
    return tuple(col_list)

In [28]:
import os
#from args import parse_args
import argparse
from dkt.dataloader import Preprocess
from dkt import trainer
import torch
from dkt.utils import setSeeds
import wandb

In [29]:
def parse_args(mode='train'):
    parser = argparse.ArgumentParser()

    
    parser.add_argument('--seed', default=42, type=int, help='seed')
    
    parser.add_argument('--device', default='cpu', type=str, help='cpu or gpu')

    parser.add_argument('--data_dir', default='/opt/ml/input/data/train_dataset', type=str, help='data directory')
    parser.add_argument('--asset_dir', default='asset/', type=str, help='data directory')
    
    parser.add_argument('--file_name', default='train_data.csv', type=str, help='train file name')
    
    parser.add_argument('--model_dir', default='models/', type=str, help='model directory')
    parser.add_argument('--model_name', default='model.pt', type=str, help='model file name')

    parser.add_argument('--output_dir', default='output/', type=str, help='output directory')
    parser.add_argument('--test_file_name', default='test_data.csv', type=str, help='test file name')
    
    parser.add_argument('--max_seq_len', default=20, type=int, help='max sequence length')
    parser.add_argument('--num_workers', default=4, type=int, help='number of workers')

    # 모델
    parser.add_argument('--hidden_dim', default=64, type=int, help='hidden dimension size')
    parser.add_argument('--n_layers', default=2, type=int, help='number of layers')
    parser.add_argument('--n_heads', default=2, type=int, help='number of heads')
    parser.add_argument('--drop_out', default=0.2, type=float, help='drop out rate')
    
    # 훈련
    parser.add_argument('--n_epochs', default=20, type=int, help='number of epochs')
    parser.add_argument('--batch_size', default=64, type=int, help='batch size')
    parser.add_argument('--lr', default=0.0001, type=float, help='learning rate')
    parser.add_argument('--clip_grad', default=10, type=int, help='clip grad')
    parser.add_argument('--patience', default=5, type=int, help='for early stopping')
    

    parser.add_argument('--log_steps', default=50, type=int, help='print log per n steps')
    

    ### 중요 ###
    parser.add_argument('--model', default='lstm', type=str, help='model type')
    parser.add_argument('--optimizer', default='adam', type=str, help='optimizer type')
    parser.add_argument('--scheduler', default='plateau', type=str, help='scheduler type')
    
    args = parser.parse_args([])

    return args

In [30]:
args = parse_args(mode='train')
device = "cuda" if torch.cuda.is_available() else "cpu"
args.device = device
print(device)

cuda


In [31]:
# 배치 전처리
def process_batch(batch, args):
    #print('<<<<< process_batch >>>')

    test, question, tag, correct, mask = batch[:5]   # [batch_size(64), max_seq_len(20)]
    cont = batch[5:]
    
    # change to float
    mask = mask.type(torch.FloatTensor)
    correct = correct.type(torch.FloatTensor)

    # interaction: 과거 정답 여부를 다음 시퀀스에 추가적인 feature로 사용하게끔 한칸 시프트 해준 feature
    #  interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용
    #    saint의 경우 decoder에 들어가는 input이다
    interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다. (정답 2, 오답 1)
    interaction = interaction.roll(shifts=1, dims=1)
    interaction_mask = mask.roll(shifts=1, dims=1)
    interaction_mask[:, 0] = 0
    interaction = (interaction * interaction_mask).to(torch.int64)  # 가장 마지막으로 푼 문제를 제외하고 정답 2, 오답 1
    # print(interaction)
    # exit()
    #  test_id, question_id, tag
    test = ((test + 1) * mask).to(torch.int64)
    question = ((question + 1) * mask).to(torch.int64)
    tag = ((tag + 1) * mask).to(torch.int64)


    # device memory로 이동

    test = test.to(args.device)
    question = question.to(args.device)


    tag = tag.to(args.device)
    correct = correct.to(args.device)
    mask = mask.to(args.device)

    interaction = interaction.to(args.device)
    
    cont_features = torch.cat(cont, 1).view(args.batch_size, args.max_seq_len,-1)
    cont_features = cont_features.to(args.device)

    return (test, question,
            tag, correct, mask,
            interaction, cont_features)

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import numpy as np
import copy
import math

try:
    from transformers.modeling_bert import BertConfig, BertEncoder, BertModel    
except:
    from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel    


In [33]:
# 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용
args.n_questions = len(np.load(os.path.join(args.asset_dir,'assessmentItemID_classes.npy')))
args.n_test = len(np.load(os.path.join(args.asset_dir,'testId_classes.npy')))
args.n_tag = len(np.load(os.path.join(args.asset_dir,'KnowledgeTag_classes.npy')))



In [None]:
class LSTM(nn.Module):

    def __init__(self, args):
        super(LSTM, self).__init__()
        self.args = args
        self.device = args.device

        self.hidden_dim = self.args.hidden_dim
        self.n_layers = self.args.n_layers
        
        self.cont_col_size = 8

        # Embedding 
        # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)
        self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)
        self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)
        self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)
        self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)


        # embedding combination projection
        self.cate_proj =  nn.Sequential(
            nn.Linear((self.hidden_dim//3)*4, self.hidden_dim),
            nn.LayerNorm(self.hidden_dim),
        )
        
        # continuous
        self.cont_bn = nn.BatchNorm1d(self.cont_col_size)
        self.cont_emb = nn.Sequential(
            nn.Linear(self.cont_col_size, self.hidden_dim),
            nn.LayerNorm(self.hidden_dim),
        )
        
        # combination
        self.comb_proj = nn.Sequential(
            nn.ReLU(),
            nn.Linear(self.hidden_dim*2, self.hidden_dim),
            nn.LayerNorm(self.hidden_dim),
        )


        self.lstm = nn.LSTM(self.hidden_dim,
                            self.hidden_dim,
                            self.n_layers,
                            batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(self.hidden_dim, 1)

        self.activation = nn.Sigmoid()

    def init_hidden(self, batch_size):
        h = torch.zeros(
            self.n_layers,
            batch_size,
            self.hidden_dim)
        h = h.to(self.device)

        c = torch.zeros(
            self.n_layers,
            batch_size,
            self.hidden_dim)
        c = c.to(self.device)

        return (h, c)

    def forward(self, input):

        test, question, tag, _, mask, interaction, cont_x = input
        self.cont_col_size = cont_x.size(-1)

        batch_size = interaction.size(0)

        # Embedding

        embed_interaction = self.embedding_interaction(interaction)
        embed_test = self.embedding_test(test)
        embed_question = self.embedding_question(question)
        embed_tag = self.embedding_tag(tag)
        

        cate_emb = torch.cat([embed_interaction,
                           embed_test,
                           embed_question,
                           embed_tag,], 2)
        
        cate_emb = self.cate_proj(cate_emb)
        
        # continuous
        cont_x = self.cont_bn(cont_x.view(-1, cont_x.size(-1))).view(batch_size, -1, cont_x.size(-1))
        cont_emb = self.cont_emb(cont_x.view(batch_size, args.max_seq_len, -1))        
        
        # combination
        seq_emb = torch.cat([cate_emb, cont_emb], 2)        
        X = self.comb_proj(seq_emb)   
        #X = self.comb_proj(embed)

        hidden = self.init_hidden(batch_size)
        out, hidden = self.lstm(X, hidden)
        out = out.contiguous().view(batch_size, -1, self.hidden_dim)  # .contiguous(): 새로운 텐서를 반환

        out = self.fc(out)
        preds = self.activation(out).view(batch_size, -1)

        return preds

In [35]:
import os
import torch
import numpy as np

#from dkt.dataloader import get_loaders
from dkt.optimizer import get_optimizer
from dkt.scheduler import get_scheduler
from dkt.criterion import get_criterion
from dkt.metric import get_metric
from dkt.model import LSTM, LSTMATTN, Bert, Saint

from dkt.trainer import get_lr, inference, get_model, compute_loss, update_params, save_checkpoint, load_model

import wandb
import time
import datetime
import gc


In [36]:
MODEL_DIR = 'models/'
os.makedirs(MODEL_DIR, exist_ok=True)

In [37]:
def get_loaders(args, train, valid):

    pin_memory = True  # False
    train_loader, valid_loader = None, None
    
    if train is not None:
        trainset = CustomDKTDataset(train, args)
        train_loader = torch.utils.data.DataLoader(trainset, num_workers=args.num_workers, shuffle=True,
                            batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)
    if valid is not None:
        valset = CustomDKTDataset(valid, args)
        valid_loader = torch.utils.data.DataLoader(valset, num_workers=args.num_workers, shuffle=False,
                            batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)

    return train_loader, valid_loader

In [38]:

def train(train_loader, model, optimizer, args):
    model.train()

    total_preds = []
    total_targets = []
    losses = []
    for step, batch in enumerate(train_loader):
        input = process_batch(batch, args)
        preds = model(input)
        targets = input[3] # correct


        loss = compute_loss(preds, targets)
        update_params(loss, model, optimizer, args)


        if step % args.log_steps == 0:
            print(f"Training steps: {step} Loss: {str(loss.item())}")
        
        # predictions
        preds = preds[:,-1]
        targets = targets[:,-1]

        if args.device == 'cuda':
            preds = preds.to('cpu').detach().numpy()
            targets = targets.to('cpu').detach().numpy()
        else: # cpu
            preds = preds.detach().numpy()
            targets = targets.detach().numpy()
        
        total_preds.append(preds)
        total_targets.append(targets)
        losses.append(loss)
      

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    auc, acc = get_metric(total_targets, total_preds)
    loss_avg = sum(losses)/len(losses)
    print(f'TRAIN AUC : {auc} ACC : {acc}')
    return auc, acc, loss_avg
    

def validate(valid_loader, model, args):
    model.eval()

    total_preds = []
    total_targets = []
    for step, batch in enumerate(valid_loader):
        input = process_batch(batch, args)

        preds = model(input)
        targets = input[3] # correct


        # predictions
        preds = preds[:,-1]
        targets = targets[:,-1]
    
        if args.device == 'cuda':
            preds = preds.to('cpu').detach().numpy()
            targets = targets.to('cpu').detach().numpy()
        else: # cpu
            preds = preds.detach().numpy()
            targets = targets.detach().numpy()

        total_preds.append(preds)
        total_targets.append(targets)

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    auc, acc = get_metric(total_targets, total_preds)
    
    print(f'VALID AUC : {auc} ACC : {acc}\n')

    return auc, acc, total_preds, total_targets


In [39]:
def run(args, train_data, valid_data):
    train_loader, valid_loader = get_loaders(args, train_data, valid_data)
    
    # only when using warmup scheduler
    args.total_steps = int(len(train_loader.dataset) / args.batch_size) * (args.n_epochs)
    args.warmup_steps = args.total_steps // 10
            
    #model = get_model(args)
    model = LSTM(args)
    model.to(args.device)
    
    optimizer = get_optimizer(model, args)
    scheduler = get_scheduler(optimizer, args)

    best_auc = -1
    early_stopping_counter = 0
    for epoch in range(args.n_epochs):

        print(f"Start Training: Epoch {epoch + 1}")
        start = time.time()
        ### TRAIN
        train_auc, train_acc, train_loss = train(train_loader, model, optimizer, args)
        
        ### VALID
        auc, acc,_ , _ = validate(valid_loader, model, args)

        sec = time.time() - start
        times = str(datetime.timedelta(seconds=sec)).split(".")
        times = times[0]
        print(f'<<<<<<<<<<  {epoch + 1} EPOCH spent : {times}  >>>>>>>>>>')

        ### TODO: model save or early stopping
        wandb.log({"epoch": epoch, "train_loss": train_loss, "train_auc": train_auc, "train_acc":train_acc,
                  "valid_auc":auc, "valid_acc":acc, "Learning_rate": get_lr(optimizer),})
        
        if auc > best_auc:
            best_auc = auc
            # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.
            model_to_save = model.module if hasattr(model, 'module') else model
            save_checkpoint({
                'epoch': epoch + 1,
                'state_dict': model_to_save.state_dict(),
                },
                args.model_dir, 'model.pt',
            )
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= args.patience:
                print(f'EarlyStopping counter: {early_stopping_counter} out of {args.patience}')
                break

        # scheduler
        if args.scheduler == 'plateau':
            scheduler.step(best_auc)
        else:
            scheduler.step()



In [40]:
wandb.init(project='dkt', config=vars(args), tags=['lstm'], name='lstm')

[34m[1mwandb[0m: Currently logged in as: [33mdhh0[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.31 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [41]:
run(args, train_data, valid_data)

Start Training: Epoch 1
torch.Size([64, 180])
torch.Size([64, 20, 9])
Training steps: 0 Loss: 0.6896703243255615
torch.Size([64, 180])
torch.Size([64, 20, 9])
torch.Size([64, 180])
torch.Size([64, 20, 9])
torch.Size([64, 180])
torch.Size([64, 20, 9])
torch.Size([64, 180])
torch.Size([64, 20, 9])
torch.Size([64, 180])
torch.Size([64, 20, 9])
torch.Size([64, 180])
torch.Size([64, 20, 9])
torch.Size([64, 180])
torch.Size([64, 20, 9])
torch.Size([64, 180])
torch.Size([64, 20, 9])
torch.Size([64, 180])
torch.Size([64, 20, 9])
torch.Size([64, 180])
torch.Size([64, 20, 9])
torch.Size([64, 180])
torch.Size([64, 20, 9])


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 185, in _worker_loop
    data = fetcher.fetch(index)
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "<ipython-input-27-9ff0f4f30542>", line 17, in collate
    pre_padded[-len(col):] = col  # 앞부분이 0으로 padding됨
RuntimeError: The expanded size of the tensor (17) must match the existing size (20) at non-singleton dimension 0.  Target sizes: [17].  Tensor sizes: [20]


In [49]:
tt = torch.rand([3,11])

In [51]:
tt.unsqueeze(-1).view([3,5,-1])

RuntimeError: shape '[3, 5, -1]' is invalid for input of size 33

## preds = model(input)

In [57]:
preds.shape

torch.Size([64, 20])

In [150]:
cont_features = input[6]
print(cont_features.shape)
print(cont_features.size(-1))

torch.Size([64, 20, 8])
8


In [169]:
# continuous
cont_col_size = cont_features.size(-1)
print(cont_col_size)
cont_bn = nn.BatchNorm1d(cont_col_size)
cont_bn.to(args.device)

8


BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)

In [170]:
# batchnorm 1d 적용
cont_bn_x = cont_bn(cont_features.view(-1, cont_features.size(-1)))
cont_bn_x.size()

torch.Size([1280, 8])

In [172]:
# batchnorm 적용 이후 원래 사이즈 복구
cont_bn_x = cont_bn_x.view(args.batch_size, -1, cont_features.size(-1))
cont_bn_x.size()

torch.Size([64, 20, 8])

In [186]:
# 범주형과는 다르게 embedding없이 바로 projection을 통해 원하는 사이즈로 줄인다
# 여기서는 embedding이라고 부른다
# [16, 16, 36] -> [16, 16, 128]
lin = nn.Linear(cont_col_size, args.hidden_dim)
ln =  nn.LayerNorm(args.hidden_dim)
# cont_emb = nn.Sequential(nn.Linear(cont_col_size, args.hidden_dim),
#                          nn.LayerNorm(args.hidden_dim))

In [None]:
lin.to(args.device)

In [178]:
args.hidden_dim

64

In [None]:
cont_emb.to(args.device)
cont_embed_x = cont_emb(cont_bn_x)
cont_embed_x.size()

In [None]:
cont_emb = nn.Sequential(
    nn.Linear(cont_col_size*cfg.n_rows_per_step, self.hidden_dim),
    nn.LayerNorm(self.hidden_dim),
)

cuda


In [33]:
def load_data_from_file(self, file_name, is_train=True):
    csv_file_path = os.path.join(self.args.data_dir, file_name)
    df = pd.read_csv(csv_file_path)#, nrows=100000)
    df = self.__feature_engineering(df)
    df = self.__preprocessing(df, is_train)

    # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용
    self.args.n_questions = len(np.load(os.path.join(self.args.asset_dir,'assessmentItemID_classes.npy')))
    self.args.n_test = len(np.load(os.path.join(self.args.asset_dir,'testId_classes.npy')))
    self.args.n_tag = len(np.load(os.path.join(self.args.asset_dir,'KnowledgeTag_classes.npy')))



    df = df.sort_values(by=['userID','Timestamp'], axis=0)
    columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag']
    group = df[columns].groupby('userID').apply(
            lambda r: (
                r['testId'].values, 
                r['assessmentItemID'].values,
                r['KnowledgeTag'].values,
                r['answerCode'].values
            )
        )

    return group.values

In [None]:
preprocess = Preprocess(args)