In [1]:
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import KFold

from copy import deepcopy

from gensim.models import Word2Vec

import warnings

warnings.filterwarnings(action='ignore')
torch.set_printoptions(sci_mode=True)

In [2]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [3]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

# 데이터 전처리

In [4]:
class MakeDataset():

    def __init__(self, DATA_PATH):
        self.preporcessing(DATA_PATH)
        self.oof_user_set = self.split_data()
    
    def split_data(self):
        user_list = self.all_df['userID'].unique().tolist()
        oof_user_set = {}
        kf = KFold(n_splits = 5, random_state = 22, shuffle = True)
        for idx, (train_user, valid_user) in enumerate(kf.split(user_list)):
            oof_user_set[idx] = valid_user.tolist()
        
        return oof_user_set

    def preporcessing(self, DATA_PATH):

        dtype = {
            'userID': 'int16',
            'answerCode': 'int8',
            'KnowledgeTag': 'int16'
        }
        
        train_df = pd.read_csv(os.path.join(DATA_PATH, 'train_data.csv'), dtype=dtype, parse_dates=['Timestamp'])
        train_df = train_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

        test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_data.csv'), dtype=dtype, parse_dates=['Timestamp'])
        test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)


        def get_large_paper_number(x):
            return x[1:4]
        
        train_df['large_paper_number'] = train_df['assessmentItemID'].apply(lambda x : get_large_paper_number(x))
        test_df['large_paper_number'] = test_df['assessmentItemID'].apply(lambda x : get_large_paper_number(x))

        # 문제 푸는데 걸린 시간
        def get_now_elapsed(df):
            
            diff = df.loc[:, ['userID','Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
            diff = diff.fillna(pd.Timedelta(seconds=0))
            diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
            df['now_elapsed'] = diff
            df['now_elapsed'] = df['now_elapsed'].apply(lambda x : x if x < 650 and x >=0 else 0)
            df['now_elapsed'] = df['now_elapsed']

            return df

        train_df = get_now_elapsed(df = train_df)
        test_df = get_now_elapsed(df = test_df)

        all_df = pd.concat([train_df, test_df])
        all_df = all_df[all_df['answerCode'] != -1].reset_index(drop = True)

        # normalize_score
        def get_normalize_score(df, all_df):
            ret_df = []

            group_df = df.groupby('userID')
            mean_answerCode_df = all_df.groupby('testId').mean()['answerCode']
            std_answerCode_df = all_df.groupby('testId').std()['answerCode']
            for userID, get_df in group_df:
                normalize_score_df = (get_df[get_df['answerCode'] != -1].groupby('testId').mean()['answerCode'] - mean_answerCode_df) / std_answerCode_df
                get_df = get_df.copy().set_index('testId')
                get_df['normalize_score'] = normalize_score_df
                ret_df.append(get_df.reset_index(drop = False))

            ret_df = pd.concat(ret_df).reset_index(drop = True)
            
            return ret_df
        
        # train_df = get_normalize_score(df = train_df, all_df = all_df)
        # test_df = get_normalize_score(df = test_df, all_df = all_df)

        # 문항별 정답률
        train_df = train_df.set_index('assessmentItemID')
        train_df['assessmentItemID_mean_answerCode'] = all_df.groupby('assessmentItemID').mean()['answerCode']
        train_df = train_df.reset_index(drop = False)

        test_df = test_df.set_index('assessmentItemID')
        test_df['assessmentItemID_mean_answerCode'] = all_df.groupby('assessmentItemID').mean()['answerCode']
        test_df = test_df.reset_index(drop = False)

        # 문항별 정답률 표준편차
        train_df = train_df.set_index('assessmentItemID')
        train_df['assessmentItemID_std_answerCode'] = all_df.groupby('assessmentItemID').std()['answerCode']
        train_df = train_df.reset_index(drop = False)

        test_df = test_df.set_index('assessmentItemID')
        test_df['assessmentItemID_std_answerCode'] = all_df.groupby('assessmentItemID').std()['answerCode']
        test_df = test_df.reset_index(drop = False)

        # 올바르게 푼 사람들의 문항별 풀이 시간 평균
        train_df = train_df.set_index('assessmentItemID')
        train_df['assessmentItemID_mean_now_elapsed'] = all_df[all_df['answerCode'] == 1].groupby('assessmentItemID').mean()['now_elapsed']
        train_df = train_df.reset_index(drop = False)

        test_df = test_df.set_index('assessmentItemID')
        test_df['assessmentItemID_mean_now_elapsed'] = all_df[all_df['answerCode'] == 1].groupby('assessmentItemID').mean()['now_elapsed']
        test_df = test_df.reset_index(drop = False)

        # 올바르게 푼 사람들의 문항별 풀이 시간 표준 편차
        train_df = train_df.set_index('assessmentItemID')
        train_df['assessmentItemID_std_now_elapsed'] = all_df[all_df['answerCode'] == 1].groupby('assessmentItemID').std()['now_elapsed']
        train_df = train_df.reset_index(drop = False)

        test_df = test_df.set_index('assessmentItemID')
        test_df['assessmentItemID_std_now_elapsed'] = all_df[all_df['answerCode'] == 1].groupby('assessmentItemID').std()['now_elapsed']
        test_df = test_df.reset_index(drop = False)

        # 문제 푼 시간
        train_df['hour'] = train_df['Timestamp'].dt.hour
        test_df['hour'] = test_df['Timestamp'].dt.hour

        # 문제 푼 요일
        train_df['dayofweek'] = train_df['Timestamp'].dt.dayofweek
        test_df['dayofweek'] = test_df['Timestamp'].dt.dayofweek

        # index 로 변환

        def get_val2idx(val_list : list) -> dict:
            val2idx = {}
            for idx, val in enumerate(val_list):
                val2idx[val] = idx
            
            return val2idx

        assessmentItemID2idx = get_val2idx(all_df['assessmentItemID'].unique().tolist())
        testId2idx = get_val2idx(all_df['testId'].unique().tolist())
        KnowledgeTag2idx = get_val2idx(all_df['KnowledgeTag'].unique().tolist())
        large_paper_number2idx = get_val2idx(all_df['large_paper_number'].unique().tolist())

        train_df['assessmentItemID2idx'] = train_df['assessmentItemID'].apply(lambda x : assessmentItemID2idx[x])
        train_df['testId2idx'] = train_df['testId'].apply(lambda x : testId2idx[x])
        train_df['KnowledgeTag2idx'] = train_df['KnowledgeTag'].apply(lambda x : KnowledgeTag2idx[x])
        train_df['large_paper_number2idx'] = train_df['large_paper_number'].apply(lambda x : large_paper_number2idx[x])

        test_df['assessmentItemID2idx'] = test_df['assessmentItemID'].apply(lambda x : assessmentItemID2idx[x])
        test_df['testId2idx'] = test_df['testId'].apply(lambda x : testId2idx[x])
        test_df['KnowledgeTag2idx'] = test_df['KnowledgeTag'].apply(lambda x : KnowledgeTag2idx[x])
        test_df['large_paper_number2idx'] = test_df['large_paper_number'].apply(lambda x : large_paper_number2idx[x])

        self.assessmentItemID2idx = assessmentItemID2idx
        self.train_df, self.test_df = train_df, test_df
        self.all_df = pd.concat([train_df, test_df[test_df['answerCode'] != -1]]).reset_index(drop=True)
        self.num_assessmentItemID = len(assessmentItemID2idx)
        self.num_testId = len(testId2idx)
        self.num_KnowledgeTag = len(KnowledgeTag2idx)
        self.num_large_paper_number = len(large_paper_number2idx)
        self.num_hour = 24
        self.num_dayofweek = 7

    def get_oof_data(self, oof):

        val_user_list = self.oof_user_set[oof]

        train = []
        valid = []

        group_df = self.all_df.groupby('userID')

        for userID, df in group_df:
            if userID in val_user_list:
                trn_df = df.iloc[:-1, :]
                val_df = df.copy()
                train.append(trn_df)
                valid.append(val_df)
            else:
                train.append(df)

        # normalize_score
        def get_normalize_score(df, all_df, vailid = False):
            ret_df = []

            group_df = df.groupby('userID')
            mean_answerCode_df = all_df.groupby('testId').mean()['answerCode']
            std_answerCode_df = all_df.groupby('testId').std()['answerCode']
            for userID, get_df in group_df:
                if vailid:
                    normalize_score_df = (get_df.iloc[:-1, :].groupby('testId').mean()['answerCode'] - mean_answerCode_df) / std_answerCode_df
                else:
                    normalize_score_df = (get_df.groupby('testId').mean()['answerCode'] - mean_answerCode_df) / std_answerCode_df
                    
                get_df = get_df.copy().set_index('testId')
                get_df['normalize_score'] = normalize_score_df
                ret_df.append(get_df.reset_index(drop = False))

            ret_df = pd.concat(ret_df).reset_index(drop = True)
            
            return ret_df

        train = pd.concat(train).reset_index(drop = True)
        valid = pd.concat(valid).reset_index(drop = True)

        # train = get_normalize_score(df = train, all_df = train)
        # valid = get_normalize_score(df = valid, all_df = train, vailid = True)
        
        return train, valid
    
    def get_test_data(self):
        return self.test_df.copy()

In [5]:
class CustomDataset(Dataset):
    def __init__(
        self, 
        df,
        cat_cols = ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx', 'hour', 'dayofweek'],
        num_cols = ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode'],
        max_len = None,
        window = None,
        data_augmentation = False,
        ):

        self.cat_cols = cat_cols
        self.num_cols = num_cols
        self.get_df = df.groupby('userID')
        self.user_list = df['userID'].unique().tolist()
        self.max_len = max_len
        self.window = window
        self.data_augmentation = data_augmentation
        if self.data_augmentation:
            self.cat_feature_list, self.num_feature_list, self.answerCode_list = self._data_augmentation()


    def __len__(self):
        if self.data_augmentation:
            return len(self.cat_feature_list)
        return len(self.user_list)

    def __getitem__(self, idx):
        if self.data_augmentation:
            cat_feature = self.cat_feature_list[idx]
            num_feature = self.num_feature_list[idx]
            answerCode = self.answerCode_list[idx]

            now_cat_feature = cat_feature[1:, :]
            now_num_feature = num_feature[1:, :]
            now_answerCode = answerCode[1:]
            
            past_cat_feature = cat_feature[:-1, :]
            past_num_feature = num_feature[:-1, :]
            past_answerCode = answerCode[:-1]
            
        else:
            user = self.user_list[idx]
            if self.max_len:
                get_df = self.get_df.get_group(user).iloc[-self.max_len:, :]
            else:
                get_df = self.get_df.get_group(user)

            now_df = get_df.iloc[1:, :]
            now_cat_feature = now_df[self.cat_cols].values
            now_num_feature = now_df[self.num_cols].values
            now_answerCode = now_df['answerCode'].values

            past_df = get_df.iloc[:-1, :]
            past_cat_feature = past_df[self.cat_cols].values
            past_num_feature = past_df[self.num_cols].values
            past_answerCode = past_df['answerCode'].values

        return {
            'past_cat_feature' : past_cat_feature, 
            'past_num_feature' : past_num_feature, 
            'past_answerCode' : past_answerCode, 
            'now_cat_feature' : now_cat_feature, 
            'now_num_feature' : now_num_feature, 
            'now_answerCode' : now_answerCode
            }
    

    def _data_augmentation(self):
        cat_feature_list = []
        num_feature_list = []
        answerCode_list = []
        for userID, get_df in tqdm(self.get_df):
            cat_feature = get_df[self.cat_cols].values[::-1]
            num_feature = get_df[self.num_cols].values[::-1]
            answerCode = get_df['answerCode'].values[::-1]

            start_idx = 0

            if len(get_df) <= self.max_len:
                cat_feature_list.append(cat_feature[::-1])
                num_feature_list.append(num_feature[::-1])
                answerCode_list.append(answerCode[::-1])
            else:
                while True:
                    if len(cat_feature[start_idx: start_idx + self.max_len, :]) < self.max_len:
                        cat_feature_list.append(cat_feature[start_idx: start_idx + self.max_len, :][::-1])
                        num_feature_list.append(num_feature[start_idx: start_idx + self.max_len, :][::-1])
                        answerCode_list.append(answerCode[start_idx: start_idx + self.max_len][::-1])
                        break
                    cat_feature_list.append(cat_feature[start_idx: start_idx + self.max_len, :][::-1])
                    num_feature_list.append(num_feature[start_idx: start_idx + self.max_len, :][::-1])
                    answerCode_list.append(answerCode[start_idx: start_idx + self.max_len][::-1])
                    start_idx += self.window
            
        return cat_feature_list, num_feature_list, answerCode_list

In [6]:
def pad_sequence(seq, max_len, padding_value = 0):
    try:
        seq_len, col = seq.shape
        padding = np.zeros((max_len - seq_len, col)) + padding_value
    except:
        seq_len = seq.shape[0]
        padding = np.zeros((max_len - seq_len, )) + padding_value

    padding_seq = np.concatenate([padding, seq])

    return padding_seq

def train_make_batch(samples):
    max_len = 0
    for sample in samples:
        seq_len, col = sample['past_cat_feature'].shape
        if max_len < seq_len:
            max_len = seq_len
    
    past_cat_feature = []
    past_num_feature = []
    past_answerCode = []
    now_cat_feature = []
    now_num_feature = []
    now_answerCode = []

    for sample in samples:
        past_cat_feature += [pad_sequence(sample['past_cat_feature'] + 1, max_len = max_len, padding_value = 0)]
        past_num_feature += [pad_sequence(sample['past_num_feature'], max_len = max_len, padding_value = 0)]
        past_answerCode += [pad_sequence(sample['past_answerCode'] + 1, max_len = max_len, padding_value = 0)]
        now_cat_feature += [pad_sequence(sample['now_cat_feature'] + 1, max_len = max_len, padding_value = 0)]
        now_num_feature += [pad_sequence(sample['now_num_feature'], max_len = max_len, padding_value = 0)]
        now_answerCode += [pad_sequence(sample['now_answerCode'], max_len = max_len, padding_value = -1)]

    return torch.tensor(past_cat_feature, dtype = torch.long), torch.tensor(past_num_feature, dtype = torch.float32), torch.tensor(past_answerCode, dtype = torch.long), torch.tensor(now_cat_feature, dtype = torch.long), torch.tensor(now_num_feature, dtype = torch.float32), torch.tensor(now_answerCode, dtype = torch.float32)

# 모델

In [7]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, hidden_units, dropout_rate):
        super(ScaledDotProductAttention, self).__init__()
        self.hidden_units = hidden_units
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, Q, K, V, mask):
        """
        Q, K, V : (batch_size, num_heads, max_len, hidden_units)
        mask : (batch_size, 1, max_len, max_len)
        """
        attn_score = torch.matmul(Q, K.transpose(2, 3)) / math.sqrt(self.hidden_units) # (batch_size, num_heads, max_len, max_len)
        attn_score = attn_score.masked_fill(mask == 0, -1e9)  # 유사도가 0인 지점은 -infinity로 보내 softmax 결과가 0이 되도록 함
        attn_dist = self.dropout(F.softmax(attn_score, dim=-1))  # attention distribution
        output = torch.matmul(attn_dist, V)  # (batch_size, num_heads, max_len, hidden_units) / # dim of output : batchSize x num_head x seqLen x hidden_units
        return output, attn_dist


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, hidden_units, dropout_rate):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads # head의 수
        self.hidden_units = hidden_units
        
        # query, key, value, output 생성을 위해 Linear 모델 생성
        self.W_Q = nn.Linear(hidden_units, hidden_units, bias=False)
        self.W_K = nn.Linear(hidden_units, hidden_units, bias=False)
        self.W_V = nn.Linear(hidden_units, hidden_units, bias=False)
        self.W_O = nn.Linear(hidden_units, hidden_units, bias=False)

        self.attention = ScaledDotProductAttention(hidden_units, dropout_rate)
        self.dropout = nn.Dropout(dropout_rate) # dropout rate
        self.layerNorm = nn.LayerNorm(hidden_units, 1e-6) # layer normalization

    def forward(self, enc, mask):
        """
        enc : (batch_size, max_len, hidden_units)
        mask : (batch_size, 1, max_len, max_len)
        
        """
        residual = enc # residual connection을 위해 residual 부분을 저장
        batch_size, seqlen = enc.size(0), enc.size(1)

        # Query, Key, Value를 (num_head)개의 Head로 나누어 각기 다른 Linear projection을 통과시킴
        Q = self.W_Q(enc).view(batch_size, seqlen, self.num_heads, self.hidden_units // self.num_heads) # (batch_size, max_len, num_heads, hidden_units)
        K = self.W_K(enc).view(batch_size, seqlen, self.num_heads, self.hidden_units // self.num_heads) # (batch_size, max_len, num_heads, hidden_units)
        V = self.W_V(enc).view(batch_size, seqlen, self.num_heads, self.hidden_units // self.num_heads) # (batch_size, max_len, num_heads, hidden_units)

        # Head별로 각기 다른 attention이 가능하도록 Transpose 후 각각 attention에 통과시킴
        Q, K, V = Q.transpose(1, 2), K.transpose(1, 2), V.transpose(1, 2) # (batch_size, num_heads, max_len, hidden_units)
        output, attn_dist = self.attention(Q, K, V, mask) # output : (batch_size, num_heads, max_len, hidden_units) / attn_dist : (batch_size, num_heads, max_len, max_len)

        # 다시 Transpose한 후 모든 head들의 attention 결과를 합칩니다.
        output = output.transpose(1, 2).contiguous() # (batch_size, max_len, num_heads, hidden_units) / contiguous() : 가변적 메모리 할당
        output = output.view(batch_size, seqlen, -1) # (batch_size, max_len, hidden_units * num_heads)

        # Linear Projection, Dropout, Residual sum, and Layer Normalization
        output = self.layerNorm(self.dropout(self.W_O(output)) + residual) # (batch_size, max_len, hidden_units)
        return output, attn_dist


class PositionwiseFeedForward(nn.Module):
    def __init__(self, hidden_units, dropout_rate):
        super(PositionwiseFeedForward, self).__init__()

        self.W_1 = nn.Linear(hidden_units, hidden_units)
        self.W_2 = nn.Linear(hidden_units, hidden_units)
        self.dropout = nn.Dropout(dropout_rate)
        self.layerNorm = nn.LayerNorm(hidden_units, 1e-6) # layer normalization

    def forward(self, x):
        residual = x
        output = self.W_2(F.relu(self.dropout(self.W_1(x))))
        output = self.layerNorm(self.dropout(output) + residual)
        return output


class SASRecBlock(nn.Module):
    def __init__(self, num_heads, hidden_units, dropout_rate):
        super(SASRecBlock, self).__init__()
        self.attention = MultiHeadAttention(num_heads, hidden_units, dropout_rate)
        self.pointwise_feedforward = PositionwiseFeedForward(hidden_units, dropout_rate)

    def forward(self, input_enc, mask):
        """
        input_enc : (batch_size, max_len, hidden_units)
        mask : (batch_size, 1, max_len, max_len)
        """
        output_enc, attn_dist = self.attention(input_enc, mask)
        output_enc = self.pointwise_feedforward(output_enc)
        return output_enc, attn_dist


class SASRec(nn.Module):
    def __init__(
        self, 
        num_assessmentItemID, 
        num_testId,
        num_KnowledgeTag,
        num_large_paper_number,
        num_hour,
        num_dayofweek,
        num_cols,
        cat_cols,
        emb_size,
        hidden_units,
        num_heads, 
        num_layers, 
        dropout_rate, 
        device):
        super(SASRec, self).__init__()

        # past
        self.past_assessmentItemID_emb = nn.Embedding(num_assessmentItemID + 1, emb_size, padding_idx = 0) # 문항에 대한 정보
        self.past_testId_emb = nn.Embedding(num_testId + 1, emb_size, padding_idx = 0) # 시험지에 대한 정보
        self.past_KnowledgeTag_emb = nn.Embedding(num_KnowledgeTag + 1, emb_size, padding_idx = 0) # 지식 태그에 대한 정보
        self.past_large_paper_number_emb = nn.Embedding(num_large_paper_number + 1, emb_size, padding_idx = 0) # 핫년에 대한 정보
        self.past_hour_emb = nn.Embedding(num_hour + 1, emb_size, padding_idx = 0) # 문제 풀이 시간에 대한 정보
        self.past_dayofweek_emb = nn.Embedding(num_dayofweek + 1, emb_size, padding_idx = 0) # 문제 풀이 요일에 대항 정보
        self.past_answerCode_emb = nn.Embedding(3, hidden_units, padding_idx = 0) # 문제 정답 여부에 대한 정보

        self.past_cat_emb = nn.Sequential(
            nn.Linear(len(cat_cols) * emb_size, hidden_units // 2),
            nn.LayerNorm(hidden_units // 2, eps=1e-6)
        )

        self.past_num_emb = nn.Sequential(
            nn.Linear(len(num_cols), hidden_units // 2),
            nn.LayerNorm(hidden_units // 2, eps=1e-6)
        )

        self.emb_layernorm = nn.LayerNorm(hidden_units, eps=1e-6)

        self.past_lstm = nn.LSTM(
            input_size = hidden_units,
            hidden_size = hidden_units,
            num_layers = num_layers,
            batch_first = True,
            bidirectional = False,
            dropout = dropout_rate,
            )

        self.past_blocks = nn.ModuleList([SASRecBlock(num_heads, hidden_units, dropout_rate) for _ in range(num_layers)])

        # now
        self.now_assessmentItemID_emb = nn.Embedding(num_assessmentItemID + 1, emb_size, padding_idx = 0) # 문항에 대한 정보
        self.now_testId_emb = nn.Embedding(num_testId + 1, emb_size, padding_idx = 0) # 시험지에 대한 정보
        self.now_KnowledgeTag_emb = nn.Embedding(num_KnowledgeTag + 1, emb_size, padding_idx = 0) # 지식 태그에 대한 정보
        self.now_large_paper_number_emb = nn.Embedding(num_large_paper_number + 1, emb_size, padding_idx = 0) # 핫년에 대한 정보
        self.now_hour_emb = nn.Embedding(num_hour + 1, emb_size, padding_idx = 0) # 문제 풀이 시간에 대한 정보
        self.now_dayofweek_emb = nn.Embedding(num_dayofweek + 1, emb_size, padding_idx = 0) # 문제 풀이 요일에 대항 정보

        self.now_cat_emb = nn.Sequential(
            nn.Linear(len(cat_cols) * emb_size, hidden_units // 2),
            nn.LayerNorm(hidden_units // 2, eps=1e-6)
        )

        self.now_num_emb = nn.Sequential(
            nn.Linear(len(num_cols), hidden_units // 2),
            nn.LayerNorm(hidden_units // 2, eps=1e-6)
        )

        self.now_lstm = nn.LSTM(
            input_size = hidden_units,
            hidden_size = hidden_units,
            num_layers = num_layers,
            batch_first = True,
            bidirectional = False,
            dropout = dropout_rate,
            )

        self.now_blocks = nn.ModuleList([SASRecBlock(num_heads, hidden_units, dropout_rate) for _ in range(num_layers)])

        # predict

        self.dropout = nn.Dropout(dropout_rate)

        self.predict_layer = nn.Sequential(
            nn.Linear(hidden_units * 2, 1),
            nn.Sigmoid()
        )

        self.cat_cols = cat_cols
        self.num_cols = num_cols
        
        self.hidden_units = hidden_units
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.device = device
    
    
    def forward(self, past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature):
        """
        past_cat_feature : (batch_size, max_len, cat_cols)
        past_num_feature : (batch_size, max_len, num_cols)
        past_answerCode : (batch_size, max_len)

        now_cat_feature : (batch_size, max_len, cat_cols)
        now_num_feature : (batch_size, max_len, num_cols)
        
        """

        past_cat_emb_list = []
        for idx in range(len(self.cat_cols)):
            if self.cat_cols[idx] == 'assessmentItemID2idx':
                past_cat_emb_list.append(self.past_assessmentItemID_emb(past_cat_feature[:, :, idx]))
            elif self.cat_cols[idx] == 'testId2idx':
                past_cat_emb_list.append(self.past_testId_emb(past_cat_feature[:, :, idx]))
            elif self.cat_cols[idx] == 'KnowledgeTag2idx':
                past_cat_emb_list.append(self.past_KnowledgeTag_emb(past_cat_feature[:, :, idx]))
            elif self.cat_cols[idx] == 'large_paper_number2idx':
                past_cat_emb_list.append(self.past_large_paper_number_emb(past_cat_feature[:, :, idx]))
            elif self.cat_cols[idx] == 'hour':
                past_cat_emb_list.append(self.past_hour_emb(past_cat_feature[:, :, idx]))
            elif self.cat_cols[idx] == 'dayofweek':
                past_cat_emb_list.append(self.past_dayofweek_emb(past_cat_feature[:, :, idx]))

        past_cat_emb = torch.concat(past_cat_emb_list, dim = -1)
        past_cat_emb = self.past_cat_emb(past_cat_emb)
        past_num_emb = self.past_num_emb(past_num_feature)

        past_emb = torch.concat([past_cat_emb, past_num_emb], dim = -1)
        past_emb += self.past_answerCode_emb(past_answerCode.to(self.device))
        past_emb = self.emb_layernorm(past_emb) # LayerNorm

        # masking 
        mask_pad = torch.BoolTensor(past_answerCode > 0).unsqueeze(1).unsqueeze(1) # (batch_size, 1, 1, max_len)
        mask_time = (1 - torch.triu(torch.ones((1, 1, past_answerCode.size(1), past_answerCode.size(1))), diagonal=1)).bool() # (batch_size, 1, max_len, max_len)
        mask = (mask_pad & mask_time).to(self.device) # (batch_size, 1, max_len, max_len)
        for block in self.past_blocks:
            past_emb, attn_dist = block(past_emb, mask)

        past_emb, _ = self.past_lstm(past_emb)

        now_cat_emb_list = []
        for idx in range(len(self.cat_cols)):
            if self.cat_cols[idx] == 'assessmentItemID2idx':
                now_cat_emb_list.append(self.now_assessmentItemID_emb(now_cat_feature[:, :, idx]))
            elif self.cat_cols[idx] == 'testId2idx':
                now_cat_emb_list.append(self.now_testId_emb(now_cat_feature[:, :, idx]))
            elif self.cat_cols[idx] == 'KnowledgeTag2idx':
                now_cat_emb_list.append(self.now_KnowledgeTag_emb(now_cat_feature[:, :, idx]))
            elif self.cat_cols[idx] == 'large_paper_number2idx':
                now_cat_emb_list.append(self.now_large_paper_number_emb(now_cat_feature[:, :, idx]))
            elif self.cat_cols[idx] == 'hour':
                now_cat_emb_list.append(self.now_hour_emb(now_cat_feature[:, :, idx]))
            elif self.cat_cols[idx] == 'dayofweek':
                now_cat_emb_list.append(self.now_dayofweek_emb(now_cat_feature[:, :, idx]))

        now_cat_emb = torch.concat(now_cat_emb_list, dim = -1)
        now_cat_emb = self.now_cat_emb(now_cat_emb)
        now_num_emb = self.now_num_emb(now_num_feature)

        now_emb = torch.concat([now_cat_emb, now_num_emb], dim = -1)

        for block in self.now_blocks:
            now_emb, attn_dist = block(now_emb, mask)

        now_emb, _ = self.now_lstm(now_emb)

        emb = torch.concat([past_emb, now_emb], dim = -1)
        
        output = self.predict_layer(self.dropout(emb))

        return output

# 학습 함수

In [8]:
from sklearn.metrics import roc_auc_score

def train(model, data_loader, criterion, optimizer):
    model.train()
    loss_val = 0

    for past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature, now_answerCode in data_loader:

        past_cat_feature, past_num_feature, past_answerCode = past_cat_feature.to(device), past_num_feature.to(device), past_answerCode
        now_cat_feature, now_num_feature, now_answerCode = now_cat_feature.to(device), now_num_feature.to(device), now_answerCode.to(device)

        optimizer.zero_grad()

        output = model(past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature).squeeze(2)
        loss = criterion(output[:, -1], now_answerCode[:, -1])

        loss.backward()
        optimizer.step()

        loss_val += loss.item()

    loss_val /= len(data_loader)

    return loss_val

def evaluate(model, data_loader):
    model.eval()

    target = []
    pred = []

    with torch.no_grad():
        for past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature, now_answerCode in data_loader:
            past_cat_feature, past_num_feature, past_answerCode = past_cat_feature.to(device), past_num_feature.to(device), past_answerCode
            now_cat_feature, now_num_feature, now_answerCode = now_cat_feature.to(device), now_num_feature.to(device), now_answerCode.to(device)
            
            output = model(past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature).squeeze(2)

            target.extend(now_answerCode[:, -1].cpu().numpy().tolist())
            pred.extend(output[:, -1].cpu().numpy().tolist())

    roc_auc = roc_auc_score(target, pred)

    return roc_auc


def predict(model, data_loader):
    model.eval()

    pred = []

    with torch.no_grad():
        for past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature, now_answerCode in data_loader:
            past_cat_feature, past_num_feature, past_answerCode = past_cat_feature.to(device), past_num_feature.to(device), past_answerCode
            now_cat_feature, now_num_feature = now_cat_feature.to(device), now_num_feature.to(device)
            
            output = model(past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature).squeeze(2)
            pred.extend(output[:, -1].cpu().numpy().tolist())

    return pred

# 학습

In [9]:
batch_size = 256
epochs = 20
lr = 0.001
device = 'cuda' if torch.cuda.is_available() else 'cpu'

emb_size = 64
hidden_units = 128
num_heads = 2 # 2,4,8,16,32
num_layers = 1
dropout_rate = 0.5
num_workers = 8

max_len = 50
window = 10
data_augmentation = True

DATA_PATH = '/opt/ml/input/data'
MODEL_PATH = '/opt/ml/model'
SUBMISSION_PATH = '/opt/ml/submission'

model_name = 'Transformer-and-LSTM-Encoder-Decoder-each-Embedding-num_heads-2-data-aug.pt'
submission_name = 'Transformer-and-LSTM-Encoder-Decoder-each-Embedding-num_heads-2-data-aug.csv'

In [10]:
if not os.path.isdir(MODEL_PATH):
    os.mkdir(MODEL_PATH)

In [11]:
if not os.path.isdir(SUBMISSION_PATH):
    os.mkdir(SUBMISSION_PATH)

In [12]:
make_dataset = MakeDataset(DATA_PATH = DATA_PATH)

# OOF Ensemble

In [13]:
oof_roc_auc = 0

for oof in make_dataset.oof_user_set.keys():
    train_df, valid_df = make_dataset.get_oof_data(oof)
    
    seed_everything(22 + oof)
    
    train_dataset = CustomDataset(df = train_df, max_len = max_len, window = window, data_augmentation = data_augmentation)
    train_data_loader = DataLoader(
        train_dataset, 
        batch_size = batch_size, 
        shuffle = True, 
        drop_last = False,
        collate_fn = train_make_batch,
        num_workers = num_workers)

    valid_dataset = CustomDataset(df = valid_df, max_len = max_len)
    valid_data_loader = DataLoader(
        valid_dataset, 
        batch_size = 1, 
        shuffle = False, 
        drop_last = False,
        collate_fn = train_make_batch,
        num_workers = num_workers)

    model = SASRec(
        num_assessmentItemID = make_dataset.num_assessmentItemID, 
        num_testId = make_dataset.num_testId,
        num_KnowledgeTag = make_dataset.num_KnowledgeTag,
        num_large_paper_number = make_dataset.num_large_paper_number,
        num_hour = make_dataset.num_hour,
        num_dayofweek = make_dataset.num_dayofweek,
        num_cols = train_dataset.num_cols,
        cat_cols = train_dataset.cat_cols,
        emb_size = emb_size,
        hidden_units = hidden_units,
        num_heads = num_heads,
        num_layers = num_layers,
        dropout_rate = dropout_rate,
        device = device).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr = lr)
    criterion = nn.BCELoss()

    # pre_emb = Word2Vec.load(os.path.join(MODEL_PATH, 'Word2Vec_Embedding_Model_window_50.model'))

    # assessmentItemID_li = make_dataset.assessmentItemID2idx.keys()

    # with torch.no_grad():
    #     for assessmentItemID in assessmentItemID_li:
    #         idx = make_dataset.assessmentItemID2idx[assessmentItemID]
    #         model.assessmentItemID_emb.weight[idx + 1] = torch.tensor(pre_emb.wv[assessmentItemID]).to(device)

    best_epoch = 0
    best_train_loss = 0
    best_roc_auc = 0

    for epoch in range(1, epochs + 1):
        tbar = tqdm(range(1))
        for _ in tbar:
            train_loss = train(model = model, data_loader = train_data_loader, criterion = criterion, optimizer = optimizer)
            roc_auc = evaluate(model = model, data_loader = valid_data_loader)
            if best_roc_auc < roc_auc:
                best_epoch = epoch
                best_train_loss = train_loss
                best_roc_auc = roc_auc
                torch.save(model.state_dict(), os.path.join(MODEL_PATH, f'oof_{oof}_' + model_name))

            tbar.set_description(f'OOF-{oof}| Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| roc_auc: {roc_auc:.5f}')
    
    print(f'BEST OOF-{oof}| Epoch: {best_epoch:3d}| Train loss: {best_train_loss:.5f}| roc_auc: {best_roc_auc:.5f}')

    oof_roc_auc += best_roc_auc

print(f'Total roc_auc: {oof_roc_auc / len(make_dataset.oof_user_set.keys()):.5f}')

100%|██████████| 7442/7442 [00:16<00:00, 446.33it/s]
OOF-0| Epoch:   1| Train loss: 0.51428| roc_auc: 0.81783: 100%|██████████| 1/1 [02:44<00:00, 164.47s/it]
OOF-0| Epoch:   2| Train loss: 0.48639| roc_auc: 0.83350: 100%|██████████| 1/1 [02:45<00:00, 165.57s/it]
OOF-0| Epoch:   3| Train loss: 0.46524| roc_auc: 0.83709: 100%|██████████| 1/1 [02:43<00:00, 163.81s/it]
OOF-0| Epoch:   4| Train loss: 0.45043| roc_auc: 0.83694: 100%|██████████| 1/1 [02:43<00:00, 163.07s/it]
OOF-0| Epoch:   5| Train loss: 0.44030| roc_auc: 0.83533: 100%|██████████| 1/1 [02:47<00:00, 167.34s/it]
OOF-0| Epoch:   6| Train loss: 0.43078| roc_auc: 0.83323: 100%|██████████| 1/1 [02:48<00:00, 168.92s/it]
OOF-0| Epoch:   7| Train loss: 0.42315| roc_auc: 0.82941: 100%|██████████| 1/1 [02:42<00:00, 162.22s/it]
OOF-0| Epoch:   8| Train loss: 0.41616| roc_auc: 0.82981: 100%|██████████| 1/1 [02:40<00:00, 160.42s/it]
OOF-0| Epoch:   9| Train loss: 0.40766| roc_auc: 0.82895: 100%|██████████| 1/1 [02:46<00:00, 166.71s/it]
OO

# 예측

In [20]:
test_df = make_dataset.get_test_data()
test_dataset = CustomDataset(df = test_df, max_len = max_len)
test_data_loader = DataLoader(
    test_dataset,
    batch_size = 1, 
    shuffle = False, 
    drop_last = False,
    collate_fn = train_make_batch,
    num_workers = num_workers)

pred_list = []

model = SASRec(
    num_assessmentItemID = make_dataset.num_assessmentItemID, 
    num_testId = make_dataset.num_testId,
    num_KnowledgeTag = make_dataset.num_KnowledgeTag,
    num_large_paper_number = make_dataset.num_large_paper_number,
    num_hour = make_dataset.num_hour,
    num_dayofweek = make_dataset.num_dayofweek,
    num_cols = train_dataset.num_cols,
    cat_cols = train_dataset.cat_cols,
    emb_size = emb_size, 
    hidden_units = hidden_units, 
    num_heads = num_heads, 
    num_layers = num_layers, 
    dropout_rate = dropout_rate, 
    device = device).to(device)

for oof in make_dataset.oof_user_set.keys():
    model.load_state_dict(torch.load(os.path.join(MODEL_PATH, f'oof_{oof}_' + model_name)))
    pred = predict(model = model, data_loader = test_data_loader)
    pred_list.append(pred)

pred_list = np.array(pred_list).mean(axis = 0)

In [21]:
submission = pd.DataFrame(data = np.array(pred_list), columns = ['prediction'])
submission['id'] = submission.index
submission = submission[['id', 'prediction']]
submission.to_csv(os.path.join(SUBMISSION_PATH, 'OOF-Ensemble-' + submission_name), index = False)