In [15]:
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import KFold

from copy import deepcopy

from gensim.models import Word2Vec

import warnings

warnings.filterwarnings(action='ignore')
torch.set_printoptions(sci_mode=True)

In [16]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [17]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

# 데이터 전처리

In [18]:
class MakeDataset():

    def __init__(self, DATA_PATH):
        self.preporcessing(DATA_PATH)
        self.oof_user_set = self.split_data()
    
    def split_data(self):
        user_list = self.all_df['userID'].unique().tolist()
        oof_user_set = {}
        kf = KFold(n_splits = 5, random_state = 22, shuffle = True)
        for idx, (train_user, valid_user) in enumerate(kf.split(user_list)):
            oof_user_set[idx] = valid_user.tolist()
        
        return oof_user_set

    def preporcessing(self, DATA_PATH):

        dtype = {
            'userID': 'int16',
            'answerCode': 'int8',
            'KnowledgeTag': 'int16'
        }
        
        train_df = pd.read_csv(os.path.join(DATA_PATH, 'train_data.csv'), dtype=dtype, parse_dates=['Timestamp'])
        train_df = train_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

        test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_data.csv'), dtype=dtype, parse_dates=['Timestamp'])
        test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)


        def get_large_paper_number(x):
            return x[1:4]
        
        train_df['large_paper_number'] = train_df['assessmentItemID'].apply(lambda x : get_large_paper_number(x))
        test_df['large_paper_number'] = test_df['assessmentItemID'].apply(lambda x : get_large_paper_number(x))

        # 문제 푸는데 걸린 시간
        def get_now_elapsed(df):
            
            diff = df.loc[:, ['userID','Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
            diff = diff.fillna(pd.Timedelta(seconds=0))
            diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
            df['now_elapsed'] = diff
            df['now_elapsed'] = df['now_elapsed'].apply(lambda x : x if x < 650 and x >=0 else 0)
            df['now_elapsed'] = df['now_elapsed']

            return df

        train_df = get_now_elapsed(df = train_df)
        test_df = get_now_elapsed(df = test_df)

        all_df = pd.concat([train_df, test_df])
        all_df = all_df[all_df['answerCode'] != -1].reset_index(drop = True)

        # normalize_score
        def get_normalize_score(df, all_df):
            ret_df = []

            group_df = df.groupby('userID')
            mean_answerCode_df = all_df.groupby('testId').mean()['answerCode']
            std_answerCode_df = all_df.groupby('testId').std()['answerCode']
            for userID, get_df in group_df:
                normalize_score_df = (get_df[get_df['answerCode'] != -1].groupby('testId').mean()['answerCode'] - mean_answerCode_df) / std_answerCode_df
                get_df = get_df.copy().set_index('testId')
                get_df['normalize_score'] = normalize_score_df
                ret_df.append(get_df.reset_index(drop = False))

            ret_df = pd.concat(ret_df).reset_index(drop = True)
            
            return ret_df
        
        # train_df = get_normalize_score(df = train_df, all_df = all_df)
        # test_df = get_normalize_score(df = test_df, all_df = all_df)

        # 문항별 정답률
        train_df = train_df.set_index('assessmentItemID')
        train_df['assessmentItemID_mean_answerCode'] = all_df.groupby('assessmentItemID').mean()['answerCode']
        train_df = train_df.reset_index(drop = False)

        test_df = test_df.set_index('assessmentItemID')
        test_df['assessmentItemID_mean_answerCode'] = all_df.groupby('assessmentItemID').mean()['answerCode']
        test_df = test_df.reset_index(drop = False)

        # 문항별 정답률 표준편차
        train_df = train_df.set_index('assessmentItemID')
        train_df['assessmentItemID_std_answerCode'] = all_df.groupby('assessmentItemID').std()['answerCode']
        train_df = train_df.reset_index(drop = False)

        test_df = test_df.set_index('assessmentItemID')
        test_df['assessmentItemID_std_answerCode'] = all_df.groupby('assessmentItemID').std()['answerCode']
        test_df = test_df.reset_index(drop = False)

        # 올바르게 푼 사람들의 문항별 풀이 시간 평균
        train_df = train_df.set_index('assessmentItemID')
        train_df['assessmentItemID_mean_now_elapsed'] = all_df[all_df['answerCode'] == 1].groupby('assessmentItemID').mean()['now_elapsed']
        train_df = train_df.reset_index(drop = False)

        test_df = test_df.set_index('assessmentItemID')
        test_df['assessmentItemID_mean_now_elapsed'] = all_df[all_df['answerCode'] == 1].groupby('assessmentItemID').mean()['now_elapsed']
        test_df = test_df.reset_index(drop = False)

        # 올바르게 푼 사람들의 문항별 풀이 시간 표준 편차
        train_df = train_df.set_index('assessmentItemID')
        train_df['assessmentItemID_std_now_elapsed'] = all_df[all_df['answerCode'] == 1].groupby('assessmentItemID').std()['now_elapsed']
        train_df = train_df.reset_index(drop = False)

        test_df = test_df.set_index('assessmentItemID')
        test_df['assessmentItemID_std_now_elapsed'] = all_df[all_df['answerCode'] == 1].groupby('assessmentItemID').std()['now_elapsed']
        test_df = test_df.reset_index(drop = False)

        # train_df['now_elapsed'] = train_df['now_elapsed'] + 1
        # test_df['now_elapsed'] = test_df['now_elapsed'] + 1

        # 0을 패딩으로 줄것이기 때문에 평행 이동 시켜야 함
        # 해당 년도의 몇번째 주인지 판단
        # https://github.com/tidyverse/lubridate/issues/731
        sin_val = np.sin(2 * np.pi * np.array([i for i in range(1, 54)]) / 53)
        cos_val = np.cos(2 * np.pi * np.array([i for i in range(1, 54)]) / 53)

        train_df['week_number'] = train_df['Timestamp'].apply(lambda x:x.isocalendar()[1]) - 1
        train_df['num_week_number'] = 2 * np.pi * (train_df['week_number'] + 1) / 53
        train_df['sin_num_week_number'] = np.sin(train_df['num_week_number']) + 2 * abs(sin_val.min())
        train_df['cos_num_week_number'] = np.cos(train_df['num_week_number']) + 2 * abs(cos_val.min())

        test_df['week_number'] = test_df['Timestamp'].apply(lambda x:x.isocalendar()[1]) - 1
        test_df['num_week_number'] = 2 * np.pi * (test_df['week_number'] + 1) / 53
        test_df['sin_num_week_number'] = np.sin(test_df['num_week_number']) + 2 * abs(sin_val.min())
        test_df['cos_num_week_number'] = np.cos(test_df['num_week_number']) + 2 * abs(cos_val.min())

        # 문제 푼 시간
        sin_val = np.sin(2 * np.pi * np.array([i for i in range(1, 25)]) / 24)
        cos_val = np.cos(2 * np.pi * np.array([i for i in range(1, 25)]) / 24)

        train_df['hour'] = train_df['Timestamp'].dt.hour
        train_df['num_hour'] = 2 * np.pi * (train_df['hour'] + 1) / 24
        train_df['sin_num_hour'] = np.sin(train_df['num_hour']) + 2 * abs(sin_val.min())
        train_df['cos_num_hour'] = np.cos(train_df['num_hour']) + 2 * abs(cos_val.min())

        test_df['hour'] = test_df['Timestamp'].dt.hour
        test_df['num_hour'] = 2 * np.pi * (test_df['hour'] + 1) / 24
        test_df['sin_num_hour'] = np.sin(test_df['num_hour']) + 2 * abs(sin_val.min())
        test_df['cos_num_hour'] = np.cos(test_df['num_hour']) + 2 * abs(cos_val.min())

        # 문제 푼 요일
        sin_val = np.sin(2 * np.pi * np.array([i for i in range(1, 8)]) / 7)
        cos_val = np.cos(2 * np.pi * np.array([i for i in range(1, 8)]) / 7)

        train_df['dayofweek'] = train_df['Timestamp'].dt.dayofweek
        train_df['num_dayofweek'] = 2 * np.pi * (train_df['dayofweek'] + 1) / 7
        train_df['sin_num_dayofweek'] = np.sin(train_df['num_dayofweek']) + 2 * abs(sin_val.min())
        train_df['cos_num_dayofweek'] = np.cos(train_df['num_dayofweek']) + 2 * abs(cos_val.min())

        test_df['dayofweek'] = test_df['Timestamp'].dt.dayofweek
        test_df['num_dayofweek'] = 2 * np.pi * (test_df['dayofweek'] + 1) / 7
        test_df['sin_num_dayofweek'] = np.sin(test_df['num_dayofweek']) + 2 * abs(sin_val.min())
        test_df['cos_num_dayofweek'] = np.cos(test_df['num_dayofweek']) + 2 * abs(cos_val.min())

        # 해당 대분류 시험지를 푼 기간 (주 단위)
        def get_now_week(df):
            userID2large_paper_number2week_number2now_week = {}
            group_df = df.groupby('userID')

            for userID, g_df in group_df:
                large_paper_number2week_number = {}
                gg_df = g_df.groupby('large_paper_number')
                for large_paper_number, ggg_df in gg_df:
                    week_number2now_week = {}
                    for idx, week_number in enumerate(sorted(ggg_df['week_number'].unique())):
                        week_number2now_week[week_number] = idx
                    
                    large_paper_number2week_number[large_paper_number] = week_number2now_week

                userID2large_paper_number2week_number2now_week[userID] = large_paper_number2week_number

            def get_now_week_val(x):
                return userID2large_paper_number2week_number2now_week[x['userID']][x['large_paper_number']][x['week_number']]

            df['now_week'] = df.apply(lambda x : get_now_week_val(x), axis = 1)

            return df

        train_df = get_now_week(df = train_df)
        test_df = get_now_week(df = test_df)
    
        # index 로 변환

        def get_val2idx(val_list : list) -> dict:
            val2idx = {}
            for idx, val in enumerate(val_list):
                val2idx[val] = idx
            
            return val2idx

        assessmentItemID2idx = get_val2idx(all_df['assessmentItemID'].unique().tolist())
        testId2idx = get_val2idx(all_df['testId'].unique().tolist())
        KnowledgeTag2idx = get_val2idx(all_df['KnowledgeTag'].unique().tolist())
        large_paper_number2idx = get_val2idx(all_df['large_paper_number'].unique().tolist())

        train_df['assessmentItemID2idx'] = train_df['assessmentItemID'].apply(lambda x : assessmentItemID2idx[x])
        train_df['testId2idx'] = train_df['testId'].apply(lambda x : testId2idx[x])
        train_df['KnowledgeTag2idx'] = train_df['KnowledgeTag'].apply(lambda x : KnowledgeTag2idx[x])
        train_df['large_paper_number2idx'] = train_df['large_paper_number'].apply(lambda x : large_paper_number2idx[x])

        test_df['assessmentItemID2idx'] = test_df['assessmentItemID'].apply(lambda x : assessmentItemID2idx[x])
        test_df['testId2idx'] = test_df['testId'].apply(lambda x : testId2idx[x])
        test_df['KnowledgeTag2idx'] = test_df['KnowledgeTag'].apply(lambda x : KnowledgeTag2idx[x])
        test_df['large_paper_number2idx'] = test_df['large_paper_number'].apply(lambda x : large_paper_number2idx[x])

        self.assessmentItemID2idx = assessmentItemID2idx
        self.train_df, self.test_df = train_df, test_df
        self.all_df = pd.concat([train_df, test_df[test_df['answerCode'] != -1]]).reset_index(drop=True)
        self.num_assessmentItemID = len(assessmentItemID2idx)
        self.num_testId = len(testId2idx)
        self.num_KnowledgeTag = len(KnowledgeTag2idx)
        self.num_large_paper_number = len(large_paper_number2idx)
        self.num_hour = 24
        self.num_dayofweek = 7
        self.num_week_number = 53

    def get_oof_data(self, oof):

        val_user_list = self.oof_user_set[oof]

        train = []
        valid = []

        group_df = self.all_df.groupby('userID')

        for userID, df in group_df:
            if userID in val_user_list:
                trn_df = df.iloc[:-1, :]
                val_df = df.copy()
                train.append(trn_df)
                valid.append(val_df)
            else:
                train.append(df)

        # normalize_score
        def get_normalize_score(df, all_df, vailid = False):
            ret_df = []

            group_df = df.groupby('userID')
            mean_answerCode_df = all_df.groupby('testId').mean()['answerCode']
            std_answerCode_df = all_df.groupby('testId').std()['answerCode']
            for userID, get_df in group_df:
                if vailid:
                    normalize_score_df = (get_df.iloc[:-1, :].groupby('testId').mean()['answerCode'] - mean_answerCode_df) / std_answerCode_df
                else:
                    normalize_score_df = (get_df.groupby('testId').mean()['answerCode'] - mean_answerCode_df) / std_answerCode_df
                    
                get_df = get_df.copy().set_index('testId')
                get_df['normalize_score'] = normalize_score_df
                ret_df.append(get_df.reset_index(drop = False))

            ret_df = pd.concat(ret_df).reset_index(drop = True)
            
            return ret_df

        train = pd.concat(train).reset_index(drop = True)
        valid = pd.concat(valid).reset_index(drop = True)

        # train = get_normalize_score(df = train, all_df = train)
        # valid = get_normalize_score(df = valid, all_df = train, vailid = True)
        
        return train, valid
    
    def get_test_data(self):
        return self.test_df.copy()

In [19]:
class CustomDataset(Dataset):
    def __init__(
        self, 
        df,
        cat_cols = ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx', 'hour', 'dayofweek'],
        num_cols = ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'now_week'],
        max_len = None,
        window = None,
        data_augmentation = False,
        ):

        self.cat_cols = cat_cols
        self.num_cols = num_cols
        self.get_df = df.groupby('userID')
        self.user_list = df['userID'].unique().tolist()
        self.max_len = max_len
        self.window = window
        self.data_augmentation = data_augmentation
        if self.data_augmentation:
            self.cat_feature_list, self.num_feature_list, self.answerCode_list = self._data_augmentation()


    def __len__(self):
        if self.data_augmentation:
            return len(self.cat_feature_list)
        return len(self.user_list)

    def __getitem__(self, idx):
        if self.data_augmentation:
            cat_feature = self.cat_feature_list[idx]
            num_feature = self.num_feature_list[idx]
            answerCode = self.answerCode_list[idx]

            now_cat_feature = cat_feature[1:, :]
            now_num_feature = num_feature[1:, :]
            now_answerCode = answerCode[1:]
            
            past_cat_feature = cat_feature[:-1, :]
            past_num_feature = num_feature[:-1, :]
            past_answerCode = answerCode[:-1]
            
        else:
            user = self.user_list[idx]
            if self.max_len:
                get_df = self.get_df.get_group(user).iloc[-self.max_len:, :]
            else:
                get_df = self.get_df.get_group(user)

            now_df = get_df.iloc[1:, :]
            now_cat_feature = now_df[self.cat_cols].values
            now_num_feature = now_df[self.num_cols].values
            now_answerCode = now_df['answerCode'].values

            past_df = get_df.iloc[:-1, :]
            past_cat_feature = past_df[self.cat_cols].values
            past_num_feature = past_df[self.num_cols].values
            past_answerCode = past_df['answerCode'].values

        return {
            'past_cat_feature' : past_cat_feature, 
            'past_num_feature' : past_num_feature, 
            'past_answerCode' : past_answerCode, 
            'now_cat_feature' : now_cat_feature, 
            'now_num_feature' : now_num_feature, 
            'now_answerCode' : now_answerCode
            }
    

    def _data_augmentation(self):
        cat_feature_list = []
        num_feature_list = []
        answerCode_list = []
        for userID, get_df in tqdm(self.get_df):
            cat_feature = get_df[self.cat_cols].values[::-1]
            num_feature = get_df[self.num_cols].values[::-1]
            answerCode = get_df['answerCode'].values[::-1]

            start_idx = 0

            if len(get_df) <= self.max_len:
                cat_feature_list.append(cat_feature[::-1])
                num_feature_list.append(num_feature[::-1])
                answerCode_list.append(answerCode[::-1])
            else:
                while True:
                    if len(cat_feature[start_idx: start_idx + self.max_len, :]) < self.max_len:
                        cat_feature_list.append(cat_feature[start_idx: start_idx + self.max_len, :][::-1])
                        num_feature_list.append(num_feature[start_idx: start_idx + self.max_len, :][::-1])
                        answerCode_list.append(answerCode[start_idx: start_idx + self.max_len][::-1])
                        break
                    cat_feature_list.append(cat_feature[start_idx: start_idx + self.max_len, :][::-1])
                    num_feature_list.append(num_feature[start_idx: start_idx + self.max_len, :][::-1])
                    answerCode_list.append(answerCode[start_idx: start_idx + self.max_len][::-1])
                    start_idx += self.window
            
        return cat_feature_list, num_feature_list, answerCode_list

In [20]:
def pad_sequence(seq, max_len, padding_value = 0):
    try:
        seq_len, col = seq.shape
        padding = np.zeros((max_len - seq_len, col)) + padding_value
    except:
        seq_len = seq.shape[0]
        padding = np.zeros((max_len - seq_len, )) + padding_value

    padding_seq = np.concatenate([padding, seq])

    return padding_seq

def train_make_batch(samples):
    max_len = 0
    for sample in samples:
        seq_len, col = sample['past_cat_feature'].shape
        if max_len < seq_len:
            max_len = seq_len
    
    past_cat_feature = []
    past_num_feature = []
    past_answerCode = []
    now_cat_feature = []
    now_num_feature = []
    now_answerCode = []

    for sample in samples:
        past_cat_feature += [pad_sequence(sample['past_cat_feature'] + 1, max_len = max_len, padding_value = 0)]
        past_num_feature += [pad_sequence(sample['past_num_feature'], max_len = max_len, padding_value = 0)]
        past_answerCode += [pad_sequence(sample['past_answerCode'] + 1, max_len = max_len, padding_value = 0)]
        now_cat_feature += [pad_sequence(sample['now_cat_feature'] + 1, max_len = max_len, padding_value = 0)]
        now_num_feature += [pad_sequence(sample['now_num_feature'], max_len = max_len, padding_value = 0)]
        now_answerCode += [pad_sequence(sample['now_answerCode'], max_len = max_len, padding_value = -1)]

    return torch.tensor(past_cat_feature, dtype = torch.long), torch.tensor(past_num_feature, dtype = torch.float32), torch.tensor(past_answerCode, dtype = torch.long), torch.tensor(now_cat_feature, dtype = torch.long), torch.tensor(now_num_feature, dtype = torch.float32), torch.tensor(now_answerCode, dtype = torch.float32)

# 모델

In [21]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, hidden_units, dropout_rate):
        super(ScaledDotProductAttention, self).__init__()
        self.hidden_units = hidden_units
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, Q, K, V, mask):
        """
        Q, K, V : (batch_size, num_heads, max_len, hidden_units)
        mask : (batch_size, 1, max_len, max_len)
        """
        attn_score = torch.matmul(Q, K.transpose(2, 3)) / math.sqrt(self.hidden_units) # (batch_size, num_heads, max_len, max_len)
        attn_score = attn_score.masked_fill(mask == 0, -1e9)  # 유사도가 0인 지점은 -infinity로 보내 softmax 결과가 0이 되도록 함
        attn_dist = self.dropout(F.softmax(attn_score, dim=-1))  # attention distribution
        output = torch.matmul(attn_dist, V)  # (batch_size, num_heads, max_len, hidden_units) / # dim of output : batchSize x num_head x seqLen x hidden_units
        return output, attn_dist


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, hidden_units, dropout_rate):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads # head의 수
        self.hidden_units = hidden_units
        
        # query, key, value, output 생성을 위해 Linear 모델 생성
        self.W_Q = nn.Linear(hidden_units, hidden_units, bias=False)
        self.W_K = nn.Linear(hidden_units, hidden_units, bias=False)
        self.W_V = nn.Linear(hidden_units, hidden_units, bias=False)
        self.W_O = nn.Linear(hidden_units, hidden_units, bias=False)

        self.attention = ScaledDotProductAttention(hidden_units, dropout_rate)
        self.dropout = nn.Dropout(dropout_rate) # dropout rate
        self.layerNorm = nn.LayerNorm(hidden_units, 1e-6) # layer normalization

    def forward(self, enc, mask):
        """
        enc : (batch_size, max_len, hidden_units)
        mask : (batch_size, 1, max_len, max_len)
        
        """
        residual = enc # residual connection을 위해 residual 부분을 저장
        batch_size, seqlen = enc.size(0), enc.size(1)

        # Query, Key, Value를 (num_head)개의 Head로 나누어 각기 다른 Linear projection을 통과시킴
        Q = self.W_Q(enc).view(batch_size, seqlen, self.num_heads, self.hidden_units // self.num_heads) # (batch_size, max_len, num_heads, hidden_units)
        K = self.W_K(enc).view(batch_size, seqlen, self.num_heads, self.hidden_units // self.num_heads) # (batch_size, max_len, num_heads, hidden_units)
        V = self.W_V(enc).view(batch_size, seqlen, self.num_heads, self.hidden_units // self.num_heads) # (batch_size, max_len, num_heads, hidden_units)

        # Head별로 각기 다른 attention이 가능하도록 Transpose 후 각각 attention에 통과시킴
        Q, K, V = Q.transpose(1, 2), K.transpose(1, 2), V.transpose(1, 2) # (batch_size, num_heads, max_len, hidden_units)
        output, attn_dist = self.attention(Q, K, V, mask) # output : (batch_size, num_heads, max_len, hidden_units) / attn_dist : (batch_size, num_heads, max_len, max_len)

        # 다시 Transpose한 후 모든 head들의 attention 결과를 합칩니다.
        output = output.transpose(1, 2).contiguous() # (batch_size, max_len, num_heads, hidden_units) / contiguous() : 가변적 메모리 할당
        output = output.view(batch_size, seqlen, -1) # (batch_size, max_len, hidden_units * num_heads)

        # Linear Projection, Dropout, Residual sum, and Layer Normalization
        output = self.layerNorm(self.dropout(self.W_O(output)) + residual) # (batch_size, max_len, hidden_units)
        return output, attn_dist


class PositionwiseFeedForward(nn.Module):
    def __init__(self, hidden_units, dropout_rate):
        super(PositionwiseFeedForward, self).__init__()

        self.W_1 = nn.Linear(hidden_units, hidden_units)
        self.W_2 = nn.Linear(hidden_units, hidden_units)
        self.dropout = nn.Dropout(dropout_rate)
        self.layerNorm = nn.LayerNorm(hidden_units, 1e-6) # layer normalization

    def forward(self, x):
        residual = x
        output = self.W_2(F.relu(self.dropout(self.W_1(x))))
        output = self.layerNorm(self.dropout(output) + residual)
        return output


class SASRecBlock(nn.Module):
    def __init__(self, num_heads, hidden_units, dropout_rate):
        super(SASRecBlock, self).__init__()
        self.attention = MultiHeadAttention(num_heads, hidden_units, dropout_rate)
        self.pointwise_feedforward = PositionwiseFeedForward(hidden_units, dropout_rate)

    def forward(self, input_enc, mask):
        """
        input_enc : (batch_size, max_len, hidden_units)
        mask : (batch_size, 1, max_len, max_len)
        """
        output_enc, attn_dist = self.attention(input_enc, mask)
        output_enc = self.pointwise_feedforward(output_enc)
        return output_enc, attn_dist


class SASRec(nn.Module):
    def __init__(
        self, 
        num_assessmentItemID, 
        num_testId,
        num_KnowledgeTag,
        num_large_paper_number,
        num_hour,
        num_dayofweek,
        num_week_number,
        num_cols,
        cat_cols,
        emb_size,
        hidden_units,
        num_heads, 
        num_layers, 
        dropout_rate, 
        device):
        super(SASRec, self).__init__()

        self.cat_cols = cat_cols
        self.num_cols = num_cols
        
        self.hidden_units = hidden_units
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.device = device

        # past
        past_emb = {}
        for cat_col in self.cat_cols:
            if cat_col == 'assessmentItemID2idx':
                past_emb[cat_col] = nn.Embedding(num_assessmentItemID + 1, emb_size, padding_idx = 0) # 문항에 대한 정보
            elif cat_col == 'testId2idx':
                past_emb[cat_col] = nn.Embedding(num_testId + 1, emb_size, padding_idx = 0) # 시험지에 대한 정보
            elif cat_col == 'KnowledgeTag2idx':
                past_emb[cat_col] = nn.Embedding(num_KnowledgeTag + 1, emb_size, padding_idx = 0) # 지식 태그에 대한 정보
            elif cat_col == 'large_paper_number2idx':
                past_emb[cat_col] = nn.Embedding(num_large_paper_number + 1, emb_size, padding_idx = 0) # 학년에 대한 정보
            elif cat_col == 'hour':
                past_emb[cat_col] = nn.Embedding(num_hour + 1, emb_size, padding_idx = 0) # 문제 풀이 시간에 대한 정보
            elif cat_col == 'dayofweek':
                past_emb[cat_col] = nn.Embedding(num_dayofweek + 1, emb_size, padding_idx = 0) # 문제 풀이 요일에 대항 정보
            elif cat_col == 'week_number':
                past_emb[cat_col] = nn.Embedding(num_week_number + 1, emb_size, padding_idx = 0) # 문제 풀이 주에 대항 정보

        self.past_emb_dict = nn.ModuleDict(past_emb)

        self.past_answerCode_emb = nn.Embedding(3, hidden_units, padding_idx = 0) # 문제 정답 여부에 대한 정보

        self.past_cat_emb = nn.Sequential(
            nn.Linear(len(cat_cols) * emb_size, hidden_units // 2),
            nn.LayerNorm(hidden_units // 2, eps=1e-6)
        )

        self.past_num_emb = nn.Sequential(
            nn.Linear(len(num_cols), hidden_units // 2),
            nn.LayerNorm(hidden_units // 2, eps=1e-6)
        )

        self.emb_layernorm = nn.LayerNorm(hidden_units, eps=1e-6)

        self.past_lstm = nn.LSTM(
            input_size = hidden_units,
            hidden_size = hidden_units,
            num_layers = num_layers,
            batch_first = True,
            bidirectional = False,
            dropout = dropout_rate,
            )

        self.past_blocks = nn.ModuleList([SASRecBlock(num_heads, hidden_units, dropout_rate) for _ in range(num_layers)])

        # now

        now_emb = {}
        for cat_col in self.cat_cols:
            if cat_col == 'assessmentItemID2idx':
                now_emb[cat_col] = nn.Embedding(num_assessmentItemID + 1, emb_size, padding_idx = 0) # 문항에 대한 정보
            elif cat_col == 'testId2idx':
                now_emb[cat_col] = nn.Embedding(num_testId + 1, emb_size, padding_idx = 0) # 시험지에 대한 정보
            elif cat_col == 'KnowledgeTag2idx':
                now_emb[cat_col] = nn.Embedding(num_KnowledgeTag + 1, emb_size, padding_idx = 0) # 지식 태그에 대한 정보
            elif cat_col == 'large_paper_number2idx':
                now_emb[cat_col] = nn.Embedding(num_large_paper_number + 1, emb_size, padding_idx = 0) # 학년에 대한 정보
            elif cat_col == 'hour':
                now_emb[cat_col] = nn.Embedding(num_hour + 1, emb_size, padding_idx = 0) # 문제 풀이 시간에 대한 정보
            elif cat_col == 'dayofweek':
                now_emb[cat_col] = nn.Embedding(num_dayofweek + 1, emb_size, padding_idx = 0) # 문제 풀이 요일에 대항 정보
            elif cat_col == 'week_number':
                now_emb[cat_col] = nn.Embedding(num_week_number + 1, emb_size, padding_idx = 0) # 문제 풀이 주에 대항 정보

        self.now_emb_dict = nn.ModuleDict(now_emb)

        self.now_cat_emb = nn.Sequential(
            nn.Linear(len(cat_cols) * emb_size, hidden_units // 2),
            nn.LayerNorm(hidden_units // 2, eps=1e-6)
        )

        self.now_num_emb = nn.Sequential(
            nn.Linear(len(num_cols), hidden_units // 2),
            nn.LayerNorm(hidden_units // 2, eps=1e-6)
        )

        self.now_lstm = nn.LSTM(
            input_size = hidden_units,
            hidden_size = hidden_units,
            num_layers = num_layers,
            batch_first = True,
            bidirectional = False,
            dropout = dropout_rate,
            )

        self.now_blocks = nn.ModuleList([SASRecBlock(num_heads, hidden_units, dropout_rate) for _ in range(num_layers)])

        # predict

        self.dropout = nn.Dropout(dropout_rate)

        self.predict_layer = nn.Sequential(
            nn.Linear(hidden_units * 2, 1),
            nn.Sigmoid()
        )
    
    
    def forward(self, past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature):
        """
        past_cat_feature : (batch_size, max_len, cat_cols)
        past_num_feature : (batch_size, max_len, num_cols)
        past_answerCode : (batch_size, max_len)

        now_cat_feature : (batch_size, max_len, cat_cols)
        now_num_feature : (batch_size, max_len, num_cols)
        
        """

        past_cat_emb_list = []
        for idx, cat_col in enumerate(self.cat_cols):
            past_cat_emb_list.append(self.past_emb_dict[cat_col](past_cat_feature[:, :, idx]))

        past_cat_emb = torch.concat(past_cat_emb_list, dim = -1)
        past_cat_emb = self.past_cat_emb(past_cat_emb)
        past_num_emb = self.past_num_emb(past_num_feature)

        past_emb = torch.concat([past_cat_emb, past_num_emb], dim = -1)
        past_emb += self.past_answerCode_emb(past_answerCode.to(self.device))
        past_emb = self.emb_layernorm(past_emb) # LayerNorm

        # masking 
        mask_pad = torch.BoolTensor(past_answerCode > 0).unsqueeze(1).unsqueeze(1) # (batch_size, 1, 1, max_len)
        mask_time = (1 - torch.triu(torch.ones((1, 1, past_answerCode.size(1), past_answerCode.size(1))), diagonal=1)).bool() # (batch_size, 1, max_len, max_len)
        mask = (mask_pad & mask_time).to(self.device) # (batch_size, 1, max_len, max_len)
        for block in self.past_blocks:
            past_emb, attn_dist = block(past_emb, mask)

        past_emb, _ = self.past_lstm(past_emb)

        now_cat_emb_list = []
        for idx, cat_col in enumerate(self.cat_cols):
            now_cat_emb_list.append(self.now_emb_dict[cat_col](now_cat_feature[:, :, idx]))

        now_cat_emb = torch.concat(now_cat_emb_list, dim = -1)
        now_cat_emb = self.now_cat_emb(now_cat_emb)
        now_num_emb = self.now_num_emb(now_num_feature)

        now_emb = torch.concat([now_cat_emb, now_num_emb], dim = -1)

        for block in self.now_blocks:
            now_emb, attn_dist = block(now_emb, mask)

        now_emb, _ = self.now_lstm(now_emb)

        emb = torch.concat([past_emb, now_emb], dim = -1)
        
        output = self.predict_layer(self.dropout(emb))

        return output

# 학습 함수

In [22]:
from sklearn.metrics import roc_auc_score

def train(model, data_loader, criterion, optimizer):
    model.train()
    loss_val = 0

    for past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature, now_answerCode in data_loader:

        past_cat_feature, past_num_feature, past_answerCode = past_cat_feature.to(device), past_num_feature.to(device), past_answerCode
        now_cat_feature, now_num_feature, now_answerCode = now_cat_feature.to(device), now_num_feature.to(device), now_answerCode.to(device)

        optimizer.zero_grad()

        output = model(past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature).squeeze(2)
        loss = criterion(output[now_answerCode != -1], now_answerCode[now_answerCode != -1])

        loss.backward()
        optimizer.step()

        loss_val += loss.item()

    loss_val /= len(data_loader)

    return loss_val

def evaluate(model, data_loader):
    model.eval()

    target = []
    pred = []

    with torch.no_grad():
        for past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature, now_answerCode in data_loader:
            past_cat_feature, past_num_feature, past_answerCode = past_cat_feature.to(device), past_num_feature.to(device), past_answerCode
            now_cat_feature, now_num_feature, now_answerCode = now_cat_feature.to(device), now_num_feature.to(device), now_answerCode.to(device)
            
            output = model(past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature).squeeze(2)

            target.extend(now_answerCode[:, -1].cpu().numpy().tolist())
            pred.extend(output[:, -1].cpu().numpy().tolist())

    roc_auc = roc_auc_score(target, pred)

    return roc_auc


def predict(model, data_loader):
    model.eval()

    pred = []

    with torch.no_grad():
        for past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature, now_answerCode in data_loader:
            past_cat_feature, past_num_feature, past_answerCode = past_cat_feature.to(device), past_num_feature.to(device), past_answerCode
            now_cat_feature, now_num_feature = now_cat_feature.to(device), now_num_feature.to(device)
            
            output = model(past_cat_feature, past_num_feature, past_answerCode, now_cat_feature, now_num_feature).squeeze(2)
            pred.extend(output[:, -1].cpu().numpy().tolist())

    return pred

# 학습

In [23]:
batch_size = 32
epochs = 10
lr = 0.001
device = 'cuda' if torch.cuda.is_available() else 'cpu'

emb_size = 64
hidden_units = 128
num_heads = 2 # 2,4,8,16,32
num_layers = 1
dropout_rate = 0.5
num_workers = 8

DATA_PATH = '/opt/ml/input/data'

In [9]:
make_dataset = MakeDataset(DATA_PATH = DATA_PATH)

#  feature-sellection

In [24]:
from itertools import combinations, product

cat_cols = [['week_number'], ['hour'], ['dayofweek'],]

num_cols = [['now_week'],
            ['num_week_number'],
            ['sin_num_week_number', 'cos_num_week_number'],
            ['num_hour'],
            ['sin_num_hour', 'cos_num_hour'],
            ['num_dayofweek'],
            ['sin_num_dayofweek', 'cos_num_dayofweek'],]

In [25]:
cat_cols_combinations_list = [['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']]
for cnt in range(1, len(cat_cols) + 1):
    for i in combinations(cat_cols, cnt):
        cat_cols_combinations = ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
        for j in i:
            if isinstance(j, list): cat_cols_combinations.extend(j)
            else : cat_cols_combinations.append(j)
            
        cat_cols_combinations_list += [cat_cols_combinations]

In [26]:
num_cols_combinations_list = [['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode']]
for cnt in range(1, len(num_cols) + 1):
    for i in combinations(num_cols, cnt):
        num_cols_combinations = ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode']
        for j in i:
            if isinstance(j, list): num_cols_combinations.extend(j)
            else : num_cols_combinations.append(j)
        num_cols_combinations_list += [num_cols_combinations]

In [27]:
cat_cols_and_num_cols_combinations_list = list(product(cat_cols_combinations_list, num_cols_combinations_list))

In [28]:
oof = 0

train_df, valid_df = make_dataset.get_oof_data(oof)

for cat_cols, num_cols in cat_cols_and_num_cols_combinations_list:

    seed_everything(22 + oof)

    train_dataset = CustomDataset(df = train_df, cat_cols = cat_cols, num_cols = num_cols)
    train_data_loader = DataLoader(
        train_dataset, 
        batch_size = batch_size, 
        shuffle = True, 
        drop_last = False,
        collate_fn = train_make_batch,
        num_workers = num_workers)

    valid_dataset = CustomDataset(df = valid_df, cat_cols = cat_cols, num_cols = num_cols)
    valid_data_loader = DataLoader(
        valid_dataset, 
        batch_size = 1, 
        shuffle = False, 
        drop_last = False,
        collate_fn = train_make_batch,
        num_workers = num_workers)

    model = SASRec(
        num_assessmentItemID = make_dataset.num_assessmentItemID, 
        num_testId = make_dataset.num_testId,
        num_KnowledgeTag = make_dataset.num_KnowledgeTag,
        num_large_paper_number = make_dataset.num_large_paper_number,
        num_hour = make_dataset.num_hour,
        num_dayofweek = make_dataset.num_dayofweek,
        num_week_number = make_dataset.num_week_number,
        num_cols = train_dataset.num_cols,
        cat_cols = train_dataset.cat_cols,
        emb_size = emb_size,
        hidden_units = hidden_units,
        num_heads = num_heads,
        num_layers = num_layers,
        dropout_rate = dropout_rate,
        device = device).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr = lr)
    criterion = nn.BCELoss()

    best_epoch = 0
    best_train_loss = 0
    best_roc_auc = 0

    for epoch in range(1, epochs + 1):
        tbar = tqdm(range(1))
        for _ in tbar:
            train_loss = train(model = model, data_loader = train_data_loader, criterion = criterion, optimizer = optimizer)
            roc_auc = evaluate(model = model, data_loader = valid_data_loader)
            if best_roc_auc < roc_auc:
                best_epoch = epoch
                best_train_loss = train_loss
                best_roc_auc = roc_auc

            tbar.set_description(f'OOF-{oof}| Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| roc_auc: {roc_auc:.5f}')

    print('-' * 20)
    print('cat_cols :', cat_cols)
    print('num_cols :', num_cols)
    print(f'BEST OOF-{oof}| Epoch: {best_epoch:3d}| Train loss: {best_train_loss:.5f}| roc_auc: {best_roc_auc:.5f}')
    print('-' * 20)

OOF-0| Epoch:   1| Train loss: 0.50756| roc_auc: 0.82033: 100%|██████████| 1/1 [01:54<00:00, 114.45s/it]
OOF-0| Epoch:   2| Train loss: 0.46816| roc_auc: 0.84098: 100%|██████████| 1/1 [01:53<00:00, 113.82s/it]
OOF-0| Epoch:   3| Train loss: 0.45429| roc_auc: 0.84726: 100%|██████████| 1/1 [01:53<00:00, 113.22s/it]
OOF-0| Epoch:   4| Train loss: 0.44831| roc_auc: 0.85348: 100%|██████████| 1/1 [01:51<00:00, 111.42s/it]
OOF-0| Epoch:   5| Train loss: 0.44370| roc_auc: 0.85476: 100%|██████████| 1/1 [01:54<00:00, 114.70s/it]
OOF-0| Epoch:   6| Train loss: 0.44123| roc_auc: 0.85441: 100%|██████████| 1/1 [01:53<00:00, 113.71s/it]
OOF-0| Epoch:   7| Train loss: 0.43887| roc_auc: 0.85293: 100%|██████████| 1/1 [01:53<00:00, 113.09s/it]
OOF-0| Epoch:   8| Train loss: 0.43690| roc_auc: 0.85497: 100%|██████████| 1/1 [01:53<00:00, 113.36s/it]
OOF-0| Epoch:   9| Train loss: 0.43538| roc_auc: 0.85426: 100%|██████████| 1/1 [01:52<00:00, 112.19s/it]
OOF-0| Epoch:  10| Train loss: 0.43365| roc_auc: 0.8565

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode']
BEST OOF-0| Epoch:  10| Train loss: 0.43365| roc_auc: 0.85652
--------------------


OOF-0| Epoch:   1| Train loss: 0.50721| roc_auc: 0.82651: 100%|██████████| 1/1 [01:53<00:00, 113.11s/it]
OOF-0| Epoch:   2| Train loss: 0.46710| roc_auc: 0.84505: 100%|██████████| 1/1 [01:52<00:00, 112.31s/it]
OOF-0| Epoch:   3| Train loss: 0.45339| roc_auc: 0.85121: 100%|██████████| 1/1 [01:53<00:00, 113.01s/it]
OOF-0| Epoch:   4| Train loss: 0.44703| roc_auc: 0.85472: 100%|██████████| 1/1 [01:52<00:00, 112.67s/it]
OOF-0| Epoch:   5| Train loss: 0.44336| roc_auc: 0.85278: 100%|██████████| 1/1 [01:53<00:00, 113.98s/it]
OOF-0| Epoch:   6| Train loss: 0.44041| roc_auc: 0.85523: 100%|██████████| 1/1 [01:53<00:00, 113.26s/it]
OOF-0| Epoch:   7| Train loss: 0.43808| roc_auc: 0.85616: 100%|██████████| 1/1 [01:52<00:00, 112.80s/it]
OOF-0| Epoch:   8| Train loss: 0.43625| roc_auc: 0.85838: 100%|██████████| 1/1 [01:51<00:00, 111.86s/it]
OOF-0| Epoch:   9| Train loss: 0.43487| roc_auc: 0.85862: 100%|██████████| 1/1 [01:54<00:00, 114.01s/it]
OOF-0| Epoch:  10| Train loss: 0.43316| roc_auc: 0.8618

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'now_week']
BEST OOF-0| Epoch:  10| Train loss: 0.43316| roc_auc: 0.86180
--------------------


OOF-0| Epoch:   1| Train loss: 0.50721| roc_auc: 0.82711: 100%|██████████| 1/1 [01:52<00:00, 112.15s/it]
OOF-0| Epoch:   2| Train loss: 0.46717| roc_auc: 0.84565: 100%|██████████| 1/1 [01:53<00:00, 113.30s/it]
OOF-0| Epoch:   3| Train loss: 0.45356| roc_auc: 0.85185: 100%|██████████| 1/1 [01:52<00:00, 112.89s/it]
OOF-0| Epoch:   4| Train loss: 0.44704| roc_auc: 0.85547: 100%|██████████| 1/1 [01:53<00:00, 113.24s/it]
OOF-0| Epoch:   5| Train loss: 0.44341| roc_auc: 0.85355: 100%|██████████| 1/1 [01:52<00:00, 112.39s/it]
OOF-0| Epoch:   6| Train loss: 0.44055| roc_auc: 0.85531: 100%|██████████| 1/1 [01:52<00:00, 112.66s/it]
OOF-0| Epoch:   7| Train loss: 0.43821| roc_auc: 0.85750: 100%|██████████| 1/1 [01:53<00:00, 113.33s/it]
OOF-0| Epoch:   8| Train loss: 0.43626| roc_auc: 0.85960: 100%|██████████| 1/1 [01:52<00:00, 112.60s/it]
OOF-0| Epoch:   9| Train loss: 0.43473| roc_auc: 0.85882: 100%|██████████| 1/1 [01:53<00:00, 113.40s/it]
OOF-0| Epoch:  10| Train loss: 0.43311| roc_auc: 0.8600

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'num_week_number']
BEST OOF-0| Epoch:  10| Train loss: 0.43311| roc_auc: 0.86007
--------------------


OOF-0| Epoch:   1| Train loss: 0.51049| roc_auc: 0.82731: 100%|██████████| 1/1 [01:53<00:00, 113.69s/it]
OOF-0| Epoch:   2| Train loss: 0.46779| roc_auc: 0.84595: 100%|██████████| 1/1 [01:53<00:00, 113.21s/it]
OOF-0| Epoch:   3| Train loss: 0.45344| roc_auc: 0.85166: 100%|██████████| 1/1 [01:52<00:00, 112.65s/it]
OOF-0| Epoch:   4| Train loss: 0.44724| roc_auc: 0.85427: 100%|██████████| 1/1 [01:53<00:00, 113.26s/it]
OOF-0| Epoch:   5| Train loss: 0.44358| roc_auc: 0.85250: 100%|██████████| 1/1 [01:52<00:00, 112.40s/it]
OOF-0| Epoch:   6| Train loss: 0.44074| roc_auc: 0.85444: 100%|██████████| 1/1 [01:53<00:00, 113.17s/it]
OOF-0| Epoch:   7| Train loss: 0.43840| roc_auc: 0.85535: 100%|██████████| 1/1 [01:51<00:00, 111.66s/it]
OOF-0| Epoch:   8| Train loss: 0.43642| roc_auc: 0.85580: 100%|██████████| 1/1 [01:52<00:00, 112.60s/it]
OOF-0| Epoch:   9| Train loss: 0.43481| roc_auc: 0.85577: 100%|██████████| 1/1 [01:54<00:00, 114.12s/it]
OOF-0| Epoch:  10| Train loss: 0.43315| roc_auc: 0.8559

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'sin_num_week_number', 'cos_num_week_number']
BEST OOF-0| Epoch:  10| Train loss: 0.43315| roc_auc: 0.85590
--------------------


OOF-0| Epoch:   1| Train loss: 0.50740| roc_auc: 0.82733: 100%|██████████| 1/1 [01:53<00:00, 113.58s/it]
OOF-0| Epoch:   2| Train loss: 0.46739| roc_auc: 0.84597: 100%|██████████| 1/1 [01:53<00:00, 113.89s/it]
OOF-0| Epoch:   3| Train loss: 0.45361| roc_auc: 0.85239: 100%|██████████| 1/1 [01:53<00:00, 113.65s/it]
OOF-0| Epoch:   4| Train loss: 0.44708| roc_auc: 0.85626: 100%|██████████| 1/1 [01:53<00:00, 113.90s/it]
OOF-0| Epoch:   5| Train loss: 0.44348| roc_auc: 0.85424: 100%|██████████| 1/1 [01:53<00:00, 113.63s/it]
OOF-0| Epoch:   6| Train loss: 0.44052| roc_auc: 0.85638: 100%|██████████| 1/1 [01:53<00:00, 113.47s/it]
OOF-0| Epoch:   7| Train loss: 0.43818| roc_auc: 0.85892: 100%|██████████| 1/1 [01:53<00:00, 113.55s/it]
OOF-0| Epoch:   8| Train loss: 0.43630| roc_auc: 0.86102: 100%|██████████| 1/1 [01:52<00:00, 112.54s/it]
OOF-0| Epoch:   9| Train loss: 0.43474| roc_auc: 0.85977: 100%|██████████| 1/1 [01:53<00:00, 113.96s/it]
OOF-0| Epoch:  10| Train loss: 0.43323| roc_auc: 0.8614

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'num_hour']
BEST OOF-0| Epoch:  10| Train loss: 0.43323| roc_auc: 0.86142
--------------------


OOF-0| Epoch:   1| Train loss: 0.51044| roc_auc: 0.82618: 100%|██████████| 1/1 [01:53<00:00, 113.23s/it]
OOF-0| Epoch:   2| Train loss: 0.46776| roc_auc: 0.84604: 100%|██████████| 1/1 [01:54<00:00, 114.91s/it]
OOF-0| Epoch:   3| Train loss: 0.45338| roc_auc: 0.85160: 100%|██████████| 1/1 [01:53<00:00, 113.23s/it]
OOF-0| Epoch:   4| Train loss: 0.44717| roc_auc: 0.85459: 100%|██████████| 1/1 [01:55<00:00, 115.14s/it]
OOF-0| Epoch:   5| Train loss: 0.44351| roc_auc: 0.85294: 100%|██████████| 1/1 [01:52<00:00, 112.98s/it]
OOF-0| Epoch:   6| Train loss: 0.44065| roc_auc: 0.85542: 100%|██████████| 1/1 [01:54<00:00, 114.13s/it]
OOF-0| Epoch:   7| Train loss: 0.43825| roc_auc: 0.85601: 100%|██████████| 1/1 [01:54<00:00, 114.09s/it]
OOF-0| Epoch:   8| Train loss: 0.43660| roc_auc: 0.85690: 100%|██████████| 1/1 [01:54<00:00, 114.66s/it]
OOF-0| Epoch:   9| Train loss: 0.43492| roc_auc: 0.85728: 100%|██████████| 1/1 [01:53<00:00, 113.93s/it]
OOF-0| Epoch:  10| Train loss: 0.43322| roc_auc: 0.8563

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'sin_num_hour', 'cos_num_hour']
BEST OOF-0| Epoch:   9| Train loss: 0.43492| roc_auc: 0.85728
--------------------


OOF-0| Epoch:   1| Train loss: 0.50730| roc_auc: 0.82767: 100%|██████████| 1/1 [01:52<00:00, 112.56s/it]
OOF-0| Epoch:   2| Train loss: 0.46731| roc_auc: 0.84598: 100%|██████████| 1/1 [01:53<00:00, 113.25s/it]
OOF-0| Epoch:   3| Train loss: 0.45365| roc_auc: 0.85263: 100%|██████████| 1/1 [01:53<00:00, 113.19s/it]
OOF-0| Epoch:   4| Train loss: 0.44710| roc_auc: 0.85634: 100%|██████████| 1/1 [01:52<00:00, 112.09s/it]
OOF-0| Epoch:   5| Train loss: 0.44346| roc_auc: 0.85409: 100%|██████████| 1/1 [01:52<00:00, 112.43s/it]
OOF-0| Epoch:   6| Train loss: 0.44059| roc_auc: 0.85579: 100%|██████████| 1/1 [01:52<00:00, 112.43s/it]
OOF-0| Epoch:   7| Train loss: 0.43824| roc_auc: 0.85841: 100%|██████████| 1/1 [01:52<00:00, 112.68s/it]
OOF-0| Epoch:   8| Train loss: 0.43631| roc_auc: 0.85917: 100%|██████████| 1/1 [01:52<00:00, 112.59s/it]
OOF-0| Epoch:   9| Train loss: 0.43471| roc_auc: 0.85954: 100%|██████████| 1/1 [01:55<00:00, 115.22s/it]
OOF-0| Epoch:  10| Train loss: 0.43329| roc_auc: 0.8611

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'num_dayofweek']
BEST OOF-0| Epoch:  10| Train loss: 0.43329| roc_auc: 0.86117
--------------------


OOF-0| Epoch:   1| Train loss: 0.51053| roc_auc: 0.82704: 100%|██████████| 1/1 [01:52<00:00, 112.19s/it]
OOF-0| Epoch:   2| Train loss: 0.46769| roc_auc: 0.84575: 100%|██████████| 1/1 [01:52<00:00, 112.21s/it]
OOF-0| Epoch:   3| Train loss: 0.45333| roc_auc: 0.85095: 100%|██████████| 1/1 [01:52<00:00, 112.87s/it]
OOF-0| Epoch:   4| Train loss: 0.44712| roc_auc: 0.85464: 100%|██████████| 1/1 [01:52<00:00, 112.80s/it]
OOF-0| Epoch:   5| Train loss: 0.44417| roc_auc: 0.85255: 100%|██████████| 1/1 [01:52<00:00, 112.50s/it]
OOF-0| Epoch:   6| Train loss: 0.44113| roc_auc: 0.85510: 100%|██████████| 1/1 [01:53<00:00, 113.55s/it]
OOF-0| Epoch:   7| Train loss: 0.43853| roc_auc: 0.85406: 100%|██████████| 1/1 [01:53<00:00, 113.47s/it]
OOF-0| Epoch:   8| Train loss: 0.43677| roc_auc: 0.85571: 100%|██████████| 1/1 [01:53<00:00, 113.41s/it]
OOF-0| Epoch:   9| Train loss: 0.43522| roc_auc: 0.85622: 100%|██████████| 1/1 [01:53<00:00, 113.81s/it]
OOF-0| Epoch:  10| Train loss: 0.43355| roc_auc: 0.8557

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'sin_num_dayofweek', 'cos_num_dayofweek']
BEST OOF-0| Epoch:   9| Train loss: 0.43522| roc_auc: 0.85622
--------------------


OOF-0| Epoch:   1| Train loss: 0.50988| roc_auc: 0.82829: 100%|██████████| 1/1 [01:54<00:00, 114.83s/it]
OOF-0| Epoch:   2| Train loss: 0.46736| roc_auc: 0.84665: 100%|██████████| 1/1 [01:55<00:00, 115.43s/it]
OOF-0| Epoch:   3| Train loss: 0.45303| roc_auc: 0.85112: 100%|██████████| 1/1 [01:54<00:00, 114.32s/it]
OOF-0| Epoch:   4| Train loss: 0.44689| roc_auc: 0.85350: 100%|██████████| 1/1 [01:55<00:00, 115.76s/it]
OOF-0| Epoch:   5| Train loss: 0.44325| roc_auc: 0.85176: 100%|██████████| 1/1 [01:54<00:00, 114.52s/it]
OOF-0| Epoch:   6| Train loss: 0.44042| roc_auc: 0.85414: 100%|██████████| 1/1 [01:54<00:00, 114.30s/it]
OOF-0| Epoch:   7| Train loss: 0.43795| roc_auc: 0.85566: 100%|██████████| 1/1 [01:53<00:00, 113.83s/it]
OOF-0| Epoch:   8| Train loss: 0.43594| roc_auc: 0.85587: 100%|██████████| 1/1 [01:55<00:00, 115.10s/it]
OOF-0| Epoch:   9| Train loss: 0.43437| roc_auc: 0.85624: 100%|██████████| 1/1 [01:54<00:00, 114.12s/it]
OOF-0| Epoch:  10| Train loss: 0.43272| roc_auc: 0.8556

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'now_week', 'num_week_number']
BEST OOF-0| Epoch:   9| Train loss: 0.43437| roc_auc: 0.85624
--------------------


OOF-0| Epoch:   1| Train loss: 0.50705| roc_auc: 0.82931: 100%|██████████| 1/1 [01:54<00:00, 114.84s/it]
OOF-0| Epoch:   2| Train loss: 0.46612| roc_auc: 0.84492: 100%|██████████| 1/1 [01:54<00:00, 114.99s/it]
OOF-0| Epoch:   3| Train loss: 0.45268| roc_auc: 0.84942: 100%|██████████| 1/1 [01:54<00:00, 114.90s/it]
OOF-0| Epoch:   4| Train loss: 0.44649| roc_auc: 0.85225: 100%|██████████| 1/1 [01:54<00:00, 114.25s/it]
OOF-0| Epoch:   5| Train loss: 0.44292| roc_auc: 0.84912: 100%|██████████| 1/1 [01:55<00:00, 115.72s/it]
OOF-0| Epoch:   6| Train loss: 0.44006| roc_auc: 0.85127: 100%|██████████| 1/1 [01:55<00:00, 115.24s/it]
OOF-0| Epoch:   7| Train loss: 0.43793| roc_auc: 0.85362: 100%|██████████| 1/1 [01:54<00:00, 114.43s/it]
OOF-0| Epoch:   8| Train loss: 0.43615| roc_auc: 0.85516: 100%|██████████| 1/1 [01:55<00:00, 115.05s/it]
OOF-0| Epoch:   9| Train loss: 0.43420| roc_auc: 0.85559: 100%|██████████| 1/1 [01:53<00:00, 113.16s/it]
OOF-0| Epoch:  10| Train loss: 0.43330| roc_auc: 0.8568

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'now_week', 'sin_num_week_number', 'cos_num_week_number']
BEST OOF-0| Epoch:  10| Train loss: 0.43330| roc_auc: 0.85687
--------------------


OOF-0| Epoch:   1| Train loss: 0.50998| roc_auc: 0.82821: 100%|██████████| 1/1 [01:53<00:00, 113.71s/it]
OOF-0| Epoch:   2| Train loss: 0.46752| roc_auc: 0.84677: 100%|██████████| 1/1 [01:54<00:00, 114.20s/it]
OOF-0| Epoch:   3| Train loss: 0.45318| roc_auc: 0.85151: 100%|██████████| 1/1 [01:54<00:00, 114.11s/it]
OOF-0| Epoch:   4| Train loss: 0.44720| roc_auc: 0.85332: 100%|██████████| 1/1 [01:54<00:00, 114.33s/it]
OOF-0| Epoch:   5| Train loss: 0.44334| roc_auc: 0.85211: 100%|██████████| 1/1 [01:53<00:00, 113.69s/it]
OOF-0| Epoch:   6| Train loss: 0.44051| roc_auc: 0.85432: 100%|██████████| 1/1 [01:54<00:00, 114.79s/it]
OOF-0| Epoch:   7| Train loss: 0.43806| roc_auc: 0.85590: 100%|██████████| 1/1 [01:54<00:00, 114.96s/it]
OOF-0| Epoch:   8| Train loss: 0.43613| roc_auc: 0.85635: 100%|██████████| 1/1 [01:54<00:00, 114.82s/it]
OOF-0| Epoch:   9| Train loss: 0.43444| roc_auc: 0.85581: 100%|██████████| 1/1 [01:54<00:00, 114.62s/it]
OOF-0| Epoch:  10| Train loss: 0.43280| roc_auc: 0.8564

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'now_week', 'num_hour']
BEST OOF-0| Epoch:  10| Train loss: 0.43280| roc_auc: 0.85641
--------------------


OOF-0| Epoch:   1| Train loss: 0.50705| roc_auc: 0.82993: 100%|██████████| 1/1 [01:54<00:00, 114.99s/it]
OOF-0| Epoch:   2| Train loss: 0.46603| roc_auc: 0.84546: 100%|██████████| 1/1 [01:53<00:00, 113.83s/it]
OOF-0| Epoch:   3| Train loss: 0.45263| roc_auc: 0.84905: 100%|██████████| 1/1 [01:54<00:00, 114.87s/it]
OOF-0| Epoch:   4| Train loss: 0.44652| roc_auc: 0.85236: 100%|██████████| 1/1 [01:55<00:00, 115.25s/it]
OOF-0| Epoch:   5| Train loss: 0.44290| roc_auc: 0.84958: 100%|██████████| 1/1 [01:55<00:00, 115.41s/it]
OOF-0| Epoch:   6| Train loss: 0.44007| roc_auc: 0.85230: 100%|██████████| 1/1 [01:55<00:00, 115.17s/it]
OOF-0| Epoch:   7| Train loss: 0.43794| roc_auc: 0.85297: 100%|██████████| 1/1 [01:55<00:00, 115.22s/it]
OOF-0| Epoch:   8| Train loss: 0.43636| roc_auc: 0.85515: 100%|██████████| 1/1 [01:55<00:00, 115.16s/it]
OOF-0| Epoch:   9| Train loss: 0.43417| roc_auc: 0.85541: 100%|██████████| 1/1 [01:54<00:00, 114.05s/it]
OOF-0| Epoch:  10| Train loss: 0.43314| roc_auc: 0.8563

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'now_week', 'sin_num_hour', 'cos_num_hour']
BEST OOF-0| Epoch:  10| Train loss: 0.43314| roc_auc: 0.85638
--------------------


OOF-0| Epoch:   1| Train loss: 0.50989| roc_auc: 0.82822: 100%|██████████| 1/1 [01:52<00:00, 112.90s/it]
OOF-0| Epoch:   2| Train loss: 0.46746| roc_auc: 0.84722: 100%|██████████| 1/1 [01:53<00:00, 113.73s/it]
OOF-0| Epoch:   3| Train loss: 0.45324| roc_auc: 0.85104: 100%|██████████| 1/1 [01:53<00:00, 113.42s/it]
OOF-0| Epoch:   4| Train loss: 0.44696| roc_auc: 0.85280: 100%|██████████| 1/1 [01:53<00:00, 113.16s/it]
OOF-0| Epoch:   5| Train loss: 0.44328| roc_auc: 0.85196: 100%|██████████| 1/1 [01:53<00:00, 113.04s/it]
OOF-0| Epoch:   6| Train loss: 0.44041| roc_auc: 0.85445: 100%|██████████| 1/1 [01:52<00:00, 112.87s/it]
OOF-0| Epoch:   7| Train loss: 0.43794| roc_auc: 0.85579: 100%|██████████| 1/1 [01:53<00:00, 113.25s/it]
OOF-0| Epoch:   8| Train loss: 0.43590| roc_auc: 0.85544: 100%|██████████| 1/1 [01:55<00:00, 115.37s/it]
OOF-0| Epoch:   9| Train loss: 0.43462| roc_auc: 0.85591: 100%|██████████| 1/1 [01:52<00:00, 112.87s/it]
OOF-0| Epoch:  10| Train loss: 0.43296| roc_auc: 0.8562

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'now_week', 'num_dayofweek']
BEST OOF-0| Epoch:  10| Train loss: 0.43296| roc_auc: 0.85625
--------------------


OOF-0| Epoch:   1| Train loss: 0.50703| roc_auc: 0.82931: 100%|██████████| 1/1 [01:55<00:00, 115.05s/it]
OOF-0| Epoch:   2| Train loss: 0.46610| roc_auc: 0.84545: 100%|██████████| 1/1 [01:53<00:00, 113.61s/it]
OOF-0| Epoch:   3| Train loss: 0.45275| roc_auc: 0.84935: 100%|██████████| 1/1 [01:53<00:00, 113.72s/it]
OOF-0| Epoch:   4| Train loss: 0.44663| roc_auc: 0.85247: 100%|██████████| 1/1 [01:54<00:00, 114.36s/it]
OOF-0| Epoch:   5| Train loss: 0.44293| roc_auc: 0.84970: 100%|██████████| 1/1 [01:54<00:00, 114.16s/it]
OOF-0| Epoch:   6| Train loss: 0.44015| roc_auc: 0.85271: 100%|██████████| 1/1 [01:54<00:00, 114.45s/it]
OOF-0| Epoch:   7| Train loss: 0.43808| roc_auc: 0.85394: 100%|██████████| 1/1 [01:53<00:00, 113.44s/it]
OOF-0| Epoch:   8| Train loss: 0.43625| roc_auc: 0.85561: 100%|██████████| 1/1 [01:54<00:00, 114.48s/it]
OOF-0| Epoch:   9| Train loss: 0.43412| roc_auc: 0.85571: 100%|██████████| 1/1 [01:53<00:00, 113.95s/it]
OOF-0| Epoch:  10| Train loss: 0.43312| roc_auc: 0.8571

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'now_week', 'sin_num_dayofweek', 'cos_num_dayofweek']
BEST OOF-0| Epoch:  10| Train loss: 0.43312| roc_auc: 0.85715
--------------------


OOF-0| Epoch:   1| Train loss: 0.50727| roc_auc: 0.83067: 100%|██████████| 1/1 [01:54<00:00, 114.26s/it]
OOF-0| Epoch:   2| Train loss: 0.46606| roc_auc: 0.84680: 100%|██████████| 1/1 [01:54<00:00, 114.91s/it]
OOF-0| Epoch:   3| Train loss: 0.45274| roc_auc: 0.85030: 100%|██████████| 1/1 [01:53<00:00, 113.47s/it]
OOF-0| Epoch:   4| Train loss: 0.44686| roc_auc: 0.85368: 100%|██████████| 1/1 [01:53<00:00, 113.12s/it]
OOF-0| Epoch:   5| Train loss: 0.44312| roc_auc: 0.85108: 100%|██████████| 1/1 [01:55<00:00, 115.01s/it]
OOF-0| Epoch:   6| Train loss: 0.44027| roc_auc: 0.85420: 100%|██████████| 1/1 [01:54<00:00, 114.21s/it]
OOF-0| Epoch:   7| Train loss: 0.43805| roc_auc: 0.85558: 100%|██████████| 1/1 [01:53<00:00, 113.29s/it]
OOF-0| Epoch:   8| Train loss: 0.43643| roc_auc: 0.85660: 100%|██████████| 1/1 [01:53<00:00, 113.49s/it]
OOF-0| Epoch:   9| Train loss: 0.43464| roc_auc: 0.85658: 100%|██████████| 1/1 [01:54<00:00, 114.22s/it]
OOF-0| Epoch:  10| Train loss: 0.43345| roc_auc: 0.8581

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'num_week_number', 'sin_num_week_number', 'cos_num_week_number']
BEST OOF-0| Epoch:  10| Train loss: 0.43345| roc_auc: 0.85818
--------------------


OOF-0| Epoch:   1| Train loss: 0.51026| roc_auc: 0.82815: 100%|██████████| 1/1 [01:53<00:00, 113.53s/it]
OOF-0| Epoch:   2| Train loss: 0.46753| roc_auc: 0.84668: 100%|██████████| 1/1 [01:53<00:00, 113.47s/it]
OOF-0| Epoch:   3| Train loss: 0.45326| roc_auc: 0.85196: 100%|██████████| 1/1 [01:54<00:00, 114.29s/it]
OOF-0| Epoch:   4| Train loss: 0.44710| roc_auc: 0.85439: 100%|██████████| 1/1 [01:54<00:00, 114.66s/it]
OOF-0| Epoch:   5| Train loss: 0.44347| roc_auc: 0.85302: 100%|██████████| 1/1 [01:53<00:00, 113.33s/it]
OOF-0| Epoch:   6| Train loss: 0.44066| roc_auc: 0.85487: 100%|██████████| 1/1 [01:54<00:00, 114.02s/it]
OOF-0| Epoch:   7| Train loss: 0.43827| roc_auc: 0.85697: 100%|██████████| 1/1 [01:54<00:00, 114.19s/it]
OOF-0| Epoch:   8| Train loss: 0.43639| roc_auc: 0.85753: 100%|██████████| 1/1 [01:54<00:00, 114.31s/it]
OOF-0| Epoch:   9| Train loss: 0.43477| roc_auc: 0.85710: 100%|██████████| 1/1 [01:54<00:00, 114.16s/it]
OOF-0| Epoch:  10| Train loss: 0.43304| roc_auc: 0.8560

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'num_week_number', 'num_hour']
BEST OOF-0| Epoch:   8| Train loss: 0.43639| roc_auc: 0.85753
--------------------


OOF-0| Epoch:   1| Train loss: 0.50726| roc_auc: 0.83077: 100%|██████████| 1/1 [01:53<00:00, 113.90s/it]
OOF-0| Epoch:   2| Train loss: 0.46607| roc_auc: 0.84588: 100%|██████████| 1/1 [01:54<00:00, 114.00s/it]
OOF-0| Epoch:   3| Train loss: 0.45274| roc_auc: 0.84892: 100%|██████████| 1/1 [01:54<00:00, 114.58s/it]
OOF-0| Epoch:   4| Train loss: 0.44674| roc_auc: 0.85336: 100%|██████████| 1/1 [01:54<00:00, 114.22s/it]
OOF-0| Epoch:   5| Train loss: 0.44309| roc_auc: 0.85057: 100%|██████████| 1/1 [01:54<00:00, 114.84s/it]
OOF-0| Epoch:   6| Train loss: 0.44019| roc_auc: 0.85377: 100%|██████████| 1/1 [01:55<00:00, 115.24s/it]
OOF-0| Epoch:   7| Train loss: 0.43801| roc_auc: 0.85453: 100%|██████████| 1/1 [01:53<00:00, 113.73s/it]
OOF-0| Epoch:   8| Train loss: 0.43617| roc_auc: 0.85517: 100%|██████████| 1/1 [01:54<00:00, 114.94s/it]
OOF-0| Epoch:   9| Train loss: 0.43436| roc_auc: 0.85663: 100%|██████████| 1/1 [01:55<00:00, 115.20s/it]
OOF-0| Epoch:  10| Train loss: 0.43324| roc_auc: 0.8569

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'num_week_number', 'sin_num_hour', 'cos_num_hour']
BEST OOF-0| Epoch:  10| Train loss: 0.43324| roc_auc: 0.85694
--------------------


OOF-0| Epoch:   1| Train loss: 0.51025| roc_auc: 0.82880: 100%|██████████| 1/1 [01:55<00:00, 115.01s/it]
OOF-0| Epoch:   2| Train loss: 0.46759| roc_auc: 0.84755: 100%|██████████| 1/1 [01:58<00:00, 118.88s/it]
OOF-0| Epoch:   3| Train loss: 0.45349| roc_auc: 0.85173: 100%|██████████| 1/1 [01:57<00:00, 117.75s/it]
OOF-0| Epoch:   4| Train loss: 0.44728| roc_auc: 0.85399: 100%|██████████| 1/1 [01:58<00:00, 118.51s/it]
OOF-0| Epoch:   5| Train loss: 0.44361| roc_auc: 0.85330: 100%|██████████| 1/1 [01:57<00:00, 117.38s/it]
OOF-0| Epoch:   6| Train loss: 0.44070| roc_auc: 0.85510: 100%|██████████| 1/1 [01:58<00:00, 118.06s/it]
OOF-0| Epoch:   7| Train loss: 0.43830| roc_auc: 0.85673: 100%|██████████| 1/1 [01:58<00:00, 118.33s/it]
OOF-0| Epoch:   8| Train loss: 0.43652| roc_auc: 0.85756: 100%|██████████| 1/1 [01:57<00:00, 117.10s/it]
OOF-0| Epoch:   9| Train loss: 0.43480| roc_auc: 0.85634: 100%|██████████| 1/1 [01:57<00:00, 117.88s/it]
OOF-0| Epoch:  10| Train loss: 0.43317| roc_auc: 0.8556

--------------------
cat_cols : ['assessmentItemID2idx', 'testId2idx', 'KnowledgeTag2idx', 'large_paper_number2idx']
num_cols : ['now_elapsed', 'assessmentItemID_mean_now_elapsed', 'assessmentItemID_std_now_elapsed', 'assessmentItemID_mean_answerCode', 'assessmentItemID_std_answerCode', 'num_week_number', 'num_dayofweek']
BEST OOF-0| Epoch:   8| Train loss: 0.43652| roc_auc: 0.85756
--------------------


OOF-0| Epoch:   1| Train loss: 0.50726| roc_auc: 0.83075: 100%|██████████| 1/1 [01:57<00:00, 117.37s/it]
  0%|          | 0/1 [00:00<?, ?it/s]