In [17]:
import numpy as np
import pandas as pd

import gc
import psutil
import joblib
import random
import logging
from tqdm import tqdm

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import QuantileTransformer

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [18]:
MAX_SEQ = 100
D_MODEL = 256
N_LAYER = 2
BATCH_SIZE = 256
DROPOUT_RATE = 0.1
EPOCHS = 100

In [19]:
train_df = pd.read_csv('/opt/ml/input/data/after_fe_train_test.csv')
train_df

Unnamed: 0.1,Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,...,big_category,problem_num,mid_category,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,kind
0,0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,...,6,1,1,0.947683,0.222749,1268,0.955022,0.207410,637,train
1,1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.000000,...,6,2,1,0.947683,0.222749,1268,0.913187,0.281603,3040,train
2,2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.000000,...,6,3,1,0.947683,0.222749,1268,0.913187,0.281603,3040,train
3,3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.000000,...,6,4,1,0.947683,0.222749,1268,0.913187,0.281603,3040,train
4,4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.000000,...,6,5,1,0.947683,0.222749,1268,0.913187,0.281603,3040,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,260109,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832,8.0,11,0.727273,...,4,1,130,0.604762,0.499738,127,0.584848,0.499618,193,test
2526696,260110,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832,8.0,12,0.666667,...,4,2,130,0.604762,0.499738,127,0.584848,0.499618,193,test
2526697,260111,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244,9.0,13,0.692308,...,4,3,130,0.604762,0.499738,127,0.725490,0.476214,111,test
2526698,260112,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244,10.0,14,0.714286,...,4,4,130,0.604762,0.499738,127,0.725490,0.476214,111,test


In [20]:
train_df['user_correct_answer'].fillna(0,inplace=True)
train_df['user_acc'].fillna(method='bfill',inplace= True)
train_df = train_df.loc[train_df['answerCode'] != -1]

In [21]:
train_df = train_df[['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag', 'user_correct_answer', 
       'user_acc', 'month', 'hour', 'big_category', 'mid_category', 'test_mean', 'tag_mean']]
train_df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,KnowledgeTag,user_correct_answer,user_acc,month,hour,big_category,mid_category,test_mean,tag_mean
0,0,A060001001,A060000001,1,7224,0.0,1.000000,3,0,6,1,0.947683,0.955022
1,0,A060001002,A060000001,1,7225,1.0,1.000000,3,0,6,1,0.947683,0.913187
2,0,A060001003,A060000001,1,7225,2.0,1.000000,3,0,6,1,0.947683,0.913187
3,0,A060001004,A060000001,1,7225,3.0,1.000000,3,0,6,1,0.947683,0.913187
4,0,A060001005,A060000001,1,7225,4.0,1.000000,3,0,6,1,0.947683,0.913187
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526694,7439,A040197006,A040000197,1,2132,7.0,0.700000,8,7,4,197,0.744792,0.720930
2526695,7439,A040130001,A040000130,0,8832,8.0,0.727273,10,23,4,130,0.604762,0.584848
2526696,7439,A040130002,A040000130,1,8832,8.0,0.666667,10,23,4,130,0.604762,0.584848
2526697,7439,A040130003,A040000130,1,8244,9.0,0.692308,10,23,4,130,0.604762,0.725490


In [22]:
assessmentItemID_to_idx = {v:k for k,v in enumerate(train_df['assessmentItemID'].unique())}
idx_to_assessmentItemID = {k:v for k,v in enumerate(train_df['assessmentItemID'].unique())}

testId_to_idx = {v:k for k,v in enumerate(train_df['testId'].unique())}
idx_to_testId = {k:v for k,v in enumerate(train_df['testId'].unique())}

train_df['assessmentItemID'] = train_df['assessmentItemID'].map(assessmentItemID_to_idx)
train_df['testId'] = train_df['testId'].map(testId_to_idx)

train_df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,KnowledgeTag,user_correct_answer,user_acc,month,hour,big_category,mid_category,test_mean,tag_mean
0,0,0,0,1,7224,0.0,1.000000,3,0,6,1,0.947683,0.955022
1,0,1,0,1,7225,1.0,1.000000,3,0,6,1,0.947683,0.913187
2,0,2,0,1,7225,2.0,1.000000,3,0,6,1,0.947683,0.913187
3,0,3,0,1,7225,3.0,1.000000,3,0,6,1,0.947683,0.913187
4,0,4,0,1,7225,4.0,1.000000,3,0,6,1,0.947683,0.913187
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526694,7439,1396,217,1,2132,7.0,0.700000,8,7,4,197,0.744792,0.720930
2526695,7439,5041,809,0,8832,8.0,0.727273,10,23,4,130,0.604762,0.584848
2526696,7439,5042,809,1,8832,8.0,0.666667,10,23,4,130,0.604762,0.584848
2526697,7439,5043,809,1,8244,9.0,0.692308,10,23,4,130,0.604762,0.725490


In [23]:
skills = train_df['assessmentItemID'].unique()
n_skill = len(skills)

In [24]:
train_group = train_df[['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag', 'user_correct_answer', 
       'user_acc', 'month', 'hour', 'big_category', 'mid_category', 'test_mean', 'tag_mean']].groupby('userID').apply(lambda r: (
            r['assessmentItemID'].values,
            r['testId'].values,
            r['answerCode'].values,
            r['KnowledgeTag'].values,
            r['user_correct_answer'].values,
            r['user_acc'].values,
            r['month'].values,
            r['hour'].values,
            r['big_category'].values,
            r['mid_category'].values,
            r['test_mean'].values,
            r['tag_mean'].values))

In [25]:
class SAINTDataset(Dataset):
    def __init__(self, group, n_skill, max_seq=MAX_SEQ):
        super(SAINTDataset, self).__init__()
        self.max_seq = max_seq
        self.n_skill = n_skill
        self.samples = {}
        
        self.user_ids = []
        for user_id in group.index:
            q, test_ID, qa, kT, uCA, uAcc, month, hour, bC, mC, testM, tagM = group[user_id]
            if len(q) < 2:
                continue
            
            if len(q) > self.max_seq:
                total_questions = len(q)
                initial = total_questions % self.max_seq
                if initial >= 2:
                    self.user_ids.append(f"{user_id}_0")
                    self.samples[f"{user_id}_0"] = (q[:initial], test_ID[:initial], qa[:initial], kT[:initial], uCA[:initial], 
                                                    uAcc[:initial], month[:initial], hour[:initial], bC[:initial], mC[:initial],
                                                    testM[:initial], tagM[:initial])
                for seq in range(total_questions // self.max_seq):
                    self.user_ids.append(f"{user_id}_{seq+1}")
                    start = initial + seq * self.max_seq
                    end = initial + (seq + 1) * self.max_seq
                    self.samples[f"{user_id}_{seq+1}"] = (q[start:end], test_ID[start:end], qa[start:end], kT[start:end], uCA[start:end], 
                                                    uAcc[start:end], month[start:end], hour[start:end], bC[start:end], mC[start:end],
                                                    testM[start:end], tagM[start:end])
            else:
                user_id = str(user_id)
                self.user_ids.append(user_id)
                self.samples[user_id] = (q, test_ID, qa, kT, uCA, uAcc, month, hour, bC, mC, testM, tagM)
    
    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        q_, test_ID_, qa_, kT_, uCA_, uAcc_, month_, hour_, bC_, mC_, testM_, tagM_ = self.samples[user_id]
        seq_len = len(q_)

        ## for zero padding
        # q_ = q_+1
        # pri_exp_ = pri_exp_ + 1
        # res_ = qa_ + 1
        
        q = np.zeros(self.max_seq, dtype=int)
        test_ID = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        kT = np.zeros(self.max_seq, dtype=int)
        uCA = np.zeros(self.max_seq, dtype=int)
        uAcc = np.zeros(self.max_seq, dtype=int)
        month = np.zeros(self.max_seq, dtype=int)
        hour = np.zeros(self.max_seq, dtype=int)
        bC = np.zeros(self.max_seq, dtype=int)
        mC = np.zeros(self.max_seq, dtype=int)
        testM = np.zeros(self.max_seq, dtype=int)
        tagM = np.zeros(self.max_seq, dtype=int)
        
        if seq_len == self.max_seq:

            q[:] = q_
            test_ID[:] = test_ID_
            qa[:] = qa_
            kT[:] = kT_
            uCA[:] = uCA_
            uAcc[:] = uAcc_
            month[:] = month_
            hour[:] = hour_
            bC[:] = bC_
            mC[:] = mC_
            testM[:] = testM_
            tagM[:] = tagM_
            
        else:
            q[-seq_len:] = q_
            test_ID[-seq_len:] = test_ID_
            qa[-seq_len:] = qa_
            kT[-seq_len:] = kT_
            uCA[-seq_len:] = uCA_
            uAcc[-seq_len:] = uAcc_
            month[-seq_len:] = month_
            hour[-seq_len:] = hour_
            bC[-seq_len:] = bC_
            mC[-seq_len:] = mC_
            testM[-seq_len:] = testM_
            tagM[-seq_len:] = tagM_
        
        target = q[1:]
        test_ID = test_ID[1:]
        kT = kT[1:]        
        uCA = uCA[1:]        
        uAcc = uAcc[1:]        
        month = month[1:]        
        hour = hour[1:]        
        bC = bC[1:]        
        mC = mC[1:]        
        testM = testM[1:]        
        tagM = tagM[1:]        
        label = qa[1:]

        return target, test_ID, kT, uCA, uAcc, month, hour, bC, mC, testM, tagM, label

In [26]:
train_dataset = SAINTDataset(train_group, n_skill)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [28]:
class FFN(nn.Module):
    def __init__(self, state_size=200):
        super(FFN, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(state_size, state_size)
        self.dropout = nn.Dropout(DROPOUT_RATE)
    
    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(seq_length):
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)

def get_sinusoid_encoding_table(n_seq, d_hidn):
    def cal_angle(position, i_hidn):
        return position / np.power(10000, 2 * (i_hidn // 2) / d_hidn)
    def get_posi_angle_vec(position):
        return [cal_angle(position, i_hidn) for i_hidn in range(d_hidn)]

    sinusoid_table = np.array([get_posi_angle_vec(i_seq) for i_seq in range(n_seq)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # even index sin 
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # odd index cos

    return sinusoid_table


class SAINTModel(nn.Module):
    def __init__(self, n_skill, max_seq=MAX_SEQ, embed_dim= 128):
        super(SAINTModel, self).__init__()

        self.n_skill = n_skill
        self.embed_dim = embed_dim

        self.target_embedding = nn.Embedding(self.n_skill+1, embed_dim) ## target
        self.test_id_embedding = nn.Embedding(1538, embed_dim) ## target
        self.kT_embedding = nn.Embedding(913, embed_dim) ## target
        self.uCA_embedding = nn.Embedding(1553, embed_dim) ## target
        self.uAcc_embedding = nn.Embedding(196185, embed_dim) ## target
        self.month_embedding = nn.Embedding(13, embed_dim) ## target
        self.hour_embedding = nn.Embedding(25, embed_dim) ## target
        self.bc_embedding = nn.Embedding(10, embed_dim) ## target
        self.mC_embedding = nn.Embedding(199, embed_dim) ## target
        self.testM_embedding = nn.Embedding(2472, embed_dim) ## target
        self.tagM_embedding = nn.Embedding(1709, embed_dim) ## target
        self.pos_embedding = nn.Embedding(max_seq-1, embed_dim) ## position
        # self.pos_embedding = get_sinusoid_encoding_table(max_seq-1, self.embed_dim)
        # self.pos_embedding =  torch.FloatTensor(self.pos_embedding).to(device)

        self.transformer = nn.Transformer(nhead=8, d_model = embed_dim, num_encoder_layers= N_LAYER, num_decoder_layers= N_LAYER, dropout = DROPOUT_RATE)

        self.dropout = nn.Dropout(DROPOUT_RATE)
        self.layer_normal = nn.LayerNorm(embed_dim) 
        self.ffn = FFN(embed_dim)
        self.pred = nn.Linear(embed_dim, 1)
    
    def forward(self, question, test_ID, kT, uCA, uAcc, month, hour, bC, mC, testM, tagM):

        device = question.device  

        ## embedding layer
        question = self.target_embedding(question)
        test_ID = self.test_id_embedding(test_ID)
        kT = self.kT_embedding(kT)
        uCA = self.uCA_embedding(uCA)
        uAcc = self.uAcc_embedding(uAcc)
        month = self.month_embedding(month)
        hour = self.hour_embedding(hour)
        bC = self.bc_embedding(bC)
        mC = self.mC_embedding(mC)
        testM = self.testM_embedding(testM)
        tagM = self.tagM_embedding(tagM)
        pos_id = torch.arange(question.size(1)).unsqueeze(0).to(device)
        pos_id = self.pos_embedding(pos_id)
        
        enc = question + pos_id
        dec = pos_id + test_ID + kT + uCA + uAcc + month + hour + bC + mC + testM + tagM       

        enc = enc.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        dec = dec.permute(1, 0, 2)
        mask = future_mask(enc.size(0)).to(device)

        att_output = self.transformer(enc, dec, src_mask=mask, tgt_mask=mask, memory_mask = mask)
        att_output = self.layer_normal(att_output)
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]

        x = self.ffn(att_output)
        x = self.layer_normal(x + att_output)
        x = self.pred(x)

        return x.squeeze(-1)

In [29]:
model = SAINTModel(n_skill, embed_dim= D_MODEL)

## AdamW
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=2e-3, steps_per_epoch=len(train_dataloader), epochs=EPOCHS
)

model.to(device)
criterion.to(device)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
print(model)

SAINTModel(
  (target_embedding): Embedding(9455, 256)
  (test_id_embedding): Embedding(1537, 256)
  (kT_embedding): Embedding(912, 256)
  (uCA_embedding): Embedding(1552, 256)
  (uAcc_embedding): Embedding(196184, 256)
  (month_embedding): Embedding(12, 256)
  (hour_embedding): Embedding(24, 256)
  (bc_embedding): Embedding(9, 256)
  (mC_embedding): Embedding(198, 256)
  (testM_embedding): Embedding(2471, 256)
  (tagM_embedding): Embedding(1708, 256)
  (pos_embedding): Embedding(99, 256)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=256, bias=True)
          (norm1)

In [None]:
def train_epoch(model, dataloader, optimizer, scheduler, criterion, device="cuda"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    for item in dataloader:
        q = item[0].to(device).long()
        test_id = item[1].to(device).long()
        label = item[2].to(device).float()
        kT = item[3].to(device).long()
        uCA = item[4].to(device).long()
        uAcc = item[5].to(device).long()
        month = item[6].to(device).long()
        hour = item[7].to(device).long()
        bC = item[8].to(device).long()
        mC = item[9].to(device).long()
        testM = item[10].to(device).long()
        tagM = item[11].to(device).long()
        target_mask = (q != 0)

        optimizer.zero_grad()
        output = model(q, test_id, kT, uCA, uAcc, month, hour, bC, mC, testM, tagM)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss.append(loss.item())

        output = torch.masked_select(output, target_mask)
        label = torch.masked_select(label, target_mask)
        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(train_loss)

    return loss, acc, auc

In [None]:
best_auc = 0
over_fit = 0
last_auc = 0

for epoch in range(EPOCHS):
    print(f'------- Epoch {epoch} ---------')
    train_loss, train_acc, train_auc = train_epoch(model, train_dataloader, optimizer, scheduler,criterion)
    print('\t Train')
    print("\tepoch - {} train_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, train_loss, train_acc, train_auc))
        
    if train_auc > best_auc:
        best_auc = train_auc
        torch.save(model.state_dict(), '2nd_sakt_model.pt')
        over_fit = 0
    else:
        over_fit += 1
            
    if over_fit >= 3: # 3번 동안 auc 향상이 없으면 eearly stop
        print("early stop epoch ", epoch)
        break

------- Epoch 0 ---------


../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [12,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [12,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [12,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [12,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [12,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [12,0,0], thread: [69,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [12,0,0], t

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.