# Simple RNN Baseline

- Transformer 적용 이전, 성능의 원활한 비교를 위해 DNN baseline으로 RNN 모델을 빌드함
- 간단한 구조로 구성

In [1]:
import os, random
from tqdm import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

plt.rcParams["font.family"] = 'NanumGothic'

In [2]:
# fix random seed
seed = 42
random.seed(seed)
np.random.seed(seed)
_ = torch.manual_seed(seed)

# prep dataset, dataloader

## label encoding

In [3]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

train_df = pd.read_csv('../../data/train_data.csv')
test_df = pd.read_csv('../../data/test_data.csv')

class MultiLabelEncoder:
    # 참고: https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn

    def __init__(self, df):
        # define encoder dict
        self.d = defaultdict(LabelEncoder)
        # Encoding the variable
        for col in df.columns:
            self.d[col] = self.d[col].fit(df[col])
    
    def encode(self, df):
        return df.apply(lambda x: self.d[x.name].transform(x))

    def decode(self, df):
    # Inverse the encoded
        return df.apply(lambda x: self.d[x.name].inverse_transform(x))

category_cols = ['assessmentItemID', 'testId', 'KnowledgeTag']
mle = MultiLabelEncoder(train_df[category_cols])

## get sequences by user

In [4]:
def get_sequence_by_user(df, features, max_length=512, train=True):
    user_ids, inputs, masks, targets = [], [], [], []

    for user_id in tqdm(df['userID'].unique()):

        # get user data with user_id
        user_data = df[df['userID'] == user_id]
        # get additional info (previous label)
        user_data = user_data.assign(previous_label=(user_data.answerCode.shift(1)+1).fillna(0).values)
        # get sequence to numpy
        sequence = user_data[features].to_numpy()
        # get target data: last answerCode
        target = user_data['answerCode'].values[-1]

        # cut or pad sequences with max_length
        if len(sequence) < max_length:
            padding = np.zeros((max_length - len(sequence), sequence.shape[1]))
            mask = np.vstack((padding, np.ones_like(sequence)))
            sequence = np.vstack((padding, sequence))
        else:
            sequence = sequence[-max_length:]
            mask = np.ones((max_length, sequence.shape[1]))
        
        user_ids.append(user_id)
        inputs.append(sequence)
        masks.append(mask)
        targets.append(target)

    return np.array(user_ids), np.array(inputs), np.array(masks), np.array(targets)

## dataset

In [5]:
def get_data(data_path, mle, category_cols, features, max_length, train=True):
    # read data
    df = pd.read_csv(data_path)
    # preprocess
    # label encoding
    df_encoded = mle.encode(df[category_cols])
    df = pd.concat([df.drop(category_cols, axis=1), df_encoded], axis=1)

    # sequence
    return get_sequence_by_user(df, features, max_length, train)

In [6]:
train_path = '../../data/train_data.csv'
test_path = '../../data/test_data.csv'
features = ['assessmentItemID','testId','KnowledgeTag', 'previous_label']
max_length = 512

train_data = get_data(train_path, mle, category_cols, features, max_length, train=True)
test_data = get_data(test_path, mle, category_cols, features, max_length, train=False)

100%|██████████| 6698/6698 [00:32<00:00, 208.28it/s]
100%|██████████| 744/744 [00:01<00:00, 408.90it/s]


## split

In [7]:
from sklearn.model_selection import train_test_split

def train_val_split(user_ids, X, masks, target, train_size=.8):
    # split index and target
    train_index, valid_index, train_y, valid_y = train_test_split(
        range(target.shape[0]), target, train_size=.8, stratify=target)
    # split X
    train_X, valid_X = X[train_index], X[valid_index]
    # split masks
    train_masks, valid_masks = masks[train_index], masks[valid_index]
    # split users
    train_users, valid_users = user_ids[train_index], user_ids[valid_index]

    return (train_users, train_X, train_masks, train_y), (valid_users, valid_X, valid_masks, valid_y)

In [8]:
train_data, valid_data = train_val_split(*train_data)

## prep dataset

In [9]:
class SimpleSequenceDKTDataset(Dataset):
    def __init__(self, user_ids, X, mask, y=None, max_length=512, train=True):
        super().__init__()
        self.train = train

        self.user_ids = user_ids
        self.X = X
        self.mask = mask
        if self.train:
            self.y = y
    
    def __len__(self):
        return len(self.user_ids)
    
    def __getitem__(self, index):
        item = {'user_id': self.user_ids[index]}
        item['X'] = self.X[index]
        item['mask'] = self.mask[index]
        if self.train:
            item['y'] = self.y[index]
        return item
    
    def get_user_ids(self):
        return self.user_ids

In [10]:
train_dataset = SimpleSequenceDKTDataset(*train_data)
valid_dataset = SimpleSequenceDKTDataset(*valid_data)
test_dataset = SimpleSequenceDKTDataset(*test_data, train=False)
len(train_dataset), len(valid_dataset), len(test_dataset)

(5358, 1340, 744)

## dataloader

In [11]:
batch_size = 128
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size)

# Modeling

In [12]:
cat_size = len(features[:-1])
cat_emb_size = [len(mle.d[feature].classes_) for feature in features[:-1]]
num_size = 1

In [13]:
class SimpleRNNModel(nn.Module):

    def __init__(self, features, cat_size, num_size, cat_emb_size, hdim, activation_f):
        super().__init__()
        # for categorical data
        self.embedding = nn.Embedding(sum(cat_emb_size)+cat_size, hdim, dtype=torch.float32)
        self.cat_linear = nn.Linear(hdim*cat_size, hdim//2, dtype=torch.float32)
        self.cat_layernorm = nn.LayerNorm(hdim//2)
        # for continuous data
        self.cont_linear = nn.Linear(num_size, hdim//2, dtype=torch.float32)
        self.cont_layernorm = nn.LayerNorm(hdim//2)
        # rnn cell
        self.rnn_cell = nn.RNNCell(hdim, hdim, activation_f, dtype=torch.float32)
        self.last_layer = nn.Linear(hdim, 1, dtype=torch.float32)

    def init_params(self):
        # rnn
        nn.init.kaiming_uniform_(self.rnn_cell.weight_ih)
        nn.init.kaiming_uniform_(self.rnn_cell.weight_hh)
        nn.init.zeros_(self.rnn_cell.bias_ih)
        nn.init.zeros_(self.rnn_cell.bias_hh)
        # last layer
        nn.init.kaiming_uniform_(self.last_layer.weights)
        nn.init.zeros_(self.last_layer.bias)

    def forward(self, x):
        x, mask = x
        batch_size, seq_len, _ = x.size()

        # categorical data embedding
        offset = x.new_tensor(np.array([0, *np.cumsum(cat_emb_size)[:-1]])) + 1
        x_cat = (x[:,:,:-1] + offset).mul(mask[:,:,:-1])
        x_cat = self.embedding(x_cat.int()).view(batch_size, seq_len, -1)
        x_cat = self.cat_linear(x_cat)
        x_cat = self.cat_layernorm(x_cat)

        # continuous
        x_cont = x[:,:,-1].view(batch_size, seq_len, -1)
        x_cont = self.cont_linear(x_cont)
        x_cont = self.cont_layernorm(x_cont)

        # concat data
        x_concat = torch.concat([x_cat, x_cont], dim=-1)

        rnn_hidden = torch.rand(batch_size, self.rnn_cell.hidden_size, 
            dtype=torch.float32).to(device)
        
        for e in range(seq_len):
            rnn_hidden = self.rnn_cell(x_concat[:, e, :], rnn_hidden)

        output = self.last_layer(rnn_hidden)
        
        return output

In [14]:
learning_rate = 1e-1
T_max = 10
eta_min = 0

seqlen = 512
input_feature = 4
hdim = 16

In [15]:
# # at beginning of the script
# device = torch.device('cpu')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = SimpleRNNModel(features, cat_size, num_size, cat_emb_size, hdim, 'tanh').to(device)

loss_f = nn.BCEWithLogitsLoss().to(device)
adamw = torch.optim.AdamW(model.parameters(), lr=learning_rate)
lr_schedular = torch.optim.lr_scheduler.CosineAnnealingLR(adamw, T_max=T_max, eta_min=eta_min)

In [16]:
from copy import deepcopy
from sklearn.metrics import roc_auc_score

def train_step(model, loss_f, train_dataloader, adamw, scheduler, device):
    loss_sum, auc_sum = 0, 0

    for iter, data in enumerate(train_dataloader):

        X, y = data['X'].float().to(device), data['y'].view(data['y'].size(0), 1).float().to(device)
        mask = data['mask'].float().to(device)
    
        pred = model((X, mask))
        loss = loss_f(pred, y)
        
        adamw.zero_grad()
        loss.backward()
        adamw.step()

        auc_sum += roc_auc_score(y.detach().cpu().numpy(), (torch.sigmoid(pred).detach().cpu().numpy()>=0.5).astype(int))
        loss_sum += loss.item()
    
    scheduler.step()
    
    return loss_sum/len(train_dataloader), auc_sum/len(train_dataloader)

def valid_step(model, loss_f, valid_dataloader, device):
    loss_sum = 0
    auc_sum = 0
    targets, preds = [], []

    for iter, data in enumerate(valid_dataloader):
        X, y = data['X'].float().to(device), data['y'].view(data['y'].size(0), 1).float().to(device)
        mask = data['mask'].float().to(device)
        pred = model((X, mask))
        
        loss = loss_f(pred, y)

        auc_sum += roc_auc_score(y.detach().cpu().numpy(), (torch.sigmoid(pred).detach().cpu().numpy()>=0.5).astype(int))
        loss_sum += loss.item()

        targets.extend(data['y'].detach().numpy())
        preds.extend(torch.sigmoid(pred).detach().cpu().numpy())
        
    return targets, preds, loss_sum/len(valid_dataloader), auc_sum/len(valid_dataloader)

In [17]:
epochs = 30
best_auc, best_epochs = 0, 0
least_loss, patience, num = 1e+9, 10, 0
best_model = None

for e in range(epochs):

    train_loss, train_auc = train_step(model, loss_f, train_dataloader, adamw, lr_schedular, device)
    _, _, valid_loss, valid_auc = valid_step(model, loss_f, valid_dataloader, device)

    if best_auc < valid_auc:
        best_auc, best_epochs = valid_auc, e
        best_model = deepcopy(model.state_dict())

    print(f'[{e} epochs] train_loss: {train_loss:.4f}, valid_loss: {valid_loss:.4f}, train_auc: {train_auc:.4f}, valid_auc: {valid_auc:.4f}')
    
    if valid_loss < least_loss:
        least_loss, num = valid_loss, 0
    else:
        num += 1
        if num >= patience:
            print(f'early stopped at {e} epoch')
            break

[0 epochs] train_loss: 0.6767, valid_loss: 0.6087, train_auc: 0.5869, valid_auc: 0.6701
[1 epochs] train_loss: 0.5938, valid_loss: 0.6139, train_auc: 0.6912, valid_auc: 0.6702
[2 epochs] train_loss: 0.5687, valid_loss: 0.6028, train_auc: 0.7142, valid_auc: 0.6857
[3 epochs] train_loss: 0.5456, valid_loss: 0.6164, train_auc: 0.7334, valid_auc: 0.6988
[4 epochs] train_loss: 0.5277, valid_loss: 0.5914, train_auc: 0.7465, valid_auc: 0.6781
[5 epochs] train_loss: 0.5156, valid_loss: 0.6221, train_auc: 0.7532, valid_auc: 0.6684
[6 epochs] train_loss: 0.5048, valid_loss: 0.6077, train_auc: 0.7581, valid_auc: 0.6797
[7 epochs] train_loss: 0.4929, valid_loss: 0.6302, train_auc: 0.7700, valid_auc: 0.6777
[8 epochs] train_loss: 0.4759, valid_loss: 0.6179, train_auc: 0.7764, valid_auc: 0.6986
[9 epochs] train_loss: 0.4628, valid_loss: 0.6198, train_auc: 0.7847, valid_auc: 0.6872
[10 epochs] train_loss: 0.4559, valid_loss: 0.6226, train_auc: 0.7897, valid_auc: 0.6916
[11 epochs] train_loss: 0.4563,

In [18]:
model.load_state_dict(best_model)
targets, preds, valid_loss, valid_auc = valid_step(model, loss_f, valid_dataloader, device)
_, _, train_loss, train_auc = valid_step(model, loss_f, train_dataloader, device)
best_auc, best_epochs, train_auc, valid_auc

(0.6987918758704563, 3, 0.7497908939301835, 0.7006010379213227)

In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(targets, np.array(preds) > 0.5)

array([[544, 154],
       [249, 393]])

In [20]:
# test auc
def test_step(model, loss_f, test_dataloader, device):
    user_ids, test_pred_proba = [], []

    for iter, data in enumerate(test_dataloader):
        X = data['X'].float().to(device)
        mask = data['mask'].float().to(device)
        pred = model((X, mask))

        user_ids.extend(data['user_id'].detach().numpy())
        test_pred_proba.extend(torch.sigmoid(pred).detach().cpu().numpy())
        
    return user_ids, test_pred_proba

In [21]:
user_ids, test_pred_proba = test_step(model, loss_f, test_dataloader, device)

In [22]:
submission_df = pd.read_csv('../../data/sample_submission.csv')
submission_df.prediction = np.array(test_pred_proba)

In [23]:
from datetime import datetime as dt
result_dir = '../results/'
now = dt.strftime(dt.now(), '%y%m%d-%H%M%S')
modelname = 'rnn'
savename = f'{modelname}_{now}_{valid_auc:.4f}.csv'
submission_df.to_csv(os.path.join(result_dir, savename), index=False)