# Simple LSTM Baseline

- Transformer 적용 이전, 성능의 원활한 비교를 위해 DNN baseline으로 LSTM 모델을 빌드함
- 간단한 구조로 구성

In [1]:
import os, random
from tqdm import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

plt.rcParams["font.family"] = 'NanumGothic'

In [2]:
# fix random seed
seed = 42
random.seed(seed)
np.random.seed(seed)
_ = torch.manual_seed(seed)

# prep dataset, dataloader

## label encoding

In [3]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

train_df = pd.read_csv('../../data/train_data.csv')
test_df = pd.read_csv('../../data/test_data.csv')

class MultiLabelEncoder:
    # 참고: https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn

    def __init__(self, df):
        # define encoder dict
        self.d = defaultdict(LabelEncoder)
        # Encoding the variable
        for col in df.columns:
            self.d[col] = self.d[col].fit(df[col])
    
    def encode(self, df):
        return df.apply(lambda x: self.d[x.name].transform(x))

    def decode(self, df):
    # Inverse the encoded
        return df.apply(lambda x: self.d[x.name].inverse_transform(x))

category_cols = ['assessmentItemID', 'testId', 'KnowledgeTag']
mle = MultiLabelEncoder(train_df[category_cols])

## get sequences by user

In [4]:
def get_sequence_by_user(df, features, max_length=512, train=True):
    user_ids, inputs, masks, targets = [], [], [], []

    for user_id in tqdm(df['userID'].unique()):

        # get user data with user_id
        user_data = df[df['userID'] == user_id]
        # get additional info (previous label)
        user_data = user_data.assign(previous_label=(user_data.answerCode.shift(1)+1).fillna(0).values)
        # get sequence to numpy
        sequence = user_data[features].to_numpy()
        # get target data: last answerCode
        target = user_data['answerCode'].values[-1]

        # cut or pad sequences with max_length
        if len(sequence) < max_length:
            padding = np.zeros((max_length - len(sequence), sequence.shape[1]))
            mask = np.vstack((padding, np.ones_like(sequence)))
            sequence = np.vstack((padding, sequence))
        else:
            sequence = sequence[-max_length:]
            mask = np.ones((max_length, sequence.shape[1]))
        
        user_ids.append(user_id)
        inputs.append(sequence)
        masks.append(mask)
        targets.append(target)

    return np.array(user_ids), np.array(inputs), np.array(masks), np.array(targets)

## dataset

In [5]:
def get_data(data_path, mle, category_cols, features, max_length, train=True):
    # read data
    df = pd.read_csv(data_path)
    # preprocess
    # label encoding
    df_encoded = mle.encode(df[category_cols])
    df = pd.concat([df.drop(category_cols, axis=1), df_encoded], axis=1)

    # sequence
    return get_sequence_by_user(df, features, max_length, train)

In [6]:
train_path = '../../data/train_data.csv'
test_path = '../../data/test_data.csv'
features = ['assessmentItemID','testId','KnowledgeTag', 'previous_label']
max_length = 512

train_data = get_data(train_path, mle, category_cols, features, max_length, train=True)
test_data = get_data(test_path, mle, category_cols, features, max_length, train=False)

100%|██████████| 6698/6698 [00:32<00:00, 207.88it/s]
100%|██████████| 744/744 [00:01<00:00, 422.64it/s]


## split

In [7]:
from sklearn.model_selection import train_test_split

def train_val_split(user_ids, X, masks, target, train_size=.8):
    # split index and target
    train_index, valid_index, train_y, valid_y = train_test_split(
        range(target.shape[0]), target, train_size=.8, stratify=target)
    # split X
    train_X, valid_X = X[train_index], X[valid_index]
    # split masks
    train_masks, valid_masks = masks[train_index], masks[valid_index]
    # split users
    train_users, valid_users = user_ids[train_index], user_ids[valid_index]

    return (train_users, train_X, train_masks, train_y), (valid_users, valid_X, valid_masks, valid_y)

In [8]:
train_data, valid_data = train_val_split(*train_data)

## prep dataset

In [9]:
class SimpleSequenceDKTDataset(Dataset):
    def __init__(self, user_ids, X, mask, y=None, max_length=512, train=True):
        super().__init__()
        self.train = train

        self.user_ids = user_ids
        self.X = X
        self.mask = mask
        if self.train:
            self.y = y
    
    def __len__(self):
        return len(self.user_ids)
    
    def __getitem__(self, index):
        item = {'user_id': self.user_ids[index]}
        item['X'] = self.X[index]
        item['mask'] = self.mask[index]
        if self.train:
            item['y'] = self.y[index]
        return item
    
    def get_user_ids(self):
        return self.user_ids

In [10]:
train_dataset = SimpleSequenceDKTDataset(*train_data)
valid_dataset = SimpleSequenceDKTDataset(*valid_data)
test_dataset = SimpleSequenceDKTDataset(*test_data, train=False)
len(train_dataset), len(valid_dataset), len(test_dataset)

(5358, 1340, 744)

# Configs

In [11]:
import yaml

#read yaml file
with open('rnn_config.yaml') as file:
  config = yaml.safe_load(file)  
print(config)

{'data_dir': '.', 'data_version': 'v1', 'modelname': 'rnn', 'result_dir': '../results/', 'batch_size': 128, 'emb_dim': 16, 'hidden_dim': 16, 'activation_f': 'tanh', 'learnin_rate': 0.1, 'epochs': 100, 'patience': 20, 'T_max': 10, 'eta_min': 0}


# load dataset

In [12]:
data_dir = config['data_dir']
data_version = config['data_version']

train_dataset_v1 = torch.load(os.path.join(data_dir, f'train_dataset_{data_version}.pt'))
valid_dataset_v1 = torch.load(os.path.join(data_dir, f'valid_dataset_{data_version}.pt'))
test_dataset_v1 = torch.load(os.path.join(data_dir, f'test_dataset_{data_version}.pt'))

# dataloader

In [13]:
batch_size = config['batch_size']
train_dataloader = DataLoader(train_dataset_v1, batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset_v1, batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset_v1, batch_size)

# Modeling

In [14]:
cat_size = len(features[:-1])
cat_emb_size = [len(mle.d[feature].classes_) for feature in features[:-1]]
num_size = 1

In [15]:
class SimpleLSTMModel(nn.Module):

    def __init__(self, features, cat_size, num_size, cat_emb_size, 
        emb_dim, hidden_dim, activation_f):
        super().__init__()
        # for categorical data
        self.embedding = nn.Embedding(sum(cat_emb_size)+cat_size, emb_dim, dtype=torch.float32)
        self.cat_linear = nn.Linear(emb_dim*cat_size, hidden_dim//2, dtype=torch.float32)
        self.cat_layernorm = nn.LayerNorm(hidden_dim//2)
        # for continuous data
        self.cont_linear = nn.Linear(num_size, hidden_dim//2, dtype=torch.float32)
        self.cont_layernorm = nn.LayerNorm(hidden_dim//2)
        # lstm cell
        self.lstm_cell = nn.LSTMCell(hidden_dim, hidden_dim, activation_f, dtype=torch.float32)
        self.last_layer = nn.Linear(hidden_dim, 1, dtype=torch.float32)

    def init_params(self):
        # lstm
        nn.init.kaiming_uniform_(self.lstm_cell.weight_ih)
        nn.init.kaiming_uniform_(self.lstm_cell.weight_hh)
        nn.init.zeros_(self.lstm_cell.bias_ih)
        nn.init.zeros_(self.lstm_cell.bias_hh)
        # last layer
        nn.init.kaiming_uniform_(self.last_layer.weights)
        nn.init.zeros_(self.last_layer.bias)

    def forward(self, x):
        x, mask = x
        batch_size, seq_len, _ = x.size()

        # categorical data embedding
        offset = x.new_tensor(np.array([0, *np.cumsum(cat_emb_size)[:-1]])) + 1
        x_cat = (x[:,:,:-1] + offset).mul(mask[:,:,:-1])
        x_cat = self.embedding(x_cat.int()).view(batch_size, seq_len, -1)
        x_cat = self.cat_linear(x_cat)
        x_cat = self.cat_layernorm(x_cat)

        # continuous
        x_cont = x[:,:,-1].view(batch_size, seq_len, -1)
        x_cont = self.cont_linear(x_cont)
        x_cont = self.cont_layernorm(x_cont)

        # concat data
        x_concat = torch.concat([x_cat, x_cont], dim=-1)

        # Initial hidden state and cell state
        lstm_h0 = torch.rand(batch_size, self.lstm_cell.hidden_size).to(device)
        lstm_c0 = torch.rand(batch_size, self.lstm_cell.hidden_size).to(device)
        
        for e in range(seq_len):
            lstm_h0, lstm_c0 = self.lstm_cell(x_concat[:, e, :], (lstm_h0, lstm_c0))

        output = self.last_layer(lstm_h0)
        
        return output

In [16]:
learning_rate = config['learnin_rate']
T_max = config['T_max']
eta_min = config['eta_min']

seqlen = 512 #DATA
input_feature = 4 #DATA
emb_dim = config['emb_dim']
hidden_dim = config['hidden_dim']

In [17]:
# # at beginning of the script
# device = torch.device('cpu')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = SimpleLSTMModel(features, cat_size, num_size, cat_emb_size, 
    emb_dim, hidden_dim, 'tanh').to(device)

loss_f = nn.BCEWithLogitsLoss().to(device)
adamw = torch.optim.AdamW(model.parameters(), lr=learning_rate)
lr_schedular = torch.optim.lr_scheduler.CosineAnnealingLR(adamw, T_max=T_max, eta_min=eta_min)

In [18]:
from copy import deepcopy
from sklearn.metrics import roc_auc_score

def train_step(model, loss_f, train_dataloader, adamw, scheduler, device):
    loss_sum, auc_sum = 0, 0

    for iter, data in enumerate(train_dataloader):

        X, y = data['X'].float().to(device), data['y'].view(data['y'].size(0), 1).float().to(device)
        mask = data['mask'].float().to(device)
    
        pred = model((X, mask))
        loss = loss_f(pred, y)
        
        adamw.zero_grad()
        loss.backward()
        adamw.step()

        auc_sum += roc_auc_score(y.detach().cpu().numpy(), (torch.sigmoid(pred).detach().cpu().numpy()>=0.5).astype(int))
        loss_sum += loss.item()
    
    scheduler.step()
    
    return loss_sum/len(train_dataloader), auc_sum/len(train_dataloader)

def valid_step(model, loss_f, valid_dataloader, device):
    loss_sum = 0
    auc_sum = 0
    targets, preds = [], []

    for iter, data in enumerate(valid_dataloader):
        X, y = data['X'].float().to(device), data['y'].view(data['y'].size(0), 1).float().to(device)
        mask = data['mask'].float().to(device)
        pred = model((X, mask))
        
        loss = loss_f(pred, y)

        auc_sum += roc_auc_score(y.detach().cpu().numpy(), (torch.sigmoid(pred).detach().cpu().numpy()>=0.5).astype(int))
        loss_sum += loss.item()

        targets.extend(data['y'].detach().numpy())
        preds.extend(torch.sigmoid(pred).detach().cpu().numpy())
        
    return targets, preds, loss_sum/len(valid_dataloader), auc_sum/len(valid_dataloader)

In [19]:
epochs = config['epochs']
best_auc, best_epochs = 0, 0
least_loss, patience, num = 1e+9, config['patience'], 0
best_model = None

for e in range(epochs):

    train_loss, train_auc = train_step(model, loss_f, train_dataloader, adamw, lr_schedular, device)
    _, _, valid_loss, valid_auc = valid_step(model, loss_f, valid_dataloader, device)

    if best_auc < valid_auc:
        best_auc, best_epochs = valid_auc, e
        best_model = deepcopy(model.state_dict())

    print(f'[{e} epochs] train_loss: {train_loss:.4f}, valid_loss: {valid_loss:.4f}, train_auc: {train_auc:.4f}, valid_auc: {valid_auc:.4f}')
    
    if valid_loss < least_loss:
        least_loss, num = valid_loss, 0
    else:
        num += 1
        if num >= patience:
            print(f'early stopped at {e} epoch')
            break

[0 epochs] train_loss: 0.6585, valid_loss: 0.5864, train_auc: 0.5919, valid_auc: 0.6857
[1 epochs] train_loss: 0.5779, valid_loss: 0.5815, train_auc: 0.7057, valid_auc: 0.7105
[2 epochs] train_loss: 0.5333, valid_loss: 0.6041, train_auc: 0.7422, valid_auc: 0.6721
[3 epochs] train_loss: 0.5141, valid_loss: 0.7061, train_auc: 0.7481, valid_auc: 0.6662
[4 epochs] train_loss: 0.4902, valid_loss: 0.6335, train_auc: 0.7687, valid_auc: 0.6939
[5 epochs] train_loss: 0.4481, valid_loss: 0.6416, train_auc: 0.7910, valid_auc: 0.7025
[6 epochs] train_loss: 0.4146, valid_loss: 0.6660, train_auc: 0.8097, valid_auc: 0.6901
[7 epochs] train_loss: 0.3903, valid_loss: 0.6856, train_auc: 0.8218, valid_auc: 0.6901
[8 epochs] train_loss: 0.3723, valid_loss: 0.6908, train_auc: 0.8334, valid_auc: 0.6888
[9 epochs] train_loss: 0.3617, valid_loss: 0.7086, train_auc: 0.8380, valid_auc: 0.6890
[10 epochs] train_loss: 0.3583, valid_loss: 0.6969, train_auc: 0.8402, valid_auc: 0.6871
[11 epochs] train_loss: 0.3590,

In [20]:
model.load_state_dict(best_model)
targets, preds, valid_loss, valid_auc = valid_step(model, loss_f, valid_dataloader, device)
_, _, train_loss, train_auc = valid_step(model, loss_f, train_dataloader, device)
train_auc, valid_auc

(0.7536527477769102, 0.7087290938911681)

In [21]:
from sklearn.metrics import confusion_matrix
confusion_matrix(targets, np.array(preds) > 0.5)

array([[477, 221],
       [170, 472]])

In [22]:
# test auc
def test_step(model, loss_f, test_dataloader, device):
    user_ids, test_pred_proba = [], []

    for iter, data in enumerate(test_dataloader):
        X = data['X'].float().to(device)
        mask = data['mask'].float().to(device)
        pred = model((X, mask))

        user_ids.extend(data['user_id'].detach().numpy())
        test_pred_proba.extend(torch.sigmoid(pred).detach().cpu().numpy())
        
    return user_ids, test_pred_proba

In [23]:
user_ids, test_pred_proba = test_step(model, loss_f, test_dataloader, device)

In [24]:
submission_df = pd.read_csv('../../data/sample_submission.csv')
submission_df.prediction = np.array(test_pred_proba)

In [25]:
from datetime import datetime as dt
result_dir = config['result_dir']
now = dt.strftime(dt.now(), '%y%m%d-%H%M%S')
modelname = config['modelname']
savename = f'{modelname}_{now}_{valid_auc:.4f}.csv'
submission_df.to_csv(os.path.join(result_dir, savename), index=False)