# Transformer seq2seq Model

In [1]:
%load_ext autoreload
%autoreload 2

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim

print("Torch Version:", torch.__version__)

import torchtext
# from torchtext.legacy.datasets import Multi30k
# from torchtext.legacy.data import Field, BucketIterator
from torchtext.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# import spacy
import numpy as np
import unicodedata
import re
import numpy as np
import os
import io

import random
import math
import time
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from transformer_pyt import Encoder, Decoder, Seq2Seq, Seq2SeqMulti
from transformer_pyt import train, evaluate
from utils import get_session_data, build_vocab_from_seqs, data_process_meta, epoch_time

Torch Version: 1.5.0


In [3]:
# from torchtext.data.utils import get_tokenizer
from torchtext.data import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [4]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
data_dir = "/recsys_data/RecSys/h_and_m_personalized_fashion_recommendation"
file_name = "hnm_7w_sessionized.txt" # "hnm_big.txt"
model_name = file_name.split(".")[0] + ".pt"
seq_file_name = "seq_" + file_name
test_seq_file = "seq_test_" + file_name
colsep = "\t"

inp_seq_len, tgt_seq_len = 12, 12
BATCH_SIZE = 256
num_examples = None
file_path = os.path.join(data_dir, seq_file_name)
test_file_path = os.path.join(data_dir, test_seq_file)

tokenizer = get_tokenizer("basic_english")
# en_tokenizer = get_tokenizer(language='en')

tokens = tokenizer('0924243001 0924243002 0923758001 0918522001 0909370001 0866731001 0751471001 0915529003 0915529005 0448509014 0762846027 0714790020')
tokens

['0924243001',
 '0924243002',
 '0923758001',
 '0918522001',
 '0909370001',
 '0866731001',
 '0751471001',
 '0915529003',
 '0915529005',
 '0448509014',
 '0762846027',
 '0714790020']

Get all the sequence information

In [6]:
inp_file = os.path.join(data_dir, file_name)

all_seqs, prod_dict = get_session_data(inp_file, inp_seq_len=inp_seq_len, tgt_seq_len=tgt_seq_len)
print(all_seqs.keys())

1332519it [00:07, 167655.99it/s]


Read 144202 user interactions
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 'prod'])


In [7]:
src_vocab = build_vocab_from_seqs(all_seqs['prod'], tokenizer)
all_data = data_process_meta(all_seqs, tokenizer, src_vocab)
train_data, val_data = train_test_split(all_data, test_size=0.2)
# test_data = data_process(test_file_path, tokenizer, src_vocab, test_flag=True)
len(all_data), len(train_data), len(val_data)#, len(test_data)

(278172, 222537, 55635)

In [8]:
src_vocab['<unk>'], src_vocab['<pad>'], src_vocab['<bos>'], src_vocab['<eos>']

(0, 1, 2, 3)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [10]:
train_data[0]

(tensor([[ 454, 5189, 2871,  788, 2764],
         [   4,   11,   11,    4,    9],
         [   1,    1,    1,    1,    2],
         [   2,    2,    7,    2,    2],
         [   2,   20,   12,   20,    2],
         [  16,   16,   16,   16,   16],
         [   2,    2,    2,    2,    2],
         [   2,    2,    2,    2,    2],
         [  14,   14,   14,   14,   14],
         [   5,    5,    5,    5,    5]]),
 tensor([ 2105,  2105, 15553, 15553,   437,   128,  2955,   128,  1210,  1210,
           437,  2955]))

In [11]:
PAD_IDX = src_vocab['<pad>']
BOS_IDX = src_vocab['<bos>']
EOS_IDX = src_vocab['<eos>']

def generate_batch(data_batch):
    inp_batch, meta_batch, tgt_batch = [], [], []
    for (de_item, en_item) in data_batch:
        n = de_item.shape[0]
        before = torch.unsqueeze(torch.tensor([BOS_IDX] * n), 1)
        after = torch.unsqueeze(torch.tensor([EOS_IDX] * n), 1)
        total = torch.cat([before, de_item, after], dim=1)
        total = total.permute(1, 0)
        inp_batch.append(total)
        tgt_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
    
#     print(torch.cat(inp_batch, dim=0).shape)
    inp_batch = pad_sequence(inp_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return inp_batch, tgt_batch

train_iterator = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
valid_iterator = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)

In [12]:
# input dimension of each product attribute
prod_dict_dims = [max(prod_dict[k].values()) for k in range(len(prod_dict))]

Create multiple encoders - one for each product attribute

In [16]:
HID_DIM = 128
HID_DIM2 = 32
ENC_LAYERS = 2
DEC_LAYERS = 2
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 128
DEC_PF_DIM = 128
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1
OUTPUT_DIM = len(src_vocab)

encs = torch.nn.ModuleList()
prod_enc = Encoder(input_dim=len(src_vocab), 
                   hid_dim=HID_DIM, 
                   n_layers=ENC_LAYERS, 
                   n_heads=ENC_HEADS, 
                   pf_dim=ENC_PF_DIM, 
                   dropout=ENC_DROPOUT, 
                   device=device)
encs.append(prod_enc)
total_dim = HID_DIM
for pdim in prod_dict_dims:
    enc_p = Encoder(input_dim=pdim, 
                   hid_dim=HID_DIM2, 
                   n_layers=ENC_LAYERS, 
                   n_heads=ENC_HEADS, 
                   pf_dim=ENC_PF_DIM, 
                   dropout=ENC_DROPOUT, 
                   device=device)
    encs.append(enc_p)
    total_dim += HID_DIM2

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [17]:
SRC_PAD_IDX = 0
TRG_PAD_IDX = 0

model = Seq2SeqMulti(encs, dec, total_dim, HID_DIM, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 13,317,894 trainable parameters


In [19]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [20]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [21]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

## Model Training

In [None]:
import math

N_EPOCHS = 20
CLIP = 1

best_valid_loss = float('inf')
best_valid_map = 0
patience, max_patience = 0, 5

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP, device)
    valid_loss, valid_map = evaluate(model, valid_iterator, criterion, device)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_map > best_valid_map:
        best_valid_map = valid_map
        torch.save(model.state_dict(), 'reco-model.pt')
    else:
        patience += 1

    if patience == max_patience:
        print("Maximum patience reached ... exiting!")
        break

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s | patience {patience}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f} | Val. MAP: {valid_map:7.3f}')

51it [00:16,  3.04it/s]

# Only Product Sequence

In [7]:
src_vocab = build_vocab_from_file(file_path, tokenizer)
all_data = data_process(file_path, tokenizer, src_vocab)
train_data, val_data = train_test_split(all_data, test_size=0.2)
test_data = data_process(test_file_path, tokenizer, src_vocab, test_flag=True)
len(all_data), len(train_data), len(val_data), len(test_data)

(71460, 57168, 14292, 48709)

In [23]:
INPUT_DIM = len(src_vocab)
OUTPUT_DIM = len(src_vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [10]:
PAD_IDX = src_vocab['<pad>']
BOS_IDX = src_vocab['<bos>']
EOS_IDX = src_vocab['<eos>']

def generate_batch(data_batch):
    de_batch, en_batch = [], []
    for (de_item, en_item) in data_batch:
        de_batch.append(torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
        en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
    de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return de_batch, en_batch

train_iterator = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
valid_iterator = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
test_iterator = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=False, collate_fn=generate_batch)

In [24]:
SRC_PAD_IDX = 0
TRG_PAD_IDX = 0

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [34]:
def predict(sentence, src_vocab, trg_vocab, model, device, max_len = tgt_seq_len):
    
    model.eval()

    tokens = tokenizer(sentence)
    tokens = [src_vocab['<bos>']] + tokens + [src_vocab['<eos>']]
    src_indexes = [src_vocab.stoi[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_vocab.stoi['<bos>']]
    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        trg_indexes.append(pred_token)
        if pred_token == trg_vocab.stoi['<eos>']:
            break
    
    trg_tokens = [trg_vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention

In [37]:
pred, _ = predict('13112 16042 3871 35', src_vocab, src_vocab, model, device)
pred

['1566', '<eos>']