# Transformer seq2seq Model

In [1]:
%load_ext autoreload
%autoreload 2

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim

print("Torch Version:", torch.__version__)

import torchtext
# from torchtext.legacy.datasets import Multi30k
# from torchtext.legacy.data import Field, BucketIterator
from torchtext.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# import spacy
import numpy as np
import unicodedata
import re
import numpy as np
import os
import io

import random
import math
import time
# import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from transformer_pyt import Encoder, Decoder, Seq2Seq

Torch Version: 1.5.0


In [3]:
# from torchtext.data.utils import get_tokenizer
from torchtext.data import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [4]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


In [5]:
data_dir = "/recsys_data/RecSys/h_and_m_personalized_fashion_recommendation"
file_name = "hnm_3w_sessionized.txt" # "hnm_big.txt"
seq_file_name = "seq_" + file_name
test_seq_file = "seq_test_" + file_name
colsep = "\t"

inp_seq_len, tgt_seq_len = 12, 12
BATCH_SIZE = 256
num_examples = None
file_path = os.path.join(data_dir, seq_file_name)
test_file_path = os.path.join(data_dir, test_seq_file)

tokenizer = get_tokenizer("basic_english")
# en_tokenizer = get_tokenizer(language='en')

tokens = tokenizer('0924243001 0924243002 0923758001 0918522001 0909370001 0866731001 0751471001 0915529003 0915529005 0448509014 0762846027 0714790020')
tokens

['0924243001',
 '0924243002',
 '0923758001',
 '0918522001',
 '0909370001',
 '0866731001',
 '0751471001',
 '0915529003',
 '0915529005',
 '0448509014',
 '0762846027',
 '0714790020']

In [6]:
def build_vocab(filepath, tokenizer, index=-1):
    counter = Counter()
    with io.open(filepath, encoding="utf8") as f:
        for string_ in f:
            if index == -1:
                counter.update(tokenizer(string_.strip().split('\t')[0]))
                counter.update(tokenizer(string_.strip().split('\t')[1]))
            else:
                counter.update(tokenizer(string_.strip().split('\t')[index]))
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

def data_process(filepath, tokenizer, vocab, test_flag=False):
    raw_iter = iter(io.open(filepath, encoding="utf8"))
    data = []
    for raw in raw_iter:
        if test_flag:
            src = raw.strip()
            src_tensor = torch.tensor([vocab[token] for token in tokenizer(src)], dtype=torch.long)
            data.append(src_tensor)
        else:
            src, tgt = raw.strip().split('\t')
            src_tensor = torch.tensor([vocab[token] for token in tokenizer(src)], dtype=torch.long)
            tgt_tensor = torch.tensor([vocab[token] for token in tokenizer(tgt)], dtype=torch.long)
            data.append((src_tensor, tgt_tensor))
    return data

In [7]:
src_vocab = build_vocab(file_path, tokenizer)
all_data = data_process(file_path, tokenizer, src_vocab)
train_data, val_data = train_test_split(all_data, test_size=0.2)
test_data = data_process(test_file_path, tokenizer, src_vocab, test_flag=True)
len(all_data), len(train_data), len(val_data), len(test_data)

(71460, 57168, 14292, 48709)

In [8]:
src_vocab['<unk>'], src_vocab['<pad>'], src_vocab['<bos>'], src_vocab['<eos>']

(0, 1, 2, 3)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [10]:
PAD_IDX = src_vocab['<pad>']
BOS_IDX = src_vocab['<bos>']
EOS_IDX = src_vocab['<eos>']

def generate_batch(data_batch):
    de_batch, en_batch = [], []
    for (de_item, en_item) in data_batch:
        de_batch.append(torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
        en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
    de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return de_batch, en_batch

train_iterator = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
valid_iterator = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
test_iterator = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=False, collate_fn=generate_batch)

In [11]:
INPUT_DIM = len(src_vocab)
OUTPUT_DIM = len(src_vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [12]:
SRC_PAD_IDX = 0
TRG_PAD_IDX = 0

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [13]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 20,510,680 trainable parameters


In [14]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [15]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [16]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [58]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for _, (src, trg) in tqdm(enumerate(iterator)):
  
        src, trg = src.to(device), trg.to(device)
        src = src.permute(1, 0)
        trg = trg.permute(1, 0)
#         print(src.shape, trg.shape)
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    all_maps = []    
    with torch.no_grad():
    
        for _, (src, trg) in tqdm(enumerate(iterator)):

            src, trg = src.to(device), trg.to(device)
            src = src.permute(1, 0)
            trg = trg.permute(1, 0)

            output, _ = model(src, trg[:,:-1])

            # Mean Average Precision Calculation
            prediction = torch.argmax(output, axis=-1)
            mapr = map_batch(trg[:,1:], prediction)
            all_maps.append(mapr)
                
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator), np.mean(all_maps)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def rel(true, pred):
    return 1 if true == pred else 0


def precision_k(actual, predicted, k) -> float:
    actual_set = set(actual[:k])
    predicted_set = set(predicted[:k])
    precision_k_value = len(actual_set & predicted_set) / k

    return precision_k_value


def mAP_k(actual, predicted) -> float:
    # actual = row['valid_true'].split() # prediction_string --> prediction list
    # predicted = row['valid_pred'].split() # prediction_string --> prediction list

    M = min(len(actual), len(predicted))
    K = min(M, 12)

    if M == 0:
        return 0
    else:
        score = 0
        for k in range(1, K + 1):
            precision_k_value = precision_k(actual, predicted, k)

            score += precision_k_value * rel(actual[k - 1], predicted[k - 1])
        return score


def map_batch(label, prediction):
    """
    label: (batch, 12)
    prediction: (batch, 12)
    """
    pred = prediction.cpu().numpy()
    label = label.cpu().numpy()
    maps = []
    for ii in range(prediction.shape[0]):
        l_ii = [x for x in label[ii,:] if x not in [0, 1, 2, 3]]
        p_ii = [x for x in pred[ii,:] if x not in [0, 1, 2, 3]]
        if len(p_ii) > 0:
            maps.append(mAP_k(l_ii, p_ii))
        else:
            maps.append(0)
    return np.mean(maps)


def evaluate_map(model, iterator):
    
    model.eval()
    all_maps = []
    with torch.no_grad():
    
        for _, (src, trg) in tqdm(enumerate(iterator)):

            src, trg = src.to(device), trg.to(device)
            src = src.permute(1, 0)
            trg = trg.permute(1, 0)

            logits, _ = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            trg = trg[:,1:]
            prediction = torch.argmax(logits, axis=-1)
            mapr = map_batch(trg, prediction)
            all_maps.append(mapr)
        
    return np.mean(all_maps)

In [60]:
import math

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss, valid_map = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f} | Val. MAP: {valid_map:7.3f}')

224it [00:54,  4.12it/s]
56it [00:04, 11.64it/s]
0it [00:00, ?it/s]

Epoch: 01 | Time: 0m 59s
	Train Loss: 0.862 | Train PPL:   2.367
	 Val. Loss: 2.257 |  Val. PPL:   9.551 | Val. MAP:   0.053


224it [00:54,  4.11it/s]
56it [00:04, 11.51it/s]
0it [00:00, ?it/s]

Epoch: 02 | Time: 0m 59s
	Train Loss: 0.801 | Train PPL:   2.227
	 Val. Loss: 2.322 |  Val. PPL:  10.195 | Val. MAP:   0.049


224it [00:54,  4.09it/s]
56it [00:04, 11.57it/s]
0it [00:00, ?it/s]

Epoch: 03 | Time: 0m 59s
	Train Loss: 0.746 | Train PPL:   2.109
	 Val. Loss: 2.380 |  Val. PPL:  10.808 | Val. MAP:   0.043


224it [00:54,  4.10it/s]
56it [00:04, 11.37it/s]
0it [00:00, ?it/s]

Epoch: 04 | Time: 0m 59s
	Train Loss: 0.695 | Train PPL:   2.004
	 Val. Loss: 2.438 |  Val. PPL:  11.453 | Val. MAP:   0.042


224it [00:54,  4.10it/s]
56it [00:04, 11.52it/s]
0it [00:00, ?it/s]

Epoch: 05 | Time: 0m 59s
	Train Loss: 0.655 | Train PPL:   1.925
	 Val. Loss: 2.496 |  Val. PPL:  12.135 | Val. MAP:   0.042


224it [00:54,  4.10it/s]
56it [00:04, 11.57it/s]
0it [00:00, ?it/s]

Epoch: 06 | Time: 0m 59s
	Train Loss: 0.616 | Train PPL:   1.852
	 Val. Loss: 2.551 |  Val. PPL:  12.820 | Val. MAP:   0.042


80it [00:19,  4.05it/s]


KeyboardInterrupt: 

In [34]:
def predict(sentence, src_vocab, trg_vocab, model, device, max_len = tgt_seq_len):
    
    model.eval()

    tokens = tokenizer(sentence)
    tokens = [src_vocab['<bos>']] + tokens + [src_vocab['<eos>']]
    src_indexes = [src_vocab.stoi[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_vocab.stoi['<bos>']]
    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        trg_indexes.append(pred_token)
        if pred_token == trg_vocab.stoi['<eos>']:
            break
    
    trg_tokens = [trg_vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention

In [37]:
pred, _ = predict('13112 16042 3871 35', src_vocab, src_vocab, model, device)
pred

['1566', '<eos>']

In [55]:
evaluate_map(model, valid_iterator)

56it [00:04, 12.70it/s]


0.05807317614020755

## Create Sequences from the original data

In [63]:
colsep = "\t"
import pandas as pd
from collections import defaultdict


def get_ids(elems):
    ids = []
    for ii, e in enumerate(elems):
        if e not in prod_dict[ii]:
            prod_dict[ii][e] = len(prod_dict[ii]) + 1
        ids.append(prod_dict[ii][e])
    return ids

def break_sessions(seqs):    
    sids = sorted(list(set([x[-1] for x in seqs])))
    temp = [[] for _ in range(len(sids))]
    for seq in seqs:
        temp[seq[-1]].append(seq[:-1])
    return temp

inp_file = os.path.join(data_dir, file_name)
sample = pd.read_csv(inp_file, sep=colsep, nrows=5)
ncol = sample.shape[1]

num_prod_dim = ncol - 3  # other than u, i, t
if num_prod_dim > 0:
    prod_dict = [{} for _ in range(num_prod_dim)]

User = defaultdict(list)
with open(os.path.join(data_dir, file_name), 'r') as fr:
    for line in tqdm(fr):
        if ncol == 3:
            u, i, _ = line.rstrip().split(colsep)
        elif ncol >= 4:
            elems = line.rstrip().split(colsep)
            u, i, t = elems[0], elems[1], elems[-1]
            pdims = elems[2:-1]
            pids = get_ids(pdims)
        u = int(u)
        i = int(i)
        t = int(t)
        if ncol >= 4:
            User[u].append([i] + pids + [t])
        else:
            User[u].append(i)
print(f"Read {len(User)} user interactions")

364695it [00:02, 146284.23it/s]

Read 48709 user interactions





NameError: name 'sys' is not defined

In [66]:
all_seqs = {k: [] for k in range(num_prod_dim)}
all_seqs['prod'] = []  # one more dictionary for the products

for u in User:
    seqs = break_sessions(User[u])
    for ii in range(1, len(seqs)):
        inp, tgt = seqs[ii-1], seqs[ii]
        if len(inp) > inp_seq_len:
            inp = inp[-inp_seq_len:] # taking the last 12
        if len(tgt) > tgt_seq_len:
            tgt = tgt[:tgt_seq_len]  # taking the first 12
        inp_p = [str(ii[0]) for ii in inp]  # only the product-id
        tgt_p = [str(ii[0]) for ii in tgt]  # always only the product-id
        all_seqs['prod'].append((inp_p, tgt_p))
        for jj in range(num_prod_dim):
            inp_jj = [kk[jj+1] for kk in inp]
            tgt_jj = [kk[jj+1] for kk in tgt]
            all_seqs[jj].append((inp_jj, tgt_jj))


In [96]:
all_seqs['prod'][0]

(['1186'], ['13112', '16042', '3871', '35'])

In [98]:
[all_seqs[kk][0] for kk in range(num_prod_dim)]

[([1], [2, 2, 3, 4]),
 ([1], [1, 1, 2, 1]),
 ([1], [2, 3, 3, 4]),
 ([1], [2, 3, 4, 5]),
 ([1], [2, 2, 3, 3]),
 ([1], [1, 1, 1, 1]),
 ([1], [1, 1, 1, 1]),
 ([1], [1, 1, 1, 1]),
 ([1], [2, 2, 3, 3])]

In [105]:
def build_vocab(prod_seqs, tokenizer, index=-1):
    counter = Counter()
    for seq in prod_seqs:
        src, tgt = ' '.join(seq[0]), ' '.join(seq[1])
        if index == -1:
            counter.update(tokenizer(src))
            counter.update(tokenizer(tgt))
        else:
            counter.update(tokenizer(seq[index]))
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

def data_process(all_seqs, tokenizer, vocab, test_flag=False):
    """
    all_seqs is a dict with keys: 'prod', 0, 1, ..., num_prod_dim-1
    """
    data = []
    num_examples = len(all_seqs['prod'])
    for ii in range(num_examples):
        items = all_seqs['prod'][ii]
        meta = [all_seqs[kk][ii][0] for kk in range(num_prod_dim)] # rest of the attributes for input
        if test_flag:
            src = items[0]
            src_tensor = torch.tensor([vocab[token] for token in tokenizer(src)], dtype=torch.long)
            meta_tensor = torch.tensor(meta)
            data.append(src_tensor)
        else:
            src, tgt = items[0], items[1]
            src_tensor = torch.tensor([vocab[token] for token in src], dtype=torch.long)
            tgt_tensor = torch.tensor([vocab[token] for token in tgt], dtype=torch.long)
            meta_tensor = torch.tensor(meta)
            if src_tensor.dim() == 1:
                src_tensor = torch.unsqueeze(src_tensor, 0)
            src_tensor = torch.cat([src_tensor, meta_tensor])
            data.append((src_tensor, tgt_tensor))
    return data

In [106]:
src_vocab = build_vocab(all_seqs['prod'], tokenizer)
all_data = data_process(all_seqs, tokenizer, src_vocab)
train_data, val_data = train_test_split(all_data, test_size=0.2)
# test_data = data_process(test_file_path, tokenizer, src_vocab, test_flag=True)
len(all_data), len(train_data), len(val_data)#, len(test_data)

(71460, 57168, 14292)

71460