In [1]:
%matplotlib inline

In [2]:
# picking the most free GPU resource as cuda device
import subprocess
import sys
import os

import torch
import pandas as pd

if sys.version_info[0] < 3:
    from StringIO import StringIO
else:
    from io import StringIO

torch.cuda.empty_cache()


def get_free_gpu():
    gpu_stats = subprocess.check_output(
        ["nvidia-smi", "--format=csv", "--query-gpu=memory.used,memory.free"]
    )
    gpu_df = pd.read_csv(
        StringIO(gpu_stats.decode("utf-8")),
        names=["memory.used", "memory.free"],
        skiprows=1,
    )
    print("GPU usage:\n{}".format(gpu_df))
    gpu_df["memory.free"] = gpu_df["memory.free"].map(
        lambda x: int(x.rstrip(" MiB"))
    )
    idx = gpu_df["memory.free"].idxmax()
    print(
        "Returning GPU{} with {} free MiB".format(
            idx, gpu_df.iloc[idx]["memory.free"]
        )
    )
    return idx


cmd = "export CUDA_VISIBLE_DEVICES=1,2,3,4,5,6"
os.popen(cmd)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using {device}")
if torch.cuda.is_available():
    free_gpu_id = get_free_gpu()
    print(f"using GPU id: {free_gpu_id}")
    torch.cuda.set_device(free_gpu_id)

using cuda
GPU usage:
  memory.used memory.free
0       2 MiB   24266 MiB
1       2 MiB   24266 MiB
2       2 MiB   24266 MiB
3   15209 MiB    9059 MiB
4   15209 MiB    9059 MiB
5    4033 MiB   20235 MiB
Returning GPU0 with 24266 free MiB
using GPU id: 0



Language Translation with nn.Transformer and torchtext
======================================================

This tutorial shows, how to train a translation model from scratch using
Transformer. We will be using `Multi30k <http://www.statmt.org/wmt16/multimodal-task.html#task1>`__ 
dataset to train a German to English translation model.



Data Sourcing and Processing
----------------------------

`torchtext library <https://pytorch.org/text/stable/>`__ has utilities for creating datasets that can be easily
iterated through for the purposes of creating a language translation
model. In this example, we show how to use torchtext's inbuilt datasets, 
tokenize a raw text sentence, build vocabulary, and numericalize tokens into tensor. We will use
`Multi30k dataset from torchtext library <https://pytorch.org/text/stable/datasets.html#multi30k>`__
that yields a pair of source-target raw sentences. 





In [3]:
from torchtext.data.datasets_utils import _RawTextIterableDataset


def get_multiwoz(split=("train", "val", "test")):
    files = {
        "train": "../data/train.history_belief",
        "val": "../data/val.history_belief",
        "test": "../data/test.history_belief",
    }
    datas = []
    for name in split:
        with open(files[name]) as fp:
            raw_text = fp.read()
            raw_text = raw_text.replace(
                "<|endofcontext|>", " <|endofcontext|>"
            )
            raw_text = raw_text.replace("<|endoftext|>", "")
            texts = raw_text.split("\n")

            data = []

            for text in texts:
                if not text.split():
                    continue
                split_index = text.find("<|belief|>")
                input_text = text[: split_index - 1]
                belief = text[split_index:]
                data.append((input_text, belief))
            datas.append(
                _RawTextIterableDataset("MULTIWOZ", len(data), iter(data))
            )

    return (data for data in datas)

In [4]:
train_set, val_set, test_set = get_multiwoz()

In [5]:
from random import sample


def split_iterable_dataset(dataset, ratio=0.5):
    n = int(len(dataset) * ratio)
    samples = sample(list(dataset), n)
    return (
        _RawTextIterableDataset("MULTIWOZ", len(samples), iter(samples)),
        samples,
    )

In [6]:
downsize_train_iter, downsize_train_data = split_iterable_dataset(
    train_set, 1
)
downsize_val_iter, downsize_val_data = split_iterable_dataset(val_set, 0.01)
test_data = list(test_set)

In [7]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List

# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable) -> List[str]:
    for data_sample in data_iter:
        yield data_sample[0].split() + data_sample[1].split()


# Define special symbols and indices
UNK_IDX, PAD_IDX = 0, 1

INPUT_SOS = "<|context|>"
INPUT_EOS = "<|endofcontext|>"
OUTPUT_SOS = "<|belief|>"
OUTPUT_EOS = "<|endofbelief|>"
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ["<unk>", "<pad>"]

vocab = build_vocab_from_iterator(
    yield_tokens(downsize_train_iter),
    min_freq=1,
    specials=special_symbols,
    special_first=True,
)
vocab.set_default_index(UNK_IDX)
OUTPUT_EOS_IDX = vocab([OUTPUT_EOS])[0]
OUTPUT_SOS_IDX = vocab([OUTPUT_SOS])[0]

Seq2Seq Network using Transformer
---------------------------------

Transformer is a Seq2Seq model introduced in `“Attention is all you
need” <https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf>`__
paper for solving machine translation tasks. 
Below, we will create a Seq2Seq network that uses Transformer. The network
consists of three parts. First part is the embedding layer. This layer converts tensor of input indices
into corresponding tensor of input embeddings. These embedding are further augmented with positional
encodings to provide position information of input tokens to the model. The second part is the 
actual `Transformer <https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html>`__ model. 
Finally, the output of Transformer model is passed through linear layer
that give un-normalized probabilities for each token in the target language. 




In [8]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(
            -torch.arange(0, emb_size, 2) * math.log(10000) / emb_size
        )
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer("pos_embedding", pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(
            token_embedding + self.pos_embedding[: token_embedding.size(0), :]
        )


# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(
        self,
        num_encoder_layers: int,
        num_decoder_layers: int,
        emb_size: int,
        nhead: int,
        src_vocab_size: int,
        tgt_vocab_size: int,
        dim_feedforward: int = 512,
        dropout: float = 0.1,
    ):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(
            d_model=emb_size,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout
        )

    def forward(
        self,
        src: Tensor,
        trg: Tensor,
        src_mask: Tensor,
        tgt_mask: Tensor,
        src_padding_mask: Tensor,
        tgt_padding_mask: Tensor,
        memory_key_padding_mask: Tensor,
    ):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(
            src_emb,
            tgt_emb,
            src_mask,
            tgt_mask,
            None,
            src_padding_mask,
            tgt_padding_mask,
            memory_key_padding_mask,
        )
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(
            self.positional_encoding(self.src_tok_emb(src)), src_mask
        )

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(
            self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask
        )

During training, we need a subsequent word mask that will prevent model to look into
the future words when making predictions. We will also need masks to hide
source and target padding tokens. Below, let's define a function that will take care of both. 




In [9]:
def generate_square_subsequent_mask(sz, device=torch.device("cpu")):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(
        0, 1
    )
    mask = (
        mask.float()
        .masked_fill(mask == 0, float("-inf"))
        .masked_fill(mask == 1, float(0.0))
    )
    return mask


def create_mask(src, tgt, device=torch.device("cpu")):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len, device=device)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(
        torch.bool
    )

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [10]:
def belief_to_state_list(belief):
    if OUTPUT_EOS in belief:
        first_idx_eos = belief.index(OUTPUT_EOS)
        belief = belief[:first_idx_eos]
    belief_list = [
        token for token in belief if token not in [OUTPUT_SOS, OUTPUT_EOS]
    ]
    belief_list = [slot.split() for slot in " ".join(belief_list).split(",")]
    return belief_list


def belief_to_state_dict(belief):
    belief_list = belief_to_state_list(belief)
    state_dict = {}
    for state in belief_list:
        if len(state) < 3:
            continue
        domain = state[0]
        slot = state[1]
        sub_slot = None
        rest = state[2:]
        if slot == "book":
            sub_slot = state[2]
            rest = state[3:]
        value = " ".join(rest)
        d = state_dict.get(domain, {})
        if sub_slot:
            ss = d.get(slot, {})
            ss.update({sub_slot: value})
            d.update({slot: ss})
        else:
            d.update({slot: value})
        state_dict.update({domain: d})
    return state_dict


def match_slot(true, pred):
    pred_state = belief_to_state_dict(pred)
    true_list = belief_to_state_list(true)
    slot_matches = []
    for i, state in enumerate(true_list):
        slot_matches.append(False)
        if len(state) < 3:
            continue
        domain = state[0]
        if domain not in pred_state.keys():
            continue

        slot = state[1]
        if slot not in pred_state[domain].keys():
            continue

        if slot != "book":
            true_value = " ".join(state[2:])
            pred_value = pred_state[domain][slot]
        else:
            sub_slot = state[2]
            if sub_slot not in pred_state[domain][slot]:
                continue
            true_value = " ".join(state[3:])
            pred_value = pred_state[domain][slot][sub_slot]

        if true_value != pred_value:
            continue
        slot_matches[i] = True

    all_match = sum(slot_matches) == len(true_list)

    return all_match, slot_matches


def get_accuracy(results):
    total_states = len(results)
    total_slots = sum([len(result[1]) for result in results])
    total_correct_states = sum([result[0] for result in results])
    total_correct_slots = sum([sum(result[1]) for result in results])
    return {
        "joint_accuracy": total_correct_states / total_states,
        "slot_accuracy": total_correct_slots / total_slots,
    }

Let's now define the parameters of our model and instantiate the same. Below, we also 
define our loss function which is the cross-entropy loss and the optmizer used for training.




In [11]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab)
TGT_VOCAB_SIZE = len(vocab)
EMB_SIZE = 128
NHEAD = 4
FFN_HID_DIM = 512
BATCH_SIZE = 64
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(
    NUM_ENCODER_LAYERS,
    NUM_DECODER_LAYERS,
    EMB_SIZE,
    NHEAD,
    SRC_VOCAB_SIZE,
    TGT_VOCAB_SIZE,
    FFN_HID_DIM,
)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(device)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(
    transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9
)

Collation
---------

As seen in the ``Data Sourcing and Processing`` section, our data iterator yields a pair of raw strings. 
We need to convert these string pairs into the batched tensors that can be processed by our ``Seq2Seq`` network 
defined previously. Below we define our collate function that convert batch of raw strings into batch tensors that
can be fed directly into our model.   




In [12]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return torch.tensor(txt_input)

    return func


# src and tgt language text transforms to convert raw strings into tensors indices
text_transform_func = sequential_transforms(
    lambda s: s.split(), vocab,  # Tokenization  # Numericalization
)


# function to collate data samples into batch tesors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform_func(src_sample.rstrip("\n")))
        tgt_batch.append(text_transform_func(tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

Let's define training and evaluation loop that will be called for each 
epoch.




In [13]:
from tqdm import tqdm
from torch.utils.data import DataLoader


def train_epoch(model, optimizer):
    model.train()
    losses = 0
    # (train_iter,) = get_multiwoz(split=("train",))
    train_dataloader = DataLoader(
        downsize_train_data, batch_size=BATCH_SIZE, collate_fn=collate_fn
    )

    for src, tgt in train_dataloader:
        src = src.to(device)
        tgt = tgt.to(device)
        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(
            src, tgt_input, device=device
        )

        logits = model(
            src,
            tgt_input,
            src_mask,
            tgt_mask,
            src_padding_mask,
            tgt_padding_mask,
            src_padding_mask,
        )

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(
            logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)
        )
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)


def evaluate(model):
    model.eval()
    losses = 0
    val_model = model.to(torch.device("cpu"))
    # (val_iter,) = get_multiwoz(split=("val",))
    val_dataloader = DataLoader(
        downsize_val_data, batch_size=BATCH_SIZE, collate_fn=collate_fn
    )

    for src, tgt in val_dataloader:
        src = src.to(torch.device("cpu"))
        tgt = tgt.to(torch.device("cpu"))

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(
            src, tgt_input, device=torch.device("cpu")
        )

        logits = val_model(
            src,
            tgt_input,
            src_mask,
            tgt_mask,
            src_padding_mask,
            tgt_padding_mask,
            src_padding_mask,
        )

        tgt_out = tgt[1:, :]
        loss = loss_fn(
            logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)
        )
        losses += loss.item()

    return losses / len(val_dataloader)


In [14]:
torch.cuda.empty_cache()

Now we have all the ingredients to train our model. Let's do it!




In [15]:
from timeit import default_timer as timer

NUM_EPOCHS = 60
min_val_loss = float("inf")
train_losses = []
val_losses = []
epoches = []
for epoch in range(1, NUM_EPOCHS + 1):
    torch.cuda.empty_cache()
    start_time = timer()
    transformer = transformer.to(device)
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    torch.cuda.empty_cache()
    transformer = transformer.to(torch.device("cuda"))
    val_loss = evaluate(transformer)
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        torch.save(
            {
                "epoch": epoch,
                "model_state_dict": transformer.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "loss": train_loss,
            },
            "./transformer_checkpoint",
        )
    print(
        (
            f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "
            f"Epoch time = {(end_time - start_time):.3f}s"
        )
    )
    epoches.append(epoch)
    train_losses.append(train_loss)
    val_losses.append(val_loss)



Epoch: 1, Train loss: 3.451, Val loss: 0.896, Epoch time = 74.265s
Epoch: 2, Train loss: 0.700, Val loss: 0.506, Epoch time = 77.155s
Epoch: 3, Train loss: 0.529, Val loss: 0.432, Epoch time = 77.132s
Epoch: 4, Train loss: 0.451, Val loss: 0.380, Epoch time = 78.372s
Epoch: 5, Train loss: 0.389, Val loss: 0.329, Epoch time = 77.983s
Epoch: 6, Train loss: 0.322, Val loss: 0.257, Epoch time = 77.302s
Epoch: 7, Train loss: 0.265, Val loss: 0.223, Epoch time = 78.123s
Epoch: 8, Train loss: 0.227, Val loss: 0.190, Epoch time = 77.783s
Epoch: 9, Train loss: 0.189, Val loss: 0.166, Epoch time = 77.991s
Epoch: 10, Train loss: 0.159, Val loss: 0.154, Epoch time = 76.898s
Epoch: 11, Train loss: 0.137, Val loss: 0.155, Epoch time = 76.953s
Epoch: 12, Train loss: 0.122, Val loss: 0.149, Epoch time = 77.256s
Epoch: 13, Train loss: 0.110, Val loss: 0.148, Epoch time = 78.062s
Epoch: 14, Train loss: 0.100, Val loss: 0.148, Epoch time = 77.718s
Epoch: 15, Train loss: 0.092, Val loss: 0.154, Epoch time

KeyboardInterrupt: 

In [16]:
def test(model):
    model.eval()
    losses = 0
    test_model = model.to(torch.device("cpu"))
    test_dataloader = DataLoader(
        test_data, batch_size=BATCH_SIZE, collate_fn=collate_fn
    )
    results = []
    for src, tgt in tqdm(test_dataloader):
        src = src.to(torch.device("cpu"))
        
        num_tokens = src.shape[0]
        
        src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).to(torch.device('cpu'))
        memory = model.encode(src, src_mask)
        
        tgt = tgt.to(torch.device("cpu"))

        start_symbol = OUTPUT_SOS_IDX
        eos_bools = torch.zeros(src.size(1), dtype=torch.bool)
        top_indices = torch.ones(1, src.size(1)).fill_(start_symbol).type(torch.long).to(torch.device('cpu'))
        max_len = 128
        for i in range(max_len - 1):
            tgt_mask = (
                generate_square_subsequent_mask(top_indices.size(0)).type(torch.bool)
            ).to(torch.device('cpu'))
            out = model.decode(top_indices, memory, tgt_mask)
            out = out.transpose(0, 1)
            prob = model.generator(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            next_word = next_word.view(1, next_word.size(0))
            eos_bools = (next_word == OUTPUT_EOS_IDX).logical_or(eos_bools) 
            top_indices = torch.cat(
                [top_indices, next_word], dim=0
            )
            if torch.sum(eos_bools).item() == eos_bools.size(1):
                break
        # keep most likely tokens
        
        for idx in range(top_indices.shape[1]):
            pred = vocab.lookup_tokens(list(top_indices[:,idx].cpu().numpy()))
            true = vocab.lookup_tokens(list(tgt[:,idx].cpu().numpy()))
            
            result = match_slot(true, pred)
            results.append(result)
            
    return results

In [17]:
checkpoint = torch.load("./transformer_checkpoint")
transformer.load_state_dict(checkpoint["model_state_dict"])

<All keys matched successfully>

In [18]:
transformer = transformer.to(torch.device('cpu'))
results = test(transformer)

100%|██████████| 116/116 [07:03<00:00,  3.65s/it]


In [19]:

get_accuracy(results)

{'joint_accuracy': 0.04395008138903961, 'slot_accuracy': 0.6080988652261775}

In [21]:

# function to generate output sequence using greedy algorithm
def greedy_decode(
    model, src, src_mask, max_len, start_symbol, device=torch.device("cpu")
):
    src = src.to(device)
    src_mask = src_mask.to(device)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len - 1):
        memory = memory.to(device)
        tgt_mask = (
            generate_square_subsequent_mask(ys.size(0)).type(torch.bool)
        ).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat(
            [ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0
        )
        if next_word == OUTPUT_EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform_func(src_sentence).view(-1, 1)
    # print(src)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model, src, src_mask, max_len=100, start_symbol=OUTPUT_SOS_IDX
    ).flatten()
    return vocab.lookup_tokens(list(tgt_tokens.cpu().numpy()))

In [22]:
transformer = transformer.to(torch.device("cpu"))
torch.cuda.empty_cache()
print(
    translate(
        transformer,
        "<|context|> <|user|> i need to take a train out of cambridge , i will be leaving town on wednesday . <|system|> there are 5 trains out of cambridge on wednesday . do you have a departure time in mind ? <|user|> i would like to go to peterborough and leave after 12:45 , i have to attend a meeting beforehand . <|system|> tr1879 leaves at 13:06 on wednesday . will that work for you ? <|user|> what is the price of the fair and could you tell me what is the arrival time into peterborough ?<|endofcontext|>",
    )
)

['<|belief|>', 'train', 'leaveat', '12:45', ',', 'train', 'destination', 'peterborough', ',', 'train', 'day', 'wednesday', ',', 'train', 'arriveby', 'not', 'mentioned', ',', 'train', 'departure', 'cambridge', '<|endofbelief|>']


In [48]:
input_sample, output_sample = test_data[17]
pred = translate(
        transformer,
        input_sample,
    )
print(input_sample)
print(pred)
match_slot(output_sample.split(), pred)

 <|context|> <|user|> i want to find a moderate -ly priced restaurant . <|system|> i have many options available for you ! is there a certain area or cuisine that interests you ? <|user|> yes i would like the restaurant to be located in the center of the attractions . <|system|> there are 21 restaurant -s available in the centre of town . how about a specific type of cuisine ? <|user|> i need to know the food type and postcode and it should also have mutliple sports <|system|> i am sorry i do not understand what you just said . please repeat in a way that makes sense . <|user|> get me the food type and the post code <|endofcontext|>
['<|belief|>', 'restaurant', 'food', 'not', 'mentioned', ',', 'restaurant', 'pricerange', 'moderate', ',', 'restaurant', 'name', 'not', 'mentioned', ',', 'restaurant', 'area', 'centre', '<|endofbelief|>']


(True, [True, True, True, True])

In [None]:
results = []
for x, y in tqdm(test_data):
    pred = translate(transformer, x)
    result = match_slot(y.split(), pred)
    results.append(result)

In [40]:
get_accuracy(results)

{'joint_accuracy': 0.0, 'slot_accuracy': 0.021744886992009223}

In [None]:
checkpoint

References
----------

1. Attention is all you need paper.
   https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
2. The annotated transformer. https://nlp.seas.harvard.edu/2018/04/03/attention.html#positional-encoding

