In [9]:
from nltk.translate.bleu_score import sentence_bleu

In [10]:
from sentencepiece import SentencePieceProcessor

In [11]:
sp_model = SentencePieceProcessor(model_file='/home/chen/Desktop/ML/Transformer/data/multi30k/m_en_de.model')

In [12]:
import torch
from torch import Tensor

In [13]:
import abc

class Tokenizer(abc.ABC):

	def __init__(self, tokenize_args: dict = {}, detokenize_args: dict = {}):
		self.tokenize_args = tokenize_args
		self.detokenize_args = detokenize_args
		pass

	def tokenize(self, sentence: str) -> list:
		return self._tokenize(sentence, self.tokenize_args)
	
	def detokenize(self, tokens: list) -> str:
		return self._detokenize(tokens, self.detokenize_args)

	@abc.abstractmethod
	def _tokenize(self, sentence: str, tokenize_args: dict) -> list:
		pass
	
	@abc.abstractmethod
	def _detokenize(self, tokens: list, detokenize_args: dict) -> str:
		pass

In [14]:
class SentencePieceTokenizer(Tokenizer):

	def __init__(self, sp_model_path: str, tokenize_args: dict = {}, detokenize_args: dict = {}):
		self.sp_model = SentencePieceProcessor(model_file=sp_model_path)
		super().__init__(tokenize_args, detokenize_args)

	def _tokenize(self, sentence: str, tokenize_args: dict) -> list:
		return self.sp_model.encode(sentence, **tokenize_args)
	
	def _detokenize(self, tokens: list, detokenize_args: dict) -> str:
		return self.sp_model.decode(tokens, **detokenize_args)

In [15]:
UNK_IDX = 0
BOS_IDX = 1
EOS_IDX = 2
PAD_IDX = 3

In [16]:
def get_bleu_score(pred: Tensor, reference: Tensor, tokenizer: Tokenizer) -> float:
	'''
	Calculate BLEU-4 score of a single prediction and reference pair by detokenizing and using NLTK.

	Args:
		`pred`: Tensor<Long>[T] predicted tokens.
		`reference`: Tensor<Long>[T] reference tokens.
	
	Returns:
		`score`: float BLEU-4 score in [0, 1].
	'''
	# clean pred and reference
	# remove all <unk> and <pad> tokens
	pred = pred[pred != UNK_IDX]
	pred = pred[pred != PAD_IDX]
	reference = reference[reference != UNK_IDX]
	reference = reference[reference != PAD_IDX]
	print('cleaned predictio:', pred)
	print('cleaned reference:', reference)
	# detokenize
	pred = tokenizer.detokenize(pred.tolist())
	reference = tokenizer.detokenize(reference.tolist())
	print('detokenized predictio:', pred)
	print('detokenized reference:', reference)
	# re-tokenize
	pred = tokenizer.tokenize(pred)
	reference = tokenizer.tokenize(reference)
	print('re-tokenized predictio:', pred)
	print('re-tokenized reference:', reference)
	# calculate BLEU-4 score
	return sentence_bleu([reference], pred)


In [17]:
pred = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9])
reference = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [18]:
sample_tokenizer = SentencePieceTokenizer('/home/chen/Desktop/ML/Transformer/data/multi30k/m_en_de.model', tokenize_args={'enable_sampling': True, 'alpha': 0.1})
reference_tokenizer = SentencePieceTokenizer('/home/chen/Desktop/ML/Transformer/data/multi30k/m_en_de.model')

In [19]:
sentence = 'Sitting casually in a public place, a girl reads holding the book open with her hand on which is a butterfly ring.'

In [20]:
sample_tokens = sample_tokenizer.tokenize(sentence)
reference_tokens = reference_tokenizer.tokenize(sentence)

print('sample_tokens:', sample_tokens)
print('reference_tokens:', reference_tokens)

sample_tokens: [14482, 168, 8261, 18, 5, 2281, 3555, 14930, 5, 31, 207, 2498, 14900, 14910, 3655, 54, 1574, 1510, 14900, 14916, 86, 32, 7, 32, 10, 14911, 62, 2427, 65, 5, 9247, 144, 14903, 14902, 14913, 14917]
reference_tokens: [14482, 168, 8261, 18, 5, 2281, 3555, 14930, 5, 221, 2498, 339, 54, 1574, 1510, 87, 241, 543, 62, 2427, 65, 5, 9247, 4291, 14917]


In [21]:
sample_tokens = torch.tensor(sample_tokens)
reference_tokens = torch.tensor(reference_tokens)

In [22]:
get_bleu_score(sample_tokens, reference_tokens, reference_tokenizer)

cleaned predictio: tensor([14482,   168,  8261,    18,     5,  2281,  3555, 14930,     5,    31,
          207,  2498, 14900, 14910,  3655,    54,  1574,  1510, 14900, 14916,
           86,    32,     7,    32,    10, 14911,    62,  2427,    65,     5,
         9247,   144, 14903, 14902, 14913, 14917])
cleaned reference: tensor([14482,   168,  8261,    18,     5,  2281,  3555, 14930,     5,   221,
         2498,   339,    54,  1574,  1510,    87,   241,   543,    62,  2427,
           65,     5,  9247,  4291, 14917])
detokenized predictio: Sitting casually in a public place, a girl reads holding the book open with her hand on which is a butterfly ring.
detokenized reference: Sitting casually in a public place, a girl reads holding the book open with her hand on which is a butterfly ring.
re-tokenized predictio: [14482, 168, 8261, 18, 5, 2281, 3555, 14930, 5, 221, 2498, 339, 54, 1574, 1510, 87, 241, 543, 62, 2427, 65, 5, 9247, 4291, 14917]
re-tokenized reference: [14482, 168, 8261, 18, 

1.0

In [1]:
from models.transformer import Transformer

In [2]:
model = Transformer.load_from_checkpoint('/home/chen/Desktop/ML/Transformer/experiments/de-en-v1-multi30k/de-en-v1-sp-nb_6-multi30k/checkpoints/model-epoch=35-step=4000-val_loss=3.03.ckpt').cuda()

In [1]:
from datasets.translate import TranslationDataset, TranslationDatasetConfig

In [7]:
config = TranslationDatasetConfig(
	src_sp_model_file='/home/chen/Desktop/ML/Transformer/data/multi30k/m_en_de.model',
	tgt_sp_model_file='/home/chen/Desktop/ML/Transformer/data/multi30k/m_en_de.model',
	src_file='/home/chen/Desktop/ML/Transformer/data/multi30k/val.de',
	tgt_file='/home/chen/Desktop/ML/Transformer/data/multi30k/val.en',
	max_seq_len=128,
)

dataset = TranslationDataset(config)

Reading input files...
                                                 src  \
0  Eine Gruppe von Männern lädt Baumwolle auf ein...   
1  Ein Mann schläft in einem grünen Raum auf eine...   
2  Ein Junge mit Kopfhörern sitzt auf den Schulte...   
3  Zwei Männer bauen eine blaue Eisfischerhütte a...   
4  Ein Mann mit beginnender Glatze, der eine rote...   
5  Eine Frau in einem rotem Mantel, die eine verm...   
6  Ein brauner Hund rennt dem schwarzen Hund hint...   
7  Ein kleiner Junge mit einem Giants-Trikot schw...   
8  Ein Mann telefoniert in einem unaufgeräumten Büro   
9  Eine lächelnde Frau mit einem pfirsichfarbenen...   

                                                 tgt  
0     A group of men are loading cotton onto a truck  
1         A man sleeping in a green room on a couch.  
2  A boy wearing headphones sits on a woman's sho...  
3  Two men setting up a blue ice fishing hut on a...  
4  A balding man wearing a red life jacket is sit...  
5  A lady in a red coat, holdi

In [11]:
import torch

In [17]:
dl = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0, collate_fn=dataset.get_collate_function())

In [18]:
for batch in dl:
	print(batch)
	break

TransformerInputBatch(x_src=tensor([[    1,    26,     4, 14901,   320,    70, 14908, 14902,   129, 14902,
         11687, 14900, 14909, 14926, 14911, 14906,  1142, 14916,  3581,    63,
            12,     6,  4122,     2,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3],
        [    1,    27,    73,  1822,    18,    42,   688,   172,   703,     5,
         14912, 14920,    12, 14901, 14914,  2659, 14917,     2,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3],
        [    1,    27,   284,    61,  5751,   278,    63,   265,  4052,    82,
           104, 14917,     2,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,

In [7]:
dataset.BOS_IDX, dataset.EOS_IDX, dataset.PAD_IDX, dataset.UNK_IDX

(1, 2, 3, 0)

In [8]:
for i in range(10):
	sample = dataset[i]
	x_src, x_tgt, y_tgt = sample
	print('x_src:', x_src.shape, x_src)
	print('x_tgt:', x_tgt.shape, x_tgt)
	print('y_tgt:', y_tgt.shape, y_tgt)
	

TypeError: tuple indices must be integers or slices, not str

In [19]:
from itertools import product

In [22]:
N_BLOCKS = [1, 2, 4, 5, 6, 7, 8]
VOCAB_SIZE = [1000, 2000]

list(product(N_BLOCKS, VOCAB_SIZE))

[(1, 1000),
 (1, 2000),
 (2, 1000),
 (2, 2000),
 (4, 1000),
 (4, 2000),
 (5, 1000),
 (5, 2000),
 (6, 1000),
 (6, 2000),
 (7, 1000),
 (7, 2000),
 (8, 1000),
 (8, 2000)]