In [1]:
import torch 

In [3]:
target_seqs = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
predicted_seq = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])

In [None]:
def bleu(target_seqs, predicted_seq):
    ''' Compute the BLEU score between target_seqs and predicted_seq.
    
    Inputs:
		`target_seqs`: Tensor<Int>[B, T] target sequences tensor.
		`predicted_seq`: Tensor<Int>[B, T] predicted sequence tensor.

	Outputs:
		Tensor<Float>[B, T_q, C] output tensor.
    '''
    brevity_penalty = min(1-)

In [5]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from torch.nn.utils.rnn import pad_sequence
from typing import Iterable, List
from dataclasses import dataclass

In [6]:
# Define special symbols and indices
UNK_IDX, BOS_IDX, EOS_IDX, PAD_IDX = 0, 1, 2, 3

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {'en': 0, 'fr': 1}

    for line in data_iter:
        yield token_transform[language](line.lower())

In [12]:
# Place-holders
token_transform = {}
vocab_transform = {}

SRC_LANG = 'en'
TGT_LANG = 'fr'

token_transform[SRC_LANG] = get_tokenizer('spacy', language='en_core_web_sm')
token_transform[TGT_LANG] = get_tokenizer('spacy', language='fr_core_news_sm')

# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<bos>', '<eos>', '<pad>']

for ln in [SRC_LANG, TGT_LANG]:
	# Training data Iterator
	with open(f'data/un/undoc.2000.fr-en.{ln}', encoding="utf8") as f:
		# Create torchtext's Vocab object
		vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(f, ln),
														min_freq=10,	
														specials=special_symbols,
														special_first=True)
	
# Set UNK_IDX as the default index. This index is returned when the token is not found.
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary.
for ln in [SRC_LANG, TGT_LANG]:
	vocab_transform[ln].set_default_index(UNK_IDX)


# src and tgt language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANG, TGT_LANG]:
	text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
											vocab_transform[ln], #Numericalization
											tensor_transform) # Add BOS/EOS and create tensor


# print vocab sizes
print(f"Vocab size for {SRC_LANG}: {len(vocab_transform[SRC_LANG])}")
print(f"Vocab size for {TGT_LANG}: {len(vocab_transform[TGT_LANG])}")

Vocab size for en: 118961
Vocab size for fr: 133485


In [15]:
vocab_transform['en']['reckon']

64216

In [10]:
from pickle import dump

In [18]:
dump(vocab_transform['fr'], open('tgt_vocab.pkl', 'wb'))

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [6]:
.

spacy.tokens.token.Token

In [7]:
from pickle import load

In [8]:
vocab = load(open('tgt_vocab.pkl', 'rb'))

In [17]:
vocab.get_stoi()[str(list(nlp('hi who are you?'))[0])]

67044