# 1	Subword tokenization

## 1.1	Tokenize English and French phrases
First go to https://gattonweb.uky.edu/faculty/lium/gai/en2fr.zip to download zip file that contains the 47,000 English to French translations that I collected from various sources. Unzip the file and place en2fr.csv in the folder /files/ on your computer. We'll load the data and take a look as follows:

In [1]:
import pandas as pd

df=pd.read_csv("files/en2fr.csv")
num_examples=len(df)
print(f"there are {num_examples} examples in the training data")
print(df.iloc[30856]["en"])
print(df.iloc[30856]["fr"])

there are 47173 examples in the training data
How are you?
Comment êtes-vous?


In [None]:
from transformers import XLMTokenizer

tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024") # pretrained xlm model from hugging face

tokenized_en=tokenizer.tokenize("I don't speak French.")
print(tokenized_en)
tokenized_fr=tokenizer.tokenize("Je ne parle pas français.")
print(tokenized_fr)
print(tokenizer.tokenize("How are you?"))
print(tokenizer.tokenize("Comment êtes-vous?"))
# the XLM model uses '</w>' as a token separator, except in cases where two tokens are part of the same word.

  from .autonotebook import tqdm as notebook_tqdm


['i</w>', 'don</w>', "'t</w>", 'speak</w>', 'fr', 'ench</w>', '.</w>']
['je</w>', 'ne</w>', 'parle</w>', 'pas</w>', 'franc', 'ais</w>', '.</w>']
['how</w>', 'are</w>', 'you</w>', '?</w>']
['comment</w>', 'et', 'es-vous</w>', '?</w>']


In [3]:
# build dictionaries
from collections import Counter

en=df["en"].tolist()

en_tokens=[["BOS"]+tokenizer.tokenize(x)+["EOS"] for x in en]        
PAD=0
UNK=1
# apply to English 
word_count=Counter()
for sentence in en_tokens:
    for word in sentence:
        word_count[word]+=1
frequency=word_count.most_common(50000)  # Keeps the 50,000 most frequent tokens only: list of tuples [("the", 1200), ("to", 1100), ("and", 1050), ...]
total_en_words=len(frequency)+2
en_word_dict={w[0]:idx+2 for idx,w in enumerate(frequency)} # transform tokens to indices.
en_word_dict["PAD"]=PAD # the "PAD" token, used for padding, is allocated the integer 0,
en_word_dict["UNK"]=UNK #  the "UNK" token, representing unknown tokens, is given the integer 1.
en_idx_dict={v:k for k,v in en_word_dict.items()} # transform inices back to english tokens

print(len(frequency))

11053


In [4]:
UNK=1
print(en_tokens[0])
print(frequency[0:5])
print(en_word_dict.get("a", UNK))
print(en_word_dict.get("axcd", UNK))
print(en_idx_dict.get(10,"UNK"))
print(en_idx_dict.get(-1,"UNK"))
print(['a']+['b']+['c'])


['BOS', 'two</w>', 'young</w>', ',</w>', 'white</w>', 'males</w>', 'are</w>', 'outside</w>', 'near</w>', 'many</w>', 'bus', 'hes</w>', '.</w>', 'EOS']
[('a</w>', 52124), ('BOS', 47173), ('EOS', 47173), ('.</w>', 43237), ('in</w>', 16358)]
3030
1
of</w>
UNK
['a', 'b', 'c']


In [5]:
enidx=[en_word_dict.get(i,UNK) for i in tokenized_en]  #If the token is not found, it assigns the index for "UNK" (1)
print(enidx)

[15, 100, 38, 377, 476, 574, 5]


In [6]:
entokens=[en_idx_dict.get(i,"UNK") for i in enidx]   
print(entokens)
en_phrase="".join(entokens)
en_phrase=en_phrase.replace("</w>"," ") # Replaces the separator with a space
for x in '''?:;.,'("-!&)%''': 
    en_phrase=en_phrase.replace(f" {x}",f"{x}")  # Removes the space before punctuations
print(en_phrase)

['i</w>', 'don</w>', "'t</w>", 'speak</w>', 'fr', 'ench</w>', '.</w>']
i don't speak french. 


In [8]:
# do the same for French phrases
fr=df["fr"].tolist()       
fr_tokens=[["BOS"]+tokenizer.tokenize(x)+["EOS"] for x in fr] 
word_count=Counter()
for sentence in fr_tokens:
    for word in sentence:
        word_count[word]+=1
frequency=word_count.most_common(50000)        
total_fr_words=len(frequency)+2
fr_word_dict={w[0]:idx+2 for idx,w in enumerate(frequency)}
fr_word_dict["PAD"]=PAD
fr_word_dict["UNK"]=UNK
fr_idx_dict={v:k for k,v in fr_word_dict.items()}

In [9]:
fridx=[fr_word_dict.get(i,UNK) for i in tokenized_fr]   
print(fridx)

[28, 40, 231, 32, 726, 370, 4]


In [10]:
frtokens=[fr_idx_dict.get(i,"UNK") for i in fridx]   
print(frtokens)
fr_phrase="".join(frtokens)
fr_phrase=fr_phrase.replace("</w>"," ")
for x in '''?:;.,'("-!&)%''':
    fr_phrase=fr_phrase.replace(f" {x}",f"{x}")  
print(fr_phrase)

['je</w>', 'ne</w>', 'parle</w>', 'pas</w>', 'franc', 'ais</w>', '.</w>']
je ne parle pas francais. 


In [11]:
import pickle

with open("files/dict.p","wb") as fb:
    pickle.dump((en_word_dict,en_idx_dict,fr_word_dict,fr_idx_dict),fb)

## 1.2. Sequence Padding and Batch Creation


In [12]:
out_en_ids=[[en_word_dict.get(w,UNK) for w in s] for s in en_tokens] # en_tokens = [["I", "love", "cats"], ["Hello"]] => out_en_ids [[12, 45, 87], [34]]
out_fr_ids=[[fr_word_dict.get(w,UNK) for w in s] for s in fr_tokens]
sorted_ids=sorted(range(len(out_en_ids)), key=lambda x:len(out_en_ids[x])) 
# range(len(out_en_ids)) → creates indices for all sentences (0, 1, 2, ...).
# key=lambda x: len(out_en_ids[x]) → sorts those indices by the length of the corresponding English sentence.
# out_en_ids = [[12, 45, 87], [34]] -> sorted_ids = [1, 0]
out_en_ids=[out_en_ids[x] for x in sorted_ids]
out_fr_ids=[out_fr_ids[x] for x in sorted_ids]

In [13]:
print(len(en_tokens))
print(en_tokens[0])
print(len(out_en_ids))
print(out_en_ids[0])
print(out_en_ids[47172])


47173
['BOS', 'two</w>', 'young</w>', ',</w>', 'white</w>', 'males</w>', 'are</w>', 'outside</w>', 'near</w>', 'many</w>', 'bus', 'hes</w>', '.</w>', 'EOS']
47173
[3, 5168, 361, 4]
[3, 2, 262, 185, 514, 45, 2061, 1055, 12, 896, 16, 7, 1102, 1094, 311, 12, 2, 31, 30, 12, 1504, 1847, 14, 2, 742, 3430, 867, 16, 7, 96, 11, 1094, 2, 680, 3761, 3762, 12, 293, 30, 16, 7, 96, 306, 17, 107, 5440, 5, 4]


In [14]:
print("total number of sentences pairs:",len(en_tokens))
print("total number of sentenced pairs (ordered)",len(out_en_ids))

total number of sentences pairs: 47173
total number of sentenced pairs (ordered) 47173


In [15]:
import numpy as np

batch_size=128 # batch size is 128 tokens
idx_list=np.arange(0,len(en_tokens),batch_size) # include the indices which are the starts of all bacthes
np.random.shuffle(idx_list)

batch_indexs=[] # Each elemnnt in this list is a list that includes 128 of adjacant indices of sentences pairs in the csv file
for idx in idx_list: # Iterates over each batch starting index (idx)
    batch_indexs.append(np.arange(idx, min(len(en_tokens), idx+batch_size)))

print(f"number of batches: {len(batch_indexs)}")
print(batch_indexs[0]) # first batch of sentences
print(batch_indexs[1]) # second batch
print(np.max(idx_list))
print(min(len(en_tokens), 47104+batch_size))

number of batches: 369
[39552 39553 39554 39555 39556 39557 39558 39559 39560 39561 39562 39563
 39564 39565 39566 39567 39568 39569 39570 39571 39572 39573 39574 39575
 39576 39577 39578 39579 39580 39581 39582 39583 39584 39585 39586 39587
 39588 39589 39590 39591 39592 39593 39594 39595 39596 39597 39598 39599
 39600 39601 39602 39603 39604 39605 39606 39607 39608 39609 39610 39611
 39612 39613 39614 39615 39616 39617 39618 39619 39620 39621 39622 39623
 39624 39625 39626 39627 39628 39629 39630 39631 39632 39633 39634 39635
 39636 39637 39638 39639 39640 39641 39642 39643 39644 39645 39646 39647
 39648 39649 39650 39651 39652 39653 39654 39655 39656 39657 39658 39659
 39660 39661 39662 39663 39664 39665 39666 39667 39668 39669 39670 39671
 39672 39673 39674 39675 39676 39677 39678 39679]
[30208 30209 30210 30211 30212 30213 30214 30215 30216 30217 30218 30219
 30220 30221 30222 30223 30224 30225 30226 30227 30228 30229 30230 30231
 30232 30233 30234 30235 30236 30237 30238 30239 30

In [16]:
def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    ML = np.max(L) # Find out the length of the longest sequence in the batch
    padded_seq = np.array([np.concatenate([x, [padding] * (ML - len(x))])
                            if len(x) < ML else x for x in X]) #If a batch is shorter than the longest sequence, add 0s to the sequence at the end.
    return padded_seq

In [17]:
sentneces_indices_in_one_batch= batch_indexs[0]
batch_tokens_indices = [out_fr_ids[sentenc_index] for sentenc_index in sentneces_indices_in_one_batch]
print(batch_tokens_indices[0:3])

L = [len(x) for x in batch_tokens_indices]
print(f"max: {np.max(L)}, min: {np.min(L)}")

padded_batch = seq_padding(batch_tokens_indices)
print(padded_batch[0:3])
print(padded_batch.shape)


[[2, 7, 23, 24, 5, 213, 9, 188, 254, 94, 5, 262, 1051, 8, 149, 970, 4, 3], [2, 5, 17, 24, 5, 395, 251, 1063, 19, 1356, 14, 7, 60, 526, 241, 487, 8, 13, 157, 4, 3], [2, 1415, 7, 598, 8, 25, 43, 56, 21, 2758, 12, 2977, 10, 6, 5, 6299, 8, 916, 4, 3]]
max: 31, min: 14
[[   2    7   23   24    5  213    9  188  254   94    5  262 1051    8
   149  970    4    3    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   2    5   17   24    5  395  251 1063   19 1356   14    7   60  526
   241  487    8   13  157    4    3    0    0    0    0    0    0    0
     0    0    0]
 [   2 1415    7  598    8   25   43   56   21 2758   12 2977   10    6
     5 6299    8  916    4    3    0    0    0    0    0    0    0    0
     0    0    0]]
(128, 31)


In [18]:
from util import Batch

# The BatchLoader() class creates data batches intended for training. Each batch in 
# this list contains 128 pairs, where each pair contains numerical representations of an 
# English phrase and its corresponding French translation

# batch_indexes: Each elemnnt in this list is a list that includes 128 of adjacant indices of sentences pairs in the en2fr.csv file.
# out_en_ids: takes a sentence index and map it to the correspponding sentnce numerical representation, since sentnces indices are adjacant,
# the entire batch sentnces lengthes would be very close to each other, because out_en_ids orders sentnce ascendingly accoridng to their lengthes.
class BatchLoader():
    def __init__(self):
        self.idx=0 # self.idx is used to keep track of which batch number we’re currently on.
    def __iter__(self): # This makes the object iterable.
        return self
    def __next__(self):
        self.idx += 1
        if self.idx<=len(batch_indexs):
            b=batch_indexs[self.idx-1]
            batch_en=[out_en_ids[x] for x in b] # [[12, 45, 87], [34, 66],[7, 18, 9],....] 128 english sentences (each is a list of token indices) 
            batch_fr=[out_fr_ids[x] for x in b]  # [[17, 4], [1, 7],[3, 9],....] 128 french sentences (each is a list of token indices)
            batch_en=seq_padding(batch_en) # append 0's to each sentneces (128, sql_len)
            batch_fr=seq_padding(batch_fr) # append 0's to each sentneces (128, sql_len)
            return Batch(batch_en,batch_fr)
        raise StopIteration

In [19]:
loader = BatchLoader()
first_batch = next(loader)   

en_sentence_numbered = first_batch.src[0]
print(en_sentence_numbered)
entokens=[en_idx_dict.get(i.item(),"UNK") for i in en_sentence_numbered]   
print(entokens)
en_phrase="".join(entokens)
en_phrase=en_phrase.replace("</w>"," ") 
for x in '''?:;.,'("-!&)%''': 
    en_phrase=en_phrase.replace(f" {x}",f"{x}")  
print(en_phrase)


fr_sentence_numbered = first_batch.trg[0]
print(fr_sentence_numbered)
frtokens=[fr_idx_dict.get(i.item(),"UNK") for i in fr_sentence_numbered]   
print(frtokens)
fr_phrase="".join(frtokens)
fr_phrase=fr_phrase.replace("</w>"," ")
for x in '''?:;.,'("-!&)%''':
    fr_phrase=fr_phrase.replace(f" {x}",f"{x}")  
print(fr_phrase)


print("-" * 100)
print("Source sequence shape (batch_size, seq_len):", first_batch.src.shape)
print("First source sequence tokens:", first_batch.src[0])
print("================> Source mask shape (batch_size, 1, seq_len):", first_batch.src_mask.shape)
print("First target input sequence (decoder input):", first_batch.trg[0])
print("Target input shape (batch_size, seq_len):", first_batch.trg.shape)  # last token dropped
print("First target output sequence (ground truth):", first_batch.trg_y[0])
print("Target output shape (batch_size, seq_len):", first_batch.trg_y.shape)  # first token dropped
print("================> Target mask shape (batch_size, seq_len, seq_len):", first_batch.trg_mask.shape)


tensor([   3,    2,   19,   14,    2,  841,  339,  175,  314,    2,  744,  737,
         257,   10,   82, 1047,    5,    4])
['BOS', 'a</w>', 'woman</w>', 'with</w>', 'a</w>', 'hand', 'bag</w>', 'walks</w>', 'past</w>', 'a</w>', 'chin', 'ese</w>', 'store</w>', 'of</w>', 'some</w>', 'sort</w>', '.</w>', 'EOS']
BOSa woman with a handbag walks past a chinese store of some sort. EOS
tensor([   2,    7,   23,   24,    5,  213,    9,  188,  254,   94,    5,  262,
        1051,    8,  149,  970,    4,    3,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0])
['BOS', 'une</w>', 'femme</w>', 'avec</w>', 'un</w>', 'sac</w>', 'a</w>', 'main</w>', 'passe</w>', 'par</w>', 'un</w>', 'magasin</w>', 'chinois</w>', 'de</w>', 'quelque</w>', 'sorte</w>', '.</w>', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
BOSune femme avec un sac a main passe par un magasin chinois de quelque sorte. EOSPADPADPADPADPADPADPADPADPADPADPADPAD
------

# 2	Word embedding and positional encoding
## 2.1. Word Embedding


In [20]:
src_vocab = len(en_word_dict)
tgt_vocab = len(fr_word_dict)
print(f"there are {src_vocab} distinct English tokens")
print(f"there are {tgt_vocab} distinct French tokens")

there are 11055 distinct English tokens
there are 11239 distinct French tokens


## 2.1. Positional Encoding
To model the order of elements in the input and output sequences, we'll first create positional encodings of the sequences as follows:

In [21]:
from util import PositionalEncoding
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

pe = PositionalEncoding(256, 0.1)
x = torch.zeros(128, 25, 256).to(DEVICE) #  Creates a word embedding and fills it with zeros
y = pe.forward(x) # Calculates the input embedding by adding positional encoding to the word embedding
print(f"the shape of positional encoding is {y.shape}")
#print(y)

the shape of positional encoding is torch.Size([128, 25, 256])


# 3	Train the Transformer for English-to-French translation

## 3.1 Loss Function and the Optimizer



In [22]:
from util import create_model

model = create_model(src_vocab, tgt_vocab, N=6, d_model=256, d_ff=1024, h=8, dropout=0.1)

We create the optimizer for training as follows:

In [23]:
from util import NoamOpt

optimizer = NoamOpt(256, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

We then define the loss function as follows:

In [24]:
from util import (LabelSmoothing, SimpleLossCompute)

criterion = LabelSmoothing(tgt_vocab, padding_idx=0, smoothing=0.1)
loss_func = SimpleLossCompute(model.generator, criterion, optimizer)

## 3.2 The training loop
We'll train the model for 100 epochs. We'll calculate the loss and the number of tokens from each batch. After each epoch, we calculate the average loss in the epoch as the ratio between the total loss and the total number of tokens:

In [None]:
# train for 100 epochs
for epoch in range(100):
    model.train()
    tloss=0
    tokens=0
    for batch in BatchLoader():
        out = model(batch.src, batch.trg, batch.src_mask, batch.trg_mask)
        loss = loss_func(out, batch.trg_y, batch.ntokens)
        tloss += loss
        tokens += batch.ntokens
    print(f"Epoch {epoch}, average loss: {tloss/tokens}")
torch.save(model.state_dict(),"files/en2fr.pth")   

The above training process takes a couple of hours if you are using a GPU. It may take several hours if you are using CPU training. Once the training is done, the model weights are saved as *en2fr.pth* on your computer. 

# 4. Translate English to French with the Trained Model


In [None]:
from util import subsequent_mask


def translate(eng):
    # tokenize the English sentence
    tokenized_en=tokenizer.tokenize(eng)
    # add beginning and end tokens
    tokenized_en=["BOS"]+tokenized_en+["EOS"]
    # convert tokens to indexes
    enidx=[en_word_dict.get(i,UNK) for i in tokenized_en]  
    src=torch.tensor(enidx).long().to(DEVICE).unsqueeze(0)
    # create mask to hide padding
    src_mask=(src!=0).unsqueeze(-2)
    # encode the English sentence
    memory=model.encode(src,src_mask)
    # start translation in an autogressive fashion
    start_symbol=fr_word_dict["BOS"]
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    translation=[]
    for i in range(100):
        out = model.decode(memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data)) # Generate next token
        prob = model.generator(out[:, -1]) # # Get probability distribution for next word
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1) # Append to growing sequence
        sym = fr_idx_dict[ys[0, -1].item()] # Convert index to actual word
        if sym != 'EOS':
            translation.append(sym)
        else:
            break
        
    # convert tokens to sentences
    trans="".join(translation)
    trans=trans.replace("</w>"," ") 
    for x in '''?:;.,'("-!&)%''':
        trans=trans.replace(f" {x}",f"{x}")    
    print(trans)
    return trans

Let's try the defined function on the English phrase "Today is a beautiful day!", like so:

In [None]:
with open("files/dict.p","rb") as fb:
    en_word_dict,en_idx_dict,\
    fr_word_dict,fr_idx_dict=pickle.load(fb)
trained_weights=torch.load("files/en2fr.pth",map_location=DEVICE)
model.load_state_dict(trained_weights)
model.eval()
eng = "Today is a beautiful day!"
translated_fr = translate(eng)

aujourd'hui est une belle journee! 


In [None]:
eng = "A little boy in jeans climbs a small tree while another child looks on."
translated_fr = translate(eng)

un petit garcon en jeans grimpe un petit arbre tandis qu'un autre enfant regarde. 


In [None]:
eng = "I don't speak French."
translated_fr = translate(eng) #

je ne parle pas francais. 


Now let's try the sentence "I do not speak French."

In [None]:
eng = "I do not speak French."
translated_fr = translate(eng)

je ne parle pas francais. 
