In [3]:
'''
rnn project
sequence to sequence translation task 
using custom tokenization and vectorization
'''

'\nrnn project\nsequence to sequence translation task \nusing custom tokenization and vectorization\n'

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [5]:
# !wget -P "../data" http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
# !unzip ../data/spa-eng.zip -d ../data/

In [6]:
'''
.read() method:
appropriate when you want to read the entire contents of a file into a string.

If the file is very large and you only need to process it line by line or in chunks, you might prefer to use 
.readline() or iterate over the file object directly. This approach is more memory-efficient.

as the data is not so big .read() is used in this project
'''

'\n.read() method:\nappropriate when you want to read the entire contents of a file into a string.\n\nIf the file is very large and you only need to process it line by line or in chunks, you might prefer to use \n.readline() or iterate over the file object directly. This approach is more memory-efficient.\n\nas the data is not so big .read() is used in this project\n'

In [7]:
with open ('../data/spa-eng/spa.txt', 'r') as f:
        lines = f.read().split('\n')[:-1] # '\n' : split the data line by line

print(lines[0])
print(len(lines[0]))
print(len(lines))

Go.	Ve.
7
118964


In [8]:
data = []
source_data = []
target_data = []

for line in lines:
    source, target = line.split('\t') # '\t' : split the data by the space
    source_data.append(source)
    target_data.append(target)
    data.append((source, target))

print(source_data[0])
print(target_data[0])
print(len(data))

Go.
Ve.
118964


In [9]:
import random
random.shuffle(data)

num_val_samples = int(len(data)*0.15)
num_train_samples = len(data) - 2*num_val_samples

train_pairs = data[:num_train_samples]
val_pairs = data[num_train_samples: num_train_samples + num_val_samples]
test_pairs = data[num_train_samples + num_val_samples: ]

In [10]:
import string
from collections import Counter
from tqdm import tqdm

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

In [11]:
class TextVectorizer():
    def __init__(self, sequence_length, vocab_size, target = False):

        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.target = target
        self.vocab_counter = Counter()
        self.stoi = {'[pad]':0, '[start]':1, '[end]':2, '[unkown]':3}
        self.itos = {0:'[pad]', 1:'[start]', 2:'[end]', 3:'[unkown]'}

    def standardize(self, text):
        text = text.lower()
        return "".join(char for char in text
                        if char not in strip_chars)

    def tokenize(self, text):
        text = self.standardize(text)
        return text.split()
    
    def adapt(self, dataset):
        
        for text in tqdm(dataset):
            tokens = self.tokenize(text)
            for token in tokens:
                self.vocab_counter[token] += 1

        for token, _ in self.vocab_counter.most_common(self.vocab_size):
            index = len(self.stoi)
            self.stoi[token] = index
            self.itos[index] = token

    def encode(self, text):
        text = self.standardize(text)
        tokens = self.tokenize(text)

        if self.target:
            result = ([self.stoi['[start]']]+ [self.stoi.get(token, 3) for token in tokens]
                    + [self.stoi['[end]']])
        else:
            result = [self.stoi.get(token, 3) for token in tokens]
        
        if len(result) <= self.sequence_length:
            pad_size = self.sequence_length - len(result)
            result += [self.stoi.get('[pad]')] * (pad_size)
        else:
            #truncate!
            result = result[:self.sequence_length]    
        return result
        
    def decode(self, int_sequence):
        
        return " ".join(self.itos.get(i , '[unknown]') for i in int_sequence)

In [12]:
sequence_length = 20
vocab_size = 15000

source_vectorizer = TextVectorizer(sequence_length, vocab_size)
target_vectorizer = TextVectorizer(sequence_length +1, vocab_size, target=True)

In [13]:
source_vectorizer.adapt(source_data)
target_vectorizer.adapt(target_data)

100%|██████████| 118964/118964 [00:00<00:00, 277019.84it/s]
100%|██████████| 118964/118964 [00:00<00:00, 267470.15it/s]


In [14]:
encoded_ = source_vectorizer.encode('If you want to sound')
source_vectorizer.decode(encoded_)

'if you want to sound [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad]'

In [15]:
eng, spa = data[700]
print(eng)
print(spa)

What time do you get up every day?
¿A qué hora te levantas todos los días?


In [16]:
print(source_vectorizer.decode(source_vectorizer.encode(eng)))
print(target_vectorizer.decode(target_vectorizer.encode(spa)))

what time do you get up every day [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad]
[start] a qué hora te levantas todos los días [end] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad]


In [17]:
print(len(source_vectorizer.decode(source_vectorizer.encode(eng))))
print(len(target_vectorizer.decode(source_vectorizer.encode(eng))))

105
115


In [18]:
class EngSpaDataset(Dataset):
    def __init__(self, data, source_vectorizer, target_vectorizer):
        super().__init__()

        self.data = data
        self.source_vectorizer = source_vectorizer
        self.target_vectorizer = target_vectorizer

    def __len__(self):
        return len(self.data)
    

    def __getitem__(self, index):
        eng, spa = self.data[index]
        eng = self.source_vectorizer.encode(eng)
        spa = self.target_vectorizer.encode(spa)
        return ({
            'english': torch.tensor(eng).long(),
            'spanish': torch.tensor(spa[:-1]).long()
            }, torch.tensor(spa[1:]).long())

In [19]:
train_dataset = EngSpaDataset(train_pairs, source_vectorizer, target_vectorizer)
val_dataset = EngSpaDataset(val_pairs, source_vectorizer, target_vectorizer)
test_dataset = EngSpaDataset(test_pairs, source_vectorizer, target_vectorizer)

In [20]:
train_dataset[0][0]['spanish'].size()

torch.Size([20])

In [21]:
'''
why the collate_fn ->

- permute() method : to change the order of dimensions of a tensor.
- handle variable length (however, it is handled by TextVectorizer already)
- much more organized data

(also possible direct indexing without zero-initialized tensors)
zero-initialized tensors: 
- prepare for padding
- control data storage
'''

'\nwhy the collate_fn ->\n\n- permute() method : to change the order of dimensions of a tensor.\n- handle variable length (however, it is handled by TextVectorizer already)\n- much more organized data\n\n(also possible direct indexing without zero-initialized tensors)\nzero-initialized tensors: \n- prepare for padding\n- control data storage\n'

In [22]:
def permute_batch_seq_collate(data: torch.Tensor): # data-> is a batch of the data
  batch_size = len(data)  
  source_input = torch.zeros(batch_size, data[0][0]["english"].size(0))
  target_input = torch.zeros(batch_size, data[0][0]["spanish"].size(0))
  target_output = torch.zeros(batch_size, data[0][1].size(0))
  for idx, (inputs, output) in enumerate(data):
    source_input[idx] = inputs["english"]
    target_input[idx] = inputs["spanish"]
    target_output[idx] = output

  return (source_input.permute(1, 0).long(), target_input.permute(1, 0).long(),
          target_output.permute(1, 0).long())

In [23]:
batch_size=64

train_dl = DataLoader(train_dataset, batch_size, shuffle=True, collate_fn= permute_batch_seq_collate)
val_dl = DataLoader(val_dataset, batch_size, collate_fn= permute_batch_seq_collate)
test_dl = DataLoader(test_dataset, batch_size, collate_fn= permute_batch_seq_collate)


In [24]:
source_, target_input_ , target_output_ = next(iter(train_dl))
print(source_.size())
print(target_input_.size())
print(target_output_.size())

torch.Size([20, 64])
torch.Size([20, 64])
torch.Size([20, 64])


In [25]:
x = torch.randint(0, vocab_size, size = (20, 64)) # torch.Size([20, 64]) integers between 0 to vocab size

In [26]:
class Encoder(nn.Module):
    def __init__(self, source_dim: int, embedding_dim: int, hidden_dim: int, 
                 padding_index:int =0, num_rnn_layers: int= 1, dropout= 0.2):
        super().__init__()

        self.dropout = nn.Dropout(dropout)
        self.embedding_layer = nn.Embedding(source_dim, embedding_dim, 
                                            padding_idx=padding_index)
        self.lstm_layer = nn.LSTM(embedding_dim, hidden_dim, 
                                  num_layers= num_rnn_layers)

    def forward(self, x):
        x = self.embedding_layer(x)
        x = self.dropout(x)
        output, (cell_state, hidden_state) = self.lstm_layer(x)
        return hidden_state, cell_state
    
        # output size: [seq_len, batch_size, hidden_dim] which we don't need in this model
        # cell_state , hidden_state: [1, batch_size, hidden_dim] 
        # cell_ state and hidde_state passed to Decoder(input cell and input hidden)

In [27]:
x = torch.randint(0, len(source_vectorizer.stoi), size = (20, 64))
encoder_ = Encoder(len(source_vectorizer.stoi), 256, 512)
print(encoder_(x)[0].size())
print(encoder_(x)[1].size())

torch.Size([1, 64, 512])
torch.Size([1, 64, 512])


In [28]:
class Decoder(nn.Module):
    def __init__(self, target_dim:int, embedding_dim: int, hidden_dim:int, 
                 padding_index: int= 0, num_rnn_layers:int =1,  dropout= 0.2):
        super().__init__()

        self.embedding_layer = nn.Embedding(target_dim, embedding_dim,  
                                            padding_idx=padding_index)
        self.lstm_layer = nn.LSTM(embedding_dim, hidden_dim, 
                                  num_layers=num_rnn_layers)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_dim, target_dim)

    def forward(self, x, hidden_state, cell_state):
        x = self.embedding_layer(x)
        x = self.dropout(x)
        outputs , (cell, hidden) = self.lstm_layer(x, (hidden_state, cell_state)) 
        predictions = self.classifier(outputs)      # outputs: [seq_len, batch_size, hidden_dim]
        return predictions                          # predictions: [seq_len, batch_size, target_dim]

In [29]:
y = torch.randint(0, len(target_vectorizer.stoi), size=(20,64))
decoder_ = Decoder(len(target_vectorizer.stoi), 256, 512)
decoder_(y, encoder_(x)[0], encoder_(x)[1]).size()

torch.Size([20, 64, 15004])

In [30]:
class LSTMNet(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target):
        h , c = self.encoder(source)
        outputs = self.decoder(target, h, c)
        return outputs

In [31]:
model_ = LSTMNet(encoder_, decoder_)
model_(x, y).size()
# len(target_vectorizer.stoi) = 15004

torch.Size([20, 64, 15004])

In [32]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

source_vocab_size = len(source_vectorizer.stoi) # 13636
target_vocab_size = len(target_vectorizer.stoi) # 15004
encoder_embedding = 128
decoder_embedding = 128
hidden_dim = 256
padding_index = target_vectorizer.stoi['[pad]'] # 0

learning_rate = 0.001
num_epochs = 20

encoder = Encoder(
    source_vocab_size,
    encoder_embedding,
    hidden_dim
    ).to(device)
decoder = Decoder(
    target_vocab_size,
    decoder_embedding,
    hidden_dim
    ).to(device)

model = LSTMNet(encoder, decoder)

criterion = nn.CrossEntropyLoss(ignore_index= padding_index)
optimizer = torch.optim.NAdam(model.parameters(), lr= learning_rate)



In [33]:
'''
each token from target_out (true class index) compared vs the corresponding row in predictions.
the row in predictions gives a probability distribution across all classes (target_dim)
[(seq_len * batch_size), target_dim] vs [seq_len * batch_size]

Each row in predictions corresponds to one token in the sequence and contains target_dim values, 
each representing the model's predicted score (or probability if softmaxed) for each possible class

Internally, CrossEntropyLoss applies the softmax function to convert scores in predictions to probabilities 
and then uses the negative log likelihood of the probability assigned to the correct class as the loss.
'''

"\neach token from target_out (true class index) compared vs the corresponding row in predictions.\nthe row in predictions gives a probability distribution across all classes (target_dim)\n[(seq_len * batch_size), target_dim] vs [seq_len * batch_size]\n\nEach row in predictions corresponds to one token in the sequence and contains target_dim values, \neach representing the model's predicted score (or probability if softmaxed) for each possible class\n\nInternally, CrossEntropyLoss applies the softmax function to convert scores in predictions to probabilities \nand then uses the negative log likelihood of the probability assigned to the correct class as the loss.\n"

In [34]:
# def accuracy(predictions, true_target):
#     predicted_tokens = predictions.view(-1, predictions.shape[-1]).argmax(dim=1)
#     true_target_flat = true_target.reshape(-1)
#     correct_tokens = (predicted_tokens == true_target_flat).sum().item()
#     return correct_tokens / true_target_flat.size(0)

In [62]:
def accuracy(predictions, true_tokens):  # [seq_len , batch_size, vocab_size] , [seq_len , batch_size]
    predicted_tokens = predictions.view(-1, predictions.shape[-1]).argmax(dim=1) # [seq_len*batch_size] torch.Size([1280])
    true_tokens_flat = true_tokens.reshape(-1)                      # [seq_len*batch_size] torch.Size([1280])
    correct = 0 
    for idx, token in enumerate(predicted_tokens):  # len(predicted_tokens) = 1280

        if token == true_tokens_flat[idx]:
            correct += 1
        if token == 0:
            break
    return correct / (idx+1) 

In [36]:
x_ = torch.randint(0, 15004, size= (20, 64, 15004))
x_.view(-1, x_.shape[-1]).argmax()

tensor(25831)

In [None]:
for epoch in range(num_epochs):

    train_loss = 0
    train_acc = 0
    val_loss = 0
    val_acc = 0

    model.train()

    for source, target_in, target_out in tqdm(train_dl):
        
        source = source.to(device)
        target_in = target_in.to(device)
        target_out = target_out.to(device)                 # [seq_len, batch_size]           
        predictions = model(source, target_in)             # [seq_len, batch_size, target_dim]
        # target_out.reshape(-1) -> [seq_len * batch_size]
        # predictions.view(-1, predictions.shape[-1]) -> [(seq_len * batch_size), target_dim] 
        loss = criterion(predictions.reshape(-1, predictions.shape[-1]), target_out.reshape(-1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        with torch.no_grad():
            train_acc += accuracy(predictions, target_out)

    model.eval()
    with torch.no_grad():

        for source, target_in, target_out in val_dl:
            source = source.to(device)
            target_in = target_in.to(device)
            target_out = target_out.to(device)           
            predictions = model(source, target_in)

            loss = criterion(predictions.reshape(-1, predictions.shape[-1]), target_out.reshape(-1))
            val_loss += loss
            val_acc += accuracy(predictions, target_out)

    print (f'Epoch {epoch+1}/{num_epochs} |\
            Train Loss {train_loss:.2f}, Train Accuracy {(train_acc / len(train_dl)):.2f} |\
            Validation Loss {val_loss:.2f}, Validation Accuracy {(val_acc / len(val_dl)):.2f}')

In [72]:
source, target_in, target_out = next(iter(val_dl))
predicted_tokens = model(source.to(device), target_in.to(device))
predicted_vector = predicted_tokens[:, 18].argmax(1) # predicted vector for the 18th batch
target_vector  = target_out[:, 18] # original target vector for the 18th batch
print(predicted_vector)
print(target_vector)

tensor([ 27, 272, 102,  39,  51,  13, 167,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2], device='mps:0')
tensor([  27, 2821,    7,  177,  329,   13, 7074,    2,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])


In [68]:
translate = ''
for i in predicted_vector:
    text = target_vectorizer.itos[i.item()]
    translate += ' '+text
    if text == '[end]':
        break
translate

' qué pasó de tiene haber un auto [end]'

In [73]:
translate = ''
for i in target_vector:
    text = target_vectorizer.itos[i.item()]
    translate += ' '+text
    if text == '[end]':
        break
translate

' qué alimentos no debería tomar un diabético [end]'