In [1]:
import string
def simple_tokenizer(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.lower().split()
    return tokens

# Example usage:
text_example = "This is a, simple example sentence."
tokens_example = simple_tokenizer(text_example)
print(tokens_example)


['this', 'is', 'a', 'simple', 'example', 'sentence']


In [8]:
import random
import torch
import string
from torch import nn
from torch import optim
from matplotlib import pyplot as plt
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import torchtext
from collections import defaultdict
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'

allowed_chars = string.ascii_lowercase + ' '

class Translator:
    def __init__(self, file_en, file_es,max_length):
        self.sentences_en = self.load_sentences(file_en)
        self.sentences_es = self.load_sentences(file_es)

        self.tokenizer_es = self.tokenizer
        self.tokenizer_en = self.tokenizer

        self.vocab_en = self.build_vocab(self.sentences_en)
        self.vocab_es = self.build_vocab(self.sentences_es)

        self.archivo_ingles = file_en
        self.archivo_espanol = file_es
        self.max_length = max_length

    def build_vocab(self, sentences):
        words = set()
        for sentence in sentences:
            words.update(self.tokenizer(sentence))
        vocab = {word: idx for idx, word in enumerate(words)}
        return vocab

    def load_sentences(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            return [line.strip() for line in file]

    def sample(self):
        index = random.randint(0, len(self.sentences_en) - 1)
        ## Input Size [batch_size, max_source_sequence_length, 300]
        # Output Size [batch_size, max_target_sequence_length, 512]
        return self.string_to_tensor(self.sentences_en[index],self.vocab_en), self.string_to_tensor(self.sentences_es[index],self.vocab_es)

    def batch_to_tensor(self, n):
        seq_in = []
        seq_out = []
        inputs, outputs = self.batch(n)
        for input, output in zip(inputs, outputs):
            seq_in.append(self.string_to_tensor(input, self.vocab_en))
            seq_out.append(self.string_to_tensor(output, self.vocab_es))
        return pad_sequence(seq_in, batch_first=True), pad_sequence(seq_out, batch_first=True)
        
    def batch(self, n):
        inputs = []
        outputs = []
        for _ in range(n):
            input, output = self.sample()
            inputs.append(input)
            outputs.append(output)
        # Convert the lists of tensors to a single tensor along the batch dimension
        #input_tensor = torch.stack(inputs, dim=0)
        #output_tensor = torch.stack(outputs, dim=0)
        return inputs, outputs

    def string_to_tensor(self, s, vocab):
        indices = [vocab.get(token, len(vocab) - 1) for token in self.tokenizer(s)] + [vocab.get('<pad>', len(vocab) - 1)]
        # Ensure the tensor has a fixed length by padding or truncating
        indices = indices[:self.max_length] + [0] * max(0, self.max_length - len(indices))
        res = torch.tensor(indices)
        return res

    def tokens_to_tensor(self, tokens, vocab):
        return torch.stack([torch.tensor(vocab.get(token, len(vocab)-1)) for token in tokens])

    def tensor_to_string(self, tensor, vocab, is_tensor=False): #############################################
        if is_tensor:
            indices = tensor.view(-1).tolist()
        else:
            indices = tensor
        return ''.join([list(vocab.keys())[int(idx)] for idx in indices])
  
    def tokens_to_indices(self, tokens, vocab):
        return [vocab.stoi[token] for token in tokens] + [vocab.stoi['<pad>']]

    def get_output_lengths(self, n):
        _, outputs = self.batch(n)
        return [len(seq) for seq in outputs]

    def __len__(self):
        return len(self.sentences_en)

    def __getitem__(self, idx):
        item = self.sentences_en[idx], self.sentences_es[idx]
        tokens_ingles = self.tokenizer_en(item[0])
        tokens_espanol = self.tokenizer_es(item[1])

        tokens_ingles = tokens_ingles + ['<eos>']
        tokens_espanol = ['<sos>'] + tokens_espanol + ['<eos>']

        if not tokens_ingles or not tokens_espanol:
            return torch.zeros(1, len(self.vocab_en)), torch.zeros(1, len(self.vocab_es))
    
        tensor_ingles = self.string_to_tensor(item[0], self.vocab_en)
        tensor_espanol = self.string_to_tensor(item[1], self.vocab_es)

        indices_ingles = [self.vocab_en.get(token, len(self.vocab_en)-1) for token in tokens_ingles] + [self.vocab_en.get('<pad>', len(self.vocab_en)-1)]
        indices_espanol = [self.vocab_es.get(token, len(self.vocab_es)-1) for token in tokens_espanol] + [self.vocab_es.get('<pad>', len(self.vocab_es)-1)]

        return tensor_ingles, tensor_espanol

    def tokenizer(self, text):
        # Remove punctuation
        if isinstance(text, torch.Tensor):
            # Convert tensor to string
            text = self.tensor_to_string(text, self.vocab_en)
        text = text.translate(str.maketrans('', '', string.punctuation))
        tokens = text.lower().split()
        return tokens
        
    def add_sos_eos_unk_pad(self, vocabulary):
        words = vocabulary.itos
        vocab = vocabulary.stoi
        embedding_matrix = vocabulary.vectors

        # Tokens especiales
        sos_token = '<sos>'
        eos_token = '<eos>'
        pad_token = '<pad>'
        unk_token = '<unk>'

        # Inicializamos los vectores para los tokens especiales, por ejemplo, con ceros
        sos_vector = torch.full((1, embedding_matrix.shape[1]), 1.)
        eos_vector = torch.full((1, embedding_matrix.shape[1]), 2.)
        pad_vector = torch.zeros((1, embedding_matrix.shape[1]))
        unk_vector = torch.full((1, embedding_matrix.shape[1]), 3.)

        # Añade los vectores al final de la matriz de embeddings
        embedding_matrix = torch.cat((embedding_matrix, sos_vector, eos_vector, unk_vector, pad_vector), 0)

        # Añade los tokens especiales al vocabulario
        vocab[sos_token] = len(vocab)
        vocab[eos_token] = len(vocab)
        vocab[pad_token] = len(vocab)
        vocab[unk_token] = len(vocab)

        words.append(sos_token)
        words.append(eos_token)
        words.append(pad_token)
        words.append(unk_token)

        vocabulary.itos = words
        vocabulary.stoi = vocab
        vocabulary.vectors = embedding_matrix

        default_stoi = defaultdict(lambda : len(vocabulary)-1, vocabulary.stoi)
        vocabulary.stoi = default_stoi
        return vocabulary

In [10]:
from torchtext.data.utils import get_tokenizer
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import torchtext
import torch
from collections import defaultdict
import torch.nn.functional as F

def collate_fn(batch):
    ingles_batch, espanol_batch= zip(*batch)
    ingles_batch = pad_sequence(ingles_batch, batch_first=True, padding_value=0)
    espanol_batch = pad_sequence(espanol_batch, batch_first=True, padding_value=0)
    
    return ingles_batch, espanol_batch 

class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        print(input_dim)
        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True)

    def forward(self, x, hidden):
        output, (hidden, cell) = self.rnn(x, hidden)
        return output, (hidden, cell)

class LuongAttention(nn.Module):
    def __init__(self):
        super(LuongAttention, self).__init__()
        self.encoder_hidden_states = None

    def assign_encoder_hidden_states(self, hidden_states):
        self.encoder_hidden_states = hidden_states

    def calculate_score(self, decoder_hidden_states):
        print('-')
        print(decoder_hidden_states.shape)
        print(self.encoder_hidden_states.shape)
        self.encoder_hidden_states = self.encoder_hidden_states.unsqueeze(1).repeat(1, decoder_hidden_states.shape[1], 1)
        print(self.encoder_hidden_states.transpose(1, 2).shape)
        return torch.bmm(decoder_hidden_states, self.encoder_hidden_states.transpose(1, 2))

        #decoder_hidden_states has shape (batch_size, seq_len, hidden_size) and 
        #self.encoder_hidden_states has shape (batch_size, hidden_size, seq_len)
        
    def source_context(self, decoder_hidden_states):
        energy = self.calculate_score(decoder_hidden_states)
        attention_weights = F.softmax(energy, dim=-1)
        context_vector = torch.bmm(attention_weights, self.encoder_hidden_states)

        return context_vector #attention_weights, context_vector

    def forward(self, decoder_hidden_state):
        return self.source_context(decoder_hidden_state)


class Decoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True)

    def forward(self, x, hidden, cell):
        output, (hidden, cell) = self.rnn(x, (hidden, cell))
        return output, (hidden, cell)

class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.encoder = Encoder(input_size, hidden_size)
        self.attention = LuongAttention()
        self.decoder = Decoder(hidden_size, hidden_size)
        self.reduce_dimension = nn.Linear(hidden_size, output_size)
        self.output = nn.Linear(output_size, output_size)
        self.output_size = output_size
    
    def forward(self, input, hidden=None):
        batch_size = input.size(0)
        encoder_hidden = torch.zeros(1, self.encoder.rnn.hidden_size)
        encoder_cell = torch.zeros(1, self.encoder.rnn.hidden_size)

        output_enc, (hn_enc, cn_enc) = self.encoder(input, (encoder_hidden, encoder_cell))
        self.attention.assign_encoder_hidden_states(output_enc)
        
        latent_tensor = output_enc.unsqueeze(1).repeat(1, input.size(1), 1)
        
        hn_enc = hn_enc.unsqueeze(1).repeat(1, batch_size, 1)
        cn_enc = cn_enc.unsqueeze(1).repeat(1, batch_size, 1)
        out_dec, (_, _) = self.decoder(latent_tensor, hn_enc, cn_enc)
        print(5)
        attention_output = self.attention(out_dec)

        reduced_output = self.reduce_dimension(out_dec)
        attention_output = attention_output[:, :, :reduced_output.size(2)]
        return self.output(reduced_output * attention_output)

archivo_ingles = 'mock.en'
archivo_espanol = 'mock.es'
max_length = 10  # Define your desired sequence length
translator = Translator('mock.en', 'mock.es',max_length)

# Parámetros
input_dim = 300
output_dim = translator.get_output_lengths(n=1)[0]
print(output_dim)
hidden_size = 512
num_layers = 2
learning_rate = 0.001
num_epochs = 100
batch_size = 8
num_workers = 0
shuffle = True

from torch.utils.data import DataLoader

def train(model, optimizer, loss_fn, n_epochs, batch_size, train_loader):
    for epoch in range(n_epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            x, y= batch  
            x, y = x.to(device), y.to(device)
            print(x.shape)
            print(y.shape)
            print('---')
            # Determine the maximum sequence length
            max_length = max(x.size(1), y.size(1))

            padded_tensor1 = F.pad(x, (0, 0,max_length - x.size(1), 0))
            padded_tensor2 = F.pad(y, (0, 0,max_length - y.size(1), 0))

            ## Input Size [batch_size, max_source_sequence_length, 300]
            # Output Size [batch_size, max_target_sequence_length, 512]
            print(padded_tensor1.shape)
            y_pred = model(padded_tensor1.float())
            
            print('qesto')
            print(y_pred.shape)
            print(y.shape)
            loss = loss_fn(y_pred, padded_tensor2)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        if epoch % 10 == 0:
            print("Epoch: {}, Loss: {}".format(epoch, total_loss))
        history.append(total_loss)

print('###### CARGAMENTO #####')

train_dataset = Translator('mock.en', 'mock.es',max_length)
eval_dataset = Translator('mockeval.en', 'mockeval.es',max_length)

padded_inputs, padded_outputs = train_dataset.batch_to_tensor(5)
print(padded_inputs.shape)
print(padded_outputs.shape)

train_loader = DataLoader(train_dataset, batch_size, collate_fn=collate_fn, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size, collate_fn=collate_fn, shuffle=False)
print('###### INICIALIZACION MODELO #####')
model = Seq2Seq(input_dim, hidden_size, output_dim)
history = []
optimizer = optim.Adam(model.parameters(), learning_rate)
loss_fn = nn.MSELoss()
print('###### ENTRENAMIENTO #####')
train(model, optimizer, loss_fn, num_epochs, batch_size, train_loader)
plt.plot(history, label='loss')

10
###### CARGAMENTO #####
torch.Size([5, 10])
torch.Size([5, 10])
###### INICIALIZACION MODELO #####
300
###### ENTRENAMIENTO #####
torch.Size([8, 10])
torch.Size([8, 10])
---
torch.Size([8, 10])


RuntimeError: input.size(-1) must be equal to input_size. Expected 300, got 10

In [None]:
def evaluate(model, n, eval_loader):
    model.eval()
    with torch.no_grad():
        for batch in eval_loader:
            x, y= batch  # The collate_fn returns four elements, but we only need x and y for evaluation
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            for i in range(min(n, len(x))):
            #    print("Predicted:", translation.tensor_to_string(y_pred[i],translation.vocab_es))
            #    print("Actual:", translation.tensor_to_string(y[i],translation.vocab_es))
            #    print()
                #print("Predicted:", y_pred[i])
                #print("Actual:", translator.tensor_to_string(y[i],translator.vocab_es))
                predicted_word = translator.tensor_to_string(y_pred, translation.vocab_es, is_tensor=True)
                print("Predicted word:", predicted_word)
evaluate(model, 5, eval_loader)


In [None]:
hn_enc = torch.rand(1, 512)
print(hn_enc.shape)
#hn_enc = torch.rand(batch_size, seq_len, hidden_size)
latent_tensor = hn_enc.unsqueeze(1).repeat(1, input_dim, 1)
hn_enc = hn_enc.unsqueeze(1).repeat(1, batch_size, 1)
print(hn_enc.shape)