In [7]:
import pandas as pd 
dataset = pd.read_csv("data.csv") 
print(dataset.head())
en_sentences = dataset["EN"].tolist()
fr_sentences = dataset["FR"].tolist()

                                   EN                                  FR
0           i m at a loss for words .               j en perds mes mots .
1           i m at a loss for words .              les mots me manquent .
2           i m at a loss for words .         je ne trouve pas les mots .
3  you re in better shape than i am .  tu es en meilleure forme que moi .
4                 you are in my way .              tu es sur mon chemin .


In [8]:
class Vocab:
    def __init__(self):
        self.max_len = 30
        self.token_to_ids = {"<UNK>": 0}
        self.id_to_token = {0: "<UNK>"}
        self.special_tokens = ["<pad>", "<sos>", "<eos>"]
        for token in self.special_tokens:
            self.add_token(token)

    def add_token(self, token):
        if token not in self.token_to_ids:
            id = len(self.token_to_ids)
            self.token_to_ids[token] = id
            self.id_to_token[id] = token 
    
    def sentence_to_id(self, sentence, maxlen):
        attention_id = []
        id_list = []
        words = ["<sos>"] + sentence.split() + ["<eos>"]

        for word in words:
            if word in self.token_to_ids.keys():
                id_list.append(self.token_to_ids[word])
                attention_id.append(1)
            else:
                id_list.append(self.token_to_ids["<UNK>"])
                attention_id.append(1)

        if len(id_list) > maxlen:
            id_list = id_list[:maxlen]
        while len(id_list) < maxlen:
            id_list.append(self.token_to_ids["<pad>"])
            attention_id.append(0)
        
        return id_list, attention_id

    def id_to_sentence(self, ids):
        words = []
        for id in ids:
            token = self.id_to_token.get(id,"<UNK>")
            if token == "<eos>":
                break
            if token in ["<sos>", "<pad>"]:
                continue 
            words.append(token)
        return " ".join(words)

    def build_Vocab(self, sentences):
    
        for sentence in sentences:
            tokens = sentence.lower().split()
            for token in tokens:
                self.add_token(token)

In [9]:
en_vocab = Vocab()
fr_vocab = Vocab()

en_vocab.build_Vocab(en_sentences)
fr_vocab.build_Vocab(fr_sentences)

import torch
from torch.utils.data import DataLoader

class EN_FR_Dataset(torch.utils.data.Dataset):
    def __init__(self, en_sentences, fr_sentences, en_vocab, fr_vocab):
        self.en_sentences = en_sentences
        self.fr_sentences = fr_sentences
        self.en_vocab = en_vocab
        self.fr_vocab = fr_vocab

    def __len__(self):
        return len(self.en_sentences)

    def __getitem__(self, index):
        en_sentences = self.en_sentences[index]
        fr_sentences = self.fr_sentences[index]

        en_ids, en_attention = self.en_vocab.sentence_to_id(en_sentences, 30)
        fr_ids, fr_attention = self.fr_vocab.sentence_to_id(fr_sentences, 30)

        return torch.tensor(en_ids), torch.tensor(en_attention, dtype=torch.bool), torch.tensor(fr_ids), torch.tensor(fr_attention, dtype=torch.bool)

dataset = EN_FR_Dataset(en_sentences, fr_sentences, en_vocab, fr_vocab)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

for batch in dataloader:
    en_ids, en_attention, fr_ids, fr_attention = batch
    print(en_ids.shape)


torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([3

In [None]:
import torch.nn as nn 

class Encoder(nn.Module):
    def __init__(self, vocab_size, input_size, hidden_dim, bidirectional):
        super().__init__()
        self.vocab_size = vocab_size
        self.input_size = input_size
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional

        self.input_embed = nn.Embedding(vocab_size,input_size)   
        self.gru = nn.GRU(input_size, hidden_dim, batch_first=True, bidirectional=bidirectional)


    def forward(self, batch):
        embedded_vector = self.input_embed(batch)
        encoder_output, last_hidden = self.gru(embedded_vector)
        print(encoder_output.shape)
        print(last_hidden.shape)

        return encoder_output, last_hidden #encoder outputs enthält die gesamte information der hidden states pro wort 
                                            #last_hidden verwenden wir für die initialisierung des decoders (ist der letzte hidden state vom encoder satz der die ganze info enthält)



In [None]:
import torch
class BahdanauAttention(nn.Module):

    def __init__(self, w1, w2, v, attention_dim, encoder_output, last_hidden):
        super().__init__()
        self.w1 = nn.Linear(encoder_output, attention_dim)
        self.w2 = nn.Linear(last_hidden, attention_dim)
        self.v = nn.Linear(attention_dim, 1)
        self.tanh = nn.Tanh()

    def forward(self, decoder_hidden, encoder_outputs):
        h = self.w1(encoder_outputs)
        s = self.w2(decoder_hidden).unsqueeze(1)

        score = self.tanh(h + s)        
        score_full = self.v(score) 

        attention_weights = torch.softmax(score_full, dim=1)
        context_vector = torch.sum(attention_weights * encoder_outputs, dim=1)
        
        return context_vector, attention_weights.squeeze(-1)

In [None]:
class Decoder: 
    def __init__(self, target_vocab_size, embedding_dim, hidden_dim, bidirectional):
        super().__init__()
        self.embedding = nn.Embedding()
        self.gru = nn.GRU(embedding_dim + hidden_dim, hidden_dim, batch_first=True, bidirectional=bidirectional)
        self.attention = BahdanauAttention()
        self.fc = nn.Linear(hidden_dim + hidden_dim, target_vocab_size)

    def forward(self, input_token, last_hidden, encoder_outputs):
        token_embed = self.embedding(input_token)
        decoder_output = self.gru(token_embed)

        raw_logit = self.attention(encoder_outputs, decoder_output) 
        torch.softmax(raw_logit)

