In [None]:
!python3 -m spacy download de_core_news_sm
!pip install pandarallel

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [4]:
dataset = pd.read_csv('/kaggle/input/machine-translation-dataset-de-en/translation_train.csv')
dataset_test = pd.read_csv('/kaggle/input/machine-translation-dataset-de-en/translation_test.csv')
dataset = dataset[dataset.columns[::-1]]
dataset

Unnamed: 0,german,english
0,Zwei junge weiße Männer sind im Freien in der ...,"Two young, White males are outside near many b..."
1,Mehrere Männer mit Schutzhelmen bedienen ein A...,Several men in hard hats are operating a giant...
2,Ein kleines Mädchen klettert in ein Spielhaus ...,A little girl climbing into a wooden playhouse.
3,Ein Mann in einem blauen Hemd steht auf einer ...,A man in a blue shirt is standing on a ladder ...
4,Zwei Männer stehen am Herd und bereiten Essen zu.,Two men are at the stove preparing food.
...,...,...
28995,Eine Frau schreibt hinter einer verschnörkelte...,A woman behind a scrolled wall is writing
28996,Ein Bergsteiger übt an einer Kletterwand.,A rock climber practices on a rock climbing wall.
28997,Zwei Bauarbeiter arbeiten auf einer Straße vor...,Two male construction workers are working on a...
28998,Ein älterer Mann sitzt mit einem Jungen mit ei...,An elderly man sits outside a storefront accom...


In [5]:
import spacy
import nltk

def preprocessing(data,
                  preprocess_column = 'En',
                  n_workers = 32,
                  lang_stopwords = None,
                  corpus = None,
                  lower = False):

    import pandarallel
    from pandarallel import pandarallel
    import re


    pandarallel.initialize(progress_bar=False, nb_workers=n_workers)

    data = data.copy()
    data[preprocess_column] = data[preprocess_column]

    pandarallel.initialize(nb_workers=n_workers)


    if lower:
        data[preprocess_column] = data[preprocess_column].\
                                         parallel_apply(lambda text: text.lower())

    if corpus is not None:
        nlp = spacy.load(corpus)
        data[preprocess_column] = data[preprocess_column].\
                                         parallel_apply(lambda text: " ".join([token.text for token in nlp.tokenizer(text)]))
    if lang_stopwords is not None:
        sw = nltk.corpus.stopwords.words(lang_stopwords)
        data[preprocess_column] = data[preprocess_column].\
                                         parallel_apply(lambda text: " ".join([word for word in text.split() if word not in sw]))

    return data


en_corpus = 'en_core_web_sm'
de_corpus = 'de_core_news_sm'

preprocessed_df = preprocessing(dataset,
                                preprocess_column = 'english',
                                corpus = en_corpus,
                                lower=True,
                                n_workers = 16)

preprocessed_df = preprocessing(preprocessed_df,
                                preprocess_column = 'german',
                                corpus = de_corpus,
                                lower=True,
                                n_workers = 16)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
preprocessed_df

Unnamed: 0,german,english
0,zwei junge weiße männer sind im freien in der ...,"two young , white males are outside near many ..."
1,mehrere männer mit schutzhelmen bedienen ein a...,several men in hard hats are operating a giant...
2,ein kleines mädchen klettert in ein spielhaus ...,a little girl climbing into a wooden playhouse .
3,ein mann in einem blauen hemd steht auf einer ...,a man in a blue shirt is standing on a ladder ...
4,zwei männer stehen am herd und bereiten essen ...,two men are at the stove preparing food .
...,...,...
28995,eine frau schreibt hinter einer verschnörkelte...,a woman behind a scrolled wall is writing
28996,ein bergsteiger übt an einer kletterwand .,a rock climber practices on a rock climbing wa...
28997,zwei bauarbeiter arbeiten auf einer straße vor...,two male construction workers are working on a...
28998,ein älterer mann sitzt mit einem jungen mit ei...,an elderly man sits outside a storefront accom...


In [7]:
def max_len(dataframe, col1, col2):
    max_length = preprocessed_df[col1].str.len().max()
    max_text = preprocessed_df.loc[preprocessed_df[col1].str.len() == max_length, col1].iloc[0]

    max_len_eng = len(max_text.split())

    max_length = preprocessed_df[col2].str.len().max()
    max_text = preprocessed_df.loc[preprocessed_df[col2].str.len() == max_length, col2].iloc[0]

    max_len_de = len(max_text.split())

    return max(max_len_eng, max_len_de)

max_len = max_len(preprocessed_df, 'english', 'german')
max_len

43

In [9]:
import random

seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
# torch.backends.cudnn.deterministic = True

In [40]:
from torch.utils.data import Dataset,DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset,DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

class Tokenizer(Dataset):

    def __init__(self, data, max_len=32, get_mask = False, min_freq = 1):
        super().__init__()

        self.df = data.reset_index(drop=True)
        self.min_freq = min_freq
        self.max_len = max_len + 4
        self.get_mask = get_mask
        self.src_vocab, self.index2word_src = self.build_vocab(self.df.columns[0])
        self.trg_vocab, self.index2word_trg = self.build_vocab(self.df.columns[1])

    def get_special_tokens(self):
        return {'<unk>': 0, '<pad>':1, '<sos>': 2, '<eos>': 3}

    def get_vocab_sizes(self):
        return len(self.src_vocab), len(self.trg_vocab)

    def build_vocab(self, col):
        special = list(self.get_special_tokens())
        freq = {}

        for text in self.df[col]:
            for word in text.split():
                if word in freq:
                    freq[word] +=1
                else:
                    freq[word] = 1

        filtered_words = [word for word, count in freq.items() if count >= self.min_freq]

        vocab = special + filtered_words
        w2idx = {word: idx for idx, word in enumerate(vocab)}
        idx2w = {idx: word for idx, word in enumerate(vocab)}

        return w2idx, idx2w

    def __getitem__(self, ind):


        src_text = self.df[self.df.columns[0]][ind].split()
        trg_text = self.df[self.df.columns[1]][ind].split()

        special = self.get_special_tokens()

        tokenized_src = [special['<sos>']] + [self.src_vocab.get(word, special['<unk>']) for word in src_text] + [special['<eos>']]
        tokenized_trg = [special['<sos>']] + [self.trg_vocab.get(word, special['<unk>']) for word in trg_text] + [special['<eos>']]

        tokenized_src = tokenized_src + [self.src_vocab['<pad>']]*(self.max_len - len(tokenized_src))
        tokenized_trg = tokenized_trg + [self.trg_vocab['<pad>']]*(self.max_len - len(tokenized_trg))


        if self.get_mask:
            mask_src = [1]*len(tokenized_src) + [0]*(self.max_len - len(tokenized_src))
            mask_trg = [1]*len(tokenized_trg) + [0]*(self.max_len - len(tokenized_trg))

            return {'src': torch.tensor(tokenized_src),
                    'trg': torch.tensor(tokenized_trg),
                    'mask_src': torch.tensor(mask_src),
                    'mask_trg': torch.tensor(mask_trg)}


        return {'src': torch.tensor(tokenized_src),
                'trg': torch.tensor(tokenized_trg)}


    def __len__(self):
        return len(self.df)

batch_size = 128
get_mask = False

tokenized = Tokenizer(preprocessed_df.reset_index(drop=True),
                      max_len=max_len,
                      get_mask=get_mask,
                      min_freq=1)

train_data , valid_data = train_test_split(tokenized,
                                           test_size = 0.2)

train_dataloader = DataLoader(train_data,
                              batch_size=batch_size,
                              shuffle=True)

valid_dataloader = DataLoader(valid_data,
                              batch_size=batch_size,
                              shuffle=False)

In [41]:
import tqdm
import random
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers,dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim,
                                      embedding_dim)

        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           dropout=dropout,
                           batch_first=True)

        self.dropout = nn.Dropout(dropout)


    def forward(self, src):
        output = self.embedding(src)
        output = self.dropout(output)
        outputs, (hidden, cell) = self.rnn(output)
        return hidden, cell

In [42]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()

        self.embedding = nn.Embedding(output_dim,
                                      embedding_dim,
                                      )

        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           dropout=dropout,
                           batch_first=True)

        self.dropout = nn.Dropout(dropout)

        self.fc_out = nn.Linear(hidden_dim, output_dim)


    def forward(self, trg, hidden, cell):
        trg = trg.unsqueeze(1)
        output = self.embedding(trg)
        output = self.dropout(output)

        output, (hidden, cell) = self.rnn(output, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))

        return prediction, hidden, cell

In [43]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, teacher_forcing_ratio):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.tf = teacher_forcing_ratio

    def forward(self, src, trg):

        trg_len = trg.shape[1]
        batch_size = trg.shape[0]
        output_dim = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, trg_len, output_dim).to(self.device)
        hidden, cell = self.encoder(src)
        x = trg[:, 0]

        for t in range(1, trg_len):

            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[:, t, :] = output

            teacher_force = torch.rand(1) < self.tf
            x = trg[:, t] if teacher_force else output.argmax(1)

        return outputs

In [44]:
input_dim, output_dim =  tokenized.get_vocab_sizes()
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
teacher_forcing_ratio = 0.5
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
).to(device)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
).to(device)

model = Seq2Seq(encoder, decoder, device, teacher_forcing_ratio)
optimizer = optim.Adam(model.parameters())
pad_index = tokenized.get_special_tokens()['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [45]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, device
):
    model.train()
    epoch_loss = 0
    for batch in data_loader:
        src = batch["src"].to(device)
        trg = batch["trg"].to(device)


        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]

        output = output[:, 1:, :].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(data_loader)

def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in data_loader:

            src = batch["src"].to(device)
            trg = batch["trg"].to(device)

            output = model(src, trg)
            output_dim = output.shape[-1]

            output = output[:, 1:, :].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(data_loader)

n_epochs = 10
clip = 1.0

best_valid_loss = float("inf")
for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_dataloader,
        optimizer,
        criterion,
        clip,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_dataloader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut1-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

 10%|█         | 1/10 [01:23<12:31, 83.48s/it]

	Train Loss:   5.220 | Train PPL: 184.842
	Valid Loss:   4.797 | Valid PPL: 121.090


 20%|██        | 2/10 [02:47<11:08, 83.62s/it]

	Train Loss:   4.647 | Train PPL: 104.303
	Valid Loss:   4.514 | Valid PPL:  91.328


 30%|███       | 3/10 [04:10<09:45, 83.59s/it]

	Train Loss:   4.371 | Train PPL:  79.136
	Valid Loss:   4.278 | Valid PPL:  72.099


 40%|████      | 4/10 [05:34<08:21, 83.65s/it]

	Train Loss:   4.130 | Train PPL:  62.185
	Valid Loss:   4.112 | Valid PPL:  61.093


 50%|█████     | 5/10 [06:58<06:58, 83.69s/it]

	Train Loss:   3.975 | Train PPL:  53.243
	Valid Loss:   4.002 | Valid PPL:  54.711


 60%|██████    | 6/10 [08:21<05:34, 83.69s/it]

	Train Loss:   3.875 | Train PPL:  48.179
	Valid Loss:   3.864 | Valid PPL:  47.665


 70%|███████   | 7/10 [09:45<04:11, 83.79s/it]

	Train Loss:   3.729 | Train PPL:  41.649
	Valid Loss:   3.799 | Valid PPL:  44.670


 80%|████████  | 8/10 [11:09<02:47, 83.84s/it]

	Train Loss:   3.610 | Train PPL:  36.961
	Valid Loss:   3.706 | Valid PPL:  40.677


 90%|█████████ | 9/10 [12:33<01:23, 83.90s/it]

	Train Loss:   3.493 | Train PPL:  32.894
	Valid Loss:   3.654 | Valid PPL:  38.628


100%|██████████| 10/10 [13:57<00:00, 83.80s/it]

	Train Loss:   3.404 | Train PPL:  30.094
	Valid Loss:   3.634 | Valid PPL:  37.849





In [46]:
model.load_state_dict(torch.load("tut1-model.pt"))

test_loss = evaluate_fn(model, valid_dataloader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 3.595 | Test PPL:  36.402 |


In [49]:
def translate_sentence(
    sentence,
    model,
    nlp_src,
    nlp_trg,
    vocab_src,
    vocab_trg,
    lower,
    sos_token='<sos>',
    eos_token='<eos>',
    device=device,
    max_output_length=20
):
    model.eval()

    with torch.no_grad():
        if isinstance(sentence, str):
            tokens = [token.text for token in nlp_src.tokenizer(sentence)]
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]

        ids = [vocab_src.get(token, vocab_src["<unk>"]) for token in tokens]
        tensor = torch.LongTensor(ids).unsqueeze(-1).T.to(device) #.to(device)
        hidden, cell = model.encoder(tensor)
        inputs = [vocab_trg[sos_token]]
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == vocab_trg[eos_token]:
                break
        
    return inputs, np.array(list(vocab_trg))[inputs]

num_s = 0
sentence = dataset_test["german"][num_s]
expected_translation = dataset_test["english"][num_s]

nlp_src = spacy.load("de_core_news_sm")
nlp_trg = spacy.load("en_core_web_sm")

vocab_src = tokenized.src_vocab
vocab_trg = tokenized.trg_vocab
lower = True

print(sentence)
print(expected_translation)


translate_sentence(
    sentence,
    model,
    nlp_src,
    nlp_trg,
    vocab_src,
    vocab_trg,
    lower,
    sos_token='<sos>',
    eos_token='<eos>',
    device=device,
    max_output_length=20,
)

Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.
A man in an orange hat starring at something.


([2, 21, 31, 85, 21, 7, 33, 14, 3],
 array(['<sos>', 'a', 'man', 'wearing', 'a', 'white', 'shirt', '.',
        '<eos>'], dtype='<U16'))