<a href="https://colab.research.google.com/github/alyson-mei/ml_math_hw_1/blob/main/lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python -m spacy download ru_core_news_sm

Collecting ru-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-sm==3.7.0)
  Downloading pymorphy3-2.0.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.2/53.2 kB[0m [31m996.3 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy3>=1.0.0->ru-core-news-sm==3.7.0)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-sm==3.7.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m


In [None]:
import os
path_do_data = '../../datasets/Machine_translation_EN_RU/data.txt'
if not os.path.exists(path_do_data):
    print("Dataset not found locally. Downloading from github.")
    !wget https://raw.githubusercontent.com/neychev/made_nlp_course/master/datasets/Machine_translation_EN_RU/data.txt -nc
    path_do_data = './data.txt'

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchdata.datapipes as dp
from torch.utils.tensorboard import SummaryWriter
import torchtext
import torchtext.transforms as T
from torchtext.vocab import build_vocab_from_iterator
import spacy
import tqdm
import matplotlib.pyplot as plt
import random

## Preprocessing

### Setup

In [None]:
en = spacy.load("en_core_web_sm")
ru = spacy.load("ru_core_news_sm")

In [None]:
data_pipe = dp.iter.IterableWrapper([path_do_data]) #creating an iterable of filenames
data_pipe = dp.iter.FileOpener(data_pipe, mode='rb') #pass the iterable to FileOpener which then opens the file in read mode
data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True) #call a function to parse the file

In [None]:
# example
%%script false --no-raise-error

dp_list = list(data_pipe)
total_len = len(dp_list)
dp_list[:5]

In [None]:
def enTokenize(text):
    """
    Tokenize an English text and return a list of tokens
    """
    return [token.text for token in en.tokenizer(text)]

def ruTokenize(text):
    """
    Tokenize a Russian text and return a list of tokens
    """
    return [token.text for token in ru.tokenizer(text)]

In [None]:
# example
%%script false --no-raise-error

print(enTokenize(dp_list[0][0]))
print(ruTokenize(dp_list[0][1]))

### Building the vocabulary


In [None]:
def getTokens(data_iter, place):
    """
    Function to yield tokens from an iterator. Since, our iterator contains
    tuple of sentences (source and target), `place` parameters defines for which
    index to return the tokens for. `place = 0` for source and `place = 1` for target
    """
    for english, russian in data_iter:
        if place == 0:
            yield enTokenize(english)
        else:
            yield ruTokenize(russian)

In [None]:
source_vocab = build_vocab_from_iterator(
    getTokens(data_pipe, 0),
    min_freq = 2,
    specials = ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
source_vocab.set_default_index(source_vocab['<unk>'])

target_vocab = build_vocab_from_iterator(
    getTokens(data_pipe,1),
    min_freq = 2,
    specials = ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
target_vocab.set_default_index(target_vocab['<unk>'])

In [None]:
# example
%%script false --no-raise-error

print(source_vocab.get_itos()[:15])
print(target_vocab.get_itos()[:15])
print("Matarese" in source_vocab)

### Numericalize sentences using vocabulary


In [None]:
def getTransform(vocab):
    """
    Create transforms based on given vocabulary. The returned transform is applied to sequence
    of tokens.
    """
    text_tranform = T.Sequential(
        ## converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab = vocab),
        ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
        # 1 as seen in previous section
        T.AddToken(1, begin = True),
        ## Add <eos> at ending of each sentence. 2 because the index for <eos> in vocabulary is
        # 2 as seen in previous section
        T.AddToken(2, begin = False)
    )
    return text_tranform

In [None]:
# example
%%script false --no-raise-error

some_sentence = list(data_pipe)[798][0]
print("Some sentence = ", end = "")
print(some_sentence)
transformed_sentence = getTransform(source_vocab)(enTokenize(some_sentence))
print("Transformed sentence = ", end = "")
print(transformed_sentence)
index_to_string = source_vocab.get_itos()
for index in transformed_sentence:
    print(index_to_string[index], end = " ")

In [None]:
def applyTransform(sequence_pair):
    """
    Apply transforms to sequence of tokens in a sequence pair
    """
    return (
        getTransform(source_vocab)(enTokenize(sequence_pair[0])),
        getTransform(target_vocab)(ruTokenize(sequence_pair[1]))
    )

data_pipe = data_pipe.map(applyTransform) ## Apply the function to each element in the iterator


In [None]:
# example
%%script false --no-raise-error

for sample in data_pipe:
    print(sample)
    break

### Make batches (bucket_batch)


In [None]:
def sortBucket(bucket):
    """
    Function to sort a given bucket. Here, we want to sort based on the length of
    source and target sequence.
    """
    return sorted(bucket, key=lambda x: (len(x[0]), len(x[1])))

data_pipe = data_pipe.bucketbatch(
    batch_size = 64,
    bucket_num = 1,
    use_in_batch_shuffle = True,
    sort_key = sortBucket
)

In [None]:
# example
%%script false --no-raise-error

for sample in data_pipe:
    print(sample[:4])
    break
print(len(list(data_pipe)))

In [None]:
def separateSourceTarget(sequence_pairs):
    """
    input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`
    output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`
    """
    sources, targets = zip(*sequence_pairs)
    return sources, targets

## Apply the function to each element in the iterator
data_pipe = data_pipe.map(separateSourceTarget)

In [None]:
# example
%%script false --no-raise-error

for sample in data_pipe:
    print(len(sample))
    print(sample[0])
    print(sample[1])
    break

### Padding

In [None]:
def applyPadding(pair_of_sequences):
    """
    Convert sequences to tensors and apply padding
    """
    return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1])))
## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
# padding. Here, `0` is passed to the constructor to specify the index of the `<pad>` token in the
# vocabulary.
data_pipe = data_pipe.map(applyPadding)

In [None]:
# example
%%script false --no-raise-error

for sample in data_pipe:
    print(len(sample))
    print(sample[0].shape)
    print(sample[0])
    print(sample[1].shape)
    print(sample[1])
    break

In [None]:
train, valid, test = data_pipe.random_split(total_length = total_len,
                                      weights={"train": 0.8, "valid": 0.1, "test": 0.1},
                                      seed = 0)

In [None]:
# example
%%script false --no-raise-error

for sample in train:
    print(sample)
    break

In [None]:
train_loader = DataLoader(
    dataset = train,
    num_workers = 2)

valid_loader = DataLoader(
    dataset = valid,
    num_workers = 2)

test_loader = DataLoader(
    dataset = test,
    num_workers = 2)

In [None]:
for i, batch in enumerate(train_loader):
    print(batch[0])
    break

## Seq2Seq model

### Model

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

    def forward(self, source):
        # source: (seq_length, batch_size)
        print(1)

        embedding = self.dropout(self.embedding(source))
        # embedding: (seq_length, batch_size, embedding_size)

        output, (hidden, cell) = self.rnn(embedding)
        # hidden: (num_layers, batch_size, hidden_size)
        # cell: (n_layers, batch_size, hidden_size)

        return hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layesrs = num_layers
        self.output_size = output_size
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)
        self.fc = nn.Linear(hidden_size, output_size)

def forward(self, input, hidden, cell):
    # input: batch_size
    # hidden: (n_layers, batch_size, hidden_size)
    # cell: (n_layers, batch_size, hidden_size)

    # input: batch_size, but we want (1, batch_size)
    input = input.unsqueeze(0)

    embedding = self.dropout(self.embedding(input))
    # embedding: (seq_length, batch_size, embedding_size)

    output, (hidden, cell) = self.rnn(embedding, (hidden, cell))
    # output: (1, batch_size, hidden_size)

    # shape of predictions: (1, N, length_of_vocab)
    predictions = self.fc(output.squeeze(0))

    # prediction = (batch size, output dim)

    return predictions, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ration = 0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = self.decoder.output_size

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # Grab start token
        input = target[0]

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(input, hidden, cell)

            outputs[t] = output

            best_guess = output.argmax(1)

            input = target[t] if random.random() < teacher_force_ration else best_guess

        return outputs

### Training

In [None]:
# Training parameters
num_epochs = 20
learning_rate = 1.e-3

# Model hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(source_vocab)
input_size_decoder = len(target_vocab)
output_size = len(target_vocab)
encoder_embedding_size = 256
decoder_embedding_size = 256
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard
writer = SummaryWriter(f'runs/Loss_plot')
step = 0

In [None]:
encoder_net = Encoder(input_size_encoder,
                      encoder_embedding_size,
                      hidden_size,
                      num_layers,
                      dec_dropout).to(device)

decoder_net = Decoder(input_size_encoder,
                      encoder_embedding_size,
                      hidden_size,
                      output_size,
                      num_layers,
                      dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

In [None]:
pad_idx = source_vocab.__getitem__("<pad>")
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [None]:
for i, batch in enumerate(train_loader):
    input = batch[0].to(device)
    target = batch[1].to(device)

    output = model(input, target)
    #output: (trg_len, batch_size, output_dim)

    output_dim = output.shape[-1]
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    optimizer.zero_grad()
    loss = criterion(output, target)

    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
    optimizer.step()

    writer.add_scalar('Training Loss', loss, global_step = step)
    step += 1