<center><img src="./images/logo_kbtu.png" width=300 style="display: inline-block;"></center> 

### Week 9, RNN practice

<br />
<br />
November 5, 2022

Source: https://github.com/andriygav/MachineLearningSeminars/tree/master/sem15

### Import packages

In [1]:
from copy import deepcopy

import matplotlib.pyplot as plt
from matplotlib.image import imread
from mpl_toolkits import mplot3d
from matplotlib import gridspec
from PIL import Image
import io
from urllib.request import urlopen
from lime import lime_image
from skimage.segmentation import mark_boundaries

from tqdm.notebook import tqdm
import numpy as np
import requests
import torch

from sklearn.metrics import classification_report
from torch.utils.tensorboard import SummaryWriter

from torchvision import datasets, transforms

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Set the code execution device (cpu/cuda)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

### Recurrent neural network (seq2seq architecture, encoder-decoder)

<center><img src="./images/enc_dec_linear_out-min.png" width=600></center> 

-----
Source: https://lena-voita.github.io/nlp_course/seq2seq_and_attention.html

#### Useful code for model training

In [6]:
def batch_generator(dataset, char2idx, batch_size=64, shuffle=True):
    X, Y = dataset[:-1], dataset[1:]
    
    PAD = char2idx['<PAD>']
    n_samples = len(X)

    # list of indexes generation
    list_of_indexes = np.linspace(
        0, n_samples - 1, n_samples, dtype=np.int64)
    List_X = []
    List_Y = []
    
    if shuffle:
        np.random.shuffle(list_of_indexes)        

    # generate a list of indices, by these indices,
    # make a new shuffled list of tokens and tags
    for indx in list_of_indexes:
            List_X.append(X[indx])
            List_Y.append(Y[indx])
    
    n_batches = n_samples // batch_size
    if n_samples % batch_size != 0:
        n_batches += 1
        
    # For each k yield pair of x and y
    for k in range(n_batches):
        this_batch_size = batch_size
    
        # if we have the last batch, then it needs to be cut
        if k == n_batches - 1:
            if n_samples % batch_size > 0:
                this_batch_size = n_samples % batch_size
                
        This_X = List_X[k*batch_size:k*batch_size + this_batch_size]
        This_Y = List_Y[k*batch_size:k*batch_size + this_batch_size]
        
        This_X_line = [
                       [char2idx.get(char, 0) for char in sent]\
                       for sent in This_X]
        This_Y_line = [
                       [char2idx.get('<START>', 0)]\
                       + [char2idx.get(char, 0) for char in sent]\
                       + [char2idx.get('<FINISH>', 0)]\
                       for sent in This_Y]

        List_of_length_x = [len(sent) for sent in This_X_line]
        length_of_sentence_x = max(List_of_length_x)
        List_of_length_y = [len(sent) for sent in This_Y_line]
        length_of_sentence_y = max(List_of_length_y)

        x_arr = np.ones(shape=[this_batch_size, length_of_sentence_x])*PAD
        y_arr = np.ones(shape=[this_batch_size, length_of_sentence_y])*PAD

        for i in range(this_batch_size):
            x_arr[i, :len(This_X_line[i])] = This_X_line[i]
            y_arr[i, :len(This_Y_line[i])] = This_Y_line[i]

        x = torch.LongTensor(x_arr)
        y = torch.LongTensor(y_arr)
        lengths = torch.LongTensor(List_of_length_x)

        yield x, y

In [7]:
def train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function):
    encoder, decoder = model
    encoder.train()
    decoder.train()
    encoder.zero_grad()
    decoder.zero_grad()
    
    d, h, c = encoder(batch_of_x.to(encoder.device))
    output = decoder(
        batch_of_y.to(decoder.device), 
        h=h.to(decoder.device)[:, -decoder.num_layers:, :], 
        c=c.to(decoder.device)[:, -decoder.num_layers:, :])

    loss = loss_function(output[:, :-1, :].transpose(1, 2), batch_of_y.to(decoder.device)[:, 1:])
    
    loss.backward()
    optimizer.step()
    
    return loss.cpu().item()

In [8]:
def train_epoch(train_generator, model, loss_function, optimizer):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        local_loss = train_on_batch(
            model, batch_of_x, batch_of_y, optimizer, loss_function)
        train_generator.set_postfix({'train batch loss': local_loss})

        epoch_loss += local_loss * len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss / total

In [9]:
def trainer(count_of_epoch, 
            batch_size,
            model,
            dataset,
            char2idx,
            loss_function,
            optimizer,):
    iterations = tqdm(range(count_of_epoch))

    for it in iterations:
        optima = optimizer

        number_of_batch = len(dataset) // batch_size + (len(dataset) % batch_size > 0)
        generator = tqdm(
            batch_generator(dataset, char2idx, batch_size), 
            leave=False, total=number_of_batch)
        
        epoch_loss = train_epoch(
            train_generator = generator, model = model, 
            loss_function = loss_function, 
            optimizer = optima)

        iterations.set_postfix({'train epoch loss': epoch_loss})

### Nueral network model

In [10]:
class Encoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device
        
    def __init__(self,
                 vocab_dim,
                 emb_dim = 10, 
                 hidden_dim = 10,
                 num_layers = 3,
                 bidirectional = False):
        super(Encoder, self).__init__()
        
        self.num_direction = int(bidirectional + 1)
        self.emb_dim = emb_dim # learnable embedding matrix
        self.hidden_dim = hidden_dim

        self.embedding = torch.nn.Embedding(vocab_dim, emb_dim)

        self.encoder = torch.nn.LSTM(
            emb_dim, hidden_dim, num_layers, bidirectional=bidirectional)
        
    def forward(self, input):
        input = self.embedding(input) # shape (batch_size, seq_len, emb_dim)
        input = torch.transpose(input, 0, 1) # shape (seq_len, batch_size, emb_dim)
        d, (h, c) = self.encoder(input)
        return d, torch.transpose(h, 0, 1), torch.transpose(c, 0, 1)


In [11]:
class Decoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self,
                 vocab_dim,
                 output_dim,
                 emb_dim = 10, 
                 hidden_dim = 10,
                 num_layers = 3,
                 bidirectional = False):
        super(Decoder, self).__init__()
        
        self.num_direction = int(bidirectional + 1)
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers

        self.embedding = torch.nn.Embedding(vocab_dim, self.emb_dim)

        self.decoder = torch.nn.LSTM(
            emb_dim, hidden_dim, num_layers, bidirectional = bidirectional)

        self.linear = torch.nn.Linear(
            self.num_direction*hidden_dim, output_dim)

    def forward(self, real=None, h=None, c=None, max_len=50):
        batch_size = 1
        if h is not None:
            batch_size = h.shape[0]
        if c is not None:
            batch_size = c.shape[0]
        if real is not None:
            batch_size = real.shape[0]


        if real is not None:
            input = self.embedding(real)

            if h is None:
                h = torch.randn(
                    (batch_size, self.num_layers, self.num_direction*self.hidden_dim)).to(
                        self.device
                    )
            if c is None:
                c = torch.randn(
                    (batch_size, self.num_layers, self.num_direction*self.hidden_dim)).to(
                        self.device
                    )

            input = torch.transpose(input, 0, 1)
            h = torch.transpose(h, 0, 1)
            c = torch.transpose(c, 0, 1)
            d, _ = self.decoder(input, (h, c))
            answers = self.linear(d)
        else:
            input = self.embedding(
                torch.tensor(
                    [[char2idx['<START>']] for _ in range(
                        batch_size)]).long().to(
                        self.device
                    )
                )

            if h is None:
                h = torch.randn(
                    (batch_size, self.num_layers, self.num_direction*self.hidden_dim)).to(
                        self.device
                    )
            if c is None:
                c = torch.randn(
                    (batch_size, self.num_layers, self.num_direction*self.hidden_dim)).to(
                        self.device
                    )

            input = torch.transpose(input, 0, 1)
            h = torch.transpose(h, 0, 1)
            c = torch.transpose(c, 0, 1)

            answers = torch.zeros(
                (max_len, input.shape[1], self.output_dim)).to(
                    self.device)
                
            for i in range(max_len):
                d, (h, c) = self.decoder(input, (h, c))
                answers[i, :, :] = self.linear(d)[0]
                input = self.embedding(
                    torch.argmax(answers[i:i+1, :, :], dim=-1))

        return torch.transpose(answers, 0, 1)

### Dataset

In [12]:
text = open('input.txt', 'r', encoding='utf-8').read() # should be simple plain text file

In [13]:
char2idx = {'<PAD>':0, '<UNK>': 1, '<START>': 2, '<FINISH>': 3}
idx2char = {0: '<PAD>', 1: '<UNK>', 2: '<START>', 3: '<FINISH>'}
for item in list(set(text)):
    char2idx[item] = len(char2idx)
    idx2char[char2idx[item]] = item

In [14]:
dataset = [sent.strip() for sent in text.split('\n') if len(sent.strip()) > 20 and 
           len(sent.strip()) < 300 ]
len(dataset)

3758

### Model initialization

In [15]:
encoder = Encoder(vocab_dim=len(char2idx), 
                  num_layers=2, emb_dim=100, hidden_dim=100)
encoder.to(device)
decoder = Decoder(vocab_dim=len(char2idx), 
                  output_dim=len(char2idx), num_layers=2, emb_dim=100, hidden_dim=100)
decoder.to(device)

optimizer = torch.optim.Adam(
    list(encoder.parameters()) + list(decoder.parameters()), lr=1e-3)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=char2idx['<PAD>'])

### Model quality before training

In [16]:
for _ in range(10):
    indexes = torch.argmax(
        decoder(max_len=100,
                h=0.1*torch.randn(
                    (1, decoder.num_layers, decoder.num_direction*decoder.hidden_dim)).to(
                        decoder.device
                ), 
                c=torch.randn(
                    (1, decoder.num_layers, decoder.num_direction*decoder.hidden_dim)).to(
                    decoder.device
                )), dim=-1).detach().cpu().numpy()[0]
    list_of_char = []
    for idx in indexes:
        if idx == char2idx['<FINISH>']:
            break
        list_of_char.append(idx2char[idx])
    print(''.join(list_of_char))

дввввдвдввдв<UNK><UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>в<UNK>
ЗЗЗЗЗЗвФФOOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOO
ФФФФOOGввФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФO
VёёььььФФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOь
ццццGвФФФOOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOO
eФФOOьФOФOGФOФOOGввФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФO
Oдддвдввдвyдввyдвдввyyддввдвyyдвдввyyддввдвyyдвдввyyддввдвyyдвдввyyддввдвyyдвдввyyддввдвyyдвдввyyддв
CддввФФOOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФФOOGФФOOGввФФOOOёФOOьФ
вввв<UNK><UN

### Model training

In [17]:
trainer(count_of_epoch=10,
        batch_size=64,
        model=(encoder, decoder),
        dataset=dataset, 
        char2idx=char2idx,
        loss_function=loss_function,
        optimizer=optimizer)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

### Model quality after training

In [18]:
for _ in range(10):
    indexes = torch.argmax(
        decoder(max_len=100,
                h=0.1*torch.randn(
                    (1, decoder.num_layers, decoder.num_direction*decoder.hidden_dim)).to(
                        decoder.device
                ), 
                c=torch.randn(
                    (1, decoder.num_layers, decoder.num_direction*decoder.hidden_dim)).to(
                    decoder.device
                )), dim=-1).detach().cpu().numpy()[0]
    list_of_char = []
    for idx in indexes:
        if idx == char2idx['<FINISH>']:
            break
        list_of_char.append(idx2char[idx])
    print(''.join(list_of_char))

ютрить в проворил в проворил в проворил в проворил в проворил в проворил в проворил в проворил в про
ря в проворил в проворил в проворил в проворил в проворил в проворил в проворил в проворил в провори
ят в проворил в проворил в проворил в проворил в проворил в проворил в проворил в проворил в провори
дднне не подоворил в проворил в проворил в проворил в проворил в проворил в проворил в проворил в пр
вде не проворил в проворил в проворил в проворил в проворил в проворил в проворил в проворил в прово
лран в проворил в проворил в проворил в проворил в проворил в проворил в проворил в проворил в прово
– не не подоворил в проворил в проворил в проворил в проворил в проворил в проворил в проворил в про
lr провил в проворил в проворил в проворил в проворил в проворил в проворил в проворил в проворил в 
– не проворил в проворил в проворил в проворил в проворил в проворил в проворил в проворил в провори
и не проворил в проворил в проворил в проворил в проворил в проворил в проворил в проворил 