
# RNN, LSTM, GRU



In [None]:
!pip install torchdata torchtext==0.13.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import re
from collections import Counter
import matplotlib.pyplot as plt
from IPython.display import clear_output
import seaborn as sns
import numpy as np
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import tqdm.notebook as tqdm

from torchtext.datasets import IMDB

import random
from random import sample

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# IMDB
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. 

In [None]:
train_data = IMDB(split='train')
test_data = IMDB(split='test')

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_data), min_freq=20, specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
vocab["<pad>"]

1

In [None]:
vocab["<unk>"]

0

In [None]:
len(vocab)

13352

Current data libla

In [None]:
max_length = 256

class TextDatasetWrapper(torch.utils.data.Dataset):
    def __init__(self, base_data):
        labels = []
        texts = []
        for l, t in base_data:
            labels.append(l)
            texts.append(t)

        self.labels = labels
        self.texts = texts

        
    def __getitem__(self, idx):
        label = self.labels[idx]
        text = self.texts[idx]
        tokens = tokenizer(text)[:max_length]
        length = len(tokens)

        ids = [vocab[token] for token in tokens]

        return {
          'ids': ids, 
          'length': length, 
          'label': 1 if label == 'pos' else 0
        }

    def __len__(self):
        return len(self.texts)

In [None]:
train_data = TextDatasetWrapper(train_data)
test_data = TextDatasetWrapper(test_data)

In [None]:
next(iter(train_data))

{'ids': [13,
  1568,
  13,
  246,
  0,
  43,
  64,
  398,
  1135,
  92,
  7,
  37,
  2,
  7126,
  15,
  3363,
  11,
  60,
  11,
  17,
  94,
  629,
  12,
  6921,
  3,
  13,
  87,
  553,
  15,
  38,
  94,
  11,
  17,
  0,
  40,
  1225,
  3,
  16,
  3,
  9263,
  51,
  11,
  131,
  780,
  8,
  2480,
  14,
  682,
  4,
  1575,
  118,
  6,
  342,
  7,
  114,
  1160,
  3052,
  13,
  72,
  75,
  8,
  74,
  14,
  19,
  537,
  3,
  2,
  121,
  10,
  5959,
  194,
  6,
  191,
  3862,
  474,
  1424,
  766,
  4314,
  42,
  489,
  8,
  834,
  287,
  61,
  58,
  50,
  127,
  3,
  12,
  826,
  61,
  489,
  8,
  1132,
  47,
  11859,
  8,
  257,
  56,
  441,
  7,
  669,
  28,
  54,
  2,
  863,
  0,
  209,
  50,
  781,
  1001,
  1304,
  147,
  18,
  2,
  2675,
  337,
  5,
  1510,
  1304,
  12,
  2,
  2359,
  1592,
  3,
  12,
  203,
  2182,
  7271,
  5,
  1919,
  0,
  7,
  0,
  50,
  73,
  4656,
  28,
  2381,
  4,
  61,
  52,
  402,
  20,
  47,
  474,
  1692,
  4,
  8135,
  4,
  5,
  999,
  347,
  3,
  54,


In [None]:
pad_index = vocab["<pad>"]

def collate(batch):
    batch_ids = [torch.tensor(i['ids']) for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_length = [i['length'] for i in batch]
    batch_length = torch.tensor(batch_length, dtype=torch.long)
    batch_label = [i['label'] for i in batch]
    batch_label = torch.tensor(batch_label, dtype=torch.long)
    batch = {'ids': batch_ids,
             'length': batch_length,
             'label': batch_label}
    return batch

In [None]:
batch_size = 64

train_dataloader = torch.utils.data.DataLoader(train_data, 
                                               batch_size=batch_size, 
                                               collate_fn=collate, 
                                               shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, collate_fn=collate)

In [None]:
next(iter(train_dataloader))

{'ids': tensor([[  14,   10,   35,  ...,    1,    1,    1],
         [  13,  220,    2,  ...,    1,    1,    1],
         [  58,   13,   34,  ...,   52,    8, 1421],
         ...,
         [  13,   33,  252,  ..., 8198,   11,  723],
         [  11,    9,   16,  ..., 1359,  301,   19],
         [  13,  299,   14,  ...,    1,    1,    1]]),
 'length': tensor([176, 223, 256, 256, 177,  97, 246, 102, 256, 138, 256, 139, 256, 255,
         168, 247, 243,  82, 150, 256, 256, 256, 256, 157, 256, 186,  73, 108,
          70, 256, 256,  86, 256, 143, 245,  72, 145, 256, 124, 256, 256, 239,
         256, 229, 239, 256, 138, 179, 131,  97, 256, 199, 256, 143, 219, 256,
         128, 135, 168, 256, 256, 256, 256,  44]),
 'label': tensor([0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
         1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0])}

# LSTM

Long Short Term Memory networks were designed to mitigate the two main problems of RNNs:
* Long-term dependency problem
* Vanishing gradients

The core idea behind LSTM is to use Gates that control what information to pass from the previous steps using sigmoid functions.  

$$\begin{aligned}\left(\begin{array}{l}i \\ f \\ o \\ g\end{array}\right) &=\left(\begin{array}{l}\operatorname{sigm} \\ \operatorname{sigm} \\ \operatorname{sigm} \\ \tanh \end{array}\right) W^{l}\left(\begin{array}{l}h_{t}^{l-1} \\ h_{t-1}^{l}\end{array}\right) \\ c_{t}^{l} &=f \odot c_{t-1}^{l}+i \odot g \\ h_{t}^{l} &=o \odot \tanh \left(c_{t}^{l}\right) \end{aligned}$$

![LSTM Architecture](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png)


* Image is taken from [blog post](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)

![](https://www.researchgate.net/profile/Robert-Gao/publication/325564535/figure/fig2/AS:690279778615307@1541586932678/Diagram-of-a-two-layer-bi-directional-LSTM-network.png)

In [None]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional,
                 dropout_rate, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=bidirectional,
                            dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, ids, length):
        # ids = [batch size, seq len]
        # length = [batch size]
        embedded = self.dropout(self.embedding(ids))
        # embedded = [batch size, seq len, embedding dim]

        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, length, batch_first=True, 
                                                            enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        
        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output, padding_value=float("-Inf"))
        
        output_max = torch.max(output, dim=0).values

        return self.fc(output_max)

In [None]:
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 300
output_dim = 2
n_layers = 2
bidirectional = True
dropout_rate = 0.5

model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout_rate, 
             pad_index)

In [None]:
t = next(iter(train_dataloader))
t

{'ids': tensor([[   2,    0,   10,  ...,   16,  205,    3],
         [  54,   58,  854,  ...,    1,    1,    1],
         [  64, 1692, 6816,  ...,    1,    1,    1],
         ...,
         [  95,   58, 3328,  ...,    1,    1,    1],
         [  13,  619,  333,  ...,  188,    8,   33],
         [ 416,    4,   30,  ...,    1,    1,    1]]),
 'length': tensor([256, 167, 206,  58, 139, 176,  65, 226, 256, 146, 212, 233,  92, 256,
         192, 114, 157, 164,  37, 256, 161, 171, 256, 150, 197, 233,  98, 117,
         152, 256, 256,  62, 256, 190, 256, 177, 256, 130, 124, 256, 256, 159,
         208, 167, 188, 224, 207, 166, 142, 182, 207, 256, 256, 223, 108, 256,
         256, 133, 143, 256, 256, 150, 256, 224]),
 'label': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

In [None]:
model(t['ids'], t['length']).shape

torch.Size([64, 2])

In [None]:
lr = 5e-4

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [None]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [None]:
def train(dataloader, model, criterion, optimizer, device):

    model.train()
    epoch_losses = []
    epoch_accs = []

    for batch in tqdm.tqdm(dataloader, desc='training...'):
        ids = batch['ids'].to(device)
        length = batch['length']
        label = batch['label'].to(device)
        prediction = model(ids, length)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

In [None]:
def evaluate(dataloader, model, criterion, device):
    
    model.eval()
    epoch_losses = []
    epoch_accs = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc='evaluating...'):
            ids = batch['ids'].to(device)
            length = batch['length']
            label = batch['label'].to(device)
            prediction = model(ids, length)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

In [None]:
model = model.to(device)

In [None]:
n_epochs = 10
best_valid_loss = float('inf')

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(n_epochs):

    train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, device)
    valid_loss, valid_acc = evaluate(test_dataloader, model, criterion, device)

    train_losses.extend(train_loss)
    train_accs.extend(train_acc)
    valid_losses.extend(valid_loss)
    valid_accs.extend(valid_acc)
    
    epoch_train_loss = np.mean(train_loss)
    epoch_train_acc = np.mean(train_acc)
    epoch_valid_loss = np.mean(valid_loss)
    epoch_valid_acc = np.mean(valid_acc)
    
    if epoch_valid_loss < best_valid_loss:
        best_valid_loss = epoch_valid_loss
    
    print(f'epoch: {epoch+1}')
    print(f'train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}')
    print(f'valid_loss: {epoch_valid_loss:.3f}, valid_acc: {epoch_valid_acc:.3f}')

training...:   0%|          | 0/391 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/391 [00:00<?, ?it/s]

epoch: 1
train_loss: 0.557, train_acc: 0.700
valid_loss: 0.416, valid_acc: 0.807


training...:   0%|          | 0/391 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/391 [00:00<?, ?it/s]

epoch: 2
train_loss: 0.401, train_acc: 0.816
valid_loss: 0.428, valid_acc: 0.817


training...:   0%|          | 0/391 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/391 [00:00<?, ?it/s]

epoch: 3
train_loss: 0.334, train_acc: 0.855
valid_loss: 0.323, valid_acc: 0.867


training...:   0%|          | 0/391 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/391 [00:00<?, ?it/s]

epoch: 4
train_loss: 0.293, train_acc: 0.875
valid_loss: 0.306, valid_acc: 0.877


training...:   0%|          | 0/391 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/391 [00:00<?, ?it/s]

epoch: 5
train_loss: 0.260, train_acc: 0.891
valid_loss: 0.297, valid_acc: 0.881


training...:   0%|          | 0/391 [00:00<?, ?it/s]

KeyboardInterrupt: ignored