In [6]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
  

In [7]:
import warnings
warnings.simplefilter("ignore", UserWarning)

In [None]:
!pip install torchdata
import torch
from torchtext.datasets import AG_NEWS

In [9]:
train_iter = AG_NEWS(split='train')
test_iter = AG_NEWS(split='test')
batch = [next(iter(train_iter))]
y, x = batch[0]
x

"Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."

In [10]:
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import re

tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')

def yield_tokens(data_iter):
    for _, text in data_iter:
        text = re.sub(r"[\\]", " ", text)
        text = re.sub(r"[--]", " ", text)
        text = re.sub(r"[=]", " ", text)
        text = re.sub(r"[/]", " ", text)
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

print(f"Words in vocabulary: {len(vocab)}")

Words in vocabulary: 65948


In [11]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def text_pipeline(text):
    text = re.sub(r"[\\]", " ", text)
    text = re.sub(r"[--]", " ", text)
    text = re.sub(r"[=]", " ", text)
    text = re.sub(r"[/]", " ", text)
    return tokenizer(text)

label_pipeline = lambda x: int(x) - 1


def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(vocab(text_pipeline(_text)), dtype=torch.int64)
         text_list.append(processed_text)
         
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = pad_sequence(text_list, batch_first=True, padding_value=vocab["<pad>"])
    return label_list.to(device), text_list.to(device)

train_iter = AG_NEWS(split='train')
dataloader = DataLoader(train_iter, batch_size=1, shuffle=False, collate_fn=collate_batch)
collate_batch(batch)[1].shape, len(text_pipeline(x))

(torch.Size([1, 31]), 31)

In [13]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 2000
    start_time = time.time()

    for idx, (label, text) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            print(f"Loss {loss.item()}")
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [14]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset


train_iter, test_iter = AG_NEWS()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

BATCH_SIZE = 64 # batch size for training
train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

In [25]:
y, X = next(iter(train_dataloader))
X.shape

torch.Size([64, 104])


#### A) Baseline architecture

No tricks you know about bag of words.



In [12]:
import torch.nn as nn
class BagOfEmb(nn.Module):
    def __init__(self, n_tokens, embedding_dim):
        super(BagOfEmb, self).__init__()
        self.embedding_layer = nn.Embedding(n_tokens, embedding_dim, padding_idx=vocab["<pad>"])
        self.linear = nn.Linear(embedding_dim, 4)
    def forward(self, batch):
        out = self.embedding_layer(batch)
        out = torch.mean(out, dim=1)
        out = self.linear(out)
        return out

In [103]:
# Models
vocab_size = len(vocab)
emsize = 25
model = BagOfEmb(vocab_size, emsize).to(device)

# Hyperparameters
EPOCHS = 5 # epoch
LR = 5  # learning rate


criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

-----------------------------------------------------------
| end of epoch   1 | time:  8.60s | valid accuracy    0.824 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time:  8.65s | valid accuracy    0.863 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time:  8.61s | valid accuracy    0.873 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time:  8.65s | valid accuracy    0.870 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time:  8.86s | valid accuracy    0.883 
-----------------------------------------------------------


In [111]:
count_parameters(model)

+------------------------+------------+
|        Modules         | Parameters |
+------------------------+------------+
| embedding_layer.weight |  1648700   |
|     linear.weight      |    100     |
|      linear.bias       |     4      |
+------------------------+------------+
Total Trainable Params: 1648804


1648804


#### A) CNN architecture

All the tricks you know about dense and convolutional neural networks apply here as well.
* Dropout. Nuff said.
* Batch Norm. This time it's `nn.BatchNorm*`/`L.BatchNormalization`
* Parallel convolution layers. The idea is that you apply several nn.Conv1d to the same embeddings and concatenate output channels.
* More layers, more neurons, ya know...

![picture](https://lena-voita.github.io/resources/lectures/text_clf/neural/cnn/several_kernel_sizes-min.png)

In [28]:
class ParalellConv(nn.Module):
      def __init__(self, embedding_dim, n_filters, kernel_size):
        super(ParalellConv, self).__init__()
        # Conv - RelU - Droput - MaxOverTime
        self.layers = nn.Sequential(
            nn.Conv1d(embedding_dim, n_filters, kernel_size), 
            nn.ReLU(),
            nn.Dropout(p=0.1)
        )
      def forward(self, x):
          return self.layers(x)


class CNN_TEXT(nn.Module):
    def __init__(self, n_tokens, embedding_dim):
        super(CNN_TEXT, self).__init__()
        self.embedding_layer = nn.Embedding(n_tokens, embedding_dim)
        self.conv2 = ParalellConv(embedding_dim, 16, kernel_size=2)
        self.conv4 = ParalellConv(embedding_dim, 8, kernel_size=3)
        self.conv6 = ParalellConv(embedding_dim, 4, kernel_size=4)

        self.bn = nn.BatchNorm1d(28)
        self.linear = nn.Linear(28, 4)

    def forward(self, batch):
        emb_out = self.embedding_layer(batch)
        emb_perm = torch.permute(emb_out, (0, 2, 1))

        out2 = self.conv2(emb_perm).max(-1)[0]
        out4 = self.conv4(emb_perm).max(-1)[0]
        out6 = self.conv6(emb_perm).max(-1)[0]
        out = torch.cat([out2, out4, out6], axis=1)
        out = self.bn(out)
        out = self.linear(out)

        return out

In [124]:
# Models
vocab_size = len(vocab)
emsize = 25
model = CNN_TEXT(vocab_size, emsize).to(device)

# Hyperparameters
EPOCHS = 5 # epoch
LR = 3  # learning rate


criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

-----------------------------------------------------------
| end of epoch   1 | time: 12.56s | valid accuracy    0.709 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time: 12.62s | valid accuracy    0.775 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time: 12.49s | valid accuracy    0.790 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time: 12.51s | valid accuracy    0.812 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time: 12.51s | valid accuracy    0.823 
-----------------------------------------------------------


In [125]:
count_parameters(model)

+------------------------+------------+
|        Modules         | Parameters |
+------------------------+------------+
| embedding_layer.weight |  1648700   |
| conv2.layers.0.weight  |    800     |
|  conv2.layers.0.bias   |     16     |
| conv4.layers.0.weight  |    600     |
|  conv4.layers.0.bias   |     8      |
| conv6.layers.0.weight  |    400     |
|  conv6.layers.0.bias   |     4      |
|       bn.weight        |     28     |
|        bn.bias         |     28     |
|     linear.weight      |    112     |
|      linear.bias       |     4      |
+------------------------+------------+
Total Trainable Params: 1650700


1650700

#### B) Play with pooling

There's more than one way to perform pooling:
* Max over time (independently for each feature)
* Average over time (excluding PAD)
* Softmax-pooling:
$$ out_{i, t} = \sum_t {h_{i,t} \cdot {{e ^ {h_{i, t}}} \over \sum_\tau e ^ {h_{j, \tau}} } }$$

* Attentive pooling
$$ out_{i, t} = \sum_t {h_{i,t} \cdot Attn(h_t)}$$

, where $$ Attn(h_t) = {{e ^ {NN_{attn}(h_t)}} \over \sum_\tau e ^ {NN_{attn}(h_\tau)}}  $$
and $NN_{attn}$ is a dense layer.

The optimal score is usually achieved by concatenating several different poolings, including several attentive pooling with different $NN_{attn}$ (aka multi-headed attention).

The catch is that keras layers do not inlude those toys. You will have to [write your own keras layer](https://keras.io/layers/writing-your-own-keras-layers/). Or use pure tensorflow, it might even be easier :)

In [2]:
### SKIP DIFFERENT POOLING 

#### C) Fun with words

It's not always a good idea to train embeddings from scratch. Here's a few tricks:

* Use a pre-trained embeddings from `gensim.downloader.load`. See last lecture.
* Start with pre-trained embeddings, then fine-tune them with gradient descent. You may or may not download pre-trained embeddings from [here](http://nlp.stanford.edu/data/glove.6B.zip) and follow this [manual](https://keras.io/examples/nlp/pretrained_word_embeddings/) to initialize your Pytorch embedding layer with downloaded weights.

Initialize embedding matrix with glove embeddings, 
for out of vocabulary use random initialization

In [16]:
import gensim.downloader as api
import numpy as np


glove = api.load("glove-twitter-25")



In [18]:
embeddings_matrix = np.zeros((len(vocab), 25))
embeddings_dim = glove["will"].shape

for word, index in vocab.get_stoi().items():
    if word in glove:
        embeddings_matrix[index] = glove.get_vector(word)
    else:
        embeddings_matrix[index] = np.random.uniform(-0.5, 0.5, embeddings_dim)

embeddings_matrix = torch.tensor(embeddings_matrix, device=device, dtype=torch.float32)
print(f"Sanity check n_words, word_shape {embeddings_matrix.shape}")

Sanity check n_words, word_shape torch.Size([65948, 25])


In [78]:
class CNN_TEXT(nn.Module):
    def __init__(self, embeddings_matrix, embedding_dim, freeze=False):
        super(CNN_TEXT, self).__init__()
        self.embedding_layer = nn.Embedding.from_pretrained(embeddings_matrix, freeze=freeze).to(device)
        self.conv2 = ParalellConv(embedding_dim, 16, kernel_size=2)
        self.conv4 = ParalellConv(embedding_dim, 8, kernel_size=3)
        self.conv6 = ParalellConv(embedding_dim, 4, kernel_size=4)

        self.bn = nn.BatchNorm1d(28)
        self.linear = nn.Linear(28, 4)

    def forward(self, batch):
        emb_out = self.embedding_layer(batch)
        emb_perm = torch.permute(emb_out, (0, 2, 1))

        out2 = self.conv2(emb_perm).max(-1)[0]
        out4 = self.conv4(emb_perm).max(-1)[0]
        out6 = self.conv6(emb_perm).max(-1)[0]
        out = torch.cat([out2, out4, out6], axis=1)
        out = self.bn(out)
        out = self.linear(out)

        return out

In [81]:
# Models
emsize = 25
model = CNN_TEXT(embeddings_matrix, emsize).to(device)

# Hyperparameters
EPOCHS = 5 # epoch
LR = 3  # learning rate


criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

-----------------------------------------------------------
| end of epoch   1 | time: 25.36s | valid accuracy    0.879 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time: 24.79s | valid accuracy    0.892 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time: 25.20s | valid accuracy    0.901 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time: 24.82s | valid accuracy    0.903 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time: 24.11s | valid accuracy    0.900 
-----------------------------------------------------------



#### D) Going recurrent

We've already learned that recurrent networks can do cool stuff in sequence modelling. Turns out, they're not useless for classification as well. With some tricks of course..

* Like convolutional layers, LSTM should be pooled into a fixed-size vector with some of the poolings.
* Since you know all the text in advance, use bidirectional RNN
  * Run one LSTM from left to right
  * Run another in parallel from right to left 
  * Concatenate their output sequences along unit axis (dim=-1)

![picture](https://lena-voita.github.io/resources/lectures/text_clf/neural/rnn/rnn_final_state-min.png)



In [70]:
class LSTM(nn.Module):
    def __init__(self, freeze=False):
        super(LSTM, self).__init__()
        self.embedding_layer = nn.Embedding.from_pretrained(embeddings_matrix, freeze=freeze).to(device)
        self.lstm = nn.LSTM(input_size=25,
                    hidden_size=50,
                    num_layers=1,
                    batch_first=True,
                    bidirectional=False)
        self.drop = nn.Dropout(p=0.5)
        self.linear = nn.Linear(50, 4)
        
    def forward(self, batch):
        emb_out = self.embedding_layer(batch)
        lstm_out, (last_hidden, _) = self.lstm(emb_out)
        out = self.drop(torch.squeeze(last_hidden, 0))
        out = self.linear(out)
        return out 

In [72]:
# Models
emsize = 25
model = LSTM().to(device)

# Hyperparameters
EPOCHS = 5 # epoch
LR = 3  # learning rate


criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

-----------------------------------------------------------
| end of epoch   1 | time: 10.68s | valid accuracy    0.245 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time: 10.07s | valid accuracy    0.578 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time: 10.16s | valid accuracy    0.861 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time: 10.31s | valid accuracy    0.859 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time: 10.13s | valid accuracy    0.891 
-----------------------------------------------------------


![picture](https://lena-voita.github.io/resources/lectures/text_clf/neural/rnn/bidirectional-min.png)

In [98]:
class LSTM_Bi(nn.Module):
    def __init__(self, freeze=False):
        super(LSTM_Bi, self).__init__()
        self.embedding_layer = nn.Embedding.from_pretrained(embeddings_matrix, freeze=freeze).to(device)
        self.lstm = nn.LSTM(input_size=25,
                    hidden_size=50,
                    num_layers=1,
                    batch_first=True,
                    bidirectional=True)
        self.drop = nn.Dropout(p=0.5)
        self.linear = nn.Linear(2 * 50, 4)
        
    def forward(self, batch):
        emb_out = self.embedding_layer(batch)
        lstm_out, (last_hidden, _) = self.lstm(emb_out)
        hidden_concat = torch.cat([last_hidden[0], last_hidden[1]], dim=1)
        out = self.drop(hidden_concat)
        out = self.linear(out)
        return  out

In [99]:
# Models
emsize = 25
model = LSTM_Bi().to(device)

# Hyperparameters
EPOCHS = 5 # epoch
LR = 3  # learning rate


criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

-----------------------------------------------------------
| end of epoch   1 | time: 11.80s | valid accuracy    0.893 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time: 11.97s | valid accuracy    0.898 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time: 11.60s | valid accuracy    0.904 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time: 11.42s | valid accuracy    0.908 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time: 11.05s | valid accuracy    0.911 
-----------------------------------------------------------


In [100]:
# sum_lasthidden = output[:, -1, :hidden_size] + output[:, -1, hidden_size:]
# https://stackoverflow.com/questions/50856936/taking-the-last-state-from-bilstm-bigru-in-pytorch

class LSTM_Bi(nn.Module):
    def __init__(self, freeze=False):
        super(LSTM_Bi, self).__init__()
        self.embedding_layer = nn.Embedding.from_pretrained(embeddings_matrix, freeze=freeze).to(device)
        self.lstm = nn.LSTM(input_size=25,
                    hidden_size=50,
                    num_layers=1,
                    batch_first=True,
                    bidirectional=True)
        self.drop = nn.Dropout(p=0.5)
        self.linear = nn.Linear(2 * 50, 4)
        
    def forward(self, batch):
        emb_out = self.embedding_layer(batch)
        output, (last_hidden, _) = self.lstm(emb_out)
        hidden_size = 50
        hidden_concat = torch.cat([output[:, -1, :hidden_size], output[:, -1, hidden_size:]], dim=1)
        out = self.drop(hidden_concat)
        out = self.linear(out)
        return  out

In [101]:
# Models
emsize = 25
model = LSTM_Bi().to(device)

# Hyperparameters
EPOCHS = 5 # epoch
LR = 3  # learning rate


criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

-----------------------------------------------------------
| end of epoch   1 | time: 11.84s | valid accuracy    0.254 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time: 11.67s | valid accuracy    0.887 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time: 11.67s | valid accuracy    0.892 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time: 11.57s | valid accuracy    0.895 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time: 11.46s | valid accuracy    0.906 
-----------------------------------------------------------
