In [None]:
!pip install -q catalyst nltk torchtext captum gdown

# Seminar

Hi! Today we starts NLP section in our course. Starting with embeddings and Recurrent Neural Networks.

In [None]:
from catalyst.utils import set_global_seed, get_device

set_global_seed(42)
device = get_device()

## Text preprocessing

Text preprocessing is the most important part of NLP. In comparison, an image is usually reshaped and normalized in a preprocessing pipeline. But a text is different. A text consists of words(or tokens), that has a different probability to be written. Words are arrays of characters, and different arrays can be related to one word(E.g. "it" and "It" or "Имя" and "Имени" is one word, but different word form.). That's why texts should be normalized and tokenized.

In [None]:
example = "Hello! My name is <unk> and i'm <unk>."

In [None]:
lower = example.lower()
print(lower)

In [None]:
from nltk.tokenize import WordPunctTokenizer


tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize(lower)
print(tokens)

Another part of the preprocessing is filtration. Every token has to be informative. Punctuation hasn't much information, and it should be deleted. Pronouns, prepositions, articles (and other small words) should be deleted too. Usually, they will not help to solve tasks.

In [None]:
from string import punctuation

filtered = [
    token for token in tokens
    if ((len(token) >= 3) and (token not in punctuation))
]
print(filtered)

Last part of preprocessing is adding special tokens. They means begining(`SOS`) or ending(`EOS`) of text/sentences, words out of vocabulary(`UNK`), padding for batching(`PAD`). A nueral networks can have other special tokens. For BERT some tokens should be masked. These tokens are swapped with `MASK` token.

In [None]:
SOS = "<SOS>" # or <SOT>/<BOT>/<BOS>
EOS = "<EOS>" # so on...
PAD = "<PAD>"
UNK = "<UNK>"

# Sometimes
MASK = "<MASK>" # Masked Langueage Models

## Text Classification

The most popular task in NLP is text classification. Before 2012, this task is solved by pair of Tf-iDf method and some classification model. But now we have embeddings vector, mapped from tokens to some big continious high dimensions real space. Read more about Embeddings: [NLP course for you](https://lena-voita.github.io/nlp_course/word_embeddings.html).

For text classification we will use mean of embeddings for each text as a feature vector. Let's code this!

In [None]:
import torch

from torchtext.legacy.datasets import YelpReviewPolarity

train_dataset, test_dataset = YelpReviewPolarity()
vocab = train_dataset.get_vocab()
vocab.load_vectors("glove.6B.300d")

Each text has different length, and we will build a batch by adding padding tokens at the end of the text. Effective way to do it by bucketing. However, it's not so easy to implement. So we will create simple padding.

In [None]:
from typing import Tuple

from torch.utils.data import DataLoader


PAD_ID = vocab.stoi["<pad>"]


def collate_fn(
    batch: Tuple[torch.Tensor, torch.Tensor]
) -> Tuple[torch.Tensor, torch.Tensor]:
    texts = []
    max_len = max(t.size(0) for _, t in batch)
    labels = torch.zeros(len(batch))
    for idx, (label, txt) in enumerate(batch):
        new_txt = torch.zeros((1, max_len)) + PAD_ID
        new_txt[0, : txt.size(0)] = txt
        texts.append(new_txt)
        labels[idx] = label
    return torch.cat(texts).type(torch.LongTensor), labels


batch_size = 256

loaders = {
    "train": DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        collate_fn=collate_fn,
    ),
    "valid": DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
        collate_fn=collate_fn,
    ),
}

Our algorithm is this:
- Get embeddings for each word
- Get mean vector for text
- Classify text by mean vector

Let's code this.

In [None]:
import torch.nn as nn
from catalyst.contrib.nn import Lambda

class EmbeddingModel(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embedding_size: int = 300,
        hidden_size: int = 150,
        dropout_p: float = 0.2,
    ):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.out = nn.Sequential(
            Lambda(lambda x: x.reshape(x.size(0), embedding_size, x.size(1))),
            nn.BatchNorm1d(embedding_size),
            nn.Dropout2d(dropout_p),
            Lambda(lambda x: x.mean(2)),
            nn.Linear(embedding_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.Dropout(dropout_p),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),
        )

    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
        embedded = self.embedding(input_ids)
        return self.out(embedded).reshape(-1)

In next section, we will intepretate model's prediction. It works best with binary classificator with one output and we will train our model for this by changing criterion to `BCEWithLogitsLoss`.

Create model, optimizer and criterion!

In [None]:
from catalyst.contrib.nn import RAdam


model = EmbeddingModel(len(vocab), dropout_p=0.2)
optimizer = RAdam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

Embeddings are difficult to train. We will use pretrained one.

In [None]:
model.embedding.weight.data.copy_(vocab.vectors)

In [None]:
from catalyst.dl import SupervisedRunner, MultilabelAccuracyCallback
        
runner = SupervisedRunner()

In [None]:
from datetime import datetime
from pathlib import Path


logdir = Path("emb_logs") / datetime.now().strftime("%Y%m%d-%H%M%S")

`AccuracyCallback` doesnt' work well with binary classificator. So, we change it to `MultiLabelAccuracyCallback`.

In [None]:
runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    callbacks=[
        MultilabelAccuracyCallback(input_key = "logits", target_key = "targets", threshold=0.5),
    ],
    loaders=loaders,
    verbose=True,
    num_epochs=1,
    logdir=logdir,
    valid_loader = "valid", 
    valid_metric = "accuracy"
)

### Model Interpretability

Model's prediction interpretation is one of the ML-Engeenier task. To understand Neural Network prediction, we have great tool by PyTorch: [captum](https://github.com/pytorch/captum). It includes several algorithm, and we will use one (LayerIntegratedGradients, [arxiv](https://arxiv.org/pdf/1805.05492.pdf)) of them to understand which words influence on prediction.

In [None]:
from torchtext.data.utils import get_tokenizer


tokenize = get_tokenizer("basic_english")

In [None]:
from typing import List
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization


token_reference = TokenReferenceBase(reference_token_idx=PAD_ID)
lig = LayerIntegratedGradients(model, model.embedding)

In [None]:
# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

def interpret_sentence(
    model: nn.Module, sentence: str, min_len: int = 7, label: int = 0
):
    model.eval()
    text = [tok for tok in tokenize(sentence)]
    if len(text) < min_len:
        text += ["<pad>"] * (min_len - len(text))
    indexed = [vocab.stoi[t] for t in text]

    model.zero_grad()

    input_indices = torch.tensor(indexed, device=device)
    input_indices = input_indices.unsqueeze(0)
    
    # input_indices dim: [sequence_length]
    seq_length = min_len

    # predict
    pred = torch.sigmoid(model(input_indices)).item()
    pred_label = "pos" if pred > 0.5 else "neg"

    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(
        seq_length, device=device
    ).unsqueeze(0)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(
        input_indices,
        reference_indices,
        n_steps=5000,
        return_convergence_delta=True,
    )

    print(f"pred: {pred_label}({pred:.2}), delta: {abs(delta)}")

    add_attributions_to_visualizer(
        attributions_ig,
        text,
        pred,
        pred_label,
        label,
        delta,
        vis_data_records_ig,
    )

def add_attributions_to_visualizer(
    attributions: torch.Tensor,
    text: str,
    pred: int,
    pred_ind: str,
    label: int,
    delta: float,
    vis_data_records: List[visualization.VisualizationDataRecord],
):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    # storing couple samples in an array for visualization purposes
    vis_data_records.append(
        visualization.VisualizationDataRecord(
            attributions,
            pred,
            pred_ind,
            label,
            "pos" if label == 1 else "neg",
            attributions.sum(),
            text,
            delta,
        )
    )

We have few sentence for testing

In [None]:
interpret_sentence(model, "It was a fantastic performance!", label=1)
interpret_sentence(model, "Best film ever", label=1)
interpret_sentence(model, "It was a horrible movie", label=0)
interpret_sentence(model, "It is a disgusting movie!", label=0)

In [None]:
print("Visualize attributions based on Integrated Gradients")
visualization.visualize_text(vis_data_records_ig)

## Part-of-Speech

Move on from text classification to token classification. Tokens can include information like year, name, location and e.t.c. Or we try to analisy syntax of sentences by predcting part of speech for each token. Let's solve problem of part of speech prediction by a RNN neural network!

In [None]:
from torchtext.legacy.datasets import CoNLL2000Chunking
from torchtext.legacy import data

In [None]:
TEXT = data.Field(lower = True)
TAGS = data.Field(unk_token = None)
fields = (("text", TEXT), ("tags", TAGS))

In [None]:
train_dataset, val_dataset, test_dataset  = CoNLL2000Chunking.splits(fields)

In [None]:
TEXT.build_vocab(train_dataset,
                 vectors = "glove.6B.300d",
                 unk_init = torch.Tensor.normal_)
TAGS.build_vocab(train_dataset)

In [None]:
def collate_batch(batch):
    texts = []
    token_types = []
    max_len = max([len(example.text) for example in batch])
    for idx, example in enumerate(batch):
        new_txt = torch.zeros((1, max_len)) + PAD_ID
        new_tt = torch.zeros((1, max_len)) + PAD_ID
        txt = torch.tensor(TEXT.vocab.lookup_indices(example.text))
        tt = torch.tensor(TAGS.vocab.lookup_indices(example.tags))
        new_txt[0, : txt.size(0)] = txt
        texts.append(new_txt)
        new_tt[0, : tt.size(0)] = tt
        token_types.append(new_tt)
    return (
        torch.cat(texts).type(torch.LongTensor),
        torch.cat(token_types).type(torch.LongTensor),
    )

In [None]:
batch_size = 256

loaders = {
    "train": DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        collate_fn = collate_batch
    ),
    "valid": DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
        collate_fn = collate_batch
    ),
}

Our POS-model now consist of three components:
- Embeddigns layer
- (multi or single layer) RNN
- Classifier for each token

RNN has three main architectures: simple RNN, LSTM and GRU. Choose one of them to solve our task.

In [None]:
class POSModel(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embedding_size: int = 300,
        hidden_size: int = 150,
        num_classes: int = 2,
        dropout_p: float = 0.1,
    ):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.rnn = nn.LSTM(
            embedding_size,
            hidden_size,
            num_layers=2,
            dropout=dropout_p,
            batch_first=True,
        )
        self.clf = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
        embedded = self.embedding(input_ids)
        output, _ = self.rnn(embedded)
        return self.clf(output)

Create model, optimizer and criterion. We want predict POS for each token. But, some tokens, like `PAD`, hasn't POS property(we don't know their POS). That's why we will ignore them.  

In [None]:
model = POSModel(len(TEXT.vocab), dropout_p=0.1, num_classes=len(TAGS.vocab))
optimizer = RAdam(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID) # Ignore PAD token

In [None]:
model.embedding.weight.data.copy_(TEXT.vocab.vectors)

In [None]:
SupervisedRunner.handle_batch

In [None]:
from typing import Dict


class POSRunner(SupervisedRunner):
    def handle_batch(self, batch: Dict[str, torch.Tensor]) -> None:
        input_ids = batch["features"]
        pos_tags = batch["targets"]
        output = self.model(input_ids)

        self.batch = {
            "input_ids": input_ids,
            "targets": pos_tags.reshape(-1),
            "logits": output.reshape(-1, output.size(2)),
        }

In [None]:
logdir = Path("pos_logs") / datetime.now().strftime("%Y%m%d-%H%M%S")

In [None]:
from catalyst.dl import AccuracyCallback


runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    callbacks=[
        AccuracyCallback(input_key="logits", target_key = "targets"),
    ],
    loaders=loaders,
    verbose=True,
    num_epochs=10,
    logdir=logdir,
)