In [None]:
!pip -q install catalyst==20.12 transformers datasets nlpaug

# Seminar
Hi! Today we are build simple pipeline for a sentiment analysis task. Our target dataset will be IMDB, that contains movie reviews. We try to solve the task by transformer model.

In [None]:
from catalyst.utils import set_global_seed, get_device


set_global_seed(42)
device = get_device()

To work with dataset we use [datasets](https://github.com/huggingface/datasets) by 🤗 `huggingface`. It can work with a custom dataset. But the dataset "IMDB" will be downloaded:

In [None]:
from datasets import load_dataset


imdb_dataset = load_dataset("imdb")

Look at the dataset methods and features:

In [None]:
imdb_dataset

In [None]:
imdb_dataset["train"][0]

In [None]:
test = imdb_dataset["train"][0]["text"]

To tokenize texts, we will use pretrained BPE tokenizer.

In [None]:
from transformers import BertTokenizer


tokenizer = BertTokenizer.from_pretrained("google/bert_uncased_L-6_H-256_A-4")

Examples of text tokenization, encoding, etc:

In [None]:
print(tokenizer.tokenize(test))

In [None]:
print(tokenizer.encode(test))

Tokenizer has additional functions to create attention masks, get offsets mapping or token types to train transformer models.

In [None]:
print(tokenizer.encode_plus(test))

In [None]:
print(tokenizer.encode_plus(test, max_length=64, truncation=True, padding="max_length"))

Tokenizer can change return type.

In [None]:
print(tokenizer.encode_plus(test, max_length=64, truncation=True, padding="max_length", return_tensors="pt"))

Use information about tokenizer, create train_dataloader.

In [None]:
import torch

from catalyst.utils import get_loader


def text_data_transforms(row):
    tokens = ... # Get tokens
    tokens = {k: v[0] for k, v in tokens.items()}
    tokens.update({"targets": row["label"]})
    return tokens
    

train_dataloader = get_loader(
    imdb_dataset["train"],
    open_fn=lambda x: x,
    dict_transform=text_data_transforms,
    batch_size=256,
    num_workers=4,
    shuffle=True,
    drop_last=True,
)

valid_dataloader = get_loader(
    imdb_dataset["test"],
    open_fn=lambda x: x,
    dict_transform=text_data_transforms,
    batch_size=256,
    num_workers=4,
    shuffle=True,
    drop_last=True,
)

In [None]:
loaders = {
    "train": train_dataloader,
    "valid": valid_dataloader
}

Load BERT model for SequenceClassification. We need models smaller, than `bert-uncased-base`. List of the all model: [model names](https://huggingface.co/models).

In [None]:
from transformers import BertForSequenceClassification


model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-6_H-256_A-4")

Usual train code:

In [None]:
from catalyst.contrib.nn import RAdam
from torch.nn import CrossEntropyLoss


optimizer = RAdam(model.parameters(), lr=2e-4)
criterion = CrossEntropyLoss()

In [None]:
from datetime import datetime
from pathlib import Path


logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S")

In [None]:
from catalyst.dl import SupervisedRunner


class BertRunner(SupervisedRunner):
    def _handle_batch(self, batch):
        self.input = batch
        self.output = self.model(**{k: batch[k] for k in self.input_key}, return_dict=True)


runner = BertRunner(input_key=["input_ids", "attention_mask"], device=device)

In [None]:
from catalyst.dl import AccuracyCallback


runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=loaders,
    logdir=logdir,
    num_epochs=3,
    verbose=True,
    callbacks=[AccuracyCallback(num_classes=2)],
)

## Text Augmentation

To improve our sentiment analyser, we need more data. One way to get new samples is data augmentation methods. For text we can change characters, words or sentences. Our tool for text augmentation will be [nlpaug](https://github.com/makcedward/nlpaug) library. 

Our example is a little simple sentence:

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

text = 'The quick brown fox jumps over the lazy dog .'
print(text)

Let's try to change characters by random. Probabilities of swaping between two characters are made by keybord distance (on QWERTY keybord).

In [None]:
aug = nac.KeyboardAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Another way to change sentence is a replacing a word with its synonim:

In [None]:
aug = naw.SynonymAug(action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

More accurate way to swap words can be done by pretrained Language Model. We can work with BERT-like models. 

In [None]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', 
    action="substitute"
)
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

In our seminar, we try to work with smaller version of BERT:

In [None]:
aug = naw.ContextualWordEmbsAug(model_path='google/bert_uncased_L-2_H-128_A-2', action="substitute")

Create a new `dict_transform` function. It need to work with text and change it.

In [None]:
def aug_text_data_transforms(row):
    # Because this augmentation is pretty slow
    # we need to truncate working text.
    # It's better to generate examples offline,
    # and than fit a model with the bigger dataset.
    sentence = ... # get augmented text
    tokens = ... # get tokens
    tokens = {k: v[0] for k, v in tokens.items()}
    tokens.update({"targets": row["label"]})
    return tokens


aug_train_dataloader = get_loader(
    imdb_dataset["train"],
    open_fn=lambda x: x,
    dict_transform=aug_text_data_transforms,
    batch_size=256,
    num_workers=32,
    shuffle=True,
    drop_last=True,
)

In [None]:
aug_loaders = {
    "train": aug_train_dataloader,
    "valid": valid_dataloader
}

In [None]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-6_H-256_A-4")
optimizer = RAdam(model.parameters(), lr=2e-4)
criterion = CrossEntropyLoss()

In [None]:
logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S")
runner = BertRunner(input_key=["input_ids", "attention_mask"], device=device)
runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=aug_loaders,
    logdir=logdir,
    num_epochs=3,
    verbose=True,
    callbacks=[AccuracyCallback(num_classes=2)],
)

**Note** 

The best method of text data augmentation is Back Translation. But we need trained model from one language to another to work. `nlpaug` uses huge transfomer models (~ 10Gb), that why I don't use them in the seminar.

## Domain adaptation

Instead of adding new examples by Data Augmentation, we can add new example from similar task. The Sentiment Analysis task has several datasets, like SST-2, YELP, AMAZON-Review. To increase the model performance, try to add some samples from SST-2 dataset:

In [None]:
sst_dataset = load_dataset("glue", "sst2", split="train[:10%]")

In [None]:
sst_dataset[0]

Prepare text and train a model:

In [None]:
def sst_text_data_transforms(row):
    tokens = ... # get tokens
    tokens = {k: v[0] for k, v in tokens.items()}
    tokens.update({"targets": row["label"]})
    return tokens

sst_train_dataloader = get_loader(
    sst_dataset,
    open_fn=lambda x: x,
    dict_transform=sst_text_data_transforms,
    batch_size=256,
    num_workers=4,
    shuffle=True,
    drop_last=True,
)


sst_loaders = {
    "train": sst_train_dataloader,
}

In [None]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-6_H-256_A-4")
optimizer = RAdam(model.parameters(), lr=2e-4)
criterion = CrossEntropyLoss()

In [None]:
logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S")
runner = BertRunner(input_key=["input_ids", "attention_mask"], device=device)
runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=sst_loaders,
    logdir=logdir,
    num_epochs=3,
    verbose=True,
    callbacks=[AccuracyCallback(num_classes=2)],
)

Retrain our model on target dataset (IMDB):

In [None]:
runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=loaders,
    logdir=logdir,
    num_epochs=3,
    verbose=True,
    callbacks=[AccuracyCallback(num_classes=2)],
)

Training model on randomly choosen samples haven't lead us to better perfomance. So, we need to find useful examples. To do this, we will use pretrained BERT model to get vector representation for each sample. We will compare vectors from the source dataset (SST-2) and the target dataset (IMDB) by cosine metric. Finally, we'll select examples from the source, that has the highest metric.

In [None]:
sst_dataset = load_dataset("glue", "sst2", split="train")

In [None]:
from transformers import BertModel


model = BertModel.from_pretrained("google/bert_uncased_L-2_H-256_A-4").to(device)

In [None]:
from tqdm.notebook import tqdm


imdb_vectors = []


with torch.no_grad():
    for row in tqdm(imdb_dataset["train"]):
        row = text_data_transforms(row)
        vector = model(
            input_ids=row["input_ids"].unsqueeze(0).to(device), 
            attention_mask=row["attention_mask"].unsqueeze(0).to(device)
        )[0][0, 0].cpu().numpy()
        imdb_vectors.append(vector)
        
imdb_vectors = np.array(imdb_vectors)

In [None]:
import numpy as np


imdb_vectors_norm = ... # normalize vectors

In [None]:
sst_scores = []

with torch.no_grad():
    for row in tqdm(sst_dataset):
        row = sst_text_data_transforms(row)
        vector = model(
            input_ids=row["input_ids"].unsqueeze(0).to(device), 
            attention_mask=row["attention_mask"].unsqueeze(0).to(device)
        )[0][0, 0].cpu().numpy()
        ... # calculate cosine metric and append it to sst_scores

Look at the scores distribution.

In [None]:
import matplotlib.pyplot as plt


plt.hist(sst_scores)

Our values are located in the interval from `0.3` to `0.8`. Choose threshold value to filter SST-2 samples:

In [None]:
thr = ... # set threshold

indeces = [i for i, value in enumerate(sst_scores) if value > thr]

Repeat the model training procedure.

In [None]:
sst_train_dataloader = get_loader(
    sst_dataset.select(indeces),
    open_fn=lambda x: x,
    dict_transform=sst_text_data_transforms,
    batch_size=256,
    num_workers=4,
    shuffle=True,
    drop_last=True,
)


sst_loaders = {
    "train": sst_train_dataloader,
}

In [None]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-6_H-256_A-4")
optimizer = RAdam(model.parameters(), lr=2e-4)
criterion = CrossEntropyLoss()

In [None]:
logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S")
runner = BertRunner(input_key=["input_ids", "attention_mask"], device=device)
runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=sst_loaders,
    logdir=logdir,
    num_epochs=3,
    verbose=True,
    callbacks=[AccuracyCallback(num_classes=2)],
)

In [None]:
runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=loaders,
    logdir=logdir,
    num_epochs=3,
    verbose=True,
    callbacks=[AccuracyCallback(num_classes=2)],
)