# 6. Классификация текстов при помощи сверточных сетей

__Автор__: Никита Владимирович Блохин (NVBlokhin@fa.ru)

Финансовый университет, 2020 г.

In [748]:
import re
import typing as t
from collections import defaultdict
from functools import lru_cache
from pathlib import Path

import nltk
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from nltk.corpus import stopwords, wordnet
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, Subset, random_split


In [539]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [540]:
DATA_DIR = Path("data/")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {DEVICE.upper()} device")

Using CPU device


In [760]:
def on_cuda(device: str) -> bool:
    return device == "cuda"


def common_train(
        model: nn.Module,
        loss_fn: nn.Module,
        optimizer: optim.Optimizer,
        train_dataloader: DataLoader,
        epochs: int,
        verbose: int = 100,
        test_dataloader: DataLoader = None,
        device: str = "cpu",
) -> t.List[float]:
    train_losses = []
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}\n" + "-" * 32)
        train_loss = train_loop(
            train_dataloader,
            model,
            loss_fn,
            optimizer,
            verbose=verbose,
            device=device,
        )
        train_losses.append(train_loss.item())
        if test_dataloader:
            test_loop(test_dataloader, model, loss_fn, device=device)
        torch.cuda.empty_cache()
    return train_losses


def train_loop(
        dataloader: DataLoader,
        model: nn.Module,
        loss_fn: nn.Module,
        optimizer: optim.Optimizer,
        verbose: int = 100,
        device: str = "cpu",
) -> torch.Tensor:
    model.train()

    size = len(dataloader.dataset)  # noqa
    num_batches = len(dataloader)
    avg_loss = 0

    for batch, (x, y) in enumerate(dataloader):
        x, y = x.to(device), y.to(device)

        pred = model(x)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        avg_loss += loss
        if batch % verbose == 0:
            print(f"loss: {loss:>7f}  [{batch * len(x):>5d}/{size:>5d}]")

        del x, y, pred, loss
        torch.cuda.empty_cache()

    return avg_loss / num_batches


def test_loop(
        dataloader: DataLoader,
        model: nn.Module,
        loss_fn: nn.Module,
        transform: t.Callable = None,
        device: str = "cpu",
) -> t.Tuple[torch.Tensor, torch.Tensor]:
    model.eval()

    size = len(dataloader.dataset)  # noqa
    num_batches = len(dataloader)
    avg_loss, correct = 0, 0

    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        pred = model(x)
        avg_loss += loss_fn(pred, y)
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()  # noqa

        del x, y, pred
        torch.cuda.empty_cache()

    avg_loss /= num_batches
    accuracy = correct / size
    print(f"Test Error: \n Accuracy: {accuracy:>4f}, Avg loss: {avg_loss:>8f} \n")

    return avg_loss, accuracy


def train_test_split(dataset: t.Union[Dataset, t.Sized], train_part: float) -> t.Tuple[Subset, Subset]:
    train_size = round(train_part * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, lengths=(train_size, test_size))
    return train_dataset, test_dataset

## 1. Представление и предобработка текстовых данных в виде последовательностей

1.1 Представьте первое предложение из строки `text` как последовательность из индексов слов, входящих в это предложение

In [542]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

In [543]:
text = text.lower()

alphabet = list(set(nltk.word_tokenize(text.replace(".", ""))))
word2index = {w: i for i, w in enumerate(alphabet)}

first_sentence = nltk.sent_tokenize(text)[0].replace(".", "")
[word2index[w] for w in nltk.word_tokenize(first_sentence)]

[2, 7, 14, 19, 17, 22, 20, 24]

1.2 Представьте первое предложение из строки `text` как последовательность векторов, соответствующих индексам слов. Для представления индекса в виде вектора используйте унитарное кодирование. В результате должен получиться двумерный тензор размера `количество слов в предложении` x `количество уникальных слов`

In [544]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

In [545]:
text = text.lower()

alphabet = list(set(nltk.word_tokenize(text.replace(".", ""))))
word2index = {w: i for i, w in enumerate(alphabet)}

first_sentence = nltk.sent_tokenize(text)[0].replace(".", "")
words = nltk.word_tokenize(first_sentence)

vectors = torch.zeros(len(words), len(alphabet))
indices = [(i, word2index[w]) for i, w in enumerate(words)]
vectors[list(zip(*indices))] = 1
vectors

tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1.]])

1.3 Решите задачу 1.2, используя модуль `nn.Embedding`

In [567]:
torch.manual_seed(0)

# попадание в размерность - уже успех
embeds = nn.Embedding(num_embeddings=len(alphabet), embedding_dim=len(alphabet))
indices = torch.tensor([word2index[w] for w in nltk.word_tokenize(first_sentence)])
embeds(indices)

tensor([[ 1.0554,  0.1778, -0.2303, -0.3918,  0.5433, -0.3952, -0.4462,  0.7440,
          1.5210,  3.4105, -1.5312, -1.2341,  1.8197, -0.5515, -0.5692,  0.9200,
          1.1108,  1.2899, -1.4782,  2.5672, -0.4731,  0.3356, -1.6293, -0.5497,
         -0.4798],
        [-1.3962, -0.0661, -0.3584, -1.5616, -0.3546,  1.0811,  0.1315,  1.5735,
          0.7814, -1.0787, -0.7209,  1.4708,  0.2756,  0.6668, -0.9944, -1.1894,
         -1.1959, -0.5596,  0.5335,  0.4069,  0.3946,  0.1715,  0.8760, -0.2871,
          1.0216],
        [ 1.9595, -1.1038,  0.5411,  1.5390,  1.0860,  1.2464,  0.1151,  1.6193,
          0.4637,  1.3007,  0.8732,  0.0651,  0.7732, -0.9701, -0.8877, -0.3183,
         -0.3344,  0.4543,  0.4990,  0.8780,  0.3894,  1.4625,  0.4795, -0.5334,
         -0.0347],
        [ 0.0358,  0.2160, -0.9161,  1.5599, -3.1537, -0.5611, -0.4303, -0.3332,
         -1.5464, -0.0147,  1.2251,  1.5936, -1.6315, -0.0569,  0.6297,  0.2712,
         -0.6860, -1.0918,  1.6797, -0.8808,  0.5800

## 2. Классификация фамилий по национальности (ConvNet)

Датасет: https://disk.yandex.ru/d/owHew8hzPc7X9Q?w=1

2.1 Считать файл `surnames/surnames.csv`.

In [547]:
surnames_df = pd.read_csv(DATA_DIR / "surnames.csv")
surnames_df.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


2.2 Закодировать национальности числами, начиная с 0.

In [548]:
surnames_labeler = LabelEncoder()
surnames_df["target"] = surnames_labeler.fit_transform(surnames_df["nationality"])
print(f"classes: {len(surnames_labeler.classes_)}")
surnames_df.head()

classes: 18


Unnamed: 0,surname,nationality,target
0,Woodford,English,4
1,Coté,French,5
2,Kore,English,4
3,Koury,Arabic,0
4,Lebzak,Russian,14


2.4 Реализовать класс `Vocab` (токен = __символ__)
  * добавьте в словарь специальный токен `<PAD>` с индексом 0
  * при создании словаря сохраните длину самой длинной последовательности из набора данных в виде атрибута `max_seq_len`


In [615]:
class Vocab:
    pad = "<PAD>"

    def __init__(self, series: pd.Series):
        uniques = set()
        max_len = 0
        for w in map(str.lower, series):
            uniques.update(w)
            max_len = max(len(w), max_len)

        self.alphabet = [self.pad, *uniques]
        self.max_len = max_len
        self.ch2i = {ch: i for i, ch in enumerate(self.alphabet)}

    def encode(self, word: str) -> torch.Tensor:
        indices = [self.ch2i[ch] for ch in word]
        indices += [self.ch2i[self.pad]] * (self.max_len - len(indices))
        return torch.tensor(indices, dtype=torch.long)

    def decode(self, indices: torch.Tensor) -> str:
        pad_indices = torch.nonzero(indices == self.ch2i[self.pad], as_tuple=True)[0]
        if len(pad_indices):
            indices = indices[:pad_indices[0]]
        return "".join(self.alphabet[i] for i in indices)


In [616]:
vocab = Vocab(surnames_df["surname"])
indices = vocab.encode("kovalev")
print(indices, vocab.decode(indices))

tensor([17, 46, 54,  7, 50, 19, 54,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]) kovalev


2.5 Реализовать класс `SurnamesDataset`
  * метод `__getitem__` возвращает пару: <последовательность индексов токенов (см. 1.1 ), номер класса>
  * длина каждой такой последовательности должна быть одинаковой и равной `vocab.max_seq_len`. Чтобы добиться этого, дополните последовательность справа индексом токена `<PAD>` до нужной длины


In [583]:
class SurnamesDataset(Dataset):

    def __init__(self, df: pd.DataFrame, vocab: Vocab, transform: t.Callable = None):
        self.surnames = df["surname"].tolist()

        if transform:
            size = transform(self.surnames[0]).size()
            self.data = torch.vstack([transform(w) for w in self.surnames]).view(len(self.surnames), *size)
        else:
            self.data = self.surnames
        self.targets = torch.tensor(df["target"], dtype=torch.long)

        self.vocab = vocab
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

In [587]:
def to_indices(word: str) -> torch.Tensor:
    return vocab.encode(word.lower())


def one_hot(word: str) -> torch.Tensor:
    vectors = torch.zeros(vocab.max_len, len(vocab.alphabet))
    indices = [(i, vocab.ch2i[ch]) for i, ch in enumerate(word.lower())]
    vectors[list(zip(*indices))] = 1
    return vectors


surnames_indices_dataset = SurnamesDataset(surnames_df, vocab, transform=to_indices)
surnames_one_hot_dataset = SurnamesDataset(surnames_df, vocab, transform=one_hot)
surnames_indices_dataset[0], surnames_one_hot_dataset[0]

((tensor([ 4, 46, 46, 43, 48, 46, 51, 43,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
  tensor(4)),
 (tensor([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0.],
          [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
           0., 0.],
          [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
           0., 0.],
          [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0

2.3 Разбить датасет на обучающую и тестовую выборку

In [553]:
torch.manual_seed(0)

train_indices_dataset, test_indices_dataset = train_test_split(surnames_indices_dataset, train_part=0.8)
train_one_hot_dataset, test_one_hot_dataset = train_test_split(surnames_one_hot_dataset, train_part=0.8)
print(len(train_indices_dataset), len(test_indices_dataset))

8784 2196


2.6. Обучить классификатор.

  * Для преобразования последовательности индексов в последовательность векторов используйте `nn.Embedding`. Рассмотрите два варианта:
    - когда токен представляется в виде унитарного вектора и модуль `nn.Embedding` не обучается
    - когда токен представляется в виде вектора небольшой размерности (меньше, чем размер словаря) и модуль `nn.Embedding` обучается

  * Используйте одномерные свертки и пулинг (`nn.Conv1d`, `nn.MaxPool1d`)
    - обратите внимание, что `nn.Conv1d` ожидает на вход трехмерный тензор размерности `(batch, embedding_dim, seq_len)`


In [639]:
class SurnamesClassifier(nn.Module):

    def __init__(
            self,
            vocab: Vocab, out_features: int,
            embedding_dim: int = 128,
            use_embedding: bool = True,
            debug: bool = False,
    ):
        super(SurnamesClassifier, self).__init__()
        self.use_embedding = use_embedding
        self.debug = debug

        self.embedding_dim = embedding_dim

        last_conv_out_channels = 64
        adaptive_avg_pool = 8

        # как же этой модели все это... безразлично
        self.embedding = nn.Embedding(num_embeddings=len(vocab.alphabet), embedding_dim=embedding_dim)
        self.features = nn.Sequential(
            nn.Conv1d(in_channels=embedding_dim, out_channels=64, kernel_size=3),
            nn.BatchNorm1d(num_features=64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Conv1d(in_channels=64, out_channels=last_conv_out_channels, kernel_size=3),
            nn.BatchNorm1d(num_features=last_conv_out_channels),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
        )

        self.avgpool = nn.AdaptiveAvgPool1d(adaptive_avg_pool)
        self.classifier = nn.Sequential(
            nn.Linear(last_conv_out_channels * adaptive_avg_pool, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, out_features),
        )

        if self.debug:
            self.forward = self._debug_forward
        else:
            self.forward = self._forward

    def _forward(self, x: torch.Tensor):
        if self.use_embedding:
            x = self.embedding(x)
        else:
            x = F.pad(x, (0, self.embedding_dim - x.size(2), 0, 0), value=0)

        x = x.reshape(x.size(0), x.size(2), x.size(1))
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return torch.log_softmax(x, dim=1)

    def _debug_forward(self, x: torch.Tensor):
        print("x: ", x.size())
        if self.use_embedding:
            x = self.embedding(x)
            print("embedding: ", x.size())
        else:
            x = F.pad(x, (0, self.embedding_dim - x.size(2), 0, 0), value=0)
            print("pad: ", x.size())

        x = x.reshape(x.size(0), x.size(2), x.size(1))
        print("reshape: ", x.size())
        x = self.features(x)
        print("features: ", x.size())
        x = self.avgpool(x)
        print("avgpool: ", x.size())
        x = torch.flatten(x, 1)
        print("flatten: ", x.size())
        x = self.classifier(x)
        print("classifier: ", x.size())
        return torch.log_softmax(x, dim=1)


In [555]:
torch.manual_seed(0)

common_net = SurnamesClassifier(vocab, len(surnames_labeler.classes_)).to(DEVICE)
loss_fn = nn.NLLLoss()
optimizer = optim.Adam(common_net.parameters(), lr=0.001)

In [556]:
common_net.use_embedding = False
common_train(
    epochs=10,
    model=common_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=DataLoader(train_one_hot_dataset, batch_size=8, shuffle=True),
    test_dataloader=DataLoader(test_one_hot_dataset, batch_size=512),
    verbose=500,
    device=DEVICE,
);

Epoch 1
--------------------------------
loss: 3.059291  [    0/ 8784]
loss: 2.116892  [ 4000/ 8784]
loss: 1.438677  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.556011, Avg loss: 1.581033 

Epoch 2
--------------------------------
loss: 1.255365  [    0/ 8784]
loss: 1.509195  [ 4000/ 8784]
loss: 1.438071  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.588798, Avg loss: 1.398945 

Epoch 3
--------------------------------
loss: 1.540711  [    0/ 8784]
loss: 1.001740  [ 4000/ 8784]
loss: 2.063588  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.602004, Avg loss: 1.368793 

Epoch 4
--------------------------------
loss: 0.989875  [    0/ 8784]
loss: 1.222487  [ 4000/ 8784]
loss: 0.826472  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.615209, Avg loss: 1.331192 

Epoch 5
--------------------------------
loss: 1.626660  [    0/ 8784]
loss: 0.596323  [ 4000/ 8784]
loss: 1.152216  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.628415, Avg loss: 1.319504 

Epoch 6
--------------------------------
loss: 0.927089  [    0/ 8784]

In [557]:
torch.manual_seed(0)

embeddings_net = SurnamesClassifier(vocab, len(surnames_labeler.classes_)).to(DEVICE)
loss_fn = nn.NLLLoss()
optimizer = optim.Adam(embeddings_net.parameters(), lr=0.001)

In [558]:
embeddings_net.use_embedding = True
common_train(
    epochs=15,
    model=embeddings_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=DataLoader(train_indices_dataset, batch_size=8, shuffle=True),
    test_dataloader=DataLoader(test_indices_dataset, batch_size=512),
    verbose=500,
    device=DEVICE,
);

Epoch 1
--------------------------------
loss: 2.848648  [    0/ 8784]
loss: 1.548867  [ 4000/ 8784]
loss: 1.213963  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.563297, Avg loss: 1.592500 

Epoch 2
--------------------------------
loss: 1.657608  [    0/ 8784]
loss: 1.978383  [ 4000/ 8784]
loss: 1.008881  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.610200, Avg loss: 1.386550 

Epoch 3
--------------------------------
loss: 2.417265  [    0/ 8784]
loss: 1.959956  [ 4000/ 8784]
loss: 0.906961  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.643898, Avg loss: 1.269374 

Epoch 4
--------------------------------
loss: 0.755406  [    0/ 8784]
loss: 0.984007  [ 4000/ 8784]
loss: 1.120149  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.666667, Avg loss: 1.172726 

Epoch 5
--------------------------------
loss: 0.749230  [    0/ 8784]
loss: 1.461187  [ 4000/ 8784]
loss: 0.636841  [ 8000/ 8784]
Test Error: 
 Accuracy: 0.695355, Avg loss: 1.096197 

Epoch 6
--------------------------------
loss: 0.772892  [    0/ 8784]

2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: прогнать несколько фамилий студентов группы через модели и проверить результат. Для каждой фамилии выводить 3 наиболее вероятных предсказания.

In [562]:
def inference(
        surname: str,
        target: str,
        model: nn.Module,
        vocab: Vocab,
        labeler: LabelEncoder,
        k: int = 3,
        device: str = "cpu",
):
    x = vocab.encode(surname.lower())
    x = x.to(device)

    pred = model(x.unsqueeze(0))
    pred_proba, pred_label_indices = F.softmax(pred, 1).topk(k, dim=1)
    pred_labels = labeler.inverse_transform(pred_label_indices.squeeze().cpu())

    predicts = ", ".join(
        [f"{label} ({prob:.2f})" for (label, prob) in zip(pred_labels, pred_proba.squeeze())]
    )
    print(f"Surname : {surname}")
    print(f"True    : {target}")
    print(f"Predicts: {predicts}\n")

In [563]:
students = [
    "Alexandrova",
    "Baranov",
    "Brusova",
    "Volkova",
    "Kovalev",
    "Kostyuchenko",
    "Kuzin",
    "Likhachev",
    "Telitsyn",
    "Ustimova",
    "Khamikoeva",
]
for surname in students:
    inference(
        surname=surname,
        target="Russian",
        model=embeddings_net,
        vocab=vocab,
        labeler=surnames_labeler,
        device=DEVICE,
    )

Surname : Alexandrova
True    : Russian
Predicts: Russian (1.00), Greek (0.00), Czech (0.00)

Surname : Baranov
True    : Russian
Predicts: Russian (1.00), Czech (0.00), English (0.00)

Surname : Brusova
True    : Russian
Predicts: Czech (0.56), Spanish (0.17), Italian (0.08)

Surname : Volkova
True    : Russian
Predicts: Russian (0.98), Czech (0.02), Polish (0.00)

Surname : Kovalev
True    : Russian
Predicts: Russian (0.99), Czech (0.01), Polish (0.00)

Surname : Kostyuchenko
True    : Russian
Predicts: Russian (1.00), English (0.00), German (0.00)

Surname : Kuzin
True    : Russian
Predicts: Russian (1.00), Czech (0.00), Irish (0.00)

Surname : Likhachev
True    : Russian
Predicts: Russian (1.00), Czech (0.00), English (0.00)

Surname : Telitsyn
True    : Russian
Predicts: English (0.96), German (0.01), Irish (0.01)

Surname : Ustimova
True    : Russian
Predicts: Japanese (0.61), Russian (0.39), Czech (0.00)

Surname : Khamikoeva
True    : Russian
Predicts: Russian (1.00), Czech (0.

## 3. Классификация обзоров на фильмы (ConvNet)

Датасет: https://disk.yandex.ru/d/tdinpb0nN_Dsrg

2.1 Создайте набор данных на основе файлов polarity/positive_reviews.csv (положительные отзывы) и polarity/negative_reviews.csv (отрицательные отзывы). Разбейте на обучающую и тестовую выборку.
  * токен = __слово__
  * данные для обучения в датасете представляются в виде последовательности индексов токенов
  * словарь создается на основе _только_ обучающей выборки. Для корректной обработки ситуаций, когда в тестовой выборке встретится токен, который не хранится в словаре, добавьте в словарь специальный токен `<UNK>`
  * добавьте предобработку текста

In [835]:
def get_pos(word: str) -> str:
    tag = nltk.pos_tag([word])[0][1]
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


STOPWORDS = set(stopwords.words("english"))


def preprocess_review(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z]", repl=" ", string=text, flags=re.MULTILINE)

    lemmatizer = nltk.WordNetLemmatizer()
    words = []
    for word in nltk.word_tokenize(text):
        if word not in STOPWORDS:
            lemma = lemmatizer.lemmatize(word, pos=get_pos(word))
            if lemma not in STOPWORDS and len(lemma) > 1:
                words.append(lemma)

    return " ".join(words)

In [836]:
class ReviewsDataset(Dataset):

    def __init__(self, positive_path: Path, negative_path: Path, seed: int = None):
        self.positive_path = positive_path
        self.negative_path = negative_path
        self.positive_reviews = self.read_reviews(positive_path, preprocess_review)
        self.negative_reviews = self.read_reviews(negative_path, preprocess_review)

        data = self.positive_reviews + self.negative_reviews
        targets = torch.cat([torch.ones(len(self.positive_reviews)), torch.zeros(len(self.negative_reviews))])

        if seed is not None:
            torch.manual_seed(seed)
        indices = torch.randperm(len(data))

        self.data = [data[i] for i in indices]
        self.targets = targets[indices].to(torch.long)

    @staticmethod
    def read_reviews(path: Path, process: t.Callable[[str], str]) -> list[str]:
        reviews = []
        with open(path) as f:
            for review in f.readlines():
                review = process(review)
                if review:
                    reviews.append(review)
        return reviews

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index], self.targets[index]


In [837]:
reviews_path = DATA_DIR / "polarity"
reviews_dataset = ReviewsDataset(
    reviews_path / "positive_reviews.txt",
    reviews_path / "negative_reviews.txt",
    seed=0,
)
len(reviews_dataset), reviews_dataset[0]

(10661,
 ('ludicrous director carl franklin add enough flourish freak make entertain',
  tensor(0)))

In [838]:
torch.manual_seed(0)

train_reviews, test_reviews = train_test_split(reviews_dataset, train_part=0.9)
len(train_reviews), len(test_reviews)

(9595, 1066)

In [839]:
class ReviewsVocab:
    pad = "<PAD>"
    unknown = "<UNK>"

    def __init__(self, reviews: t.List[str]):
        uniques = set()
        max_len = 0
        for review in reviews:
            words = nltk.word_tokenize(review)
            uniques.update(words)
            max_len = max(len(words), max_len)

        self.alphabet = [self.pad, self.unknown, *uniques]
        self.max_len = max_len

        w2i = {w: i for i, w in enumerate(self.alphabet)}
        self.w2i = defaultdict(lambda: 1, w2i)

    def __len__(self):
        return len(self.alphabet)

    @lru_cache(maxsize=1024)
    def encode(self, review: str) -> torch.Tensor:
        indices = [self.w2i[w] for w in nltk.word_tokenize(review)]
        indices += [self.w2i[self.pad]] * (self.max_len - len(indices))
        return torch.tensor(indices, dtype=torch.long)

    def decode(self, indices: torch.Tensor) -> str:
        pad_indices = torch.nonzero(indices == self.w2i[self.pad], as_tuple=True)[0]
        if len(pad_indices):
            indices = indices[:pad_indices[0]]
        return " ".join(self.alphabet[i] for i in indices)


In [840]:
vocab = ReviewsVocab([review for review, _ in train_reviews])
print(f"alphabet: {len(vocab)}", f"longest: {vocab.max_len}")
encoded = vocab.encode("this is a neutral review")
encoded, vocab.decode(encoded)

alphabet: 14097 longest: 38


(tensor([    1,     1,     1, 13041,  4541,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0]),
 '<UNK> <UNK> <UNK> neutral review')

2.2. Обучите классификатор.

  * Для преобразования последовательности индексов в последовательность векторов используйте `nn.Embedding`
    - подберите адекватную размерность вектора эмбеддинга:
    - модуль `nn.Embedding` обучается

  * Используйте одномерные свертки и пулинг (`nn.Conv1d`, `nn.MaxPool1d`)
    - обратите внимание, что `nn.Conv1d` ожидает на вход трехмерный тензор размерности `(batch, embedding_dim, seq_len)`


In [846]:
class ReviewsClassifier(nn.Module):
    LAST_CONV_OUT_CHANNELS = 64
    ADAPTIVE_AVG_POOL = 8

    def __init__(self, num_embeddings: int, embedding_dim: int):
        super(ReviewsClassifier, self).__init__()

        # как же этой модели все это... безразлично
        self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
        self.features = nn.Sequential(
            nn.Conv1d(in_channels=embedding_dim, out_channels=64, kernel_size=3),
            nn.BatchNorm1d(num_features=64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Conv1d(in_channels=64, out_channels=self.LAST_CONV_OUT_CHANNELS, kernel_size=3),
            nn.BatchNorm1d(num_features=self.LAST_CONV_OUT_CHANNELS),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
        )

        self.avgpool = nn.AdaptiveAvgPool1d(self.ADAPTIVE_AVG_POOL)
        self.classifier = nn.Sequential(
            nn.Linear(self.LAST_CONV_OUT_CHANNELS * self.ADAPTIVE_AVG_POOL, 256),
            nn.ReLU(),
            # nn.Dropout(),
            nn.Linear(256, 2),
        )

    def forward(self, x: torch.Tensor):
        x = self.embedding(x)
        x = x.reshape(x.size(0), x.size(2), x.size(1))
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x


def collate_fn(batch: t.List[t.Tuple[str, torch.Tensor]]) -> t.Tuple[torch.Tensor, torch.Tensor]:
    xs, ys = [], []
    for x, y in batch:
        xs.append(vocab.encode(x))
        ys.append(y)
    return torch.vstack(xs), torch.hstack(ys)

In [847]:
torch.manual_seed(0)

net = ReviewsClassifier(num_embeddings=len(vocab), embedding_dim=128).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.000914092001)  # а почему бы и нет?

In [848]:
common_train(
    epochs=20,
    model=net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    # 21 > Embedding, Conv1d, BatchNorm1d, ReLU, MaxPool1d, AdaptiveAvgPool1d, Dropout
    train_dataloader=DataLoader(train_reviews, batch_size=21, collate_fn=collate_fn, shuffle=True),
    test_dataloader=DataLoader(test_reviews, batch_size=512, collate_fn=collate_fn),
    verbose=150,
    device=DEVICE,
);

Epoch 1
--------------------------------
loss: 0.697946  [    0/ 9595]
loss: 0.690526  [ 3150/ 9595]
loss: 0.689911  [ 6300/ 9595]
loss: 0.692454  [ 9450/ 9595]
Test Error: 
 Accuracy: 0.497186, Avg loss: 0.694098 

Epoch 2
--------------------------------
loss: 0.687758  [    0/ 9595]
loss: 0.704012  [ 3150/ 9595]
loss: 0.657614  [ 6300/ 9595]
loss: 0.665169  [ 9450/ 9595]
Test Error: 
 Accuracy: 0.518762, Avg loss: 0.693731 

Epoch 3
--------------------------------
loss: 0.644710  [    0/ 9595]
loss: 0.599036  [ 3150/ 9595]
loss: 0.512018  [ 6300/ 9595]
loss: 0.718273  [ 9450/ 9595]
Test Error: 
 Accuracy: 0.640713, Avg loss: 0.646798 

Epoch 4
--------------------------------
loss: 0.485913  [    0/ 9595]
loss: 0.683398  [ 3150/ 9595]
loss: 0.597125  [ 6300/ 9595]
loss: 0.521362  [ 9450/ 9595]
Test Error: 
 Accuracy: 0.671670, Avg loss: 0.686190 

Epoch 5
--------------------------------
loss: 0.300442  [    0/ 9595]
loss: 0.563343  [ 3150/ 9595]
loss: 0.306593  [ 6300/ 9595]
loss:

2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)
* Целевое значение accuracy на валидации - 70+%