In [1]:
import re
import typing as t
from collections import defaultdict
import csv
from pathlib import Path
from tqdm import tqdm

import nltk
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from nltk.corpus import stopwords, wordnet
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, Subset, random_split

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
DATA_DIR = Path("data/")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {DEVICE.upper()} device")

Using CUDA device


In [4]:
def on_cuda(device: str) -> bool:
    return device == "cuda"


def common_train(
        model: nn.Module,
        loss_fn: nn.Module,
        optimizer: optim.Optimizer,
        train_dataloader: DataLoader,
        epochs: int,
        test_dataloader: DataLoader = None,
        lr_scheduler=None,
        verbose: int = 100,
        device: str = "cpu",
) -> t.List[float]:
    train_losses = []
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}\n" + "-" * 32)
        train_loss = train_loop(
            train_dataloader,
            model,
            loss_fn,
            optimizer,
            verbose=verbose,
            device=device,
        )
        train_losses.append(train_loss.item())
        if test_dataloader:
            loss, acc = test_loop(test_dataloader, model, loss_fn, device=device)
            if lr_scheduler:
                lr_scheduler.step(loss)
        torch.cuda.empty_cache()
    return train_losses


def train_loop(
        dataloader: DataLoader,
        model: nn.Module,
        loss_fn: nn.Module,
        optimizer: optim.Optimizer,
        verbose: int = 100,
        device: str = "cpu",
) -> torch.Tensor:
    model.train()

    size = len(dataloader.dataset)  # noqa
    num_batches = len(dataloader)
    avg_loss = 0

    for batch, (x, y) in enumerate(dataloader):
        x, y = x.to(device), y.to(device)

        pred = model(x)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()

        avg_loss += loss
        if batch % verbose == 0:
            print(f"loss: {loss:>7f}  [{batch * len(x):>5d}/{size:>5d}]")

        del x, y, pred, loss
        torch.cuda.empty_cache()

    return avg_loss / num_batches


@torch.no_grad()
def test_loop(
        dataloader: DataLoader,
        model: nn.Module,
        loss_fn: nn.Module,
        device: str = "cpu",
) -> t.Tuple[torch.Tensor, torch.Tensor]:
    model.eval()

    size = len(dataloader.dataset)  # noqa
    num_batches = len(dataloader)
    avg_loss, correct = 0, 0

    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        pred = model(x)
        avg_loss += loss_fn(pred, y)
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()  # noqa

        del x, y, pred
        torch.cuda.empty_cache()

    avg_loss /= num_batches
    accuracy = correct / size
    print(f"Test Error: \n Accuracy: {accuracy:>4f}, Avg loss: {avg_loss:>8f} \n")

    return avg_loss, accuracy


def train_test_split(dataset: t.Union[Dataset, t.Sized], train_part: float) -> t.Tuple[Subset, Subset]:
    train_size = round(train_part * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, lengths=(train_size, test_size))
    return train_dataset, test_dataset


@torch.no_grad()
def get_y_test_y_pred(
        model: nn.Module,
        test_dataloader: DataLoader,
        device: str = "cpu",
) -> t.Tuple[torch.Tensor, torch.Tensor]:
    model.eval()

    y_test = []
    y_pred = []
    for x, y in test_dataloader:
        x, y = x.to(device), y.to(device)
        pred = model(x).argmax(1)
        y_test.append(y)
        y_pred.append(pred)

        del x
        torch.cuda.empty_cache()

    return torch.hstack(y_test).detach().cpu(), torch.hstack(y_pred).detach().cpu()

## 1. Классификация фамилий (RNN)

Датасет: https://disk.yandex.ru/d/frNchuaBQVLxyA?w=1

1.1 Используя класс `nn.RNNCell` (абстракцию для отдельного временного шага RNN), реализуйте простейшую рекуррентную сеть Элмана в виде класса `RNN`. Используя созданный класс `RNN`, решите задачу классификации фамилий. 


In [92]:
class RNN(nn.Module):

    def __init__(self, input_size: int, hidden_size: int):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.rnn_cell = nn.RNNCell(input_size, hidden_size)

    def forward(self, inputs: torch.Tensor, hx: torch.Tensor = None):
        batch_size, sequence_size, _ = inputs.size()
        inputs = inputs.permute(1, 0, 2)  # для nn.RNNCell batch_size должен быть на 2-ой месте

        if hx is None:
            # так же скрытое состояние инициализируется в nn.RNN
            hx = torch.zeros(batch_size, self.hidden_size, dtype=inputs.dtype, device=inputs.device)
        else:
            # 1-ая размерность равная 1 для совместимости с nn.RNN
            hx = hx.squeeze(0)  # избавляемся от 1-ой размерности равной 1

        hidden = []
        for i in range(sequence_size):
            hx = self.rnn_cell(inputs[i], hx)
            hidden.append(hx)

        hidden = torch.stack(hidden)
        hx = hidden[-1].unsqueeze(0)
        return hidden.permute(1, 0, 2), hx

Проверка реализации RNN:

In [93]:
torch.manual_seed(0)

input_size, hidden_size = 4, 5
inputs = torch.randn(2, 3, input_size)
hx = torch.randn(1, 2, hidden_size)

torch.manual_seed(0)
my_rnn = RNN(input_size=input_size, hidden_size=hidden_size)

torch.manual_seed(0)
true_rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, batch_first=True)

In [94]:
my_rnn(inputs, hx)

torch.Size([2, 5])
torch.Size([1, 2, 5])


(tensor([[[ 0.6515,  0.5430,  0.4023,  0.6325, -0.6068],
          [ 0.9149, -0.1088,  0.6385, -0.7387,  0.7532],
          [-0.6936,  0.5123, -0.2784, -0.5693, -0.0055]],
 
         [[ 0.1954,  0.6152,  0.2958, -0.8005,  0.8074],
          [-0.4577,  0.7566,  0.2972, -0.8834,  0.1265],
          [ 0.7166,  0.1516,  0.8047, -0.2007,  0.8192]]],
        grad_fn=<PermuteBackward0>),
 tensor([[[-0.6936,  0.5123, -0.2784, -0.5693, -0.0055],
          [ 0.7166,  0.1516,  0.8047, -0.2007,  0.8192]]],
        grad_fn=<UnsqueezeBackward0>))

In [8]:
true_rnn(inputs, hx)

(tensor([[[ 0.6515,  0.5430,  0.4023,  0.6325, -0.6068],
          [ 0.9149, -0.1088,  0.6385, -0.7387,  0.7532],
          [-0.6936,  0.5123, -0.2784, -0.5693, -0.0055]],
 
         [[ 0.1954,  0.6152,  0.2958, -0.8005,  0.8074],
          [-0.4577,  0.7566,  0.2972, -0.8834,  0.1265],
          [ 0.7166,  0.1516,  0.8047, -0.2007,  0.8192]]],
        grad_fn=<TransposeBackward1>),
 tensor([[[-0.6936,  0.5123, -0.2784, -0.5693, -0.0055],
          [ 0.7166,  0.1516,  0.8047, -0.2007,  0.8192]]],
        grad_fn=<StackBackward0>))

100% совпадение

In [9]:
class SurnamesRNNClassifier(nn.Module):

    def __init__(
            self,
            num_embeddings: int,
            embedding_dim: int,
            rnn_hidden_size: int,
            vector_size: int,
            num_classes: int,
    ):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_idx=0)
        self.rnn = RNN(input_size=embedding_dim, hidden_size=rnn_hidden_size)
        self.classifier = nn.Sequential(
            nn.Linear(rnn_hidden_size * vector_size, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)
        x, hx = self.rnn(x)
        x = torch.flatten(x, 1)
        return self.classifier(x)

In [10]:
class SurnamesVocab:
    pad = "<PAD>"

    def __init__(self, surnames: t.List[str]):
        uniques = set()
        max_len = 0
        for w in map(str.lower, surnames):
            uniques.update(w)
            max_len = max(len(w), max_len)

        self.alphabet = [self.pad, *uniques]
        self.max_len = max_len
        self.ch2i = {ch: i for i, ch in enumerate(self.alphabet)}

    def __len__(self):
        return len(self.alphabet)

    def encode(self, word: str) -> torch.Tensor:
        indices = [self.ch2i[ch] for ch in word]
        indices += [self.ch2i[self.pad]] * (self.max_len - len(indices))
        return torch.tensor(indices, dtype=torch.long)

    def decode(self, indices: torch.Tensor) -> str:
        pad_indices = torch.nonzero(indices == self.ch2i[self.pad], as_tuple=True)[0]
        if len(pad_indices):
            indices = indices[:pad_indices[0]]
        return "".join(self.alphabet[i] for i in indices)


class SurnamesDataset(Dataset):
    df: pd.DataFrame
    surnames: t.List[str]
    vocab: SurnamesVocab
    labeler: LabelEncoder
    data: torch.Tensor
    targets: torch.Tensor

    def __init__(self, path: Path):
        self.df = pd.read_csv(path)

        self.surnames = self.df["surname"].tolist()
        self.vocab = SurnamesVocab(self.surnames)
        size = self.vocab.encode(self.surnames[0].lower()).size()
        data = torch.vstack([self.vocab.encode(w.lower()) for w in self.surnames])
        self.data = data.view(len(self.surnames), *size)

        self.labeler = LabelEncoder()
        targets = self.labeler.fit_transform(self.df["nationality"])
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return self.data.size(0)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

    def encode(self, word: str) -> torch.Tensor:
        return self.vocab.encode(word)

    def decode(self, indices: torch.Tensor) -> str:
        return self.vocab.decode(indices)

In [11]:
surnames_dataset = SurnamesDataset(DATA_DIR / "surnames.csv")
len(surnames_dataset)

10980

In [12]:
torch.manual_seed(0)

train_surnames_dataset, test_surnames_dataset = train_test_split(surnames_dataset, train_part=0.8)
print(len(train_surnames_dataset), len(test_surnames_dataset))

8784 2196


### Handmade RNN

In [13]:
torch.manual_seed(0)

handmade_rnn_net = SurnamesRNNClassifier(
    num_embeddings=len(surnames_dataset.vocab),
    embedding_dim=128,
    rnn_hidden_size=64,
    vector_size=surnames_dataset.vocab.max_len,
    num_classes=len(surnames_dataset.labeler.classes_),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(handmade_rnn_net.parameters(), lr=0.0015)

train_dataloader = DataLoader(train_surnames_dataset, batch_size=128, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_surnames_dataset, batch_size=512)

In [14]:
%%time

_ = common_train(
    epochs=20,
    model=handmade_rnn_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    verbose=50,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 2.878279  [    0/ 8784]
loss: 1.349192  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.635246, Avg loss: 1.264551 

Epoch 2
--------------------------------
loss: 1.322870  [    0/ 8784]
loss: 1.030583  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.700364, Avg loss: 1.033390 

Epoch 3
--------------------------------
loss: 0.802031  [    0/ 8784]
loss: 1.007586  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.724044, Avg loss: 0.946684 

Epoch 4
--------------------------------
loss: 0.785045  [    0/ 8784]
loss: 0.951692  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.737705, Avg loss: 0.891038 

Epoch 5
--------------------------------
loss: 0.708065  [    0/ 8784]
loss: 0.741949  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.741348, Avg loss: 0.868706 

Epoch 6
--------------------------------
loss: 0.566563  [    0/ 8784]
loss: 0.636736  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.751821, Avg loss: 0.831084 

Epoch 7
--------------------------------
loss: 0.800768  [    0/

In [15]:
y_test, y_pred = get_y_test_y_pred(handmade_rnn_net, test_dataloader, DEVICE)

print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred,
    target_names=surnames_dataset.labeler.classes_,
    zero_division=True,
))

              precision    recall  f1-score   support

      Arabic       0.96      1.00      0.98       340
     Chinese       0.76      0.66      0.70        38
       Czech       0.64      0.31      0.42        96
       Dutch       0.76      0.43      0.55        51
     English       0.73      0.86      0.79       573
      French       0.19      0.13      0.15        39
      German       0.53      0.56      0.55       121
       Greek       0.72      0.68      0.70        34
       Irish       0.61      0.38      0.47        37
     Italian       0.67      0.76      0.71       128
    Japanese       0.85      0.83      0.84       156
      Korean       0.38      0.30      0.33        10
      Polish       0.58      0.54      0.56        26
  Portuguese       0.00      0.00      0.00         9
     Russian       0.85      0.85      0.85       458
    Scottish       0.00      0.00      0.00        17
     Spanish       0.37      0.38      0.38        50
  Vietnamese       0.25    

1.2 Замените модуль `RNN` из 1.1 на модули `nn.RNN`, `nn.LSTM` и `nn.GRU` (не забудьте указать аргумент `batch_first=True`). Сравните результаты работы.

In [16]:
class SurnamesAutobotRNNClassifier(nn.Module):

    def __init__(
            self,
            rnn_cls: t.Union[t.Type[nn.RNN], t.Type[nn.LSTM], t.Type[nn.GRU]],
            num_embeddings: int,
            embedding_dim: int,
            rnn_hidden_size: int,
            vector_size: int,
            num_classes: int,
    ):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_idx=0)
        self.hx, self.cx = None, None
        self.rnn = rnn_cls(input_size=embedding_dim, hidden_size=rnn_hidden_size)
        self.classifier = nn.Sequential(
            nn.Linear(rnn_hidden_size * vector_size, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)

        if isinstance(self.rnn, (nn.RNN, nn.GRU)):
            x, hx = self.rnn(x, self.hx)
            self.hx = hx.detach()
        else:
            if self.hx is not None and self.cx is not None:
                hx_cx = (self.hx, self.cx)
            else:
                hx_cx = None
            x, (hx, cx) = self.rnn(x, hx_cx)
            self.cx = cx.detach()
            self.hx = hx.detach()

        x = torch.flatten(x, 1)
        return self.classifier(x)

### nn.RNN

In [17]:
torch.manual_seed(0)

rnn_net = SurnamesAutobotRNNClassifier(
    rnn_cls=nn.RNN,
    num_embeddings=len(surnames_dataset.vocab),
    embedding_dim=128,
    rnn_hidden_size=64,
    vector_size=surnames_dataset.vocab.max_len,
    num_classes=len(surnames_dataset.labeler.classes_),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn_net.parameters(), lr=0.0015)

train_dataloader = DataLoader(train_surnames_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_surnames_dataset, batch_size=512)

In [18]:
%%time

_ = common_train(
    epochs=20,
    model=rnn_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    verbose=50,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 2.872500  [    0/ 8784]
loss: 1.533388  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.584699, Avg loss: 1.441185 

Epoch 2
--------------------------------
loss: 1.513549  [    0/ 8784]
loss: 1.125278  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.644809, Avg loss: 1.203117 

Epoch 3
--------------------------------
loss: 0.963235  [    0/ 8784]
loss: 1.213355  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.676230, Avg loss: 1.101182 

Epoch 4
--------------------------------
loss: 0.992673  [    0/ 8784]
loss: 1.117367  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.701275, Avg loss: 1.032854 

Epoch 5
--------------------------------
loss: 0.899363  [    0/ 8784]
loss: 0.877292  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.702641, Avg loss: 1.012262 

Epoch 6
--------------------------------
loss: 0.748350  [    0/ 8784]
loss: 0.720974  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.725410, Avg loss: 0.950959 

Epoch 7
--------------------------------
loss: 0.795079  [    0/

In [19]:
y_test, y_pred = get_y_test_y_pred(rnn_net, test_dataloader, DEVICE)

print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred,
    target_names=surnames_dataset.labeler.classes_,
    zero_division=True,
))

              precision    recall  f1-score   support

      Arabic       0.96      1.00      0.98       340
     Chinese       0.70      0.68      0.69        38
       Czech       0.50      0.22      0.30        96
       Dutch       0.67      0.35      0.46        51
     English       0.67      0.87      0.76       573
      French       0.21      0.10      0.14        39
      German       0.60      0.42      0.50       121
       Greek       0.67      0.47      0.55        34
       Irish       0.69      0.30      0.42        37
     Italian       0.61      0.70      0.65       128
    Japanese       0.83      0.84      0.84       156
      Korean       0.21      0.30      0.25        10
      Polish       0.67      0.46      0.55        26
  Portuguese       0.25      0.11      0.15         9
     Russian       0.86      0.85      0.86       458
    Scottish       0.00      0.00      0.00        17
     Spanish       0.47      0.38      0.42        50
  Vietnamese       0.60    

### nn.LSTM

In [20]:
torch.manual_seed(0)

lstm_net = SurnamesAutobotRNNClassifier(
    rnn_cls=nn.LSTM,
    num_embeddings=len(surnames_dataset.vocab),
    embedding_dim=128,
    rnn_hidden_size=64,
    vector_size=surnames_dataset.vocab.max_len,
    num_classes=len(surnames_dataset.labeler.classes_),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_net.parameters(), lr=0.0015)

train_dataloader = DataLoader(train_surnames_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_surnames_dataset, batch_size=512)

In [21]:
%%time

_ = common_train(
    epochs=20,
    model=lstm_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    verbose=50,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 2.889345  [    0/ 8784]
loss: 1.869502  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.548270, Avg loss: 1.580504 

Epoch 2
--------------------------------
loss: 1.611331  [    0/ 8784]
loss: 1.340435  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.615665, Avg loss: 1.319704 

Epoch 3
--------------------------------
loss: 1.414919  [    0/ 8784]
loss: 1.113623  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.661658, Avg loss: 1.163483 

Epoch 4
--------------------------------
loss: 1.102015  [    0/ 8784]
loss: 0.984063  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.692623, Avg loss: 1.071542 

Epoch 5
--------------------------------
loss: 0.862351  [    0/ 8784]
loss: 0.802106  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.698087, Avg loss: 1.012793 

Epoch 6
--------------------------------
loss: 0.773141  [    0/ 8784]
loss: 0.899641  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.708561, Avg loss: 0.983065 

Epoch 7
--------------------------------
loss: 0.737374  [    0/

In [22]:
y_test, y_pred = get_y_test_y_pred(lstm_net, test_dataloader, DEVICE)

print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred,
    target_names=surnames_dataset.labeler.classes_,
    zero_division=True,
))

              precision    recall  f1-score   support

      Arabic       0.98      1.00      0.99       340
     Chinese       0.66      0.76      0.71        38
       Czech       0.46      0.19      0.27        96
       Dutch       0.95      0.39      0.56        51
     English       0.67      0.88      0.76       573
      French       0.11      0.05      0.07        39
      German       0.57      0.45      0.50       121
       Greek       0.58      0.53      0.55        34
       Irish       0.73      0.30      0.42        37
     Italian       0.70      0.68      0.69       128
    Japanese       0.88      0.85      0.87       156
      Korean       0.10      0.10      0.10        10
      Polish       0.56      0.38      0.45        26
  Portuguese       0.00      0.00      0.00         9
     Russian       0.80      0.88      0.84       458
    Scottish       0.00      0.00      0.00        17
     Spanish       0.56      0.30      0.39        50
  Vietnamese       1.00    

### nn.GRU

In [23]:
torch.manual_seed(0)

gru_net = SurnamesAutobotRNNClassifier(
    rnn_cls=nn.GRU,
    num_embeddings=len(surnames_dataset.vocab),
    embedding_dim=128,
    rnn_hidden_size=64,
    vector_size=surnames_dataset.vocab.max_len,
    num_classes=len(surnames_dataset.labeler.classes_),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(gru_net.parameters(), lr=0.0015)

train_dataloader = DataLoader(train_surnames_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_surnames_dataset, batch_size=512)

In [24]:
%%time

_ = common_train(
    epochs=20,
    model=gru_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    verbose=50,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 2.877352  [    0/ 8784]
loss: 1.645322  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.557377, Avg loss: 1.544307 

Epoch 2
--------------------------------
loss: 1.330766  [    0/ 8784]
loss: 1.426452  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.638889, Avg loss: 1.266454 

Epoch 3
--------------------------------
loss: 1.129196  [    0/ 8784]
loss: 1.019272  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.677140, Avg loss: 1.118819 

Epoch 4
--------------------------------
loss: 0.941416  [    0/ 8784]
loss: 1.017978  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.704463, Avg loss: 1.054566 

Epoch 5
--------------------------------
loss: 1.103538  [    0/ 8784]
loss: 0.977748  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.705373, Avg loss: 1.010324 

Epoch 6
--------------------------------
loss: 0.826999  [    0/ 8784]
loss: 0.795982  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.722222, Avg loss: 0.977933 

Epoch 7
--------------------------------
loss: 0.745100  [    0/

In [25]:
y_test, y_pred = get_y_test_y_pred(gru_net, test_dataloader, DEVICE)

print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred,
    target_names=surnames_dataset.labeler.classes_,
    zero_division=True,
))

              precision    recall  f1-score   support

      Arabic       0.98      1.00      0.99       340
     Chinese       0.81      0.66      0.72        38
       Czech       0.53      0.28      0.37        96
       Dutch       0.68      0.37      0.48        51
     English       0.66      0.87      0.75       573
      French       0.09      0.05      0.07        39
      German       0.56      0.37      0.45       121
       Greek       0.65      0.50      0.57        34
       Irish       0.69      0.30      0.42        37
     Italian       0.64      0.70      0.67       128
    Japanese       0.83      0.86      0.84       156
      Korean       0.27      0.40      0.32        10
      Polish       0.79      0.42      0.55        26
  Portuguese       0.00      0.00      0.00         9
     Russian       0.84      0.84      0.84       458
    Scottish       0.00      0.00      0.00        17
     Spanish       0.44      0.30      0.36        50
  Vietnamese       0.20    

1.3 Загрузите предобученные эмбеддинги (https://disk.yandex.ru/d/BHuT2tEXr_yBOQ?w=1) в модуль `nn.Embedding` и обучите модели из 1.2.

In [26]:
class SurnamesDecepticonRNNClassifier(nn.Module):

    def __init__(
            self,
            embedding: nn.Embedding,
            rnn_cls: t.Union[t.Type[nn.RNN], t.Type[nn.LSTM], t.Type[nn.GRU]],
            rnn_hidden_size: int,
            vector_size: int,
            num_classes: int,
    ):
        super().__init__()
        self.embedding = embedding
        self.hx, self.cx = None, None
        self.rnn = rnn_cls(input_size=self.embedding.embedding_dim, hidden_size=rnn_hidden_size)
        self.classifier = nn.Sequential(
            nn.Linear(rnn_hidden_size * vector_size, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, num_classes),
        )

    def reset_rnn_state(self):
        self.hx, self.cx = None, None

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)

        if isinstance(self.rnn, (nn.RNN, nn.GRU)):
            x, hx = self.rnn(x, self.hx)
            self.hx = hx.detach()
        else:
            if self.hx is not None and self.cx is not None:
                hx_cx = (self.hx, self.cx)
            else:
                hx_cx = None
            x, (hx, cx) = self.rnn(x, hx_cx)
            self.hx = hx.detach()
            self.cx = cx.detach()

        x = torch.flatten(x, 1)
        return self.classifier(x)

In [27]:
torch.manual_seed(0)

embedding_weights = pd.read_csv(
    DATA_DIR / "glove.6B/glove.6B.50d.txt",
    sep=" ",
    quoting=csv.QUOTE_NONE,
    index_col=0,
    header=None,
)

weights = torch.ones(len(surnames_dataset.vocab), embedding_weights.shape[1], dtype=torch.float32)
torch.nn.init.normal_(weights)

for i, ch in enumerate(surnames_dataset.vocab.alphabet):
    try:
        weights[i] = torch.from_numpy(embedding_weights.loc[ch].to_numpy())
    except KeyError:
        pass

embedding = nn.Embedding.from_pretrained(weights, padding_idx=0)
embedding

Embedding(56, 50, padding_idx=0)

### nn.RNN

In [28]:
torch.manual_seed(0)

rnn_net = SurnamesDecepticonRNNClassifier(
    embedding=embedding,
    rnn_cls=nn.RNN,
    rnn_hidden_size=64,
    vector_size=surnames_dataset.vocab.max_len,
    num_classes=len(surnames_dataset.labeler.classes_),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn_net.parameters(), lr=0.0015)

train_dataloader = DataLoader(train_surnames_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_surnames_dataset, batch_size=512)

In [29]:
%%time

_ = common_train(
    epochs=20,
    model=rnn_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    verbose=50,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 2.962828  [    0/ 8784]
loss: 2.089234  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.457650, Avg loss: 1.828808 

Epoch 2
--------------------------------
loss: 2.027457  [    0/ 8784]
loss: 1.705763  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.514572, Avg loss: 1.608777 

Epoch 3
--------------------------------
loss: 1.491407  [    0/ 8784]
loss: 1.658731  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.583333, Avg loss: 1.445631 

Epoch 4
--------------------------------
loss: 1.733590  [    0/ 8784]
loss: 1.394378  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.624772, Avg loss: 1.338745 

Epoch 5
--------------------------------
loss: 1.283463  [    0/ 8784]
loss: 1.377465  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.649362, Avg loss: 1.237860 

Epoch 6
--------------------------------
loss: 1.059728  [    0/ 8784]
loss: 1.211117  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.666667, Avg loss: 1.172205 

Epoch 7
--------------------------------
loss: 0.949637  [    0/

In [30]:
y_test, y_pred = get_y_test_y_pred(rnn_net, test_dataloader, DEVICE)

print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred,
    target_names=surnames_dataset.labeler.classes_,
    zero_division=True,
))

              precision    recall  f1-score   support

      Arabic       0.93      1.00      0.96       340
     Chinese       0.54      0.84      0.66        38
       Czech       0.62      0.16      0.25        96
       Dutch       0.67      0.27      0.39        51
     English       0.66      0.90      0.76       573
      French       0.07      0.03      0.04        39
      German       0.66      0.32      0.43       121
       Greek       0.58      0.56      0.57        34
       Irish       0.78      0.19      0.30        37
     Italian       0.59      0.84      0.69       128
    Japanese       0.81      0.81      0.81       156
      Korean       0.40      0.20      0.27        10
      Polish       0.57      0.31      0.40        26
  Portuguese       1.00      0.11      0.20         9
     Russian       0.87      0.84      0.85       458
    Scottish       0.00      0.00      0.00        17
     Spanish       0.58      0.28      0.38        50
  Vietnamese       0.00    

### nn.LSTM

In [31]:
torch.manual_seed(0)

lstm_net = SurnamesDecepticonRNNClassifier(
    embedding=embedding,
    rnn_cls=nn.LSTM,
    rnn_hidden_size=64,
    vector_size=surnames_dataset.vocab.max_len,
    num_classes=len(surnames_dataset.labeler.classes_),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_net.parameters(), lr=0.0015)

train_dataloader = DataLoader(train_surnames_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_surnames_dataset, batch_size=512)

In [32]:
%%time

_ = common_train(
    epochs=20,
    model=lstm_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    verbose=50,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 2.906416  [    0/ 8784]
loss: 2.187701  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.401184, Avg loss: 1.993795 

Epoch 2
--------------------------------
loss: 1.975016  [    0/ 8784]
loss: 1.826383  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.509563, Avg loss: 1.667807 

Epoch 3
--------------------------------
loss: 1.756627  [    0/ 8784]
loss: 1.681357  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.571038, Avg loss: 1.499368 

Epoch 4
--------------------------------
loss: 1.506961  [    0/ 8784]
loss: 1.350298  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.604281, Avg loss: 1.382028 

Epoch 5
--------------------------------
loss: 1.546560  [    0/ 8784]
loss: 1.390815  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.642987, Avg loss: 1.267897 

Epoch 6
--------------------------------
loss: 1.314082  [    0/ 8784]
loss: 1.232135  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.657559, Avg loss: 1.195764 

Epoch 7
--------------------------------
loss: 1.177783  [    0/

In [33]:
y_test, y_pred = get_y_test_y_pred(lstm_net, test_dataloader, DEVICE)

print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred,
    target_names=surnames_dataset.labeler.classes_,
    zero_division=True,
))

              precision    recall  f1-score   support

      Arabic       0.96      1.00      0.98       340
     Chinese       0.67      0.82      0.74        38
       Czech       0.45      0.21      0.29        96
       Dutch       0.74      0.33      0.46        51
     English       0.65      0.91      0.76       573
      French       0.20      0.03      0.05        39
      German       0.53      0.36      0.43       121
       Greek       0.58      0.44      0.50        34
       Irish       0.86      0.16      0.27        37
     Italian       0.66      0.74      0.70       128
    Japanese       0.76      0.89      0.82       156
      Korean       0.29      0.20      0.24        10
      Polish       0.55      0.23      0.32        26
  Portuguese       1.00      0.11      0.20         9
     Russian       0.86      0.83      0.84       458
    Scottish       0.00      0.00      0.00        17
     Spanish       0.59      0.20      0.30        50
  Vietnamese       0.00    

### nn.GRU

In [34]:
torch.manual_seed(0)

gru_net = SurnamesDecepticonRNNClassifier(
    embedding=embedding,
    rnn_cls=nn.GRU,
    rnn_hidden_size=64,
    vector_size=surnames_dataset.vocab.max_len,
    num_classes=len(surnames_dataset.labeler.classes_),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(gru_net.parameters(), lr=0.0015)

train_dataloader = DataLoader(train_surnames_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_surnames_dataset, batch_size=512)

In [35]:
%%time

_ = common_train(
    epochs=20,
    model=gru_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    verbose=50,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 2.878209  [    0/ 8784]
loss: 1.986637  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.458561, Avg loss: 1.834729 

Epoch 2
--------------------------------
loss: 1.671564  [    0/ 8784]
loss: 1.591380  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.542350, Avg loss: 1.581688 

Epoch 3
--------------------------------
loss: 1.571778  [    0/ 8784]
loss: 1.334489  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.597905, Avg loss: 1.412697 

Epoch 4
--------------------------------
loss: 1.261335  [    0/ 8784]
loss: 1.502182  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.624317, Avg loss: 1.304546 

Epoch 5
--------------------------------
loss: 1.172264  [    0/ 8784]
loss: 1.117940  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.643443, Avg loss: 1.241262 

Epoch 6
--------------------------------
loss: 1.310606  [    0/ 8784]
loss: 1.070718  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.663024, Avg loss: 1.178665 

Epoch 7
--------------------------------
loss: 1.219958  [    0/

In [36]:
y_test, y_pred = get_y_test_y_pred(gru_net, test_dataloader, DEVICE)

print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred,
    target_names=surnames_dataset.labeler.classes_,
    zero_division=True,
))

              precision    recall  f1-score   support

      Arabic       0.93      1.00      0.96       340
     Chinese       0.71      0.76      0.73        38
       Czech       0.51      0.23      0.32        96
       Dutch       0.71      0.29      0.42        51
     English       0.65      0.90      0.76       573
      French       0.10      0.03      0.04        39
      German       0.56      0.40      0.47       121
       Greek       0.62      0.44      0.52        34
       Irish       0.75      0.32      0.45        37
     Italian       0.67      0.70      0.68       128
    Japanese       0.84      0.82      0.83       156
      Korean       0.25      0.20      0.22        10
      Polish       0.40      0.15      0.22        26
  Portuguese       1.00      0.11      0.20         9
     Russian       0.83      0.84      0.84       458
    Scottish       1.00      0.00      0.00        17
     Spanish       0.44      0.28      0.34        50
  Vietnamese       0.50    

## 2. Классификация новостей на основе заголовка

Датасет: https://disk.yandex.ru/d/FN-EgWGIpyjLxQ?w=1

Эмбеддинги: https://nlp.stanford.edu/projects/glove/ (находите ссылку на архив
glove.6B.zip, в нем несколько файлов с эмбеддингами слов, выбираете один из файлов в
архиве)

2.1 Загрузите набор данных train.csv. Выполните предобработку столбца Title

2.2 На основе этих данных создайте датасет NewsDataset . Не забудьте добавить
специальные токены `<PAD>` для дополнения последовательностей до нужной длины и
`<UNK>` для корректной обработке ранее не встречавшихся токенов. В данной задаче
рассматривайте отдельные слова как токены. Разбейте датасет на обучающее и
валидационное множество.

In [37]:
PATTERN = re.compile(r"[^a-z]", flags=re.MULTILINE)
STOPWORDS = set(stopwords.words("english"))


def simple_preprocess_news_title(title: str) -> str:
    return title.lower()


def complex_preprocess_news_title(
        title: str,
        lemmatizer_or_stemmer: t.Callable[[str], str] = None,
        min_word_len: int = 0,
) -> str:
    title = simple_preprocess_news_title(title)
    title = PATTERN.sub(" ", title)

    words = []
    for word in nltk.word_tokenize(title):
        if word not in STOPWORDS and len(word) >= min_word_len:
            if not lemmatizer_or_stemmer:
                words.append(word)
                continue
            word = lemmatizer_or_stemmer(word)
            if word not in STOPWORDS and len(word) >= min_word_len:
                words.append(word)

    return " ".join(words)

In [38]:
def get_pos(word: str) -> str:
    tag = nltk.pos_tag([word])[0][1]
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


_wordnet_lemmatizer = nltk.WordNetLemmatizer()


def wordnet_lemmatizer(token: str) -> str:
    return _wordnet_lemmatizer.lemmatize(token, pos=get_pos(token))


_snowball_stemmer = nltk.SnowballStemmer(language="english")


def snowball_stemmer(token: str) -> str:
    return _snowball_stemmer.stem(token)

In [39]:
class NewsVocab:
    pad = "<PAD>"
    unknown = "<UNK>"

    def __init__(self, news_titles: t.List[str], max_len: int = 0):
        uniques = set()
        for title in news_titles:
            words = nltk.word_tokenize(title)
            uniques.update(words)
            max_len = max(len(words), max_len)

        self.alphabet = [self.pad, self.unknown, *uniques]
        self.max_len = max_len

        w2i = {w: i for i, w in enumerate(self.alphabet)}
        unknown_idx = w2i[self.unknown]
        self.w2i = defaultdict(lambda: unknown_idx, w2i)

    def __len__(self):
        return len(self.alphabet)

    def encode(self, review: str) -> torch.Tensor:
        indices = [self.w2i[w] for w in nltk.word_tokenize(review)]
        indices += [self.w2i[self.pad]] * (self.max_len - len(indices))
        return torch.tensor(indices, dtype=torch.long)

    def decode(self, indices: torch.Tensor) -> str:
        pad_indices = torch.nonzero(indices == self.w2i[self.pad], as_tuple=True)[0]  # noqa
        if len(pad_indices):
            indices = indices[:pad_indices[0]]
        return " ".join(self.alphabet[i] for i in indices)


class NewsDataset(Dataset):
    df: pd.DataFrame
    titles: t.List[str]
    classes: t.List[int]
    vocab: NewsVocab
    data: torch.Tensor
    targets: torch.Tensor

    def __init__(self, path: Path, preprocess: t.Callable[[str], str], title_max_len: int = 0):
        self.df = pd.read_csv(path)

        self.titles = self.df["Title"].apply(preprocess).tolist()
        self.vocab = NewsVocab(self.titles, max_len=title_max_len)

        self.data = torch.vstack([self.vocab.encode(w.lower()) for w in self.titles])
        self.targets = torch.tensor(self.df["Class Index"], dtype=torch.long) - 1
        self.classes = self.targets.unique().tolist()

    def __len__(self):
        return self.data.size(0)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

    def encode(self, title: str) -> torch.Tensor:
        return self.vocab.encode(title)

    def decode(self, indices: torch.Tensor) -> str:
        return self.vocab.decode(indices)

In [59]:
def preprocess_news_title(title: str) -> str:
    return complex_preprocess_news_title(title, lemmatizer_or_stemmer=snowball_stemmer, min_word_len=3)


train_news_dataset = NewsDataset(
    DATA_DIR / "news/train.csv",
    preprocess_news_title,
)
test_news_dataset = NewsDataset(
    DATA_DIR / "news/test.csv",
    preprocess_news_title,
    title_max_len=train_news_dataset.vocab.max_len,
)
len(train_news_dataset), len(test_news_dataset)

(120000, 7600)

In [67]:
train_news_dataloader = DataLoader(train_news_dataset, batch_size=256, shuffle=True)
test_news_dataloader = DataLoader(test_news_dataset, batch_size=512)

2.3 Создайте модель для классификации, используя слой nn.Embedding и слой nn.RNN.
эмбеддинги инициализируйте случайным образом не забудьте указать аргумент padding_idx для nn.Embedding

In [71]:
class NewsClassifier(nn.Module):

    def __init__(
            self,
            embedding: nn.Embedding,
            rnn_cls: t.Union[t.Type[nn.RNN], t.Type[nn.LSTM], t.Type[nn.GRU]],
            rnn_hidden_size: int,
            vector_size: int,
            num_classes: int,
    ):
        super().__init__()
        self.embedding = embedding
        self.hx, self.cx = None, None
        self.rnn = rnn_cls(input_size=self.embedding.embedding_dim, hidden_size=rnn_hidden_size)
        self.classifier = nn.Sequential(
            nn.Linear(rnn_hidden_size * vector_size, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, num_classes),
        )

    def reset_rnn_state(self):
        self.hx, self.cx = None, None

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)

        if isinstance(self.rnn, (nn.RNN, nn.GRU)):
            x, hx = self.rnn(x, self.hx)
            self.hx = hx.detach()
        else:
            if self.hx is not None and self.cx is not None:
                hx_cx = (self.hx, self.cx)
            else:
                hx_cx = None
            x, (hx, cx) = self.rnn(x, hx_cx)
            self.hx = hx.detach()
            self.cx = cx.detach()

        x = torch.flatten(x, 1)
        return self.classifier(x)

In [43]:
torch.manual_seed(0)

news_rnn_net = NewsClassifier(
    embedding=nn.Embedding(num_embeddings=len(train_news_dataset.vocab), embedding_dim=64, padding_idx=0),
    rnn_cls=nn.RNN,
    rnn_hidden_size=64,
    vector_size=train_news_dataset.vocab.max_len,
    num_classes=len(train_news_dataset.classes),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(news_rnn_net.parameters(), lr=0.001)

In [44]:
%%time

_ = common_train(
    epochs=5,
    model=news_rnn_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_news_dataloader,
    test_dataloader=test_news_dataloader,
    verbose=500,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 1.358896  [    0/120000]


RuntimeError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 2.00 GiB total capacity; 1.21 GiB already allocated; 0 bytes free; 1.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

2.4 Переобучите модель, заменив слой nn.RNN на nn.LSTM и nn.GRU . Сравните качество
на тестовой выборке. Результаты сведите в таблицу (модель/метрика качества на тестовом множестве).

In [None]:
torch.manual_seed(0)

news_lstm_net = NewsClassifier(
    embedding=nn.Embedding(num_embeddings=len(train_news_dataset.vocab), embedding_dim=64, padding_idx=0),
    rnn_cls=nn.LSTM,
    rnn_hidden_size=64,
    vector_size=train_news_dataset.vocab.max_len,
    num_classes=len(train_news_dataset.classes),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(news_lstm_net.parameters(), lr=0.001)

In [None]:
%%time

_ = common_train(
    epochs=5,
    model=news_lstm_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_news_dataloader,
    test_dataloader=test_news_dataloader,
    verbose=500,
    device=DEVICE,
)

In [None]:
torch.manual_seed(0)

news_gru_net = NewsClassifier(
    embedding=nn.Embedding(num_embeddings=len(train_news_dataset.vocab), embedding_dim=64, padding_idx=0),
    rnn_cls=nn.GRU,
    rnn_hidden_size=64,
    vector_size=train_news_dataset.vocab.max_len,
    num_classes=len(train_news_dataset.classes),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(news_gru_net.parameters(), lr=0.001)

In [None]:
%%time

_ = common_train(
    epochs=5,
    model=news_gru_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_news_dataloader,
    test_dataloader=test_news_dataloader,
    verbose=500,
    device=DEVICE,
)

2.5 Выполните пункты 2.3 и 2.4, используя предобученные эмбеддинги Glove.
Прокомментируйте результат.
Эмбеддинги из скачанного файла загрузите в виде двумерного тензора pretrained_embeddings.
Обратите внимание, что номер строки в этом тензоре должен соответствовать
токену (слову), имеющему такой индекс в вашем словаре.
для слов, которых нет в файле с эмбеддингами, инициализуйте эмбеддинг
случайным образом

In [60]:
torch.manual_seed(0)

embedding_weights = pd.read_csv(
    DATA_DIR / "glove.6B/glove.6B.100d.txt",
    sep=" ",
    quoting=csv.QUOTE_NONE,
    index_col=0,
    header=None,
)

weights = torch.empty(len(train_news_dataset.vocab), embedding_weights.shape[1], dtype=torch.float32)
torch.nn.init.normal_(weights)

for i, w in tqdm(enumerate(train_news_dataset.vocab.alphabet), total=len(train_news_dataset.vocab.alphabet)):
    try:
        weights[i] = torch.from_numpy(embedding_weights.loc[w].to_numpy())
    except KeyError:
        pass

embedding = nn.Embedding.from_pretrained(weights, padding_idx=0)
embedding

  0%|          | 19/21963 [00:00<05:05, 71.72it/s]

1


 37%|███▋      | 8024/21963 [01:10<01:56, 119.66it/s]

2200


 91%|█████████ | 20012/21963 [03:05<00:22, 85.81it/s] 

5604


100%|██████████| 21963/21963 [03:25<00:00, 107.13it/s]


Embedding(21963, 100, padding_idx=0)

In [75]:
torch.manual_seed(0)

news_rnn_pretrained_net = NewsClassifier(
    embedding=embedding,
    rnn_cls=nn.RNN,
    rnn_hidden_size=64,
    vector_size=train_news_dataset.vocab.max_len,
    num_classes=len(train_news_dataset.classes),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(news_rnn_pretrained_net.parameters(), lr=0.001)

In [76]:
%%time

_ = common_train(
    epochs=5,
    model=news_rnn_pretrained_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_news_dataloader,
    test_dataloader=test_news_dataloader,
    verbose=500,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 1.456971  [    0/120000]
loss: 1.387290  [64000/120000]
Test Error: 
 Accuracy: 0.262895, Avg loss: 1.385705 

Epoch 2
--------------------------------
loss: 1.369181  [    0/120000]
loss: 1.341796  [64000/120000]
Test Error: 
 Accuracy: 0.271711, Avg loss: 1.384669 

Epoch 3
--------------------------------
loss: 1.349925  [    0/120000]
loss: 1.319169  [64000/120000]
Test Error: 
 Accuracy: 0.266316, Avg loss: 1.386917 

Epoch 4
--------------------------------
loss: 1.322543  [    0/120000]
loss: 1.282620  [64000/120000]



KeyboardInterrupt



In [69]:
torch.manual_seed(0)

news_lstm_pretrained_net = NewsClassifier(
    embedding=embedding,
    rnn_cls=nn.LSTM,
    rnn_hidden_size=64,
    vector_size=train_news_dataset.vocab.max_len,
    num_classes=len(train_news_dataset.classes),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(news_lstm_pretrained_net.parameters(), lr=0.001)

In [None]:
%%time

_ = common_train(
    epochs=5,
    model=news_lstm_pretrained_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_news_dataloader,
    test_dataloader=test_news_dataloader,
    verbose=500,
    device=DEVICE,
)

In [None]:
torch.manual_seed(0)

news_gru_pretrained_net = NewsClassifier(
    embedding=embedding,
    rnn_cls=nn.GRU,
    rnn_hidden_size=64,
    vector_size=train_news_dataset.vocab.max_len,
    num_classes=len(train_news_dataset.classes),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(news_gru_pretrained_net.parameters(), lr=0.001)

In [None]:
%%time

_ = common_train(
    epochs=5,
    model=news_gru_pretrained_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_news_dataloader,
    test_dataloader=test_news_dataloader,
    verbose=500,
    device=DEVICE,
)

Результаты сведите в таблицу (модель/метрика качества на тестовом множестве).

In [None]:
def make_pivot_table(
        models: t.List[t.Tuple[str, NewsClassifier]],
        test_dataloader: DataLoader,
        device: str = "cpu",
) -> pd.DataFrame:
    general_report = {}
    for model in models:
        report = {}
        y_test, y_pred = get_y_test_y_pred(model, test_dataloader, device)
        ms = metrics.classification_report(y_test, y_pred, zero_division=True, output_dict=True)
        report["accuracy"] = ms["accuracy"]
        report["precision (w avg)"] = ms["weighted avg"]["precision"]
        report["recall (w avg)"] = ms["weighted avg"]["recall"]
        report["f1-score (w avg)"] = ms["weighted avg"]["f1-score"]
        general_report[model.rnn.__class__.__name__] = report
    return pd.DataFrame(general_report)

In [88]:
report = make_pivot_table(
    [
        ("RNN", news_rnn_net),
        ("LSTM", news_lstm_net),
        ("GRU", news_gru_net),
        ("RNN (pretrained)", news_rnn_pretrained_net),
        ("LSTM (pretrained)", news_lstm_pretrained_net),
        ("GRU (pretrained)", news_gru_pretrained_net),
    ],
    test_news_dataloader,
    DEVICE,
)
report

NameError: name 'make_pivot_table' is not defined

In [None]:
report.plot.bar();