In [1]:
import re
import typing as t
from collections import defaultdict
from functools import lru_cache
from pathlib import Path

import nltk
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from nltk.corpus import stopwords, wordnet
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, Subset, random_split

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
DATA_DIR = Path("data/")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {DEVICE.upper()} device")

Using CUDA device


In [4]:
def on_cuda(device: str) -> bool:
    return device == "cuda"


def common_train(
        model: nn.Module,
        loss_fn: nn.Module,
        optimizer: optim.Optimizer,
        train_dataloader: DataLoader,
        epochs: int,
        test_dataloader: DataLoader = None,
        lr_scheduler=None,
        verbose: int = 100,
        device: str = "cpu",
) -> t.List[float]:
    train_losses = []
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}\n" + "-" * 32)
        train_loss = train_loop(
            train_dataloader,
            model,
            loss_fn,
            optimizer,
            verbose=verbose,
            device=device,
        )
        train_losses.append(train_loss.item())
        if test_dataloader:
            loss, acc = test_loop(test_dataloader, model, loss_fn, device=device)
            if lr_scheduler:
                lr_scheduler.step(loss)
        torch.cuda.empty_cache()
    return train_losses


def train_loop(
        dataloader: DataLoader,
        model: nn.Module,
        loss_fn: nn.Module,
        optimizer: optim.Optimizer,
        verbose: int = 100,
        device: str = "cpu",
) -> torch.Tensor:
    model.train()

    size = len(dataloader.dataset)  # noqa
    num_batches = len(dataloader)
    avg_loss = 0

    for batch, (x, y) in enumerate(dataloader):
        x, y = x.to(device), y.to(device)

        pred = model(x)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()

        avg_loss += loss
        if batch % verbose == 0:
            print(f"loss: {loss:>7f}  [{batch * len(x):>5d}/{size:>5d}]")

        del x, y, pred, loss
        torch.cuda.empty_cache()

    return avg_loss / num_batches


@torch.no_grad()
def test_loop(
        dataloader: DataLoader,
        model: nn.Module,
        loss_fn: nn.Module,
        device: str = "cpu",
) -> t.Tuple[torch.Tensor, torch.Tensor]:
    model.eval()

    size = len(dataloader.dataset)  # noqa
    num_batches = len(dataloader)
    avg_loss, correct = 0, 0

    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        pred = model(x)
        avg_loss += loss_fn(pred, y)
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()  # noqa

        del x, y, pred
        torch.cuda.empty_cache()

    avg_loss /= num_batches
    accuracy = correct / size
    print(f"Test Error: \n Accuracy: {accuracy:>4f}, Avg loss: {avg_loss:>8f} \n")

    return avg_loss, accuracy


def train_test_split(dataset: t.Union[Dataset, t.Sized], train_part: float) -> t.Tuple[Subset, Subset]:
    train_size = round(train_part * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, lengths=(train_size, test_size))
    return train_dataset, test_dataset


@torch.no_grad()
def get_y_test_y_pred(
        model: nn.Module,
        test_dataloader: DataLoader,
        device: str = "cpu",
) -> t.Tuple[torch.Tensor, torch.Tensor]:
    model.eval()

    y_test = []
    y_pred = []
    for x, y in test_dataloader:
        x, y = x.to(device), y.to(device)
        pred = model(x).argmax(1)
        y_test.append(y)
        y_pred.append(pred)

        del x
        torch.cuda.empty_cache()

    return torch.hstack(y_test).detach().cpu(), torch.hstack(y_pred).detach().cpu()

## 1. Классификация фамилий (RNN)

Датасет: https://disk.yandex.ru/d/frNchuaBQVLxyA?w=1

1.1 Используя класс `nn.RNNCell` (абстракцию для отдельного временного шага RNN), реализуйте простейшую рекуррентную сеть Элмана в виде класса `RNN`. Используя созданный класс `RNN`, решите задачу классификации фамилий. 


In [5]:
class RNN(nn.Module):

    def __init__(self, input_size: int, hidden_size: int):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.rnn_cell = nn.RNNCell(input_size, hidden_size)

    def forward(self, inputs: torch.Tensor, hx: torch.Tensor = None):
        batch_size, sequence_size, _ = inputs.size()
        inputs = inputs.permute(1, 0, 2)

        if hx is None:
            hx = torch.zeros(batch_size, self.hidden_size, dtype=inputs.dtype, device=inputs.device)
        else:
            hx = hx.squeeze(0)

        hidden = []
        for i in range(sequence_size):
            hx = self.rnn_cell(inputs[i], hx)
            hidden.append(hx)

        hidden = torch.stack(hidden)
        hx = hidden[-1].unsqueeze(0)
        return hidden.permute(1, 0, 2), hx

Проверка реализации RNN:

In [6]:
torch.manual_seed(0)

input_size, hidden_size = 4, 5
inputs = torch.randn(2, 3, input_size)
hx = torch.randn(1, 2, hidden_size)

torch.manual_seed(0)
my_rnn = RNN(input_size=input_size, hidden_size=hidden_size)

torch.manual_seed(0)
true_rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, batch_first=True)

In [7]:
my_rnn(inputs, hx)

(tensor([[[ 0.6515,  0.5430,  0.4023,  0.6325, -0.6068],
          [ 0.9149, -0.1088,  0.6385, -0.7387,  0.7532],
          [-0.6936,  0.5123, -0.2784, -0.5693, -0.0055]],
 
         [[ 0.1954,  0.6152,  0.2958, -0.8005,  0.8074],
          [-0.4577,  0.7566,  0.2972, -0.8834,  0.1265],
          [ 0.7166,  0.1516,  0.8047, -0.2007,  0.8192]]],
        grad_fn=<PermuteBackward>),
 tensor([[[-0.6936,  0.5123, -0.2784, -0.5693, -0.0055],
          [ 0.7166,  0.1516,  0.8047, -0.2007,  0.8192]]],
        grad_fn=<UnsqueezeBackward0>))

In [8]:
true_rnn(inputs, hx)

(tensor([[[ 0.6515,  0.5430,  0.4023,  0.6325, -0.6068],
          [ 0.9149, -0.1088,  0.6385, -0.7387,  0.7532],
          [-0.6936,  0.5123, -0.2784, -0.5693, -0.0055]],
 
         [[ 0.1954,  0.6152,  0.2958, -0.8005,  0.8074],
          [-0.4577,  0.7566,  0.2972, -0.8834,  0.1265],
          [ 0.7166,  0.1516,  0.8047, -0.2007,  0.8192]]],
        grad_fn=<TransposeBackward1>),
 tensor([[[-0.6936,  0.5123, -0.2784, -0.5693, -0.0055],
          [ 0.7166,  0.1516,  0.8047, -0.2007,  0.8192]]],
        grad_fn=<StackBackward>))

100% совпадение

In [9]:
class SurnamesRNNClassifier(nn.Module):

    def __init__(
            self,
            num_embeddings: int,
            embedding_dim: int,
            rnn_hidden_size: int,
            vector_size: int,
            num_classes: int,
    ):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_idx=0)
        self.hx = None
        self.rnn = RNN(input_size=embedding_dim, hidden_size=rnn_hidden_size)
        self.classifier = nn.Sequential(
            nn.Linear(rnn_hidden_size * vector_size, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)
        x, hx = self.rnn(x, self.hx)
        x = torch.flatten(x, 1)
        return self.classifier(x)

In [10]:
class SurnamesVocab:
    pad = "<PAD>"

    def __init__(self, surnames: t.List[str]):
        uniques = set()
        max_len = 0
        for w in map(str.lower, surnames):
            uniques.update(w)
            max_len = max(len(w), max_len)

        self.alphabet = [self.pad, *uniques]
        self.max_len = max_len
        self.ch2i = {ch: i for i, ch in enumerate(self.alphabet)}

    def __len__(self):
        return len(self.alphabet)

    def encode(self, word: str) -> torch.Tensor:
        indices = [self.ch2i[ch] for ch in word]
        indices += [self.ch2i[self.pad]] * (self.max_len - len(indices))
        return torch.tensor(indices, dtype=torch.long)

    def decode(self, indices: torch.Tensor) -> str:
        pad_indices = torch.nonzero(indices == self.ch2i[self.pad], as_tuple=True)[0]
        if len(pad_indices):
            indices = indices[:pad_indices[0]]
        return "".join(self.alphabet[i] for i in indices)


class SurnamesDataset(Dataset):
    df: pd.DataFrame
    surnames: t.List[str]
    vocab: SurnamesVocab
    labeler: LabelEncoder
    data: torch.Tensor
    targets: torch.Tensor

    def __init__(self, path: Path):
        self.df = pd.read_csv(path)

        self.surnames = self.df["surname"].tolist()
        self.vocab = SurnamesVocab(self.surnames)
        size = self.vocab.encode(self.surnames[0].lower()).size()
        data = torch.vstack([self.vocab.encode(w.lower()) for w in self.surnames])
        self.data = data.view(len(self.surnames), *size)

        self.labeler = LabelEncoder()
        targets = self.labeler.fit_transform(self.df["nationality"])
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return self.data.size(0)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

    def encode(self, word: str) -> torch.Tensor:
        return self.vocab.encode(word)

    def decode(self, indices: torch.Tensor) -> str:
        return self.vocab.decode(indices)

In [11]:
surnames_dataset = SurnamesDataset(DATA_DIR / "surnames.csv")
len(surnames_dataset)

10980

In [12]:
torch.manual_seed(0)

train_surnames_dataset, test_surnames_dataset = train_test_split(surnames_dataset, train_part=0.8)
print(len(train_surnames_dataset), len(test_surnames_dataset))

8784 2196


### Handmade RNN

In [13]:
torch.manual_seed(0)

handmade_rnn_net = SurnamesRNNClassifier(
    num_embeddings=len(surnames_dataset.vocab),
    embedding_dim=100,
    rnn_hidden_size=64,
    vector_size=surnames_dataset.vocab.max_len,
    num_classes=len(surnames_dataset.labeler.classes_),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(handmade_rnn_net.parameters(), lr=0.0015)

train_dataloader = DataLoader(train_surnames_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_surnames_dataset, batch_size=512)

In [14]:
%%time

_ = common_train(
    epochs=15,
    model=handmade_rnn_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    verbose=50,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 2.894605  [    0/ 8784]
loss: 1.477110  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.625683, Avg loss: 1.310740 

Epoch 2
--------------------------------
loss: 1.462449  [    0/ 8784]
loss: 1.213015  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.698087, Avg loss: 1.074561 

Epoch 3
--------------------------------
loss: 1.054994  [    0/ 8784]
loss: 0.907282  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.723588, Avg loss: 0.966488 

Epoch 4
--------------------------------
loss: 0.925219  [    0/ 8784]
loss: 0.604936  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.739071, Avg loss: 0.911016 

Epoch 5
--------------------------------
loss: 0.821252  [    0/ 8784]
loss: 0.572490  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.744080, Avg loss: 0.870962 

Epoch 6
--------------------------------
loss: 0.643748  [    0/ 8784]
loss: 0.752343  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.751821, Avg loss: 0.865607 

Epoch 7
--------------------------------
loss: 0.456838  [    0/

In [15]:
y_test, y_pred = get_y_test_y_pred(handmade_rnn_net, test_dataloader, DEVICE)

print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred,
    target_names=surnames_dataset.labeler.classes_,
    zero_division=True,
))

              precision    recall  f1-score   support

      Arabic       0.97      1.00      0.99       340
     Chinese       0.75      0.71      0.73        38
       Czech       0.64      0.28      0.39        96
       Dutch       0.79      0.43      0.56        51
     English       0.68      0.90      0.78       573
      French       0.27      0.10      0.15        39
      German       0.59      0.42      0.49       121
       Greek       0.86      0.56      0.68        34
       Irish       0.75      0.32      0.45        37
     Italian       0.72      0.77      0.74       128
    Japanese       0.83      0.85      0.84       156
      Korean       0.44      0.40      0.42        10
      Polish       0.61      0.54      0.57        26
  Portuguese       0.00      0.00      0.00         9
     Russian       0.86      0.87      0.86       458
    Scottish       1.00      0.00      0.00        17
     Spanish       0.57      0.54      0.56        50
  Vietnamese       0.00    

1.2 Замените модуль `RNN` из 1.1 на модули `nn.RNN`, `nn.LSTM` и `nn.GRU` (не забудьте указать аргумент `batch_first=True`). Сравните результаты работы.

In [16]:
class SurnamesAutobotRNNClassifier(nn.Module):

    def __init__(
            self,
            rnn_cls: t.Union[t.Type[nn.RNN], t.Type[nn.LSTM], t.Type[nn.GRU]],
            num_embeddings: int,
            embedding_dim: int,
            rnn_hidden_size: int,
            vector_size: int,
            num_classes: int,
    ):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_idx=0)
        self.hx = None
        self.rnn = rnn_cls(input_size=embedding_dim, hidden_size=rnn_hidden_size)
        self.classifier = nn.Sequential(
            nn.Linear(rnn_hidden_size * vector_size, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)
        x, hx = self.rnn(x, self.hx)
        x = torch.flatten(x, 1)
        return self.classifier(x)

### nn.RNN

In [17]:
torch.manual_seed(0)

rnn_net = SurnamesAutobotRNNClassifier(
    rnn_cls=nn.RNN,
    num_embeddings=len(surnames_dataset.vocab),
    embedding_dim=100,
    rnn_hidden_size=64,
    vector_size=surnames_dataset.vocab.max_len,
    num_classes=len(surnames_dataset.labeler.classes_),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn_net.parameters(), lr=0.0015)

train_dataloader = DataLoader(train_surnames_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_surnames_dataset, batch_size=512)

In [18]:
%%time

_ = common_train(
    epochs=15,
    model=rnn_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    verbose=50,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 2.901579  [    0/ 8784]
loss: 1.583601  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.577413, Avg loss: 1.460604 

Epoch 2
--------------------------------
loss: 1.594686  [    0/ 8784]
loss: 1.441510  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.637978, Avg loss: 1.242304 

Epoch 3
--------------------------------
loss: 1.201505  [    0/ 8784]
loss: 1.026210  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.678506, Avg loss: 1.120839 

Epoch 4
--------------------------------
loss: 1.138262  [    0/ 8784]
loss: 0.840799  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.704007, Avg loss: 1.015837 

Epoch 5
--------------------------------
loss: 1.024570  [    0/ 8784]
loss: 0.755569  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.710838, Avg loss: 1.000529 

Epoch 6
--------------------------------
loss: 0.771986  [    0/ 8784]
loss: 0.869066  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.722678, Avg loss: 0.957680 

Epoch 7
--------------------------------
loss: 0.713295  [    0/

In [19]:
y_test, y_pred = get_y_test_y_pred(rnn_net, test_dataloader, DEVICE)

print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred,
    target_names=surnames_dataset.labeler.classes_,
    zero_division=True,
))

              precision    recall  f1-score   support

      Arabic       0.97      1.00      0.99       340
     Chinese       0.68      0.71      0.69        38
       Czech       0.59      0.25      0.35        96
       Dutch       0.88      0.41      0.56        51
     English       0.65      0.90      0.76       573
      French       0.23      0.08      0.12        39
      German       0.61      0.41      0.49       121
       Greek       0.69      0.59      0.63        34
       Irish       0.70      0.19      0.30        37
     Italian       0.62      0.68      0.65       128
    Japanese       0.86      0.85      0.86       156
      Korean       0.10      0.10      0.10        10
      Polish       0.59      0.50      0.54        26
  Portuguese       0.33      0.11      0.17         9
     Russian       0.86      0.84      0.85       458
    Scottish       0.00      0.00      0.00        17
     Spanish       0.51      0.38      0.44        50
  Vietnamese       1.00    

### nn.LSTM

In [20]:
torch.manual_seed(0)

lstm_net = SurnamesAutobotRNNClassifier(
    rnn_cls=nn.LSTM,
    num_embeddings=len(surnames_dataset.vocab),
    embedding_dim=100,
    rnn_hidden_size=64,
    vector_size=surnames_dataset.vocab.max_len,
    num_classes=len(surnames_dataset.labeler.classes_),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_net.parameters(), lr=0.0015)

train_dataloader = DataLoader(train_surnames_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_surnames_dataset, batch_size=512)

In [21]:
%%time

_ = common_train(
    epochs=15,
    model=lstm_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    verbose=50,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 2.908684  [    0/ 8784]
loss: 1.817289  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.537341, Avg loss: 1.629798 

Epoch 2
--------------------------------
loss: 1.460007  [    0/ 8784]
loss: 1.196895  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.601548, Avg loss: 1.364094 

Epoch 3
--------------------------------
loss: 1.310088  [    0/ 8784]
loss: 1.463891  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.650273, Avg loss: 1.209218 

Epoch 4
--------------------------------
loss: 1.372692  [    0/ 8784]
loss: 1.153307  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.682605, Avg loss: 1.096049 

Epoch 5
--------------------------------
loss: 0.933651  [    0/ 8784]
loss: 0.957198  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.705373, Avg loss: 1.024245 

Epoch 6
--------------------------------
loss: 0.919880  [    0/ 8784]
loss: 0.989278  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.708561, Avg loss: 0.981927 

Epoch 7
--------------------------------
loss: 0.620268  [    0/

In [22]:
y_test, y_pred = get_y_test_y_pred(lstm_net, test_dataloader, DEVICE)

print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred,
    target_names=surnames_dataset.labeler.classes_,
    zero_division=True,
))

              precision    recall  f1-score   support

      Arabic       0.93      1.00      0.97       340
     Chinese       0.67      0.74      0.70        38
       Czech       0.51      0.20      0.29        96
       Dutch       0.69      0.35      0.47        51
     English       0.65      0.89      0.76       573
      French       0.10      0.03      0.04        39
      German       0.67      0.38      0.48       121
       Greek       0.71      0.50      0.59        34
       Irish       0.92      0.30      0.45        37
     Italian       0.67      0.73      0.70       128
    Japanese       0.87      0.82      0.84       156
      Korean       0.22      0.20      0.21        10
      Polish       0.62      0.38      0.48        26
  Portuguese       1.00      0.00      0.00         9
     Russian       0.82      0.86      0.84       458
    Scottish       0.00      0.00      0.00        17
     Spanish       0.49      0.38      0.43        50
  Vietnamese       0.50    

### nn.GRU

In [23]:
torch.manual_seed(0)

gru_net = SurnamesAutobotRNNClassifier(
    rnn_cls=nn.GRU,
    num_embeddings=len(surnames_dataset.vocab),
    embedding_dim=100,
    rnn_hidden_size=64,
    vector_size=surnames_dataset.vocab.max_len,
    num_classes=len(surnames_dataset.labeler.classes_),
).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(gru_net.parameters(), lr=0.0015)

train_dataloader = DataLoader(train_surnames_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_surnames_dataset, batch_size=512)

In [24]:
%%time

_ = common_train(
    epochs=15,
    model=gru_net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    verbose=50,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 2.899589  [    0/ 8784]
loss: 1.802641  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.544171, Avg loss: 1.556026 

Epoch 2
--------------------------------
loss: 1.625537  [    0/ 8784]
loss: 1.396525  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.618397, Avg loss: 1.300508 

Epoch 3
--------------------------------
loss: 1.280757  [    0/ 8784]
loss: 1.290818  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.666211, Avg loss: 1.151116 

Epoch 4
--------------------------------
loss: 1.177101  [    0/ 8784]
loss: 1.215153  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.680328, Avg loss: 1.065233 

Epoch 5
--------------------------------
loss: 0.980553  [    0/ 8784]
loss: 0.903879  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.700820, Avg loss: 1.014038 

Epoch 6
--------------------------------
loss: 0.875414  [    0/ 8784]
loss: 0.799713  [ 6400/ 8784]
Test Error: 
 Accuracy: 0.717668, Avg loss: 0.994450 

Epoch 7
--------------------------------
loss: 0.949876  [    0/

In [25]:
y_test, y_pred = get_y_test_y_pred(gru_net, test_dataloader, DEVICE)

print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred,
    target_names=surnames_dataset.labeler.classes_,
    zero_division=True,
))

              precision    recall  f1-score   support

      Arabic       0.95      1.00      0.97       340
     Chinese       0.72      0.68      0.70        38
       Czech       0.44      0.28      0.34        96
       Dutch       0.84      0.41      0.55        51
     English       0.67      0.86      0.75       573
      French       0.17      0.05      0.08        39
      German       0.52      0.46      0.49       121
       Greek       0.72      0.53      0.61        34
       Irish       0.86      0.32      0.47        37
     Italian       0.65      0.70      0.68       128
    Japanese       0.86      0.86      0.86       156
      Korean       0.27      0.30      0.29        10
      Polish       0.73      0.31      0.43        26
  Portuguese       0.00      0.00      0.00         9
     Russian       0.85      0.86      0.85       458
    Scottish       0.00      0.00      0.00        17
     Spanish       0.54      0.42      0.47        50
  Vietnamese       0.50    

1.3 Загрузите предобученные эмбеддинги (https://disk.yandex.ru/d/BHuT2tEXr_yBOQ?w=1) в модуль `nn.Embedding` и обучите модели из 1.2.

## 2. Классификация новостей на основе заголовка

Датасет: https://disk.yandex.ru/d/FN-EgWGIpyjLxQ?w=1
<br>Эмбеддинги: https://nlp.stanford.edu/projects/glove/ (находите ссылку на архив
glove.6B.zip, в нем несколько файлов с эмбеддингами слов, выбираете один из файлов в
архиве)
<br><br>2.1 Загрузите набор данных train.csv. Выполните предобработку столбца Title
<br><br>2.2 На основе этих данных создайте датасет NewsDataset . Не забудьте добавить
специальные токены <PAD> для дополнения последовательностей до нужной длины и
<UNK> для корректной обработке ранее не встречавшихся токенов. В данной задаче
рассматривайте отдельные слова как токены. Разбейте датасет на обучающее и
валидационное множество.
<br><br>2.3 Создайте модель для классификации, используя слой nn.Embedding и слой nn.RNN .
эмбеддинги инициализируйте случайным образом
не забудьте указать аргумент padding_idx для nn.Embedding
<br><br>2.4 Переобучите модель, заменив слой nn.RNN на nn.LSTM и nn.GRU . Сравните качество
на тестовой выборке. Результаты сведите в таблицу (модель/метрика качества на
тестовом множестве).
<br><br>2.5 Выполните пункты 2.3 и 2.4, используя предобученные эмбеддинги Glove.
Прокомментируйте результат.
Эмбеддинги из скачанного файла загрузите в виде двумерного тензора
pretrained_embeddings .
Обратите внимание, что номер строки в этом тензоре должен соответствовать
токену (слову), имеющему такой индекс в вашем словаре.
для слов, которых нет в файле с эмбеддингами, инициализуйте эмбеддинг
случайным образом