In [None]:
!pip install torchdata torchtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchdata
  Downloading torchdata-0.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.2 MB/s 
Collecting urllib3>=1.25
  Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 51.9 MB/s 
Collecting portalocker>=2.0.0
  Downloading portalocker-2.6.0-py2.py3-none-any.whl (15 kB)
Collecting urllib3>=1.25
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 50.8 MB/s 
Installing collected packages: urllib3, portalocker, torchdata
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
Successfully installed portalocker-2.6.0 torchdata-0.4.1 urllib3-1.25.11


In [None]:
import torch
from torch import nn
from torchtext.datasets import AG_NEWS

Для начала сделаем на торче что-нибудь нлп-шное.


Начнём со стандартного тьюториала по классификации текстов с помощью torchtext.

## Описание датасета

*AG is a collection of more than 1 million news articles. News articles have been gathered from more than 2000 news sources by ComeToMyHead in more than 1 year of activity. ComeToMyHead is an academic news search engine which has been running since July, 2004. The dataset is provided by the academic comunity for research purposes in data mining (clustering, classification, etc), information retrieval (ranking, search, etc), xml, data compression, data streaming, and any other non-commercial activity. For more information, please refer to the link http://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html .*

*The AG's news topic classification dataset is constructed by Xiang Zhang (xiang.zhang@nyu.edu) from the dataset above. It is used as a text classification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances in Neural Information Processing Systems 28 (NIPS 2015).*

*The AG's news topic classification dataset is constructed by choosing 4 largest classes from the original corpus. Each class contains 30,000 training samples and 1,900 testing samples. The total number of training samples is 120,000 and testing 7,600.*

[ag subset tf](https://www.tensorflow.org/datasets/catalog/ag_news_subset)

in `torchtext`:

    train: 120000
    test: 7600

https://pytorch.org/text/stable/datasets.html#ag-news


    1 : World
    2 : Sports
    3 : Business
    4 : Sci/Tec


In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

А что за данные-то?

In [None]:
# задаём снова, так как мы уже его прочитали, когда строили словарь
train_iter = AG_NEWS(split='train')

for item in train_iter:
  print(item)
  break

(3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")


Что делает словарь

In [None]:
vocab(["so", "call", "me", "maybe", "sdfsgsdgsdf", "<unk>"])

[303, 683, 2082, 3063, 0, 0]

In [None]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

Теперь будем иметь дело с чем-то уже нам знакомым

In [None]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

Даталоадер будет зачитывать тексты, а мы их будем преобразовывать в вид, пригодный для обработки нейронными сетями, для этого нам придётся задать специальную функцию `collate_fn`.

    Collate = collect and combine (texts, information, or sets of figures) in proper order


In [None]:
def collate_batch(batch):
  
  label_list, text_list, offsets = [], [], [0]
  
  for _label, _text in batch:
    # меняем метки на индексы
    label_list.append(label_pipeline(_label))
    # меняем токены на индексы
    processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
    text_list.append(processed_text)

    # сохраняем длину текста
    offsets.append(processed_text.size(0))

  label_list = torch.tensor(label_list, dtype=torch.int64)
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)

  # объединяем список тензоров в один вдоль нулевого измерения
  text_list = torch.cat(text_list)
  return label_list.to(device), text_list.to(device), offsets.to(device)

train_iter = AG_NEWS(split='train')
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

len(list(dataloader))

15000

## Модель

Для каждого токена будем брать эмбеддинг, и усреднять вдоль всего текста. На основе получившегося вектора будем делать предсказание.

Для таких подходов есть специальный класс, который называется [EmbeddingBag](https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html#torch.nn.EmbeddingBag).

    Computes sums or means of ‘bags’ of embeddings, without instantiating the intermediate embeddings.

In [None]:
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()        
        # вот наш усреднитель
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, 
                                         sparse=True, mode="mean")
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

## Собственно, обучение

In [None]:
# снова сбрасываем итератор в нулевую позицию
train_iter = AG_NEWS(split='train')

# число классов
num_class = len(set([label for (label, text) in train_iter]))

# размер словаря
vocab_size = len(vocab)

# размер эмбеддинга
emsize = 64

# ну тут всё понятно и знакомо
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [None]:
import time

def train(dataloader, criterion):
  
  # режим обучения
  model.train()

  # метрики-шметрики
  total_acc, total_count = 0, 0

  # как часто писать отчёты в консоль
  log_interval = 500
  start_time = time.time()

  for idx, (label, text, offsets) in enumerate(dataloader):

    optimizer.zero_grad()
    predicted_label = model(text, offsets)
    loss = criterion(predicted_label, label)
    loss.backward()
    
    # обрезка градиентов
    # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
    optimizer.step()
    
    total_acc += (predicted_label.argmax(1) == label).sum().item()
    total_count += label.size(0)
    
    if idx % log_interval == 0 and idx > 0:
      elapsed = time.time() - start_time
      print('| epoch {:3d} | {:5d}/{:5d} batches '
            '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                        total_acc/total_count))
      total_acc, total_count = 0, 0
      start_time = time.time()

def evaluate(dataloader, criterion):

  model.eval()
  total_acc, total_count = 0, 0
  
  with torch.no_grad():
    for idx, (label, text, offsets) in enumerate(dataloader):
      predicted_label = model(text, offsets)
      loss = criterion(predicted_label, label)
      total_acc += (predicted_label.argmax(1) == label).sum().item()
      total_count += label.size(0)

  return total_acc/total_count

## Пора.

In [None]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset


EPOCHS = 10 # epoch
LR = 0.5  # learning rate
BATCH_SIZE = 64 # batch size for training

# тут логарифм+софтмакс происходит под капотом
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)

# понижает LR каждой группы параметров на gamma
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

# зачитаем и трейн и тест (точней, берём итераторы)
train_iter, test_iter = AG_NEWS()

train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

# сколько берём для трейна, остальное пойдёт на валидацию
num_train = int(len(train_dataset) * 0.95)

split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)

valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):

    epoch_start_time = time.time()
    train(train_dataloader, criterion)
    accu_val = evaluate(valid_dataloader, criterion)

    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val

    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   500/ 1782 batches | accuracy    0.405
| epoch   1 |  1000/ 1782 batches | accuracy    0.597
| epoch   1 |  1500/ 1782 batches | accuracy    0.714
-----------------------------------------------------------
| end of epoch   1 | time: 14.08s | valid accuracy    0.775 
-----------------------------------------------------------
| epoch   2 |   500/ 1782 batches | accuracy    0.801
| epoch   2 |  1000/ 1782 batches | accuracy    0.823
| epoch   2 |  1500/ 1782 batches | accuracy    0.841
-----------------------------------------------------------
| end of epoch   2 | time: 13.48s | valid accuracy    0.848 
-----------------------------------------------------------
| epoch   3 |   500/ 1782 batches | accuracy    0.860
| epoch   3 |  1000/ 1782 batches | accuracy    0.865
| epoch   3 |  1500/ 1782 batches | accuracy    0.870
-----------------------------------------------------------
| end of epoch   3 | time: 13.36s | valid accuracy    0.868 
-------------------------------

In [None]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader, criterion)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.899


### А как это использовать? 

In [None]:
ag_news_label = {1: "World",
                 2: "Sports",
                 3: "Business",
                 4: "Sci/Tec"}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return ag_news_label[output.argmax(1).item() + 1]
        
model = model.to("cpu")

In [None]:
predict('The release of the Squid game series.', text_pipeline=text_pipeline)

'Sports'

# Задания

1. Попробуйте другие варианты EmbeddingBag.
2. Рассмотрите другие варианты schedulers. Как они влияют на скорость обучения?
3. Скачайте [GoogleNews-vectors-negative300.bin.gz](https://code.google.com/archive/p/word2vec/). Используя слой [Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) и [советы](https://stackoverflow.com/a/63074440/1616037), проинициализируйте им матрицу векторных представлений. Попробуйте решить эту задачу. Лучше ли стали результаты?
