<a href="https://colab.research.google.com/github/YaroslavFYPM/RNN-vs-CNN/blob/main/RNNvsCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Сравнение архитектуры RNN и CNN в задачах классификация текста.


In [None]:
!pip install datasets

In [None]:
import random
import numpy as np

import nltk
import gensim.downloader as api

import torch
import torch.nn as nn
import datasets

In [None]:
SEED = 0xDEAD
random.seed(SEED)
np.random.seed(SEED)
torch.random.manual_seed(SEED)
torch.cuda.random.manual_seed_all(SEED)

Загрузим датасет новостей: `AgNews`. В нем разделены тексты на 4 темы: `World`, `Sports`, `Business`, `Sci/Tech`. Посмотрим на структуру датасета и на примеры текстов:

In [None]:
dataset = datasets.load_dataset("ag_news")
dataset["train"]

In [None]:
print(dataset["train"][0])
print(type(dataset["train"][0]))


В `dataset` находятся `train` и `test` части датасета.

In [None]:
dataset

In [None]:
dataset["train"].info

Чтобы превращать текст из набора слов в набор векторов мы будем использовать предобученные эмбеддинги. Посмотрим на их список и выберем один из них.

In [None]:
print("\n".join(api.info()['models'].keys()))

In [None]:
word2vec = api.load("glove-twitter-50")

Токенезируем наш текст с помощью NLTK.

In [None]:
MAX_LENGTH=128

tokenizer = nltk.WordPunctTokenizer()

dataset = dataset.map(
    lambda item: {
        "tokenized": tokenizer.tokenize(item["text"])[:MAX_LENGTH]
    }
)

Создадим мапинг из токенов в индексы

In [None]:
word2idx = {word: idx for idx, word in enumerate(word2vec.index2word)}

Переведем токены в индексы

In [None]:
def encode(word):
    if word in word2idx.keys():
        return word2idx[word]
    return word2idx["unk"]

In [None]:
dataset = dataset.map(
    lambda item: {
        "features": [encode(word) for word in item["tokenized"]]
    }
)

In [None]:
dataset["train"][0]

In [None]:
dataset = dataset.remove_columns(["text", "tokenized"])

Переведем в тензоры

In [None]:
dataset.set_format(type='torch')

In [None]:
dataset["train"][0]

Хотим склеить объекты разной длинны в батчи. Для этого давайте напишем `collate_fn`.

In [None]:
def collate_fn(batch):
    max_len = max(len(row["features"]) for row in batch)
    input_embeds = torch.empty((len(batch), max_len), dtype=torch.long)
    labels = torch.empty(len(batch), dtype=torch.long)
    for idx, row in enumerate(batch):
        to_pad = max_len - len(row["features"])
        input_embeds[idx] = torch.cat((row["features"], torch.zeros(to_pad)))
        labels[idx] = row["label"]
    return {"features": input_embeds, "labels": labels}

In [None]:
print(dataset.items())
print(dataset)

In [None]:
from torch.utils.data import DataLoader

loaders = {
    k: DataLoader(
        ds, shuffle=(k=="train"), batch_size=32, collate_fn=collate_fn
    ) for k, ds in dataset.items()
}

In [None]:
loaders

## CNN



In [None]:
class CNNModel(nn.Module):
    def __init__(self, embed_size, hidden_size, num_classes=4):
        super().__init__()
        self.embeddings = nn.Embedding(len(word2idx), embedding_dim=embed_size)
        self.cnn = nn.Sequential(
            nn.Conv1d(embed_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Flatten(),
        )
        self.cl = nn.Sequential(
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        x = self.embeddings(x)  # (batch_size, seq_len, embed_dim)
        x = x.permute(0, 2, 1)
        x = self.cnn(x)
        prediction = self.cl(x)
        return prediction

In [None]:
torch.cuda.is_available()

In [None]:

device = "cuda" if torch.cuda.is_available() else "cpu"

model = CNNModel(word2vec.vector_size, 50).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 10

Подготовим функцию для обучения модели:

In [None]:

from tqdm.notebook import tqdm, trange


def training(model, criterion, optimizer, num_epochs, loaders, max_grad_norm=2):
    for e in trange(num_epochs, leave=False):
        model.train()
        num_iter = 0
        pbar = tqdm(loaders["train"], leave=False)
        for batch in pbar:
            optimizer.zero_grad()
            input_embeds = batch["features"].to(device)
            labels = batch["labels"].to(device)
            prediction = model(input_embeds)
            loss = criterion(prediction, labels)
            loss.backward()
            if max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            num_iter += 1
        valid_loss = 0
        valid_acc = 0
        num_iter = 0
        model.eval()
        with torch.no_grad():
            correct = 0
            num_objs = 0
            for batch in loaders["test"]:
                input_embeds = batch["features"].to(device)
                labels = batch["labels"].to(device)
                prediction = model(input_embeds)
                valid_loss += criterion(prediction, labels)
                correct += (labels == prediction.argmax(-1)).float().sum()
                num_objs += len(labels)
                num_iter += 1
        

        print(f"Test valid Loss: {valid_loss / num_iter}, test accuracy: {correct/num_objs}")
        with torch.no_grad():
            correct = 0
            num_objs = 0
            for batch in loaders["train"]:
                input_embeds = batch["features"].to(device)
                labels = batch["labels"].to(device)
                prediction = model(input_embeds)
                valid_loss += criterion(prediction, labels)
                correct += (labels == prediction.argmax(-1)).float().sum()
                num_objs += len(labels)
                num_iter += 1
        

        print(f"Train valid Loss: {valid_loss / num_iter},train accuracy: {correct/num_objs}")

In [None]:
device

In [None]:
%time training(model, criterion, optimizer, num_epochs, loaders)

## RNN




In [None]:
class RNN_fixed_len(torch.nn.Module) :
    def __init__(self, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(len(word2idx), embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True, num_layers=2)
        self.linear = nn.Linear(hidden_dim, 6)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        rnn_out, ht = self.rnn(x)
        return self.linear(ht[-1])

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = RNN_fixed_len(word2vec.vector_size, 50).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

num_epochs = 10
max_grad_norm = 1.0

In [None]:
%time training(model, criterion, optimizer, num_epochs, loaders, max_grad_norm)

## LSTM

In [None]:
class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(len(word2idx), embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=4)
        self.linear = nn.Linear(hidden_dim, 6)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = LSTM_fixed_len(word2vec.vector_size, 50).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

num_epochs = 10
max_grad_norm = 1.0

In [None]:
%time training(model, criterion, optimizer, num_epochs, loaders)

## QRNN

In [None]:
!pip install pynvrtc git+https://github.com/salesforce/pytorch-qrnn

In [None]:
import cupy

In [None]:
import pynvrtc

In [None]:
import torchqrnn

In [None]:
from torchqrnn import QRNN

In [None]:
class Model(nn.Module):

    def __init__(self, embed_size, hidden_size, num_classes=6, parallel=True):
        super().__init__()

        self.embed = nn.Embedding(len(word2idx), embed_size)
        self.rnn = QRNN(embed_size, hidden_size, num_layers=4)
        #self.rnn = nn.LSTM(hidden_size, hidden_size)
        # Note: we tell DataParallel to split on the second dimension as RNNs are batch second by default in PyTorch
        if parallel: self.rnn = nn.DataParallel(self.rnn, dim=1)
        self.cls = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.embed(x)
        x = x.permute(1, 0, 2)
        x = self.rnn(x)
        qrnn_out, ht= self.cls(x)
        ht.permute(1, 0, 2)
        return self.cls(ht[-1])


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = Model(word2vec.vector_size, 50).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

num_epochs = 4
max_grad_norm = 2.0

In [None]:
%time training(model, criterion, optimizer, num_epochs, loaders, max_grad_norm)

К сожалению, проверить архитектуру QRNN не удалось, предположительно из-за устаревшей библиотеки, предлагаемой авторами статьи arXiv:1611.01576v2 [cs.NE] 21 Nov 2016 James Bradbury, Stephen Merity, Caiming Xiong, Richard Socher.