In [2]:
import re
import nltk

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/noble6/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


## 1. Классификация фамилий (RNN)

Датасет: https://disk.yandex.ru/d/frNchuaBQVLxyA?w=1

1.1 Используя класс `nn.RNNCell` (абстракцию для отдельного временного шага RNN), реализуйте простейшую рекуррентную сеть Элмана в виде класса `RNN`. Используя созданный класс `RNN`, решите задачу классификации фамилий. 


In [3]:
class Vocab:
  def __init__(self, data):
    tokens = set()
    max_seq_len = 0
    for item in data:
        max_seq_len = max(max_seq_len, len(item))
        tokens.update(item)

    self.idx_to_token = {0: '<PAD>'}
    self.token_to_idx = {'<PAD>': 0}
    for idx, token in enumerate(tokens, start=1):
        self.idx_to_token[idx] = token
        self.token_to_idx[token] = idx
    self.vocab_len = len(self.idx_to_token)
    self.max_seq_len = max_seq_len

class SurnamesDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, surname):
    surname_t = torch.zeros(self.vocab.max_seq_len, dtype=torch.int64)
    for i, token in enumerate(surname):
        if i >= self.vocab.max_seq_len:
            break
        surname_t[i] = self.vocab.token_to_idx.get(token, 0)
    return surname_t

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    surname = self.X.iloc[idx]
    label = self.y.iloc[idx]
    surname_t = self.vectorize(surname)
    return surname_t, label


surnames = pd.read_csv("data/surnames.csv")
surnames['nationality'], _ = pd.factorize(surnames['nationality'])

X = surnames['surname'].str.lower()
y = surnames['nationality']
n_classes = y.nunique()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

vocab = Vocab(X)

train_dataset = SurnamesDataset(X_train, y_train, vocab)
test_dataset = SurnamesDataset(X_test, y_test, vocab)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_model(model, train_dataloader, test_dataloader, criterion, optimizer, num_epochs):
    # model.to(device)
    train_losses, test_losses = [], []
    
    for epoch in range(num_epochs):
        model.train()
        train_loss, test_loss = 0, 0
        for inputs, labels in train_dataloader:
            x = inputs#.to(device)
            y = labels#.to(device)
            
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            train_loss += loss.item()
    
        # Валидация на val_loader
        model.eval()
        with torch.no_grad():
            for inputs, labels in test_dataloader:
                inputs = inputs#.to(device)
                labels = labels#.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                test_loss += loss.item()
        
        train_losses.append(train_loss/len(train_dataloader))
        test_losses.append(test_loss/len(test_dataloader))

        print(f'Epoch {epoch+1}, Train Loss: {train_losses[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}')

def evaluate_model(model, dataloader):
    # model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for surnames, labels in dataloader:
            x = surnames#.to(device)
            y = labels#.to(device)

            logits = model(x)
            _, predicted = torch.max(logits, 1)
            correct += (predicted == y).sum().item()
            total += y.size(0)

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.5f}')

def predict(model, dataset, surname):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # model.to(device)

    model.eval()
    with torch.no_grad():
        vectorized = dataset.vectorize(surname)
        tensor = vectorized.unsqueeze(0).to(device)

        logits = model(tensor)
        
        probs = torch.softmax(logits, dim=1).squeeze()
        # print(f"probs: {probs} size: {probs.shape}")
        top3_probs, top3_indices = torch.topk(probs, k=3)
        print(top3_probs, top3_indices)

        top3_nationalities = _[top3_indices.detach().cpu().numpy()]
        print(f'{surname}: {top3_nationalities[0]} ({top3_probs[0]:.4f}), {top3_nationalities[1]} ({top3_probs[1]:.4f}), {top3_nationalities[2]} ({top3_probs[2]:.4f})')

In [5]:
rnn = nn.RNNCell(10, 20)
input = torch.randn(6, 3, 10)
print(input)
print(input.shape)
hx = torch.randn(3, 20)
print(hx, hx.shape)
output = []
for i in range(1):
    hx = rnn(input[i], hx)
    print(hx, hx.shape)
    output.append(hx)
    
# output

tensor([[[ 0.4934, -0.6052, -0.1468,  0.5293, -0.6073, -0.1523,  0.5068,
           0.8247, -0.9923,  0.9903],
         [-0.5761, -0.9204,  0.6897, -0.1006,  0.8048,  0.7096,  0.0751,
          -1.5318,  2.4260,  0.4106],
         [ 2.3055,  2.0440,  0.7741,  0.0557, -1.0849, -1.0047,  0.6631,
           0.0242, -0.8623,  0.7762]],

        [[-0.2582,  0.7932,  1.8183,  0.4992,  1.0997,  0.2566, -1.2865,
           1.2173,  0.5199, -1.9446],
         [-0.9500,  2.8146, -0.9235, -1.3016, -0.0598,  1.5100, -0.0168,
          -0.7834, -0.5279, -2.1207],
         [ 0.8666,  0.8730, -0.3235, -1.5643,  0.0539, -0.3354, -1.7087,
           0.7488, -0.8043,  0.7545]],

        [[-1.1502,  0.9670,  0.3426,  0.9749, -0.7836, -1.0436, -0.8756,
           0.3985,  2.1161, -0.0698],
         [-0.7284, -0.7192,  0.8987,  0.1179,  0.5274, -0.0929, -0.0300,
          -0.3365, -0.9884, -1.1052],
         [-0.7143,  1.2507,  0.2919, -0.6275, -1.9242,  0.5636,  0.8448,
           0.5390, -0.8321,  0.4529

In [53]:
class RNN(nn.Module):
  def __init__(self, input_size, embedding_dim, hidden_size, output_size):
    super(RNN, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(input_size, embedding_dim)
    self.rnn_cell = nn.RNNCell(embedding_dim, hidden_size)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    '''
    x.shape = (batch_size, seq_len) - тензор входных данных
    h.shape = (batch_size, hidden_size) - тензор со скрытым состоянием RNN
    '''
    batch_size, seq_len = x.shape

    h = torch.zeros(batch_size, self.hidden_size)

    hidden_states = []
    for t in range(seq_len):
      # получаем эмбеддинг текущего символа
      x_t = self.embedding(x[:, t])
      # обновляем скрытое состояние
      h = self.rnn_cell(x_t, h)
      hidden_states.append(h)

    # конкатенируем скрытые состояния и применяем полносвязный слой
    hidden_states = torch.stack(hidden_states, dim=1)
    output = self.fc(hidden_states[:, -1, :])

    return output

In [49]:
train_dataset.vectorize("tugolukov")

tensor([36, 15, 47, 29, 42, 15, 40, 29, 46,  0,  0,  0,  0,  0,  0,  0,  0])

In [54]:
model = RNN(
            input_size=vocab.vocab_len,
            embedding_dim=256,
            hidden_size=1024,
            output_size=len(set(y_train)))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [39]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Количество обучаемых параметров: {total_params}") 

Количество обучаемых параметров: 1345554


In [12]:
model.forward(train_dataset.vectorize("tugolukov").unsqueeze(0))

tensor([[ 0.1120,  0.3691,  0.0479, -0.1511,  0.0336,  0.2284,  0.2708, -0.2426,
         -0.1235, -0.4661,  0.1584, -0.0738,  0.2521, -0.1785, -0.0767,  0.1084,
         -0.1571,  0.0140]], grad_fn=<AddmmBackward0>)

In [23]:
train_model(model, train_dataloader, test_dataloader, criterion, optimizer, 3)

Epoch 1, Train Loss: 1.3938, Test Loss: 1.5178
Epoch 2, Train Loss: 1.3587, Test Loss: 1.4363
Epoch 3, Train Loss: 1.3911, Test Loss: 1.4044


In [22]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.61111


In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_model(model, train_dataloader, test_dataloader, criterion, optimizer, num_epochs):
    model.to(device)
    train_losses, test_losses = [], []
    
    for epoch in range(num_epochs):
        model.train()
        train_loss, test_loss = 0, 0
        for inputs, labels in train_dataloader:
            x = inputs.to(device)
            y = labels.to(device)
            
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            train_loss += loss.item()
    
        # Валидация на val_loader
        model.eval()
        with torch.no_grad():
            for inputs, labels in test_dataloader:
                inputs = inputs.to(device)
                labels = labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                test_loss += loss.item()
        
        train_losses.append(train_loss/len(train_dataloader))
        test_losses.append(test_loss/len(test_dataloader))

        print(f'Epoch {epoch+1}, Train Loss: {train_losses[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}')

def evaluate_model(model, dataloader):
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for surnames, labels in dataloader:
            x = surnames.to(device)
            y = labels.to(device)

            logits = model(x)
            _, predicted = torch.max(logits, 1)
            correct += (predicted == y).sum().item()
            total += y.size(0)

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.5f}')

def predict(model, dataset, surname):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    model.eval()
    with torch.no_grad():
        vectorized = dataset.vectorize(surname)
        tensor = vectorized.unsqueeze(0).to(device)

        logits = model(tensor)
        
        probs = torch.softmax(logits, dim=1).squeeze()
        # print(f"probs: {probs} size: {probs.shape}")
        top3_probs, top3_indices = torch.topk(probs, k=3)
        print(top3_probs, top3_indices)

        top3_nationalities = _[top3_indices.detach().cpu().numpy()]
        print(f'{surname}: {top3_nationalities[0]} ({top3_probs[0]:.4f}), {top3_nationalities[1]} ({top3_probs[1]:.4f}), {top3_nationalities[2]} ({top3_probs[2]:.4f})')

1.2 Замените модуль `RNN` из 1.1 на модули `nn.RNN`, `nn.LSTM` и `nn.GRU` (не забудьте указать аргумент `batch_first=True`). Сравните результаты работы.

In [13]:
class RNN(nn.Module):
  def __init__(self, input_size, embedding_dim, hidden_size, output_size):
    super(RNN, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(input_size, embedding_dim)
    self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
    self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
    self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    '''
    x.shape = (batch_size, seq_len) - тензор входных данных
    h.shape = (batch_size, hidden_size) - тензор со скрытым состоянием RNN
    '''
    batch_size, seq_len = x.shape

    # h = torch.zeros(1, batch_size, self.hidden_size)  # RNN, GRU
    h = (torch.zeros(1, batch_size, self.hidden_size).cuda(), torch.zeros(1, batch_size, self.hidden_size).cuda())  # LSTM

    # получаем эмбеддинг всех символов
    x = self.embedding(x)

    # обновляем скрытое состояние
    # output, h = self.rnn(x, h)
    output, (h, c) = self.lstm(x, h)
    # output, h = self.gru(x, h)

    # применяем полносвязный слой
    output = self.fc(output[:, -1, :])
    # return output, h[-1, :, :]
    return output

In [14]:
model = RNN(
            input_size=vocab.vocab_len,
            embedding_dim=64,
            hidden_size=256,
            output_size=len(set(y_train)))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [15]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Количество обучаемых параметров: {total_params}") 

Количество обучаемых параметров: 667666


In [17]:
train_model(model, train_dataloader, test_dataloader, criterion, optimizer, 10)

Epoch 1, Train Loss: 1.0889, Test Loss: 1.0190
Epoch 2, Train Loss: 0.9281, Test Loss: 0.9280
Epoch 3, Train Loss: 0.8188, Test Loss: 0.8794
Epoch 4, Train Loss: 0.7351, Test Loss: 0.8299
Epoch 5, Train Loss: 0.6669, Test Loss: 0.8034
Epoch 6, Train Loss: 0.5938, Test Loss: 0.8089
Epoch 7, Train Loss: 0.5361, Test Loss: 0.8006
Epoch 8, Train Loss: 0.4764, Test Loss: 0.7958
Epoch 9, Train Loss: 0.4211, Test Loss: 0.8045
Epoch 10, Train Loss: 0.3900, Test Loss: 0.8503


#### Результаты для nn.RNN

In [59]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.69444


#### Результаты для nn.LSTM

In [18]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.77550


#### Результаты для nn.GRU

In [72]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.77004


1.3 Загрузите предобученные эмбеддинги (https://disk.yandex.ru/d/BHuT2tEXr_yBOQ?w=1) в модуль `nn.Embedding` и обучите модели из 1.2.

In [26]:
embedding_path = "data/embeddings/glove.6B.50d.txt"

embeddings = {}
with open(embedding_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = torch.tensor([float(val) for val in values[1:]])
        embeddings[word] = vector

input_size = len(embeddings)
embedding_dim = len(next(iter(embeddings.values())))
embedding_matrix = torch.zeros(input_size, embedding_dim)
for i, word in enumerate(embeddings):
    embedding_matrix[i] = embeddings[word]
embedding_layer = nn.Embedding(input_size, embedding_dim)
embedding_layer.weight.data.copy_(embedding_matrix)

tensor([[ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
        [ 0.0134,  0.2368, -0.1690,  ..., -0.5666,  0.0447,  0.3039],
        [ 0.1516,  0.3018, -0.1676,  ..., -0.3565,  0.0164,  0.1022],
        ...,
        [-0.5118,  0.0587,  1.0913,  ..., -0.2500, -1.1250,  1.5863],
        [-0.7590, -0.4743,  0.4737,  ...,  0.7895, -0.0141,  0.6448],
        [ 0.0726, -0.5139,  0.4728,  ..., -0.1891, -0.5902,  0.5556]])

In [36]:
next(iter(embeddings))

'the'

In [27]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, output_size, pretrained_embeddings):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, h=None):
        '''
        x.shape = (batch_size, seq_len) - тензор входных данных
        h.shape = (batch_size, hidden_size) - тензор со скрытым состоянием RNN
        '''
        batch_size, seq_len = x.shape
        
        h = torch.zeros(1, batch_size, self.hidden_size, device=device)  # RNN, GRU
        # h = (torch.zeros(1, batch_size, self.hidden_size, device=device), torch.zeros(1, batch_size, self.hidden_size, device=device))  # LSTM

        x = self.embedding(x)

        # обновляем скрытое состояние
        # output, h = self.rnn(x, h)
        # output, (h, c) = self.lstm(x, h)
        output, h = self.gru(x, h)

        output = self.fc(output[:, -1, :])
        # return output, h[-1, :, :]
        return output

In [28]:
input_size = vocab.vocab_len
embedding_dim = len(next(iter(embeddings.values())))
hidden_size = 128
output_size = len(set(y_train))
model = RNN(input_size, embedding_dim, hidden_size, output_size, pretrained_embeddings=embedding_matrix)
# model = RNN(
#             input_size=vocab.vocab_len,
#             embedding_dim=64,
#             hidden_size=256,
#             pretrained_embeddings=embeddings,
#             output_size=len(set(y_train)))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

In [29]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Количество обучаемых параметров: {total_params}") 

Количество обучаемых параметров: 186642


In [68]:
train_model(model, train_dataloader, test_dataloader, criterion, optimizer, 10)

Epoch 1, Train Loss: 1.9063, Test Loss: 1.5279
Epoch 2, Train Loss: 1.3118, Test Loss: 1.1732
Epoch 3, Train Loss: 1.0341, Test Loss: 0.9784
Epoch 4, Train Loss: 0.8786, Test Loss: 0.8971
Epoch 5, Train Loss: 0.7800, Test Loss: 0.8477
Epoch 6, Train Loss: 0.6878, Test Loss: 0.8331
Epoch 7, Train Loss: 0.6284, Test Loss: 0.7871
Epoch 8, Train Loss: 0.5603, Test Loss: 0.7996
Epoch 9, Train Loss: 0.5104, Test Loss: 0.8299
Epoch 10, Train Loss: 0.4557, Test Loss: 0.8565


#### Результаты для nn.RNN

In [31]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.69672


#### Результаты для nn.LSTM

In [26]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.76138


#### Результаты для nn.GRU

In [61]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.77641


## 2. Классификация обзоров на фильмы (RNN)

Датасет: https://disk.yandex.ru/d/tdinpb0nN_Dsrg

2.1 Создайте набор данных на основе файлов polarity/positive_reviews.csv (положительные отзывы) и polarity/negative_reviews.csv (отрицательные отзывы). Разбейте на обучающую и тестовую выборку.
  * токен = __слово__
  * данные для обучения в датасете представляются в виде последовательности индексов токенов
  * словарь создается на основе _только_ обучающей выборки. Для корректной обработки ситуаций, когда в тестовой выборке встретится токен, который не хранится в словаре, добавьте в словарь специальный токен `<UNK>`
  * добавьте предобработку текста

2.2. Обучите классификатор.
  
  * Для преобразования последовательности индексов в последовательность векторов используйте `nn.Embedding` 
    - подберите адекватную размерность вектора эмбеддинга: 
    - модуль `nn.Embedding` обучается

  * Используйте рекуррентные слои (`nn.RNN`, `nn.LSTM`, `nn.GRU`)


2.3 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)
* Целевое значение accuracy на валидации - 70+%

In [3]:
with open("data/polarity/positive_reviews.txt") as f:
    positive_reviews = sent_tokenize(f.read())
    
with open("data/polarity/negative_reviews.txt") as f:
    negative_reviews = sent_tokenize(f.read())

In [4]:
len(positive_reviews), len(negative_reviews)

(6042, 5835)

In [5]:
reviews_df = pd.DataFrame()

reviews_df["text"] = positive_reviews + negative_reviews
reviews_df["category"] = [1 for i in range(len(positive_reviews))] + [0 for i in range(len(negative_reviews))]

reviews_df = reviews_df
reviews_df

Unnamed: 0,text,category
0,"simplistic , silly and tedious .",1
1,"it's so laddish and juvenile , only teenage bo...",1
2,exploitative and largely devoid of the depth o...,1
3,[garbus] discards the potential for pathologic...,1
4,a visually flashy but narratively opaque and e...,1
...,...,...
11872,may prove to be [tsai's] masterpiece .,0
11873,mazel tov to a film about a family's joyous li...,0
11874,standing in the shadows of motown is the best ...,0
11875,it's nice to see piscopo again after all these...,0


In [6]:
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    text = text.lower()
    text = ''.join([' ' if not char.isalpha() and char not in ['.', ',', '!', '?'] else char for char in text])
    
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

reviews_df["text"] = reviews_df["text"].apply(lambda x: preprocess_text(x))
reviews_df

Unnamed: 0,text,category
0,"simplistic , silly and tedious .",1
1,"it s so laddish and juvenile , only teenage bo...",1
2,exploitative and largely devoid of the depth o...,1
3,garbus discard the potential for pathological ...,1
4,a visually flashy but narratively opaque and e...,1
...,...,...
11872,may prove to be tsai s masterpiece .,0
11873,mazel tov to a film about a family s joyous li...,0
11874,standing in the shadow of motown is the best k...,0
11875,it s nice to see piscopo again after all these...,0


In [7]:
from sklearn.model_selection import train_test_split

X = reviews_df['text'].str.lower()
y = reviews_df['category']
n_classes = y.nunique()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [8]:
class Vocab:
  def __init__(self, data):
    self.idx_to_token = {}
    self.token_to_idx = {}
    self.vocab_len = 0
    self.max_seq_len = 0
    
    for item in data:
      self.max_seq_len = max(self.max_seq_len, len(item))
    
    # Добавляем токен для неизвестных слов
    self.idx_to_token = {0: '<UNK>'}
    self.token_to_idx = {'<UNK>': 0}
    self.vocab_len += 1

    all_words = [word for sentence in data for word in word_tokenize(sentence)]

    for word in all_words:
        if word not in self.token_to_idx:
            self.idx_to_token[self.vocab_len] = word
            self.token_to_idx[word] = self.vocab_len
            self.vocab_len += 1

    # self.data = []
    # for sentence in data:
    #     tokens = [self.token_to_idx.get(word, self.token_to_idx['<UNK>']) for word in word_tokenize(sentence)]
    #     self.data.append(tokens)
        
vocab = Vocab(X)
vocab.vocab_len, vocab.max_seq_len

(16454, 504)

In [9]:
class ReviewDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, review):
    '''Генерирует представление отзыва review при помощи бинарного кодирования (см. 1.2)'''
    vec = torch.zeros(self.vocab.max_seq_len, dtype=torch.int64)
    

    for i, word in enumerate(word_tokenize(review)):

      if i >= self.vocab.max_seq_len:
        break

      vec[i] = self.vocab.token_to_idx.get(word, 0)
  
    return vec
    
  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    vec = self.vectorize(self.X[idx])
    label = self.y[idx]
    return vec, label

In [10]:
train_dataset = ReviewDataset(X_train, y_train, vocab)
test_dataset = ReviewDataset(X_test, y_test, vocab)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [11]:
next(iter(test_dataloader))

[tensor([[   32, 15922,  2396,  ...,     0,     0,     0],
         [   23,   224,   445,  ...,     0,     0,     0],
         [   23,   125,   101,  ...,     0,     0,     0],
         ...,
         [  568,    61,    23,  ...,     0,     0,     0],
         [  492,   230,  3648,  ...,     0,     0,     0],
         [   74, 13037,   191,  ...,     0,     0,     0]]),
 tensor([0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
         0, 0, 1, 1, 1, 1, 1, 0])]

In [44]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, output_size, num_layers=1, dropout=0.2):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout)
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size*2, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        '''
        x.shape = (batch_size, seq_len) - тензор входных данных
        h.shape = (batch_size, hidden_size) - тензор со скрытым состоянием RNN
        '''
        batch_size, seq_len = x.shape

        h = torch.zeros(2*self.num_layers, batch_size, self.hidden_size).cuda()  # RNN, GRU
        # h = (torch.zeros(2*num_layers, batch_size, self.hidden_size, device=device), torch.zeros(2*num_layers, batch_size, self.hidden_size, device=device))  # LSTM

        x = self.embedding(x)
        x = self.dropout(x)

        # output, h = self.rnn(x, h)
        # output, (h, c) = self.lstm(x, h)
        output, h = self.gru(x, h)
        output = self.dropout(output)

        output = self.fc(output[:, -1, :])
        output = self.softmax(output)
        # return output, h[-1, :, :]
        return output

In [45]:
vocab.max_seq_len, vocab.vocab_len

(504, 16454)

In [46]:
model = RNN(
            input_size=vocab.vocab_len,
            embedding_dim=128,
            hidden_size=128,
            output_size=2)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [47]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Количество обучаемых параметров: {total_params}") 

Количество обучаемых параметров: 2635010


In [50]:
train_model(model, train_dataloader, test_dataloader, criterion, optimizer, 5)

Epoch 1, Train Loss: 0.5778, Test Loss: 0.5996
Epoch 2, Train Loss: 0.4761, Test Loss: 0.5934
Epoch 3, Train Loss: 0.3757, Test Loss: 0.6360
Epoch 4, Train Loss: 0.2995, Test Loss: 0.6612
Epoch 5, Train Loss: 0.2405, Test Loss: 0.7440


In [51]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.72348


In [52]:
rating_labels = ["Positive", "Negative"]

def predict(model, dataset, review):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    model.eval()
    with torch.no_grad():
        vectorized = dataset.vectorize(review)
        tensor = vectorized.unsqueeze(0).to(device)
        logits = model(tensor)
        probs = torch.softmax(logits, dim=1).squeeze()
        print(probs)
        print(f'{rating_labels[probs.argmax()]} ({probs.max():.4f}), {rating_labels[probs.argmin()]} ({probs.min():.4f}) \n{review}')
        

In [56]:
predict(model, train_dataset, "This restaurant is simply amazing! The food is delicious and the service is outstanding.")

tensor([0.9869, 0.0131], device='cuda:0')
Positive (0.9869), Negative (0.0131) 
This restaurant is simply amazing! The food is delicious and the service is outstanding.


In [54]:
predict(model, train_dataset, "The menu at this restaurant is very limited and the food is nothing special. I wouldn't go back.")

tensor([0.0128, 0.9872], device='cuda:0')
Negative (0.9872), Positive (0.0128) 
The menu at this restaurant is very limited and the food is nothing special. I wouldn't go back.


In [55]:
predict(model, train_dataset, "I had the best dining experience in this restaurant. The ambiance is perfect and the staff is very friendly.")

tensor([0.9852, 0.0148], device='cuda:0')
Positive (0.9852), Negative (0.0148) 
I had the best dining experience in this restaurant. The ambiance is perfect and the staff is very friendly.
