In [None]:
import pandas as pd
import re
import torch
from nltk.tokenize import word_tokenize
from torchtext.vocab import build_vocab_from_iterator
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import torchmetrics as M
from sklearn.metrics import confusion_matrix, classification_report

Создайте датасет на основе файла IMDB Dataset.csv (лежит в 03_embeddings/data). Разделите набор данных на обучающее и тестовое множество. Проведите предобработку данных. Минимальная предобработка данных: удаление html-тегов и знаков препинания. Разбейте документы на слова, закодируйте слова индексами, приведите каждый документ к одинаковому количеству токенов и преобразуйте в тензоры.

Реализовав нейронную сеть при помощи библиотеки PyTorch, решите задачу классификации текстов. Отобразите confusion matrix и classification report, рассчитанные на основе обучающего и тестового множества.

Вариант 1.

Решите задачу задачу классификации, используя слой nn.Embedding. Для получения эмбеддингов документов выполните следующие действия:
1. Объедините эмбеддинги слов в документе в один длинный вектор размерности (n_tokens x embedding_dim)
2. Пропустите этот вектор через полносвязный слой, понизив размерность снова до embedding_dim


In [None]:
data = pd.read_csv("IMDB Dataset.csv")

tags = re.compile(r'<.*?>')
punct = re.compile(r'[^\w\s]')

def preprocess(text):
    text = re.sub(tags, '', text.lower())
    text = re.sub(punct, '', text)
    tokens = word_tokenize(text)
    return tokens

data["review"] = [preprocess(review) for review in data["review"]]
data.head()

Unnamed: 0,review,sentiment
0,"[one, of, the, other, reviewers, has, mentione...",positive
1,"[a, wonderful, little, production, the, filmin...",positive
2,"[i, thought, this, was, a, wonderful, way, to,...",positive
3,"[basically, theres, a, family, where, a, littl...",negative
4,"[petter, matteis, love, in, the, time, of, mon...",positive


In [None]:
data_train, data_test = train_test_split(data, test_size=0.25, random_state=7)

In [None]:
vocab = build_vocab_from_iterator(data_train["review"], specials=['<pad>', '<unk>'])
vocab.set_default_index(1)

In [None]:
def text_to_indices(doc):
    return [vocab[word] for word in doc]

data_train['emb'] = data_train['review'].apply(lambda x: text_to_indices(x))
data_test['emb'] = data_test['review'].apply(lambda x: text_to_indices(x))

max_length = max(data_train['emb'].apply(len).max(), data_test['emb'].apply(len).max())

data_train['emb'] = data_train['emb'].apply(lambda x: x + [vocab['<pad>']] * (max_length - len(x)))
data_test['emb'] = data_test['emb'].apply(lambda x: x + [vocab['<pad>']] * (max_length - len(x)))

data_train['sentiment'] = data_train['sentiment'].map({'positive': 1, 'negative': 0})
data_test['sentiment'] = data_test['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
data_train.head()

Unnamed: 0,review,sentiment,emb
17552,"[lovely, music, beautiful, photography, some, ...",0,"[1282, 206, 305, 1291, 46, 5, 131, 23, 2950, 4..."
20467,"[raising, victor, vargas, is, one, of, those, ...",0,"[5404, 2615, 9735, 7, 28, 5, 141, 703, 235, 93..."
49715,"[my, abiding, love, of, italian, actress, luci...",1,"[59, 19401, 110, 5, 1064, 522, 48661, 49834, 3..."
31896,"[so, the, wwe, has, done, it, they, have, pour...",1,"[38, 2, 5616, 43, 224, 9, 34, 25, 12397, 129, ..."
11953,"[this, movie, might, not, put, the, catholic, ...",1,"[10, 17, 222, 21, 263, 2, 3278, 1596, 8, 2, 11..."


In [None]:
class Net(nn.Module):
  def __init__(self, num_embeddings):
    super().__init__()
    self.emb = nn.Embedding(
        num_embeddings=num_embeddings,
        embedding_dim=100,
    )
    self.fc = nn.Linear(in_features=100*max_length, out_features=2)

  def forward(self, X):

    e = self.emb(X)
    e = e.flatten(1)
    out = self.fc(e)
    return out

In [None]:
X_train = torch.LongTensor(data_train['emb'].tolist())
y_train = torch.LongTensor(data_train['sentiment'].tolist())

X_test = torch.LongTensor(data_test['emb'].tolist())
y_test = torch.LongTensor(data_test['sentiment'].tolist())

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=100)
test_loader = DataLoader(test_dataset, batch_size=100)

n_epochs = 5
lr = 0.001

model = Net(num_embeddings=len(vocab))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
for epoch in range(n_epochs):

  model.train()
  acc_m = M.Accuracy(task="binary")
  for X, y in train_loader:
    y_pred = model(X)
    loss = criterion(y_pred, y)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    acc_m.update(y_pred.argmax(dim=1), y)

  model.eval()
  test_acc_m = M.Accuracy(task="binary")
  with torch.no_grad():
      for X, y in test_loader:
          y_test_pred = model(X)
          test_acc_m.update(y_test_pred.argmax(dim=1), y)

  train_acc = acc_m.compute()
  test_acc = test_acc_m.compute()
  print(f"{epoch=} {train_acc.item()=} {test_acc.item()=}")

epoch=0 train_acc.item()=0.5261066555976868 test_acc.item()=0.539680004119873
epoch=1 train_acc.item()=0.6425066590309143 test_acc.item()=0.7072799801826477
epoch=2 train_acc.item()=0.7543466687202454 test_acc.item()=0.7293599843978882
epoch=3 train_acc.item()=0.8054133057594299 test_acc.item()=0.675599992275238
epoch=4 train_acc.item()=0.8527466654777527 test_acc.item()=0.8022400140762329


In [None]:
model.eval()
test_predictions = []
test_labels = []
train_predictions = []
train_labels = []

with torch.no_grad():

    for X_train_batch, y_train_batch in train_loader:
        y_train_pred = model(X_train_batch)
        train_predictions.extend(y_train_pred.argmax(dim=1).tolist())
        train_labels.extend(y_train_batch.tolist())

    for X_test_batch, y_test_batch in test_loader:
        y_test_pred = model(X_test_batch)
        test_predictions.extend(y_test_pred.argmax(dim=1).tolist())
        test_labels.extend(y_test_batch.tolist())

In [None]:
# train
class_report = classification_report(train_labels, train_predictions)
print(class_report)

              precision    recall  f1-score   support

           0       0.96      0.97      0.96     18735
           1       0.97      0.96      0.96     18765

    accuracy                           0.96     37500
   macro avg       0.96      0.96      0.96     37500
weighted avg       0.96      0.96      0.96     37500



In [None]:
# test
class_report = classification_report(test_labels, test_predictions)
print(class_report)

              precision    recall  f1-score   support

           0       0.78      0.84      0.81      6265
           1       0.83      0.76      0.79      6235

    accuracy                           0.80     12500
   macro avg       0.80      0.80      0.80     12500
weighted avg       0.80      0.80      0.80     12500



In [None]:
# train
conf_matrix = confusion_matrix(train_labels, train_predictions)
print(conf_matrix)

[[18175   560]
 [  771 17994]]


In [None]:
# test
conf_matrix = confusion_matrix(test_labels, test_predictions)
print(conf_matrix)

[[5276  989]
 [1483 4752]]
