In [1]:
import re
import typing as t
from collections import defaultdict
from functools import lru_cache
from pathlib import Path

import nltk
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from nltk.corpus import stopwords, wordnet
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, Subset, random_split
import os



In [2]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
# NLTK теги частей речи отличаются от WORDNET тегов
def get_pos(word: str) -> str:
    tag = nltk.pos_tag([word])[0][1]
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


STOPWORDS = set(stopwords.words("english"))


def preprocess_review(text: str) -> str:
    text = text.lower()
    # удаляем все символы кроме букв латинского алфавита
    text = re.sub(r"[^a-z]", repl=" ", string=text, flags=re.MULTILINE)

    lemmatizer = nltk.WordNetLemmatizer()
    words = []
    for word in nltk.word_tokenize(text):
        if word not in STOPWORDS:  # удаляем стоп-слова до лемматизации - так можно чуть-чуть сэкономить
            lemma = lemmatizer.lemmatize(word, pos=get_pos(word))
            # удаляем стоп-слова, наивное предположение - не брать леммы короче 3-х символов дало значительный прирост точности
            if lemma not in STOPWORDS and len(lemma) > 2:
                words.append(lemma)

    return " ".join(words)

In [5]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {DEVICE.upper()} device")

Using CUDA device


In [6]:
class ReviewsDataset(Dataset):

    def __init__(self, positive_path: Path, negative_path: Path, seed: int = None):
        self.positive_path = positive_path
        self.negative_path = negative_path
        self.positive_reviews = self.read_reviews(positive_path, preprocess_review)
        self.negative_reviews = self.read_reviews(negative_path, preprocess_review)

        data = self.positive_reviews + self.negative_reviews
        targets = torch.cat([torch.ones(len(self.positive_reviews)), torch.zeros(len(self.negative_reviews))])

        if seed is not None:
            torch.manual_seed(seed)
        indices = torch.randperm(len(data))

        self.data = [data[i] for i in indices]
        self.targets = targets[indices].to(torch.long)

    @staticmethod
    def read_reviews(path: Path, process: t.Callable[[str], str]) -> list[str]:
        reviews = []
        with open(path) as f:
            for review in f.readlines():
                review = process(review)
                if review:
                    reviews.append(review)
        return reviews

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index], self.targets[index]

In [7]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr

In [8]:
reviews_dataset = ReviewsDataset(
     "/kaggle/input/positive-reviews/positive_reviews.txt",
     "/kaggle/input/negative-reviews/negative_reviews.txt",
    seed=0,
)
len(reviews_dataset), reviews_dataset[0]

(151809, ('would wish install bad enemy', tensor(0)))

In [9]:
torch.manual_seed(0)
def train_test_split(dataset: t.Union[Dataset, t.Sized], train_part: float) -> t.Tuple[Subset, Subset]:
    train_size = round(train_part * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, lengths=(train_size, test_size))
    return train_dataset, test_dataset
train_reviews, test_reviews = train_test_split(reviews_dataset, train_part=0.8)
len(train_reviews), len(test_reviews)

(121447, 30362)

In [10]:
class ReviewsVocab:
    pad = "<PAD>"
    unknown = "<UNK>"

    def __init__(self, reviews: t.List[str]):
        uniques = set()
        max_len = 0
        for review in reviews:
            words = nltk.word_tokenize(review)
            uniques.update(words)
            max_len = max(len(words), max_len)

        self.alphabet = [self.pad, self.unknown, *uniques]
        self.max_len = max_len

        w2i = {w: i for i, w in enumerate(self.alphabet)}
        # если ключ отсутствует, будет возвращена 1 - индекс служебного символа
        self.w2i = defaultdict(lambda: 1, w2i)

    def __len__(self):
        return len(self.alphabet)

    @lru_cache(maxsize=8192)  # сомнительная эффективность? Ну да
    def encode(self, review: str) -> torch.Tensor:
        indices = [self.w2i[w] for w in nltk.word_tokenize(review)]
        indices += [self.w2i[self.pad]] * (self.max_len - len(indices))
        return torch.tensor(indices, dtype=torch.long)

    def decode(self, indices: torch.Tensor) -> str:
        pad_indices = torch.nonzero(indices == self.w2i[self.pad], as_tuple=True)[0]  # noqa
        if len(pad_indices):
            indices = indices[:pad_indices[0]]
        return " ".join(self.alphabet[i] for i in indices)

In [11]:
vocab = ReviewsVocab([review for review, _ in train_reviews])
print(f"alphabet: {len(vocab)}", f"longest: {vocab.max_len}")
encoded = vocab.encode("this is a neutral review")
encoded, vocab.decode(encoded)

alphabet: 59319 longest: 788


(tensor([    1,     1,     1, 22118, 53439,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [12]:
class ReviewsClassifier(nn.Module):
    LAST_CONV_OUT_CHANNELS = 64
    ADAPTIVE_AVG_POOL = 8

    def __init__(self, num_embeddings: int, embedding_dim: int):
        super(ReviewsClassifier, self).__init__()

        # Как же этой модели все это... безразлично
        self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
        self.features = nn.Sequential(
            nn.Conv1d(in_channels=embedding_dim, out_channels=self.LAST_CONV_OUT_CHANNELS, kernel_size=2),
            nn.BatchNorm1d(num_features=self.LAST_CONV_OUT_CHANNELS),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
        )
        # Единственный полезный (и понятный зачем) слой. Зачем? - Позволяет не думать о размерностях
        self.avgpool = nn.AdaptiveAvgPool1d(self.ADAPTIVE_AVG_POOL)
        self.classifier = nn.Sequential(
            nn.Linear(self.LAST_CONV_OUT_CHANNELS * self.ADAPTIVE_AVG_POOL, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 2),
        )

    def forward(self, x: torch.Tensor):
        x = self.embedding(x)
        x = x.reshape(x.size(0), x.size(2), x.size(1))
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x


def collate(batch: t.List[t.Tuple[str, torch.Tensor]]) -> t.Tuple[torch.Tensor, torch.Tensor]:
    xs, ys = [], []
    for x, y in batch:
        xs.append(vocab.encode(x))
        ys.append(y)
    return torch.vstack(xs), torch.hstack(ys)

In [13]:
torch.manual_seed(0)

net = ReviewsClassifier(num_embeddings=len(vocab), embedding_dim=128).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.000914092001)  # а почему нет?
# будет изменять lr = lr * factor, если на протяжении patience эпох ошибка не менялась более чем на threshold
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer,
    mode="min",
    patience=5,
    factor=0.333333,
    min_lr=0.000001,
    threshold=0.001,
    verbose=True,
)

# Однако batch_size - очень важно. При 8 и 64 застреваем в плохом оптимуме, 32 тоже не очень...
# 22 > Embedding, Conv1d, BatchNorm1d, ReLU, MaxPool1d, AdaptiveAvgPool1d, Dropout, lr_scheduler, optimizer
train_dataloader = DataLoader(train_reviews, batch_size=22, collate_fn=collate, shuffle=True)
test_dataloader = DataLoader(test_reviews, batch_size=512, collate_fn=collate)

In [14]:
def common_train(
        model: nn.Module,
        loss_fn: nn.Module,
        optimizer: optim.Optimizer,
        train_dataloader: DataLoader,
        epochs: int,
        test_dataloader: DataLoader = None,
        lr_scheduler=None,
        verbose: int = 100,
        device: str = "cpu",
) -> t.List[float]:
    train_losses = []
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}\n" + "-" * 32)
        train_loss = train_loop(
            train_dataloader,
            model,
            loss_fn,
            optimizer,
            verbose=verbose,
            device=device,
        )
        train_losses.append(train_loss.item())
        if test_dataloader:
            loss, acc = test_loop(test_dataloader, model, loss_fn, device=device)
            if lr_scheduler:
                lr_scheduler.step(loss)
        torch.cuda.empty_cache()
    return train_losses
def train_loop(
        dataloader: DataLoader,
        model: nn.Module,
        loss_fn: nn.Module,
        optimizer: optim.Optimizer,
        verbose: int = 100,
        device: str = "cpu",
) -> torch.Tensor:
    model.train()

    size = len(dataloader.dataset)  # noqa
    num_batches = len(dataloader)
    avg_loss = 0

    for batch, (x, y) in enumerate(dataloader):
        x, y = x.to(device), y.to(device)

        pred = model(x)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        avg_loss += loss
        if batch % verbose == 0:
            print(f"loss: {loss:>7f}  [{batch * len(x):>5d}/{size:>5d}]")

        del x, y, pred, loss
        torch.cuda.empty_cache()

    return avg_loss / num_batches
@torch.no_grad()
def test_loop(
        dataloader: DataLoader,
        model: nn.Module,
        loss_fn: nn.Module,
        device: str = "cpu",
) -> t.Tuple[torch.Tensor, torch.Tensor]:
    model.eval()

    size = len(dataloader.dataset)  # noqa
    num_batches = len(dataloader)
    avg_loss, correct = 0, 0

    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        pred = model(x)
        avg_loss += loss_fn(pred, y)
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()  # noqa

        del x, y, pred
        torch.cuda.empty_cache()

    avg_loss /= num_batches
    accuracy = correct / size
    print(f"Test Error: \n Accuracy: {accuracy:>4f}, Avg loss: {avg_loss:>8f} \n")

    return avg_loss, accuracy


In [15]:
%%time

_ = common_train(
    epochs=20,
    model=net,
    loss_fn=loss_fn,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    lr_scheduler=lr_scheduler,
    verbose=150,
    device=DEVICE,
)

Epoch 1
--------------------------------
loss: 0.688263  [    0/121447]
loss: 0.669989  [ 3300/121447]
loss: 0.602634  [ 6600/121447]
loss: 0.726695  [ 9900/121447]
loss: 0.707313  [13200/121447]
loss: 0.672009  [16500/121447]
loss: 0.665081  [19800/121447]
loss: 0.678214  [23100/121447]
loss: 0.636483  [26400/121447]
loss: 0.685687  [29700/121447]
loss: 0.671561  [33000/121447]
loss: 0.632208  [36300/121447]
loss: 0.532886  [39600/121447]
loss: 0.513744  [42900/121447]
loss: 0.456883  [46200/121447]
loss: 0.459239  [49500/121447]
loss: 0.326138  [52800/121447]
loss: 0.545915  [56100/121447]
loss: 0.301607  [59400/121447]
loss: 0.475084  [62700/121447]
loss: 0.411620  [66000/121447]
loss: 0.483227  [69300/121447]
loss: 0.479000  [72600/121447]
loss: 0.429365  [75900/121447]
loss: 0.537460  [79200/121447]
loss: 0.423303  [82500/121447]
loss: 0.474779  [85800/121447]
loss: 0.736515  [89100/121447]
loss: 0.439815  [92400/121447]
loss: 0.422479  [95700/121447]
loss: 0.326826  [99000/121447

In [16]:
@torch.no_grad()
def get_y_test_y_pred(
        model: nn.Module,
        test_dataloader: DataLoader,
        device: str = "cpu",
) -> t.Tuple[torch.Tensor, torch.Tensor]:
    model.eval()

    y_test = []
    y_pred = []
    for x, y in test_dataloader:
        x, y = x.to(device), y.to(device)
        pred = model(x).argmax(1)
        y_test.append(y)
        y_pred.append(pred)

        del x
        torch.cuda.empty_cache()

    return torch.hstack(y_test).detach().cpu(), torch.hstack(y_pred).detach().cpu()

In [17]:
y_test, y_pred = get_y_test_y_pred(net, test_dataloader, DEVICE)

print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred,
    target_names=["negative", "positive"],
))

              precision    recall  f1-score   support

    negative       0.74      0.82      0.78     11815
    positive       0.88      0.81      0.84     18547

    accuracy                           0.82     30362
   macro avg       0.81      0.82      0.81     30362
weighted avg       0.82      0.82      0.82     30362



In [18]:
def inference(
        review: str,
        target: str,
        model: nn.Module,
        vocab: ReviewsVocab,
        target_names: list[str],
        device: str = "cpu",
):
    x = vocab.encode(preprocess_review(review))
    x = x.to(device)

    pred = model(x.unsqueeze(0))
    pred_proba, pred_label_idx = F.softmax(pred, 1).max(dim=1)
    pred_label = target_names[pred_label_idx.cpu()]

    print(f"Review : {review}")
    print(f"True   : {target}")
    print(f"Predict: {pred_label} ({pred_proba.item():.2f})\n")


In [19]:
reviews = [
    ("Not good movie", "negative"),
    ("boring", "negative"),
    ("best movie ever", "positive"),
]
for review, target in reviews:
    inference(
        review=review,
        target=target,
        model=net,
        vocab=vocab,
        target_names=["negative", "positive"],
        device=DEVICE,
    )
# не такой тупой оказывается, но это хуже чем log классификация не знаю почему

Review : Not good movie
True   : negative
Predict: positive (0.96)

Review : boring
True   : negative
Predict: negative (0.96)

Review : best movie ever
True   : positive
Predict: positive (0.94)

