In [1]:
import re

import nltk
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 1. Представление и предобработка текстовых данных 

1.1 Операции по предобработке:
* токенизация
* стемминг / лемматизация
* удаление стоп-слов
* удаление пунктуации
* приведение к нижнему регистру
* любые другие операции над текстом

In [3]:
from nltk.stem.snowball import SnowballStemmer

In [4]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

Реализовать функцию `preprocess_text(text: str)`, которая:
* приводит строку к нижнему регистру
* заменяет все символы, кроме a-z, A-Z и знаков .,!? на пробел


In [5]:
pat = re.compile(r"[^a-z.!?]", flags=re.MULTILINE)
stop_words = set(stopwords.words("english"))
punctuation = set(".!?")
stemmer = SnowballStemmer("english")


def preprocess_text(text: str) -> str:
    # нижний регистр и только разделители предложений
    text = pat.sub(" ", text.lower())

    sentences = []
    for sent in nltk.sent_tokenize(text):
        words = []
        for word in nltk.word_tokenize(sent):
            # не стоп слово и не разделитель предложение
            if word not in stop_words and word not in punctuation:
                stemma = stemmer.stem(word)
                # отбросить слишком короткие стеммы - положительно сказывается на точности модели
                if stemma not in stop_words and len(stemma) > 2:
                    words.append(stemma)
        sentences.append(" ".join(words))

    # что бы сохранить возможность разбивать на предложения достаточно "."
    text = ". ".join(sentences)
    # на всякий случай
    text = re.sub(r"[!?]", ".", text, flags=re.MULTILINE)
    return text

In [6]:
preprocess_text(text)

'select prefer run instal command. stabl repres current test support version pytorch. note libtorch avail'

1.2 Представление текстовых данных при помощи бинарного кодирования


Представить первое предложение из `text` в виде тензора `sentence_t`: `sentence_t[i] == 1`, если __слово__ с индексом `i` присуствует в предложении.

In [7]:
text_ = preprocess_text(text)

vocab = set(nltk.word_tokenize(text_))
vocab.remove(".")
w2i = {w: i for i, w in enumerate(vocab)}

sent = nltk.sent_tokenize(text_)[0]
sent = sent.replace(".", "")
indices = [w2i[w] for w in nltk.word_tokenize(sent)]
vector = torch.zeros(len(vocab))
vector[indices] = 1
vector

tensor([0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1.])

## 2. Классификация фамилий по национальности

Датасет: https://disk.yandex.ru/d/owHew8hzPc7X9Q?w=1

2.1 Считать файл `surnames/surnames.csv`. 

2.2 Закодировать национальности числами, начиная с 0.

2.3 Разбить датасет на обучающую и тестовую выборку

2.4 Реализовать класс `Vocab` (токен = __символ__)

2.5 Реализовать класс `SurnamesDataset`

2.6. Обучить классификатор.

2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: прогнать несколько фамилий студентов группы через модели и проверить результат. Для каждой фамилии выводить 3 наиболее вероятных предсказания.

In [8]:
df = pd.read_csv("data/surnames.csv")
df.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [9]:
enc = LabelEncoder()
df["target"] = enc.fit_transform(df["nationality"])
print(f"classes: {len(enc.classes_)}")
df.head()

classes: 18


Unnamed: 0,surname,nationality,target
0,Woodford,English,4
1,Coté,French,5
2,Kore,English,4
3,Koury,Arabic,0
4,Lebzak,Russian,14


In [10]:
class Vocab:
    def __init__(self, data):
        self.idx_to_token = {i: t for i, t in enumerate({ch for w in data for ch in w.lower()})}
        self.token_to_idx = {t: i for i, t in self.idx_to_token.items()}
        self.vocab_len = len(self.idx_to_token)

In [11]:
vocab = Vocab(df["surname"])
vocab.vocab_len

55

In [12]:
class SurnamesDataset(Dataset):
    def __init__(self, X, y, vocab: Vocab):
        self.X = X
        self.y = y
        self.vocab = vocab

    def vectorize(self, surname):
        """Генерирует представление фамилии surname в при помощи бинарного кодирования (см. 1.2)"""
        v = torch.zeros(self.vocab.vocab_len)
        v[[self.vocab.token_to_idx[ch] for ch in surname.lower()]] = 1
        return v

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.vectorize(self.X[idx]), self.y[idx]

In [13]:
dataset = SurnamesDataset(
    X=df["surname"].tolist(),
    y=torch.tensor(df["target"], dtype=torch.long),
    vocab=vocab,
)
len(dataset)

10980

In [14]:
train_size = round(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, lengths=(train_size, test_size))
len(train_dataset), len(test_dataset)

(8784, 2196)

In [15]:
class SurnamesClassifier(nn.Module):

    def __init__(self, in_features, n_classes):
        super(SurnamesClassifier, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features, 300),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(300, n_classes),
        )

    def forward(self, inputs):
        return self.classifier(inputs)

In [16]:
model = SurnamesClassifier(vocab.vocab_len, vocab.vocab_len)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_dataloader = DataLoader(train_dataset, batch_size=6, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=512)

In [17]:
for epoch in range(15):
    print(f"Epoch {epoch + 1}\n" + "-" * 32)

    model.train()
    size = len(train_dataloader.dataset)
    num_batches = len(train_dataloader)
    avg_loss = 0

    for batch, (x, y) in enumerate(train_dataloader):
        pred = model(x)
        loss = criterion(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        avg_loss += loss
        if batch % 400 == 0:
            print(f"[{batch * len(x):>5d}/{size:>5d}] loss={loss:6f}")

    model.eval()
    size = len(test_dataloader.dataset)
    num_batches = len(test_dataloader)
    avg_loss, correct = 0, 0

    for x, y in test_dataloader:
        pred = model(x)
        avg_loss += criterion(pred, y)
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    avg_loss /= num_batches
    accuracy = correct / size
    print(f"Validate: accuracy={accuracy:5f}, avg_loss={avg_loss:5f}\n")

Epoch 1
--------------------------------
[    0/ 8784] loss=4.019810
[ 2400/ 8784] loss=2.142408
[ 4800/ 8784] loss=2.530181
[ 7200/ 8784] loss=1.795364
Validate: accuracy=0.525046, avg_loss=1.554981

Epoch 2
--------------------------------
[    0/ 8784] loss=1.302734
[ 2400/ 8784] loss=1.052091
[ 4800/ 8784] loss=1.559641
[ 7200/ 8784] loss=0.535729
Validate: accuracy=0.553734, avg_loss=1.451229

Epoch 3
--------------------------------
[    0/ 8784] loss=1.500295
[ 2400/ 8784] loss=1.125522
[ 4800/ 8784] loss=1.585158
[ 7200/ 8784] loss=1.483449
Validate: accuracy=0.571949, avg_loss=1.381282

Epoch 4
--------------------------------
[    0/ 8784] loss=2.311538
[ 2400/ 8784] loss=1.647652
[ 4800/ 8784] loss=1.132809
[ 7200/ 8784] loss=1.630430
Validate: accuracy=0.583333, avg_loss=1.338217

Epoch 5
--------------------------------
[    0/ 8784] loss=1.176329
[ 2400/ 8784] loss=2.259466
[ 4800/ 8784] loss=0.986127
[ 7200/ 8784] loss=1.145083
Validate: accuracy=0.588798, avg_loss=1.320

In [18]:
model.eval()

y_test, y_pred = [], []
for x, y in test_dataloader:
    pred = model(x).argmax(1)
    y_test.append(y)
    y_pred.append(pred)

y_test, y_pred = torch.hstack(y_test), torch.hstack(y_pred)
print(classification_report(y_test, y_pred, target_names=enc.classes_, zero_division=True))

              precision    recall  f1-score   support

      Arabic       0.80      1.00      0.89       325
     Chinese       0.43      0.44      0.43        45
       Czech       0.60      0.10      0.17        88
       Dutch       0.00      0.00      0.00        39
     English       0.52      0.82      0.63       581
      French       0.25      0.02      0.04        50
      German       0.61      0.22      0.33       122
       Greek       0.48      0.24      0.32        42
       Irish       0.67      0.29      0.41        41
     Italian       0.45      0.31      0.37       126
    Japanese       0.62      0.54      0.58       159
      Korean       0.50      0.17      0.25        12
      Polish       0.82      0.41      0.55        22
  Portuguese       1.00      0.00      0.00        13
     Russian       0.75      0.76      0.76       448
    Scottish       1.00      0.00      0.00        19
     Spanish       0.43      0.20      0.28        44
  Vietnamese       1.00    

In [19]:
students = [
    "Konovalova",
    "Tikhonov",
    "Bilyukin",
    "Titov",
    "Ratoshnyuk",
    "Voronina",
    "Petrov",
    "Kamenchuk",
    "Katamadze",
]
for surname in students:
    x = dataset.vectorize(surname)
    pred = model(x.unsqueeze(0))  # батч из одного элемента
    pred_proba, pred_label_indices = torch.softmax(pred, dim=1).topk(3, dim=1)
    pred_labels = enc.inverse_transform(pred_label_indices.squeeze())

    predicts = ", ".join([
        f"{label} ({prob:.2f})"
        for (label, prob) in zip(pred_labels, pred_proba.squeeze())
    ])
    print(f"{surname:<10}   -   {predicts}")

Konovalova   -   Russian (0.81), Czech (0.15), English (0.03)
Tikhonov     -   Russian (0.98), Czech (0.01), English (0.01)
Bilyukin     -   Russian (0.96), English (0.03), Czech (0.01)
Titov        -   Russian (0.58), Italian (0.15), English (0.15)
Ratoshnyuk   -   Russian (0.94), Greek (0.03), Japanese (0.01)
Voronina     -   Italian (0.47), Russian (0.26), English (0.11)
Petrov       -   Russian (0.44), English (0.33), French (0.07)
Kamenchuk    -   Russian (0.44), Czech (0.18), German (0.10)
Katamadze    -   Russian (0.35), Polish (0.23), Czech (0.19)


## 3. Классификация обзоров ресторанов

Датасет: https://disk.yandex.ru/d/nY1o70JtAuYa8g

3.1 Считать файл `yelp/raw_train.csv`. Оставить от исходного датасета 10% строчек.

3.2 Воспользоваться функцией `preprocess_text` из 1.1 для обработки текста отзыва. Закодировать рейтинг числами, начиная с 0.

3.3 Разбить датасет на обучающую и тестовую выборку

3.4 Реализовать класс `Vocab` (токен = слово)

3.5 Реализовать класс `ReviewDataset`

3.6 Обучить классификатор

3.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)


In [20]:
df = pd.read_csv("data/yelp/raw_train.csv", header=None)
print(len(df))
df = df.sample(frac=0.1, ignore_index=True, random_state=0)
print(len(df))
df.head()

560000
56000


Unnamed: 0,0,1
0,2,"Call me crazy, but I really enjoyed this place..."
1,1,"HORRIBLE, HORRIBLE, HORRIBLE ONLINE CUSTOMER S..."
2,2,The staff were extremely helpful in answering ...
3,2,The steak tartare is fantastic! I'd come back...
4,2,Everything I could want for $3.


In [21]:
enc = LabelEncoder()

df[0] = enc.fit_transform(df[0])
df[1] = df[1].apply(lambda t: preprocess_text(t).replace(".", ""))
print(f"classes: {len(enc.classes_)}")
df.head()

classes: 2


Unnamed: 0,0,1
0,1,call crazi realli enjoy place get ton food dec...
1,0,horribl horribl horribl onlin custom servic or...
2,1,staff extrem help answer question took time ma...
3,1,steak tartar fantast come back vega dine great...
4,1,everyth could want


In [22]:
class Vocab:
    def __init__(self, data):
        unique = {w for text in data for w in nltk.word_tokenize(text)}
        if "." in unique:
            unique.remove(".")
        self.idx_to_token = list(unique)

        self.token_to_idx = {t: i for i, t in enumerate(self.idx_to_token)}

        self.vocab_len = len(self.idx_to_token)

In [23]:
vocab = Vocab(df[1])
vocab.vocab_len

50659

In [24]:
class ReviewDataset(Dataset):
    def __init__(self, X, y, vocab: Vocab):
        self.X = X
        self.y = y
        self.vocab = vocab

    def vectorize(self, review):
        """Генерирует представление отзыва review при помощи бинарного кодирования (см. 1.2)"""
        vec = torch.zeros(self.vocab.vocab_len)
        indices = [self.vocab.token_to_idx[w] for w in nltk.word_tokenize(review)]
        vec[indices] = 1
        return vec

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.vectorize(self.X[idx]), self.y[idx]

In [25]:
dataset = ReviewDataset(
    X=df[1].tolist(),
    y=torch.tensor(df[0], dtype=torch.long),
    vocab=vocab,
)
len(dataset)

56000

In [26]:
train_size = round(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, lengths=(train_size, test_size))
len(train_dataset), len(test_dataset)

(44800, 11200)

In [27]:
class ReviewsClassifier(nn.Module):

    def __init__(self, in_features, n_classes):
        super(ReviewsClassifier, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features, 200),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(200, n_classes),
        )

    def forward(self, inputs):
        return self.classifier(inputs)

In [28]:
model = ReviewsClassifier(vocab.vocab_len, vocab.vocab_len)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1024)

In [29]:
for epoch in range(5):
    print(f"Epoch {epoch + 1}\n" + "-" * 32)

    model.train()
    size = len(train_dataloader.dataset)
    num_batches = len(train_dataloader)
    avg_loss = 0

    for batch, (x, y) in enumerate(train_dataloader):
        pred = model(x)
        loss = criterion(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        avg_loss += loss
        if batch % 200 == 0:
            print(f"[{batch * len(x):>5d}/{size:>5d}] loss={loss:6f}")

    model.eval()
    with torch.no_grad():
        size = len(test_dataloader.dataset)
        num_batches = len(test_dataloader)
        avg_loss, correct = 0, 0

        for x, y in test_dataloader:
            pred = model(x)
            avg_loss += criterion(pred, y)
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

        avg_loss /= num_batches
        accuracy = correct / size
        print(f"Validate: accuracy={accuracy:5f}, avg_loss={avg_loss:5f}\n")

Epoch 1
--------------------------------
[    0/44800] loss=10.858390
[12800/44800] loss=0.655565
[25600/44800] loss=0.301907
[38400/44800] loss=0.200185
Validate: accuracy=0.903036, avg_loss=0.283251

Epoch 2
--------------------------------
[    0/44800] loss=0.310345
[12800/44800] loss=0.296770
[25600/44800] loss=0.203643
[38400/44800] loss=0.188113
Validate: accuracy=0.909554, avg_loss=0.241475

Epoch 3
--------------------------------
[    0/44800] loss=0.094054
[12800/44800] loss=0.248649
[25600/44800] loss=0.089277
[38400/44800] loss=0.071432
Validate: accuracy=0.911339, avg_loss=0.236651

Epoch 4
--------------------------------
[    0/44800] loss=0.106096
[12800/44800] loss=0.118433
[25600/44800] loss=0.151436
[38400/44800] loss=0.115770
Validate: accuracy=0.907679, avg_loss=0.241892

Epoch 5
--------------------------------
[    0/44800] loss=0.071021
[12800/44800] loss=0.141998
[25600/44800] loss=0.080506
[38400/44800] loss=0.093734
Validate: accuracy=0.906518, avg_loss=0.25

In [30]:
model.eval()

y_test, y_pred = [], []
for x, y in test_dataloader:
    pred = model(x).argmax(1)
    y_test.append(y)
    y_pred.append(pred)

y_test, y_pred = torch.hstack(y_test), torch.hstack(y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91      5622
           1       0.91      0.90      0.91      5578

    accuracy                           0.91     11200
   macro avg       0.91      0.91      0.91     11200
weighted avg       0.91      0.91      0.91     11200



In [31]:
reviews = [
    ("This is the most terrible place I've ever been to. Untidy and sloppy staff. Low-quality products.", 1),
    ("A wonderful place. I will recommend it to my relatives and friends.", 2),
]

translate = {1: "bad", 2: "good"}

for review, target in reviews:
    x = dataset.vectorize(preprocess_text(review).replace(".", ""))

    pred = model(x.unsqueeze(0))  # батч из одного элемента
    pred_proba, pred_label_idx = torch.softmax(pred, 1).max(dim=1)
    pred_label = enc.inverse_transform([pred_label_idx.item()])

    print(
        f"{translate[target]:^4} | {translate[pred_label.item()]:^4} ({pred_proba.item():.2f}) | {review:<60}\n")

bad  | bad  (0.99) | This is the most terrible place I've ever been to. Untidy and sloppy staff. Low-quality products.

good | good (0.97) | A wonderful place. I will recommend it to my relatives and friends.

