In [1]:
import re
import nltk

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zonkz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 1. Представление и предобработка текстовых данных 

1.1 Операции по предобработке:
* токенизация
* стемминг / лемматизация
* удаление стоп-слов
* удаление пунктуации
* приведение к нижнему регистру
* любые другие операции над текстом

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer

In [4]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

Реализовать функцию `preprocess_text(text: str)`, которая:
* приводит строку к нижнему регистру
* заменяет все символы, кроме a-z, A-Z и знаков .,!? на пробел


In [5]:
def preprocess_text(text: str) -> str:
    # Приводим строку к нижнему регистру
    text = text.lower()
    
    # Заменяем все символы, кроме a-z, A-Z и знаков .,!? на пробел
    text = ''.join([' ' if not char.isalpha() and char not in ['.', ',', '!', '?'] else char for char in text])
    
    return text

text = preprocess_text(text)
text

'select your preferences and run the install command. stable represents the most currently tested and supported version of pytorch. note that libtorch is only available for c  '

In [6]:
preprocess_text("Начинается %новое % **приключение** совсем скоро &&SAP&&")

'начинается  новое     приключение   совсем скоро   sap  '

1.2 Представление текстовых данных при помощи бинарного кодирования


Представить первое предложение из `text` в виде тензора `sentence_t`: `sentence_t[i] == 1`, если __слово__ с индексом `i` присуствует в предложении.

In [7]:
first_sentence = sent_tokenize(text)[0]

# Создаем словарь слов и присваиваем каждому уникальный индекс
dictionary = {word: i for i, word in enumerate(set(text.split()))}
print(dictionary)

sentence_t = torch.zeros(len(dictionary))

print(first_sentence)

for word in first_sentence.split():
    print(word)
    if word in dictionary:
        print(dictionary[word])
        sentence_t[dictionary[word]] = 1

print(sentence_t)

{'available': 0, 'stable': 1, 'c': 2, 'your': 3, 'pytorch.': 4, 'install': 5, 'is': 6, 'version': 7, 'most': 8, 'preferences': 9, 'run': 10, 'represents': 11, 'of': 12, 'select': 13, 'the': 14, 'tested': 15, 'for': 16, 'supported': 17, 'note': 18, 'that': 19, 'only': 20, 'command.': 21, 'and': 22, 'currently': 23, 'libtorch': 24}
select your preferences and run the install command.
select
13
your
3
preferences
9
and
22
run
10
the
14
install
5
command.
21
tensor([0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0.,
        0., 0., 0., 1., 1., 0., 0.])


## 2. Классификация фамилий по национальности

Датасет: https://disk.yandex.ru/d/owHew8hzPc7X9Q?w=1

2.1 Считать файл `surnames/surnames.csv`. 

2.2 Закодировать национальности числами, начиная с 0.

2.3 Разбить датасет на обучающую и тестовую выборку

2.4 Реализовать класс `Vocab` (токен = __символ__)

2.5 Реализовать класс `SurnamesDataset`

2.6. Обучить классификатор.

2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: прогнать несколько фамилий студентов группы через модели и проверить результат. Для каждой фамилии выводить 3 наиболее вероятных предсказания.

In [8]:
surnames = pd.read_csv("data/surnames/surnames.csv")
surnames['nationality'], _ = pd.factorize(surnames['nationality'])
print(surnames)

        surname  nationality
0      Woodford            0
1          Coté            1
2          Kore            0
3         Koury            2
4        Lebzak            3
...         ...          ...
10975  Quraishi            2
10976   Innalls            0
10977      Król           12
10978    Purvis            0
10979  Messerli            9

[10980 rows x 2 columns]


In [9]:
# X_train, X_test, y_train, y_test = train_test_split(surnames['surname'].to_numpy(), surnames['nationality'].to_numpy(), test_size=0.2)
X = surnames['surname'].str.lower()
y = surnames['nationality']
n_classes = y.nunique()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [10]:
# Создаем словарь слов и присваиваем каждому уникальный индекс
dictionary = {word: i for i, word in enumerate(set(text.split()))}
print(dictionary)

sentence_t = torch.zeros(len(dictionary))

for word in first_sentence:
    if word in dictionary:
        sentence_t[dictionary[word]] = 1

print(sentence_t)

# class Vocab:
#     def __init__(self, data):
#         self.idx_to_token = {}
#         self.token_to_idx = {}
#         self.vocab_len = 0

#         # Получаем список всех слов в данных
#         all_words = [word for sentence in data["surname"].str.lower() for word in word_tokenize(sentence)]
#         # print(all_words)

#         # Строим словарь
#         for word in all_words:
#             if word not in self.token_to_idx:
#                 self.idx_to_token[self.vocab_len] = word
#                 self.token_to_idx[word] = self.vocab_len
#                 self.vocab_len += 1

class Vocab:
  def __init__(self, data):
    tokens = set()
    for item in data:
      tokens.update(item)

    self.idx_to_token = dict(enumerate(tokens))
    self.token_to_idx = {token: idx for idx, token in self.idx_to_token.items()}
    self.vocab_len = len(self.idx_to_token)
    
vocab = Vocab(surnames)

{'available': 0, 'stable': 1, 'c': 2, 'your': 3, 'pytorch.': 4, 'install': 5, 'is': 6, 'version': 7, 'most': 8, 'preferences': 9, 'run': 10, 'represents': 11, 'of': 12, 'select': 13, 'the': 14, 'tested': 15, 'for': 16, 'supported': 17, 'note': 18, 'that': 19, 'only': 20, 'command.': 21, 'and': 22, 'currently': 23, 'libtorch': 24}
tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.])


In [11]:
print(surnames["surname"].str.lower())

0        woodford
1            coté
2            kore
3           koury
4          lebzak
           ...   
10975    quraishi
10976     innalls
10977        król
10978      purvis
10979    messerli
Name: surname, Length: 10980, dtype: object


In [12]:
vocab = Vocab(surnames["surname"].str.lower())

print(vocab.vocab_len)
print(len(surnames))

55
10980


In [13]:
class SurnamesDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    # print(len(self.X))
    self.y = y
    self.vocab = vocab
    # print(self.vocab.vocab_len)

  def vectorize(self, surname):
    '''Генерирует представление фамилии surname в при помощи бинарного кодирования (см. 1.2)'''
    # Создаем вектор длины словаря, заполненный нулями
    
    vec = torch.zeros(self.vocab.vocab_len)
    
    # Проходим по каждому слову в фамилии
    # print(f"tokenized surname: {word_tokenize(surname)}")
    for word in word_tokenize(surname):
      # print(word)
      # Если слово есть в словаре, устанавливаем соответствующий бит в векторе
      if word in self.vocab.token_to_idx:
          vec[self.vocab.token_to_idx[word]] = 1
  
    return vec
  
  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    # Получаем векторизованное представление фамилии
    vec = self.vectorize(self.X[idx])

    # Получаем метку класса для фамилии
    label = self.y[idx]
    # print(f"vec, label: {vec, label}")

    return vec, label


In [14]:
class SurnamesDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, surname):
    '''Генерирует представление фамилии surname в при помощи бинарного кодирования (см. 1.2)'''
    surname_t = torch.zeros(self.vocab.vocab_len)
    for token in surname:
      surname_t[self.vocab.token_to_idx[token]] = 1
    return surname_t
    
  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return self.vectorize(self.X.iloc[idx]), self.y.iloc[idx]

In [15]:
vocab = Vocab(surnames["surname"].str.lower())

train_dataset = SurnamesDataset(X_train, y_train, vocab)
test_dataset = SurnamesDataset(X_test, y_test, vocab)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=True)

In [131]:
# class SurnameClassifier(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, output_dim):
#         super().__init__()

#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
#         self.flatten = nn.Flatten()
#         # self.fc = nn.Sequential(
#         #     nn.Linear(embedding_dim * vocab_size, 64),
#         #     nn.BatchNorm1d(64),
#         #     nn.ReLU(),
#         #     nn.Linear(64, output_dim)
#         # )
#         self.fc = nn.Linear(embedding_dim * vocab_size, output_dim)

#     def forward(self, x):
#         embedded = self.embedding(x)
#         # print(embedded, embedded.shape)
#         flattened = self.flatten(embedded)
#         # print(flattened, flattened.shape)
#         logits = self.fc(flattened)
#         return logits
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_model(model, train_dataloader, test_dataloader, criterion, optimizer, num_epochs):
    model.to(device)
    train_losses, test_losses = [], []
    

    
    for epoch in range(num_epochs):
        model.train()
        # running_loss = 0.0
        train_loss, test_loss = 0, 0
        for inputs, labels in train_dataloader:
            x = inputs.to(device)
            y = labels.to(device)
            # print(x.dtype)

            
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            train_loss += loss.item()

        # epoch_loss = running_loss / len(train_dataloader.dataset)
        # print(f'Epoch {epoch+1}, Loss: {epoch_loss:.5f}')
    
        # Валидация на val_loader
        model.eval()
        with torch.no_grad():
            for inputs, labels in test_dataloader:
                inputs = inputs.to(device)
                labels = labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                test_loss += loss.item()
        
        train_losses.append(train_loss/len(train_dataloader))
        test_losses.append(test_loss/len(test_dataloader))

        print(f'Epoch {epoch+1}, Train Loss: {train_losses[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}')
    # epoch_loss = 0
    # for X_batch, y_batch in train_dataloader:
    #     predictions = model(X_batch.cuda())
    #     loss = criterion(predictions, y_batch.cuda())
    #     loss.backward()
    #     optimizer.step()
    #     optimizer.zero_grad()
    #     epoch_loss += loss.item()
    
    # with torch.no_grad():
    #     val_loss, val_acc = 0, 0
    #     for X_batch, y_batch in test_dataloader:
    #         predictions = model(X_batch.cuda())
    #         loss = criterion(predictions, y_batch.cuda()).item()
    #         acc = accuracy_score(y_batch, predictions.argmax(dim=1).cpu().detach()).item()
    #         val_loss += loss
    #         val_acc += acc
    #     if num_epochs % 5 == 0:
    #         print(f'#{num_epochs} Training loss: {epoch_loss / len(train_dataloader)} val_loss: {val_loss / len(test_dataloader)}')
        

def evaluate_model(model, dataloader):
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for surnames, labels in dataloader:
            # print(batch["surname"])
            x = surnames.to(device)
            # print(x)
            y = labels.to(device)
            # print(x, y)
            
            # x = torch.LongTensor(x)

            logits = model(x)
            _, predicted = torch.max(logits, 1)
            correct += (predicted == y).sum().item()
            total += y.size(0)

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.5f}')

def predict(model, dataset, surname):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    model.eval()
    with torch.no_grad():
        # vectorized, y = next(iter(dataloader))
        vectorized = dataset.vectorize(surname)
        print(vectorized)
        tensor = vectorized.unsqueeze(0).to(device)
        # vectorized = torch.LongTensor(vectorized)
        # print(tensor) 
        logits = model(tensor)
        
        probs = torch.softmax(logits, dim=1).squeeze()
        # print(f"probs: {probs} size: {probs.shape}")
        top3_probs, top3_indices = torch.topk(probs, k=3)
        print(top3_probs, top3_indices)
        # print(f"top 3 probs: {top3_probs}")
        # print(top3_indices)
        # print(top3_indices.detach().cpu().numpy())
        # top3_nationalities = [vocab.idx_to_token[idx.item()] for idx in top3_indices]
        top3_nationalities = _[top3_indices.detach().cpu().numpy()]
        print(f'{surname}: {top3_nationalities[0]} ({top3_probs[0]:.4f}), {top3_nationalities[1]} ({top3_probs[1]:.4f}), {top3_nationalities[2]} ({top3_probs[2]:.4f})')
        

In [17]:
_

Index(['English', 'French', 'Arabic', 'Russian', 'Japanese', 'Chinese',
       'Italian', 'Czech', 'Irish', 'German', 'Greek', 'Spanish', 'Polish',
       'Dutch', 'Vietnamese', 'Korean', 'Portuguese', 'Scottish'],
      dtype='object')

In [43]:
print(vocab.vocab_len)

55


In [19]:
# model = SurnameClassifier(vocab_size=vocab.vocab_len,
#                            embedding_dim=32,
#                            output_dim=len(set(y_train)))
# model = SurnameClassifier(vocab_size=vocab.vocab_len,
#                            output_dim=len(set(y_train)))
model = nn.Sequential(nn.Linear(vocab.vocab_len, 300),
        nn.ReLU(),
        nn.Linear(300, len(set(y_train))))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

evaluate_model(model, test_dataloader)

Test Accuracy: 0.03051


In [20]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Количество обучаемых параметров: {total_params}") 

Количество обучаемых параметров: 22218


In [23]:
train_model(model, train_dataloader, test_dataloader, criterion, optimizer, 15)

Epoch 1, Train Loss: 1.2121, Test Loss: 1.2772
Epoch 2, Train Loss: 1.1959, Test Loss: 1.2701
Epoch 3, Train Loss: 1.1833, Test Loss: 1.2440
Epoch 4, Train Loss: 1.1775, Test Loss: 1.2553
Epoch 5, Train Loss: 1.1638, Test Loss: 1.2567
Epoch 6, Train Loss: 1.1531, Test Loss: 1.2438
Epoch 7, Train Loss: 1.1424, Test Loss: 1.2323
Epoch 8, Train Loss: 1.1342, Test Loss: 1.2283
Epoch 9, Train Loss: 1.1247, Test Loss: 1.2146
Epoch 10, Train Loss: 1.1169, Test Loss: 1.2114
Epoch 11, Train Loss: 1.1093, Test Loss: 1.2377
Epoch 12, Train Loss: 1.0973, Test Loss: 1.1956
Epoch 13, Train Loss: 1.0910, Test Loss: 1.1952
Epoch 14, Train Loss: 1.0847, Test Loss: 1.1963
Epoch 15, Train Loss: 1.0758, Test Loss: 1.2046


In [24]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.63889


In [28]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=True)
X_batch, y_batch = next(iter(test_loader))
predictions = model(X_batch).argmax(dim=1).cpu().detach()
print(classification_report(y_batch, predictions))

              precision    recall  f1-score   support

           0       0.57      0.74      0.64       567
           1       0.25      0.03      0.05        36
           2       0.78      1.00      0.88       346
           3       0.74      0.78      0.76       482
           4       0.63      0.62      0.62       161
           5       0.45      0.61      0.52        36
           6       0.42      0.39      0.40       108
           7       0.40      0.07      0.12        81
           8       0.64      0.17      0.27        41
           9       0.45      0.32      0.38       118
          10       0.67      0.31      0.43        32
          11       0.58      0.26      0.36        57
          12       0.58      0.44      0.50        25
          13       0.45      0.10      0.17        49
          14       0.00      0.00      0.00        15
          15       0.33      0.20      0.25        15
          16       0.00      0.00      0.00        14
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
predict(model, train_dataset, "kalashnikov")

tensor([0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
        0.])
tensor([0.9951, 0.0026, 0.0018]) tensor([ 3,  7, 10])
kalashnikov: Russian (0.9951), Czech (0.0026), Greek (0.0018)


In [38]:
predict(model, train_dataset, "tugolukov")

tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0.])
tensor([0.9744, 0.0122, 0.0067]) tensor([3, 0, 7])
tugolukov: Russian (0.9744), English (0.0122), Czech (0.0067)


In [41]:
predict(model, train_dataset, "popov")

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
        0.])
tensor([0.5897, 0.1765, 0.0841]) tensor([3, 0, 7])
popov: Russian (0.5897), English (0.1765), Czech (0.0841)


## 3. Классификация обзоров ресторанов

Датасет: https://disk.yandex.ru/d/nY1o70JtAuYa8g

3.1 Считать файл `yelp/raw_train.csv`. Оставить от исходного датасета 10% строчек.

3.2 Воспользоваться функцией `preprocess_text` из 1.1 для обработки текста отзыва. Закодировать рейтинг числами, начиная с 0.

3.3 Разбить датасет на обучающую и тестовую выборку

3.4 Реализовать класс `Vocab` (токен = слово)

3.5 Реализовать класс `ReviewDataset`

3.6 Обучить классификатор

3.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)


In [61]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zonkz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\zonkz\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zonkz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [71]:
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    text = text.lower()
    text = ''.join([' ' if not char.isalpha() and char not in ['.', ',', '!', '?', "'"] else char for char in text])
    
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

In [72]:
raw_train = pd.read_csv("data/yelp/raw_train.csv", names=["rating", "review"])

raw_train

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...
...,...,...
559995,2,Ryan was as good as everyone on yelp has claim...
559996,2,Professional \nFriendly\nOn time AND affordabl...
559997,1,Phone calls always go to voicemail and message...
559998,1,Looks like all of the good reviews have gone t...


In [77]:
raw_train[21691:21693]

Unnamed: 0,rating,review
21691,1,Horrible horrible horrible! Worst nail place E...
21692,1,I went in her for the first time today for a g...


In [74]:
# выбор 10% случайных строк
raw_train_10 = raw_train.sample(frac=0.005)
# # сохранение выборки в новый файл
# raw_train_10.to_csv('data/raw_train_10.csv', index=False)
raw_train_10["review"] = raw_train_10["review"].apply(lambda x: preprocess_text(x))

In [75]:
raw_train_10

Unnamed: 0,rating,review
147948,2,"amazing food , service we hope you listened to..."
512429,2,it 's cold in here ! and colder in the bathroom !
383534,1,"to start off , the best thing about this place..."
221774,2,shopaholic galore ! ! ! my new favorite store ...
540491,2,"been here a few time , but my last visit wa th..."
...,...,...
268138,1,"the first time i went to sauce , i had a sandw..."
538340,1,so my experience at the sl la vega resort . n ...
34748,1,owner would not provide confirmation number so...
545897,2,the food here is good . the pho is good and we...


In [78]:
raw_train_10['rating'], rating_labels = pd.factorize(raw_train_10['rating'])
raw_train_10

Unnamed: 0,rating,review
147948,0,"amazing food , service we hope you listened to..."
512429,0,it 's cold in here ! and colder in the bathroom !
383534,1,"to start off , the best thing about this place..."
221774,0,shopaholic galore ! ! ! my new favorite store ...
540491,0,"been here a few time , but my last visit wa th..."
...,...,...
268138,1,"the first time i went to sauce , i had a sandw..."
538340,1,so my experience at the sl la vega resort . n ...
34748,1,owner would not provide confirmation number so...
545897,0,the food here is good . the pho is good and we...


In [79]:
rating_labels

Int64Index([2, 1], dtype='int64')

In [80]:
class Vocab:
  def __init__(self, data):
    self.idx_to_token = {}
    self.token_to_idx = {}
    self.vocab_len = 0

    # Получаем список всех слов в данных
    all_words = [word for sentence in data["review"] for word in word_tokenize(sentence)]
    # print(all_words)

    # Строим словарь
    for word in all_words:
        if word not in self.token_to_idx:
            self.idx_to_token[self.vocab_len] = word
            self.token_to_idx[word] = self.vocab_len
            self.vocab_len += 1

vocab = Vocab(raw_train_10)
vocab.vocab_len

15079

In [83]:
vocab.idx_to_token

{0: 'amazing',
 1: 'food',
 2: ',',
 3: 'service',
 4: 'we',
 5: 'hope',
 6: 'you',
 7: 'listened',
 8: 'to',
 9: 'u',
 10: 'blue',
 11: 'and',
 12: 'are',
 13: 'checking',
 14: 'this',
 15: 'out',
 16: '!',
 17: 'drink',
 18: '.',
 19: 'our',
 20: 'party',
 21: 'of',
 22: 'four',
 23: 'arrived',
 24: 'for',
 25: 'reservation',
 26: 'went',
 27: 'the',
 28: 'bar',
 29: 'until',
 30: 'were',
 31: 'seated',
 32: 'if',
 33: 'like',
 34: 'a',
 35: 'lemon',
 36: 'drop',
 37: 'try',
 38: 'lemonhead',
 39: 'martini',
 40: 'serving',
 41: 'plenty',
 42: 'on',
 43: "'tini",
 44: "'s",
 45: 'did',
 46: 'kobe',
 47: 'beef',
 48: 'surf',
 49: 'turf',
 50: 'tasting',
 51: 'menu',
 52: 'option',
 53: 'since',
 54: 'had',
 55: 'prima',
 56: 'card',
 57: 'it',
 58: 'lowered',
 59: 'price',
 60: 'person',
 61: 'love',
 62: 'that',
 63: 'all',
 64: 'still',
 65: 'full',
 66: 'from',
 67: 'last',
 68: 'night',
 69: 'portion',
 70: 'amount',
 71: 'wa',
 72: 'pretty',
 73: 'i',
 74: 'am',
 75: 'picky',
 76

In [123]:
class ReviewDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, review):
    '''Генерирует представление отзыва review при помощи бинарного кодирования (см. 1.2)'''
    # vec = torch.zeros(self.vocab.vocab_len, dtype=torch.long)
    vec = torch.zeros(self.vocab.vocab_len)
    
    # Проходим по каждому слову в фамилии
    # print(f"tokenized surname: {word_tokenize(surname)}")
    for word in word_tokenize(review):
      # print(word)
      # Если слово есть в словаре, устанавливаем соответствующий бит в векторе
      if word in self.vocab.token_to_idx:
          vec[self.vocab.token_to_idx[word]] = 1
  
    return vec
    
  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    vec = self.vectorize(self.X[idx])
    label = self.y[idx]
    return vec, label
  

In [124]:
X_train, X_test, y_train, y_test = train_test_split(raw_train_10['review'].to_numpy(), raw_train_10['rating'].to_numpy(), test_size=0.2)

In [125]:
# vocab = Vocab(surnames)

train_dataset = ReviewDataset(X_train, y_train, vocab)
test_dataset = ReviewDataset(X_test, y_test, vocab)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [126]:
model = nn.Sequential(
    nn.Linear(vocab.vocab_len, 1024),
    nn.ReLU(),
    nn.Linear(1024, 2),
    nn.LogSoftmax(dim=1),
)
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [127]:
vocab.vocab_len

15079

In [128]:
len(set(y_train))

2

In [129]:
# vocab_size = len(vocab.token_to_idx)
# embedding_dim = 8
# hidden_dim = 16
# output_dim = 2
# n_layers = 2
# bidirectional = True
# dropout = 0.5

# # Создание экземпляра модели
# model = ReviewClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout)
# model = ReviewClassifier(vocab_size=vocab.vocab_len,
#                            embedding_dim=4,
#                            output_dim=len(set(y_train)))

# criterion = nn.CrossEntropyLoss()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# evaluate_model(model, test_dataloader)
model.to(device)

Sequential(
  (0): Linear(in_features=15079, out_features=1024, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1024, out_features=2, bias=True)
  (3): LogSoftmax(dim=1)
)

In [130]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Количество обучаемых параметров: {total_params}")

Количество обучаемых параметров: 15443970


In [132]:
train_model(model, train_dataloader, test_dataloader, criterion, optimizer, 5)

Epoch 1, Train Loss: 0.4042, Test Loss: 0.3598
Epoch 2, Train Loss: 0.0718, Test Loss: 0.5117
Epoch 3, Train Loss: 0.0162, Test Loss: 0.6241
Epoch 4, Train Loss: 0.0036, Test Loss: 0.7178
Epoch 5, Train Loss: 0.0015, Test Loss: 0.7421


In [143]:
rating_labels = ["Positive", "Negative"]

def predict(model, dataset, review):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    model.eval()
    with torch.no_grad():
        vectorized = dataset.vectorize(review)
        tensor = vectorized.unsqueeze(0).to(device)
        logits = model(tensor)
        probs = torch.softmax(logits, dim=1).squeeze()
        print(probs)
        print(f'{rating_labels[probs.argmax()]} ({probs.max():.4f}), {rating_labels[probs.argmin()]} ({probs.min():.4f}) \n{review}')
        

In [134]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.84107


In [135]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=True)
X_batch, y_batch = next(iter(test_loader))
predictions = model(X_batch).argmax(dim=1).cpu().detach()
print(classification_report(y_batch, predictions))

              precision    recall  f1-score   support

           0       0.87      0.82      0.85       297
           1       0.81      0.86      0.84       263

    accuracy                           0.84       560
   macro avg       0.84      0.84      0.84       560
weighted avg       0.84      0.84      0.84       560



In [144]:
predict(model, train_dataset, "I had a terrible experience at this restaurant. The staff was rude and the food was overpriced for the quality.")

tensor([2.3120e-05, 9.9998e-01])
Negative (1.0000), Positive (0.0000) 
I had a terrible experience at this restaurant. The staff was rude and the food was overpriced for the quality.


In [137]:
predict(model, train_dataset, "This restaurant is simply amazing! The food is delicious and the service is outstanding.")

tensor([9.9990e-01, 9.8094e-05])
Positive (0.9999), Negative (0.0001) 
This restaurant is simply amazing! The food is delicious and the service is outstanding.


In [138]:
predict(model, train_dataset, "The menu at this restaurant is very limited and the food is nothing special. I wouldn't go back.")

tensor([0.0057, 0.9943])
Negative (0.9943), Positive (0.0057) 
The menu at this restaurant is very limited and the food is nothing special. I wouldn't go back.


In [139]:
predict(model, train_dataset, "I can't say enough good things about this restaurant. It's the perfect place for a romantic dinner or a night out with friends.")

tensor([0.9809, 0.0191])
Positive (0.9809), Negative (0.0191) 
I can't say enough good things about this restaurant. It's the perfect place for a romantic dinner or a night out with friends.


In [140]:
predict(model, train_dataset, "I had the best dining experience in this restaurant. The ambiance is perfect and the staff is very friendly.")

tensor([9.9954e-01, 4.5701e-04])
Positive (0.9995), Negative (0.0005) 
I had the best dining experience in this restaurant. The ambiance is perfect and the staff is very friendly.
