In [257]:
import re
import nltk

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.vocab import build_vocab_from_iterator


import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from sklearn.preprocessing import LabelEncoder
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords


from tqdm.auto import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kirillanpilov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kirillanpilov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [20]:
# cd drive/MyDrive/datasets

## Функция обучения

In [21]:
def train(model, optimizer, criterion, n_epochs, train_loader, test_loader):

  loss_train = []
  accuracy_train = []

  for epoch in range(n_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Training epoch {epoch + 1}/{n_epochs}"):
        inputs = batch["input"]
        labels = batch["label"]
        output = model(inputs)

        loss = criterion(output, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()

    correct = 0
    total = 0

    with torch.no_grad():
      for batch in tqdm(test_loader, desc=f"Testing epoch {epoch + 1}/{n_epochs}"):
        inputs = batch["input"]
        labels = batch["label"]
        output = model(inputs)
        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    test_accuracy = correct / total
    accuracy_train.append(test_accuracy)
    print('Epoch [{}/{}], Loss: {:.4f}, Test Accuracy: {:.2f}%'.format(epoch + 1, n_epochs, loss.item(), test_accuracy * 100))
    loss_train.append(loss.item())


## 1. Представление и предобработка текстовых данных в виде последовательностей

__Index-Based Encoding__

1.1 Представьте первое предложение из строки `text` как последовательность из индексов слов, входящих в это предложение

In [22]:
def sen2token(text):
    text =  sent_tokenize(text.lower())[0]
    vocab = {}
    tokenizer = 0
    for word in set(word_tokenize(text)):
        vocab[word] = tokenizer
        tokenizer +=1
    sen = []
    for word in word_tokenize(text):
        sen.append(vocab[word])
        
    return sen, word_tokenize(text)
        
        
        

In [23]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

In [24]:
sen2token(text)

([8, 5, 6, 3, 7, 1, 0, 2, 4],
 ['select',
  'your',
  'preferences',
  'and',
  'run',
  'the',
  'install',
  'command',
  '.'])

__One-hot encoding__

1.2 Представьте первое предложение из строки `text` как последовательность векторов, соответствующих индексам слов. Для представления индекса в виде вектора используйте унитарное кодирование. В результате должен получиться двумерный тензор размера `количество слов в предложении` x `количество уникальных слов`

In [25]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

In [26]:
words = word_tokenize(sent_tokenize(text.lower())[0])
word_to_index = {word: idx for idx, word in enumerate(set(words))}
num_unique_words = len(word_to_index)

x = np.zeros((len(words) , len(word_to_index)))
for t, word in enumerate(words):
    x[t, word_to_index[word]] = 1
x

array([[0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.]])

1.3 Решите задачу 1.2, используя модуль `nn.Embedding`

In [27]:
words = word_tokenize(sent_tokenize(text.lower())[0])

word_to_index = {word: idx for idx, word in enumerate(set(words))}

num_unique_words = len(word_to_index)

indices = [word_to_index[word] for word in words]

embedding = nn.Embedding(num_embeddings=num_unique_words, embedding_dim=num_unique_words)

indices_tensor = torch.tensor(indices, dtype=torch.long)

embedded_vectors = embedding(indices_tensor)

print("Оригинальные индексы:", indices)
print("Векторы после унитарного кодирования:")
print(embedded_vectors)


Оригинальные индексы: [8, 5, 6, 3, 7, 1, 0, 2, 4]
Векторы после унитарного кодирования:
tensor([[ 0.8850,  0.7557,  0.8160,  0.6931, -1.1680,  0.7967,  0.4191, -0.4596,
         -1.8553],
        [ 0.1390, -0.3619,  1.1786,  1.2452, -0.2284,  0.1632,  0.9162, -0.1648,
          1.4999],
        [ 0.3629,  0.1482,  0.5693, -0.9089,  0.3316, -1.0567,  0.9797,  0.4425,
         -1.5213],
        [ 0.6687,  0.3736,  2.6836, -0.2059,  0.0832,  0.1522, -0.1385,  0.1272,
          0.6342],
        [ 0.7558,  0.1986,  0.0895,  1.2932, -0.5184, -1.2864, -1.3605,  0.1701,
          0.2454],
        [ 0.9229,  0.7865,  0.0108, -0.5037,  0.2341, -1.9598,  0.6919,  0.3401,
         -1.4779],
        [ 0.6965,  0.7170, -0.1202, -0.0740, -0.4741,  1.2247, -0.6664, -0.6920,
         -0.3381],
        [ 0.3810,  2.6529,  0.5440,  0.4022, -0.7139, -0.0984, -0.2782, -0.2886,
          1.1290],
        [ 0.0547, -0.3211,  1.0114, -0.3402,  1.3920, -1.3113, -0.8876, -2.5127,
         -0.9569]], grad_fn=<Em

## 2. Классификация фамилий по национальности (ConvNet)

Датасет: https://disk.yandex.ru/d/owHew8hzPc7X9Q?w=1

### 2.1 Считать файл `surnames/surnames.csv`. 

In [28]:
surnames_df = pd.read_csv('surnames.csv')

In [29]:
surnames_df.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [30]:
surnames_df['surname'] = surnames_df['surname'].str.lower()

In [31]:
surnames_df.head()

Unnamed: 0,surname,nationality
0,woodford,English
1,coté,French
2,kore,English
3,koury,Arabic
4,lebzak,Russian


In [32]:
surnames_df['nationality'].value_counts()

nationality
English       2972
Russian       2373
Arabic        1603
Japanese       775
Italian        600
German         576
Czech          414
Spanish        258
Dutch          236
French         229
Chinese        220
Irish          183
Greek          156
Polish         120
Korean          77
Scottish        75
Vietnamese      58
Portuguese      55
Name: count, dtype: int64

### 2.2 Закодировать национальности числами, начиная с 0.

In [33]:
le = LabelEncoder()
le.fit(surnames_df['nationality'])
surnames_df['nationality'] = le.transform(surnames_df['nationality'])
surnames_df.head()

Unnamed: 0,surname,nationality
0,woodford,4
1,coté,5
2,kore,4
3,koury,0
4,lebzak,14


In [34]:
print(le.classes_)

['Arabic' 'Chinese' 'Czech' 'Dutch' 'English' 'French' 'German' 'Greek'
 'Irish' 'Italian' 'Japanese' 'Korean' 'Polish' 'Portuguese' 'Russian'
 'Scottish' 'Spanish' 'Vietnamese']


### 2.3 Разбить датасет на обучающую и тестовую выборку


In [35]:
X_train, X_test, y_train, y_test = train_test_split(surnames_df['surname'], surnames_df['nationality'] , test_size=0.25, random_state=42)

### 2.4 Реализовать класс `Vocab` (токен = __символ__)
  * добавьте в словарь специальный токен `<PAD>` с индексом 0
  * при создании словаря сохраните длину самой длинной последовательности из набора данных в виде атрибута `max_seq_len`

In [36]:
class Vocab:
    def __init__(self, data: pd.Series):
        self.data = data

        self.vocab = build_vocab_from_iterator(data, specials=["<pad>", "<unk>"], min_freq = 5)

        self.token_to_idx = self.vocab.get_stoi()
        self.idx_to_token = {token: symbol for symbol, token  in self.token_to_idx.items()}
        self.vocab_len = len(self.token_to_idx)

        self.max_seq_len = self.data.str.len().max()
        self.pad_id = self.token_to_idx['<pad>']
        self.unk_id = self.token_to_idx['<unk>']        





### 2.5 Реализовать класс `SurnamesDataset`
  * метод `__getitem__` возвращает пару: <последовательность индексов токенов (см. 1.1 ), номер класса> 
  * длина каждой такой последовательности должна быть одинаковой и равной `vocab.max_seq_len`. Чтобы добиться этого, дополните последовательность справа индексом токена `<PAD>` до нужной длины

class Dataset for Index-Based encoding

In [46]:
class SurnamesDatasetIndex(Dataset):
    def __init__(self, X, y, vocab: Vocab):
        self.X = X
        self.y = y
        self.vocab = vocab

    def vectorize(self, surname):
        surname_tensor = torch.zeros(self.vocab.max_seq_len)

        for i, symbol in enumerate(surname):
            if symbol in self.vocab.token_to_idx:
                surname_tensor[i] = self.vocab.token_to_idx[symbol]
            else:
                surname_tensor[i] = self.vocab.unk_id
                
        for i in range(len(surname), self.vocab.max_seq_len):
            surname_tensor[i] = self.vocab.pad_id

        return surname_tensor

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return {"input" : self.vectorize(self.X[idx]) , "label" : self.y[idx]}

class Dataset for One-Hot encoding

In [62]:
class SurnamesDatasetOneHot(Dataset):
    def __init__(self, X, y, vocab: Vocab):
        self.X = X
        self.y = y
        self.vocab = vocab

    def vectorize(self, surname):
        surname_tensor = torch.zeros(self.vocab.max_seq_len , len(self.vocab.token_to_idx))
        
        for i, symbol in enumerate(surname): 
            if symbol in self.vocab.token_to_idx:
                surname_tensor[i, self.vocab.token_to_idx[symbol]] = 1
            else:
                surname_tensor[i, self.vocab.unk_id] = 1

        return surname_tensor

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return {"input" : self.vectorize(self.X[idx]) , "label" : self.y[idx]}

### 2.6. Обучить классификатор.
  
  * Для преобразования последовательности индексов в последовательность векторов используйте `nn.Embedding`. Рассмотрите два варианта: 
    - когда токен представляется в виде унитарного вектора и модуль `nn.Embedding` не обучается
    - когда токен представляется в виде вектора небольшой размерности (меньше, чем размер словаря) и модуль `nn.Embedding` обучается

  * Используйте одномерные свертки и пулинг (`nn.Conv1d`, `nn.MaxPool1d`)
    - обратите внимание, что `nn.Conv1d` ожидает на вход трехмерный тензор размерности `(batch, embedding_dim, seq_len)`

Vocab

In [63]:
vocab = Vocab(surnames_df['surname'])
vocab.token_to_idx

{'ä': 36,
 'ß': 37,
 'ü': 34,
 'ó': 33,
 'í': 31,
 'x': 28,
 'j': 25,
 'z': 24,
 'c': 19,
 'p': 22,
 'l': 9,
 'v': 16,
 'é': 30,
 'q': 26,
 'n': 6,
 't': 11,
 'y': 20,
 'g': 18,
 'ú': 40,
 '-': 38,
 'à': 35,
 'm': 13,
 'k': 12,
 'ö': 29,
 'f': 21,
 's': 8,
 'o': 4,
 'h': 10,
 'á': 32,
 'b': 17,
 'u': 14,
 'd': 15,
 'e': 3,
 '<unk>': 1,
 'r': 7,
 '<pad>': 0,
 "'": 27,
 'w': 23,
 'i': 5,
 'ñ': 39,
 'a': 2}

In [64]:
surnames_dataset_train_index = SurnamesDatasetIndex(X_train.values, y_train.values, vocab)
surnames_dataset_test_index = SurnamesDatasetIndex(X_test.values, y_test.values, vocab)

print("Index encoding" , surnames_dataset_train_index[5])

surnames_dataset_train_onehot = SurnamesDatasetOneHot(X_train.values, y_train.values, vocab)
surnames_dataset_test_onehot = SurnamesDatasetOneHot(X_test.values, y_test.values, vocab)

print("One-Hot encoding" , surnames_dataset_train_onehot[5])


surnames_dataloader_train_index = DataLoader(surnames_dataset_train_index, batch_size = 64, shuffle= True)
surnames_dataloader_test_index = DataLoader(surnames_dataset_test_index, shuffle= True)

surnames_dataloader_train_onehot = DataLoader(surnames_dataset_train_onehot, batch_size = 64, shuffle= True)
surnames_dataloader_test_onehot = DataLoader(surnames_dataset_test_onehot, shuffle= True)

Index encoding {'input': tensor([5., 8., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'label': 0}
One-Hot encoding {'input': tensor([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         

In [157]:
class SurnameClassifierCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, use_fixed_embedding=False):
        super(SurnameClassifierCNN, self).__init__()

        self.use_fixed_embedding = use_fixed_embedding

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        
        if self.use_fixed_embedding:
            self.embedding.weight.requires_grad = False  
            self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=3)

        else:
            self.embedding.weight.requires_grad = True
            self.conv1d = nn.Conv1d(in_channels=2050, out_channels=100, kernel_size=3)

    
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.fc = nn.Linear(700, output_dim)

    def forward(self, text):
        text = text.long()
        embedded = self.embedding(text)

        if self.use_fixed_embedding == False:
            embedded = embedded.view(embedded.shape[0],embedded.shape[1],-1)


        embedded = embedded.permute(0, 2, 1)
        conv_output = self.conv1d(embedded)
        pooled = self.pool(conv_output)
        pooled = pooled.view(pooled.size(0), -1)
        output = self.fc(pooled)
        return output

In [151]:
vocab_size = surnames_dataloader_train_index.dataset.vocab.vocab_len  # Размер словаря
embedding_dim = 50  
output_dim = len(le.classes_)
n_epochs = 10

model1 = SurnameClassifierCNN(vocab_size, embedding_dim, output_dim, use_fixed_embedding=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model1.parameters())

In [144]:
train(model1, optimizer, criterion, n_epochs, surnames_dataloader_train_index, surnames_dataloader_test_index)

Training epoch 1/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 1/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [1/10], Loss: 1.2507, Test Accuracy: 61.53%


Training epoch 2/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 2/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [2/10], Loss: 1.2501, Test Accuracy: 68.20%


Training epoch 3/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 3/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [3/10], Loss: 0.8472, Test Accuracy: 70.20%


Training epoch 4/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 4/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [4/10], Loss: 0.9046, Test Accuracy: 72.28%


Training epoch 5/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 5/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [5/10], Loss: 0.7957, Test Accuracy: 73.77%


Training epoch 6/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 6/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [6/10], Loss: 0.8255, Test Accuracy: 73.73%


Training epoch 7/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 7/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [7/10], Loss: 0.7700, Test Accuracy: 74.24%


Training epoch 8/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 8/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [8/10], Loss: 0.5425, Test Accuracy: 74.72%


Training epoch 9/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 9/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [9/10], Loss: 0.4076, Test Accuracy: 74.68%


Training epoch 10/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 10/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [10/10], Loss: 0.6366, Test Accuracy: 74.61%


In [158]:
vocab_size = surnames_dataloader_train_onehot.dataset.vocab.vocab_len  # Размер словаря
embedding_dim = 50  
output_dim = len(le.classes_)
n_epochs = 10

model2 = SurnameClassifierCNN(vocab_size, embedding_dim, output_dim, use_fixed_embedding=False)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model2.parameters())

In [159]:
train(model2, optimizer, criterion, n_epochs, surnames_dataloader_train_onehot, surnames_dataloader_test_onehot)

Training epoch 1/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 1/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [1/10], Loss: 1.0962, Test Accuracy: 68.89%


Training epoch 2/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 2/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [2/10], Loss: 0.6580, Test Accuracy: 73.44%


Training epoch 3/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 3/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [3/10], Loss: 0.5410, Test Accuracy: 75.01%


Training epoch 4/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 4/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [4/10], Loss: 0.4221, Test Accuracy: 75.88%


Training epoch 5/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 5/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [5/10], Loss: 0.4736, Test Accuracy: 75.99%


Training epoch 6/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 6/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [6/10], Loss: 0.3475, Test Accuracy: 75.66%


Training epoch 7/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 7/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [7/10], Loss: 0.5451, Test Accuracy: 75.81%


Training epoch 8/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 8/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [8/10], Loss: 0.4891, Test Accuracy: 75.08%


Training epoch 9/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 9/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [9/10], Loss: 0.2207, Test Accuracy: 75.12%


Training epoch 10/10:   0%|          | 0/129 [00:00<?, ?it/s]

Testing epoch 10/10:   0%|          | 0/2745 [00:00<?, ?it/s]

Epoch [10/10], Loss: 0.3876, Test Accuracy: 74.57%


### 2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: прогнать несколько фамилий студентов группы через модели и проверить результат. Для каждой фамилии выводить 3 наиболее вероятных предсказания.

In [161]:
model2.eval()

test_surnames = ['anpilov', 'kuznetsova', 'aws', 'kyserbaev']

for surname in test_surnames:
    input_data = surnames_dataset_train_onehot.vectorize(surname).unsqueeze(0)
    output = model2(input_data)
    _, predicted = torch.topk(output, 3)  # Получаем три наиболее вероятных предсказания
    predicted_labels = [le.classes_[idx] for idx in predicted.squeeze().tolist()]
    
    print(f"Input: {surname}, Predicted labels: {predicted_labels}")


Input: anpilov, Predicted labels: ['Russian', 'English', 'Czech']
Input: kuznetsova, Predicted labels: ['Russian', 'Czech', 'Dutch']
Input: aws, Predicted labels: ['German', 'Chinese', 'Arabic']
Input: kyserbaev, Predicted labels: ['Russian', 'Czech', 'English']


## 3. Классификация обзоров на фильмы (ConvNet)

Датасет: https://disk.yandex.ru/d/tdinpb0nN_Dsrg

### 3.1 Создайте набор данных на основе файлов polarity/positive_reviews.csv (положительные отзывы) и polarity/negative_reviews.csv (отрицательные отзывы). 

* Разбейте на обучающую и тестовую выборку.
* токен = слово
* данные для обучения в датасете представляются в виде последовательности индексов токенов
* словарь создается на основе только обучающей выборки. Для корректной обработки ситуаций, когда в тестовой выборке встретится токен, который не хранится в словаре, добавьте в словарь специальный токен <UNK>
* добавьте предобработку текста

почему то в файлах противоположные отзывы

In [186]:
df1 = pd.read_csv('positive_reviews.txt', delimiter='\t',  names =  ['review', 'rating'] )
df1['rating'] = 0
df2 = pd.read_csv('negative_reviews.txt', delimiter='\t', names =  ['review', "rating"] )
df2['rating'] = 1


df_reviews = pd.concat([df1, df2], axis=0, ignore_index=True)

In [187]:
df_reviews

Unnamed: 0,review,rating
0,"simplistic , silly and tedious .",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
10657,both exuberantly romantic and serenely melanch...,1
10658,mazel tov to a film about a family's joyous li...,1
10659,standing in the shadows of motown is the best ...,1
10660,it's nice to see piscopo again after all these...,1


In [188]:
def preprocess_review(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = re.sub(r'\s', ' ', text) 
    text = re.sub(r'\s{2,}', ' ', text)

    return text

In [189]:
df_reviews.iloc[500]

review     . . . a weak and ineffective ghost story with...
rating                                                    0
Name: 500, dtype: object

In [190]:
df_reviews['review'] = df_reviews['review'].map(lambda x : preprocess_review(x))
df_reviews['review'] = df_reviews['review'].map(lambda x : word_tokenize(x))

In [191]:
df_reviews['review'].to_list()

[['simplistic', 'silly', 'and', 'tedious'],
 ['its',
  'so',
  'laddish',
  'and',
  'juvenile',
  'only',
  'teenage',
  'boys',
  'could',
  'possibly',
  'find',
  'it',
  'funny'],
 ['exploitative',
  'and',
  'largely',
  'devoid',
  'of',
  'the',
  'depth',
  'or',
  'sophistication',
  'that',
  'would',
  'make',
  'watching',
  'such',
  'a',
  'graphic',
  'treatment',
  'of',
  'the',
  'crimes',
  'bearable'],
 ['garbus',
  'discards',
  'the',
  'potential',
  'for',
  'pathological',
  'study',
  'exhuming',
  'instead',
  'the',
  'skewed',
  'melodrama',
  'of',
  'the',
  'circumstantial',
  'situation'],
 ['a',
  'visually',
  'flashy',
  'but',
  'narratively',
  'opaque',
  'and',
  'emotionally',
  'vapid',
  'exercise',
  'in',
  'style',
  'and',
  'mystification'],
 ['the',
  'story',
  'is',
  'also',
  'as',
  'unoriginal',
  'as',
  'they',
  'come',
  'already',
  'having',
  'been',
  'recycled',
  'more',
  'times',
  'than',
  'id',
  'care',
  'to',
  '

In [198]:
def remove_stopwords(words):
    stop_words = set(stopwords.words('english')) 
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words

In [199]:
df_reviews['review'] = df_reviews['review'].map(lambda x : remove_stopwords(x))

In [200]:
X_train, X_test, y_train, y_test = train_test_split(df_reviews['review'], df_reviews['rating'] , shuffle=True, test_size=0.25, random_state=42)

### класс `Vocab` (токен = __слово__)

In [225]:
class Vocab:
    def __init__(self, data: pd.Series):
        self.data =  data
        self.all_words = [word for sublist in df_reviews['review'].values for word in sublist]


        self.token_to_idx = self.create_vocab( self.all_words)
        self.idx_to_token = {token: symbol for symbol, token  in self.token_to_idx.items()}
        self.vocab_len = len(self.token_to_idx)

        self.max_seq_len = max(data.apply(len))
        self.pad_id = self.token_to_idx['<pad>']
        self.unk_id = self.token_to_idx['<unk>']

    def create_vocab(self, data):
        vocab = {"<pad>" : 0 , "<unk>" : 1}
        tokenizer = 2
        for word in set(data):
            if word not in vocab:
                vocab[word] = tokenizer
                tokenizer += 1
        
        return vocab





### класс `ReviewDataset`

In [232]:
class ReviewDataset(Dataset):
    def __init__(self, X, y, vocab: Vocab):
        self.X = X
        self.y = y
        self.vocab = vocab
        
    def vectorize(self, review):
        review_tensor = torch.zeros(self.vocab.max_seq_len)

        for i, word in enumerate(review):
            if word in self.vocab.token_to_idx:
                review_tensor[i] = self.vocab.token_to_idx[word]
            else:
                review_tensor[i] = self.vocab.unk_id
                
        for i in range(len(review), self.vocab.max_seq_len):
            review_tensor[i] = self.vocab.pad_id

        return review_tensor

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return {"input" : self.vectorize(self.X[idx]) , "label" : self.y[idx]}

In [233]:
df_reviews['review']

0                             [simplistic, silly, tedious]
1        [laddish, juvenile, teenage, boys, could, poss...
2        [exploitative, largely, devoid, depth, sophist...
3        [garbus, discards, potential, pathological, st...
4        [visually, flashy, narratively, opaque, emotio...
                               ...                        
10657    [exuberantly, romantic, serenely, melancholy, ...
10658    [mazel, tov, film, familys, joyous, life, acti...
10659    [standing, shadows, motown, best, kind, docume...
10660    [nice, see, piscopo, years, chaykin, headly, p...
10661    [provides, porthole, noble, trembling, incoher...
Name: review, Length: 10662, dtype: object

In [234]:
vocab = Vocab(df_reviews['review'])
vocab.token_to_idx

{'<pad>': 0,
 '<unk>': 1,
 'solondz': 2,
 'salaries': 3,
 'die': 4,
 'blatantly': 5,
 'afflicts': 6,
 'kaufmans': 7,
 'thus': 8,
 'offputting': 9,
 'stepmom': 10,
 'tinny': 11,
 'lucks': 12,
 'sand': 13,
 'observacin': 14,
 'untrained': 15,
 'waiting': 16,
 'swim': 17,
 'aspire': 18,
 'vistas': 19,
 'clonegag': 20,
 'resolute': 21,
 'idiomas': 22,
 'sadism': 23,
 'clericks': 24,
 'practiced': 25,
 'connections': 26,
 'flakiness': 27,
 'sprung': 28,
 'coburn': 29,
 'moviegoer': 30,
 'themed': 31,
 'shoeloving': 32,
 'specialinterest': 33,
 'willis': 34,
 'telegraphed': 35,
 'cutesy': 36,
 'sports': 37,
 'tiros': 38,
 'camouflaged': 39,
 'labor': 40,
 'relocation': 41,
 'reworked': 42,
 'hussein': 43,
 'wherever': 44,
 'customers': 45,
 'dubious': 46,
 'bathing': 47,
 'liberated': 48,
 'verse': 49,
 'geographical': 50,
 'holeridden': 51,
 'sleaze': 52,
 'damon': 53,
 'muito': 54,
 'maud': 55,
 'roundrobin': 56,
 'services': 57,
 'heroines': 58,
 'justine': 59,
 'bruckheimeresque': 60,
 '

In [235]:
X_train.values[1]

['film', 'suffers', 'many', 'excesses']

In [236]:
review_dataset_train = ReviewDataset(X_train.values, y_train.values, vocab)
review_dataset_test= ReviewDataset(X_test.values, y_test.values, vocab)

In [237]:
review_dataloader_train = DataLoader(review_dataset_train, batch_size = 64, shuffle= True)
review_dataloader_test = DataLoader(review_dataset_test, shuffle= True)

In [238]:
review_dataset_train[10]

{'input': tensor([ 6765., 16564.,  4092., 16876., 17112.,  5790.,  2197.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.]),
 'label': 1}

### 3.2. Обучите классификатор.
  
  * Для преобразования последовательности индексов в последовательность векторов используйте `nn.Embedding` 
    - подберите адекватную размерность вектора эмбеддинга: 
    - модуль `nn.Embedding` обучается

  * Используйте одномерные свертки и пулинг (`nn.Conv1d`, `nn.MaxPool1d`)
    - обратите внимание, что `nn.Conv1d` ожидает на вход трехмерный тензор размерности `(batch, embedding_dim, seq_len)`

In [261]:
class ReviewClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes):
        super(ReviewClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=64, kernel_size=3)
        self.pool = nn.MaxPool1d(kernel_size= 3)
        self.fc1 = nn.Linear(768, 128)
        self.fc2 = nn.Linear(128, 16)
        self.fc3 = nn.Linear(16, num_classes)

        self.tang = nn.Tanh()
        self.dropout = nn.Dropout(p = 0.25)



    def forward(self, x):
        x = x.long()
        embedded = self.embedding(x)
        conv_out = self.conv1d(embedded.permute(0, 2, 1))
        pooled = self.pool(conv_out)
        pooled_flat = pooled.view(x.size(0), -1)
        output = self.dropout(self.fc1(pooled_flat))
        output = self.dropout(self.fc2(output))
        output = F.softmax(self.fc3(output), dim=1) 
        return output


        return output


In [262]:
vocab_size = review_dataloader_train.dataset.vocab.vocab_len  
embedding_dim = 500  
num_classes = 2
n_epochs = 15


model = ReviewClassifier(vocab_size, embedding_dim, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [263]:
train(model, optimizer, criterion, n_epochs, review_dataloader_train, review_dataloader_test)

Training epoch 1/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 1/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [1/15], Loss: 0.7243, Test Accuracy: 55.51%


Training epoch 2/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 2/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [2/15], Loss: 0.6221, Test Accuracy: 66.65%


Training epoch 3/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 3/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [3/15], Loss: 0.4168, Test Accuracy: 64.67%


Training epoch 4/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 4/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [4/15], Loss: 0.3279, Test Accuracy: 66.80%


Training epoch 5/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 5/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [5/15], Loss: 0.3798, Test Accuracy: 65.87%


Training epoch 6/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 6/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [6/15], Loss: 0.3350, Test Accuracy: 65.57%


Training epoch 7/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 7/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [7/15], Loss: 0.3735, Test Accuracy: 64.70%


Training epoch 8/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 8/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [8/15], Loss: 0.3474, Test Accuracy: 66.43%


Training epoch 9/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 9/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [9/15], Loss: 0.3349, Test Accuracy: 66.24%


Training epoch 10/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 10/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [10/15], Loss: 0.3134, Test Accuracy: 67.07%


Training epoch 11/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 11/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [11/15], Loss: 0.3964, Test Accuracy: 65.79%


Training epoch 12/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 12/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [12/15], Loss: 0.3525, Test Accuracy: 66.43%


Training epoch 13/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 13/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [13/15], Loss: 0.3291, Test Accuracy: 66.13%


Training epoch 14/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 14/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [14/15], Loss: 0.3469, Test Accuracy: 66.99%


Training epoch 15/15:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 15/15:   0%|          | 0/2666 [00:00<?, ?it/s]

Epoch [15/15], Loss: 0.3133, Test Accuracy: 66.39%


### 3.3 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)
* Целевое значение accuracy на валидации - 70+%

In [269]:
model.eval()

test_review = ['love love love love ', 'i hate hate hate you and your product']

for review in test_review:
    input_data = review_dataset_train.vectorize(review).unsqueeze(0)
    output = model(input_data)
    _, predicted = torch.max(output.data, 1)
    print(f"Input: {review}, Predicted labels: {predicted}")


Input: love love love love , Predicted labels: tensor([0])
Input: i hate hate hate you and your product, Predicted labels: tensor([0])
