### 7. Рекурентные сети для обработки последовательностей
Попробуйте обучить нейронную сеть GRU/LSTM для предсказания сентимента сообщений с твитера на примере https://www.kaggle.com/datasets/arkhoshghalb/twitter-sentiment-analysis-hatred-speech

Опишите, какой результат вы получили? Что помогло вам улучшить ее точность?

In [1]:
!pip install stop-words

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install textblob

Defaulting to user installation because normal site-packages is not writeable


In [3]:
!pip install imblearn

Defaulting to user installation because normal site-packages is not writeable


In [4]:
import torch
import re
import pandas as pd
import numpy as np
import nltk

#from google.colab import drive

import torch.nn as nn
import torch.nn.functional as F
from collections import Counter

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from torch.utils.data import DataLoader, Dataset
from string import punctuation
from textblob import TextBlob, Word
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from itertools import islice
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer

nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Инна\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Инна\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Инна\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Инна\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Загрузим данные и посмотрим на них

In [5]:
df_train = pd.read_csv('train (7).csv')
df_test = pd.read_csv('test (7).csv')
df_val = pd.read_csv('val (7).csv')

In [6]:
df_train.head()

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретви...,0
3,3,RT @epupybobv: Хочется котлету по-киевски. Зап...,1
4,4,@KarineKurganova @Yess__Boss босапопа есбоса н...,1


In [7]:
df_test.head()

Unnamed: 0,id,text
0,204150,Тектоника и рельеф-самое ужасное в мире мучение(
1,204151,"Ходили запускать шар желаний, но у нас не полу..."
2,204152,"Хочу лето только ради того, что бы направить н..."
3,204153,RT @RonyLiss: @colf_ne блин((\nа я шипперила Ф...
4,204154,"RT @anna_romt: @ZADROT_PO_IGRAM блин,каждое во..."


In [8]:
df_val.head()

Unnamed: 0,id,text,class
0,181467,RT @TukvaSociopat: Максимальный репост! ))) #є...,1
1,181468,чтоб у меня з.п. ежегодно индексировали на инд...,0
2,181469,@chilyandlime нехуя мне не хорошо !!! :((((,0
3,181470,"@inafish нее , когда ногами ахахах когда?ахаха...",0
4,181471,"Хочу сделать как лучше, а получаю как всегда. :(",0


Зададим ряд гиперпараметров, которые будут использоваться в дальнейшем процессе обучения.

In [9]:
max_words = 1500
max_len = 15
num_classes = 1
batch_size = 512

Предобработка

In [10]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['text'] = df_train['text'].apply(preprocess_text)
df_val['text'] = df_val['text'].apply(preprocess_text)
df_test['text'] = df_test['text'].apply(preprocess_text)

In [11]:
text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values
text_corpus_test = df_test['text'].values

counts = Counter()
for sequence in text_corpus_train:
    counts.update(sequence.split())

print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))
    
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

num_words before: 258107
num_words after: 66106


In [12]:
from functools import lru_cache

class TwitterDataset(torch.utils.data.Dataset):
    
    def __init__(self, txts, labels, w2index, used_length):
        self._txts = txts
        self._labels = labels
        self._length = used_length
        self._w2index = w2index
        
    def __len__(self):
        return len(self._txts)
    
    @lru_cache(50000)
    def encode_sentence(self, txt):
        encoded = np.zeros(self._length, dtype=int)
        enc1 = np.array([self._w2index.get(word, self._w2index["UNK"]) for word in txt.split()])
        length = min(self._length, len(enc1))
        encoded[:length] = enc1[:length]
        return encoded, length
    
    def __getitem__(self, index):
        encoded, length = self.encode_sentence(self._txts[index])
        return torch.from_numpy(encoded.astype(np.int32)), self._labels[index], length

In [13]:
max([len(i.split()) for i in text_corpus_train])

27

In [14]:
y_train = df_train['class'].values
y_val = df_val['class'].values

train_dataset = TwitterDataset(text_corpus_train, y_train, vocab2index, 27)
valid_dataset = TwitterDataset(text_corpus_valid, y_val, vocab2index, 27)

train_loader = torch.utils.data.DataLoader(train_dataset,
                          batch_size=128,
                          shuffle=True,
                          num_workers=3)
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                          batch_size=128,
                          shuffle=False,
                          num_workers=1)

Инициализируем устройство, на котором будем обучать модель

In [15]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

Инициализируем и обучим сеть GRU на данных 

In [18]:
from tqdm import tqdm_notebook

In [19]:
class GRUFixedLen(nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(lstm_out)
    
gru_init = GRUFixedLen(len(vocab2index), 30, 20)
optimizer = torch.optim.Adam(gru_init.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [None]:
for epoch in tqdm_notebook(range(10)):  
    gru_init.train()
    for i, data in enumerate(train_loader, 0):
        inputs, labels, lengths = data[0], data[1], data[2]
        inputs = inputs.long()
        labels = labels.long().view(-1, 1)
        
        optimizer.zero_grad()

        outputs = gru_init(inputs, lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
    gru_init.eval()
    loss_accumed = 0
    for X, y, lengths in valid_loader:
        X = X.long()
        y = y.long().view(-1, 1)
        output = gru_init(X, lengths)
        loss = criterion(output, y)
        loss_accumed += loss
    print("Epoch {} valid_loss {}".format(epoch, loss_accumed))

print('Finished!')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm_notebook(range(10)):


  0%|          | 0/10 [00:00<?, ?it/s]

Посмотрим на результаты

In [None]:
plt.figure(figsize = (16,5))
plt.title('Loss history')
plt.grid(True)
plt.ylabel('Train loss')
plt.xlabel('Step')
plt.plot(loss_accumed);

Инициализируем и обучим сеть LSTM на данных

In [None]:
class LSTMFixedLen(nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(lstm_out)
    
lstm_init = LSTMFixedLen(len(vocab2index), 30, 20)
optimizer = torch.optim.Adam(lstm_init.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [None]:
for epoch in tqdm_notebook(range(10)):  
    gru_init.train()
    for i, data in enumerate(train_loader, 0):
        inputs, labels, lengths = data[0], data[1], data[2]
        inputs = inputs.long()
        labels = labels.long().view(-1, 1)
        
        optimizer.zero_grad()

        outputs = gru_init(inputs, lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
    gru_init.eval()
    loss_accumed = 0
    for X, y, lengths in valid_loader:
        X = X.long()
        y = y.long().view(-1, 1)
        output = gru_init(X, lengths)
        loss = criterion(output, y)
        loss_accumed += loss
    print("Epoch {} valid_loss {}".format(epoch, loss_accumed))

print('Finished!')

Посмотрим на результаты

In [None]:
plt.figure(figsize = (16,5))
plt.title('Loss history')
plt.grid(True)
plt.ylabel('Train loss')
plt.xlabel('Step')
plt.plot(loss_accumed);

In [None]:
# LSTM с переменным размером входа:
class LSTM_variable_input(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.3)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out

In [None]:
for epoch in tqdm_notebook(range(10)):  
    gru_init.train()
    for i, data in enumerate(train_loader, 0):
        inputs, labels, lengths = data[0], data[1], data[2]
        inputs = inputs.long()
        labels = labels.long().view(-1, 1)
        
        optimizer.zero_grad()

        outputs = gru_init(inputs, lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
    gru_init.eval()
    loss_accumed = 0
    for X, y, lengths in valid_loader:
        X = X.long()
        y = y.long().view(-1, 1)
        output = gru_init(X, lengths)
        loss = criterion(output, y)
        loss_accumed += loss
    print("Epoch {} valid_loss {}".format(epoch, loss_accumed))

print('Finished!')

Посмотрим на результаты

In [None]:
plt.figure(figsize = (16,5))
plt.title('Loss history')
plt.grid(True)
plt.ylabel('Train loss')
plt.xlabel('Step')
plt.plot(loss_accumed);