In [1]:
import pandas as pd
import numpy as np
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

# RNN for sequences

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from collections import Counter

In [6]:
df_train = pd.read_csv("text/train.csv")


In [7]:
df_train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [8]:
df_train['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [9]:
# test is without labels, so we'll split train to train and validation dataframes on label (label 1 is less than 10%)
X_train, X_test = train_test_split(df_train,test_size=0.3, stratify=df_train[['label']])

In [10]:
print(len(X_train), len(X_test), len(df_train))

22373 9589 31962


In [11]:
def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in puncts)
    txt = txt.lower()
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if (word not in sw) & (word.isalnum())]
    # exclude all except letters and numbers
   
    return " ".join(txt)

In [12]:
morpher = MorphAnalyzer()
sw = set(get_stop_words("en"))
puncts = set(punctuation)

In [13]:
from tqdm import tqdm 
tqdm.pandas()

X_train['tweet'] = X_train['tweet'].progress_apply(preprocess_text)
X_test['tweet'] = X_test['tweet'].progress_apply(preprocess_text)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22373/22373 [00:06<00:00, 3665.83it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9589/9589 [00:02<00:00, 3813.42it/s]


In [14]:
df_train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [15]:
train_corpus = " ".join(X_train['tweet'])
train_corpus = train_corpus.lower()

In [16]:
text_corpus_train = X_train['tweet'].values
text_corpus_train

array(['euro area industrial production data head noh danske bank blog silver gold forex',
       'user yum yum moms attractions16 conferencing nomnom',
       'user finallllly leaving work streaming user user soon get home',
       ...,
       'exhausted staying lemans24 adrenaline really pumping ahead finish europeangp f1 lm24',
       'rosematter comes 14 minutes im taking science final instead getting lipstick',
       'weekend planed son'], dtype=object)

In [17]:
text_corpus_test = X_test['tweet'].values

In [18]:
counts = Counter()
for sequence in text_corpus_train:
    counts.update(sequence.split())

print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))
    
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

num_words before: 31872
num_words after: 11662


In [19]:
from functools import lru_cache

class TwitterDataset(torch.utils.data.Dataset):
    
    def __init__(self, txts, labels, w2index, used_length):
        self._txts = txts
        self._labels = labels
        self._length = used_length
        self._w2index = w2index
        
    def __len__(self):
        return len(self._txts)
    
    @lru_cache(50000)
    def encode_sentence(self, txt):
        encoded = np.zeros(self._length, dtype=int)
        enc1 = np.array([self._w2index.get(word, self._w2index["UNK"]) for word in txt.split()])
        length = min(self._length, len(enc1))
        encoded[:length] = enc1[:length]
        return encoded
    
    def __getitem__(self, index):
        encoded = self.encode_sentence(self._txts[index])
        return torch.from_numpy(encoded.astype(np.int32)), self._labels[index]

In [20]:
used_length = max([len(i.split()) for i in text_corpus_train])

In [21]:
y_train = X_train['label'].values
y_test = X_test['label'].values

train_dataset = TwitterDataset(text_corpus_train, y_train, vocab2index, used_length)
test_dataset = TwitterDataset(text_corpus_test, y_test, vocab2index, used_length)

train_loader = torch.utils.data.DataLoader(train_dataset,
                          batch_size=128,
                          shuffle=True,
                          num_workers=2)
test_loader = torch.utils.data.DataLoader(test_dataset,
                          batch_size=128,
                          shuffle=False,
                          num_workers=1)

In [22]:
print(len(train_loader))

175


In [23]:
class LSTMFixedLen(nn.Module) :
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=128, use_last=True):
        super().__init__()
        self.use_last = use_last
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, ht = self.lstm(x)
       
        if self.use_last:
            last_tensor = lstm_out[:,-1,:]
        else:
            # use mean
            last_tensor = torch.mean(lstm_out[:,:], dim=1)
    
        out = self.linear(last_tensor)
        # print(out.shape)
        return torch.sigmoid(out)
    


In [24]:
lstm_init = LSTMFixedLen(len(vocab2index), 128, 20, use_last=False)
optimizer = torch.optim.Adam(lstm_init.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [None]:
lstm_init.train()
th = 0.5
epochs = 10


for epoch in range(epochs):  
    lstm_init.train()
    running_items, running_right = 0.0, 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data[0], data[1]

        # обнуляем градиент
        optimizer.zero_grad()
        outputs = lstm_init(inputs)
        
        loss = criterion(outputs, labels.float().view(-1, 1))
        loss.backward()
        optimizer.step()

        # подсчет ошибки на обучении
        loss = loss.item()
        running_items += len(labels)
        # подсчет метрики на обучении
        pred_labels = torch.squeeze((outputs > th).int())
        running_right += (labels == pred_labels).sum()
        
    # выводим статистику о процессе обучения
    lstm_init.eval()
    
    print(f'Epoch [{epoch + 1}/{epochs}]. ' \
            f'Step [{i + 1}/{len(train_loader)}]. ' \
            f'Loss: {loss:.3f}. ' \
            f'Acc: {running_right / running_items:.3f} \n')
    running_loss, running_items, running_right = 0.0, 0.0, 0.0
   
print('Training is finished!')

In [None]:
test_running_right, test_running_total, test_loss = 0.0, 0.0, 0.0
for j, data in enumerate(test_loader):
    test_labels = data[1]
    test_outputs = lstm_init(data[0])
        
        # подсчет ошибки на тесте
    test_loss = criterion(test_outputs, test_labels.float().view(-1, 1))
        # подсчет метрики на тесте
    test_running_total += len(data[1])
    pred_test_labels = torch.squeeze((test_outputs > th).int())
    test_running_right += (test_labels == pred_test_labels).sum()
    
 
print(f'Test loss: {test_loss:.3f}. Test acc: {test_running_right / test_running_total:.3f}')

In [None]:
class GRUFixedLen(nn.Module) :
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=128, use_last=True):
        super().__init__()
        self.use_last = use_last
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, batch_first=True, )
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        gru_out, ht = self.gru(x)
       
        if self.use_last:
            last_tensor = gru_out[:,-1,:]
        else:
            # use mean
            last_tensor = torch.mean(gru_out[:,:], dim=1)
    
        out = self.linear(last_tensor)
        return torch.sigmoid(out)

In [None]:
gru_init = GRUFixedLen(len(vocab2index), 128, 20, use_last=False)
optimizer = torch.optim.Adam(gru_init.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [None]:
for epoch in range(epochs): 
    gru_init.train() 
    running_items, running_right = 0.0, 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data[0], data[1]

        # обнуляем градиент
        optimizer.zero_grad()
        outputs = gru_init(inputs)
        
        loss = criterion(outputs, labels.float().view(-1, 1))
        loss.backward()
        optimizer.step()

        # подсчет ошибки на обучении
        loss = loss.item()
        running_items += len(labels)
        # подсчет метрики на обучении
        pred_labels = torch.squeeze((outputs > th).int())
        running_right += (labels == pred_labels).sum()
        
    # выводим статистику о процессе обучения
    gru_init.eval()
    
    print(f'Epoch [{epoch + 1}/{epochs}]. ' \
          f'Step [{i + 1}/{len(train_loader)}]. ' \
          f'Loss: {loss:.3f}. ' \
          f'Acc: {running_right / running_items:.3f}', end='\n ')

    
print('Training is finished!')

In [None]:
test_running_right, test_running_total, test_loss = 0.0, 0.0, 0.0
for j, data in enumerate(test_loader):
    test_labels = data[1]
    test_outputs = gru_init(data[0])
        
        # подсчет ошибки на тесте
    test_loss = criterion(test_outputs, test_labels.float().view(-1, 1))
        # подсчет метрики на тесте
    test_running_total += len(data[1])
    pred_test_labels = torch.squeeze((test_outputs > th).int())
    test_running_right += (test_labels == pred_test_labels).sum()
    
 
print(f'Test loss: {test_loss:.3f}. Test acc: {test_running_right / test_running_total:.3f}')

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence

In [None]:
# Можно строить lstm с переменным размером входа:
class LSTM_variable_input(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.3)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out