In [1]:
import numpy as np
import pandas as pd
import spacy
import csv
import json
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.distributions as tdist
from vocab import Vocab
from IPython.display import clear_output
import matplotlib.pyplot as plt
import time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def read_dataset():
    with open('twitter_prep_data.json') as f:
        data = json.load(f)
        dataset = pd.DataFrame.from_dict(data) 
    return dataset

In [None]:
def create_vocab(dataset):
    word_list = []
    for s in tqdm(dataset['text'].values):
        word_list += s
    
    word_counter = Counter(word_list)
    vocab = Vocab(word_counter, min_freq=10)
    return vocab

In [None]:
class TwitterDataset(Dataset):
    def __init__(self, data, vocab):
        self.vocab = vocab
        self.data = data
        self.text = self.data['text'].values
        self.label = self.data['label'].values
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        
        text = self.text[index]
        label = self.label[index]
        text = self.vocab.sent2idx(text)
        #text = torch.LongTensor(text)
        
        sample = {'label': label, 'text': text}
        return sample
    
    def collate_fn(self, dicts): 
        pad_token = 0
        sents_padded = []
        corpus_size = len(dicts)
        len_text_list = [len(d['text']) for d in dicts]
        text_list = [d['text'] for d in dicts]
        labels = [i['label'] for i in dicts]

        sorted_len_text, sorted_text, sorted_labels = list(zip(*sorted(zip(len_text_list, text_list, labels), key=lambda x: x[0] ,reverse=True))) #sorts sentences in the reverse hierarchical order        
        max_lens = sorted_len_text[0]
        
        text_padded = [sorted_text[i] + [pad_token] * (max_lens - sorted_len_text[i]) for i in range(corpus_size)]
        text_padded = torch.LongTensor(text_padded)
        labels = torch.FloatTensor(sorted_labels)

        return text_padded, labels, sorted_len_text

In [None]:
def create_train_dataset(dataset):
    X_train, X_test = train_test_split(dataset, test_size=0.33, random_state=42)
    vocab = create_vocab(dataset)
    train_dataset = TwitterDataset(X_train, vocab)
    test_dataset = TwitterDataset(X_test, vocab)
    return train_dataset, test_dataset

In [None]:
def create_dataloaders(train_dataset, test_dataset):
    train_dataloader = DataLoader(train_dataset, batch_size=32,
                        shuffle=True, collate_fn=train_dataset.collate_fn)
    val_dataloader = DataLoader(test_dataset, batch_size=32,
                           shuffle=False, collate_fn=test_dataset.collate_fn)

In [None]:
def load_embeddings():
    d = {}
    with open('./fast_text.vec') as f:
        for i, line in enumerate(f.readlines()):
            word = line.split()[0]
            vector_string = line.split()[1:]
            d[word] = np.array(vector_string, dtype=np.float32)
    return d

In [None]:
def create_pretrained_embeddings(vocab, d):
    matrix_len = len(vocab._token2idx)
    weights_matrix = np.zeros((matrix_len, 100))
    words_found = 0
    for i, word in enumerate(vocab._token2idx):
        try: 
            weights_matrix[i] = d[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(100, ))
    pretrained_embeddings = weights_matrix
    pretrained_embeddings = torch.FloatTensor(pretrained_embeddings)
    return pretrained_embeddings

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim,
                 hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        if text_lengths == 0:
            return
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
   
        hidden = torch.cat((hidden[-2,:,:], hidden[-1, :,:]), dim = 1)
        return self.fc(hidden.squeeze(0))

In [None]:
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = 0

model = RNN(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            0).to(device)

In [None]:
model.embedding.weight.data.copy_(pretrained_embeddings)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [None]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    train_losses = []
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch_idx, (text, label, len_text) in enumerate(iterator): 
        text = text.to(device)
        label = label.to(device)
        predictions = model(text, len_text).squeeze(1)
        loss = criterion(predictions, label.float())
        acc = binary_accuracy(predictions, label.float())   
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        train_losses.append(loss.item())
        
        if batch_idx % 50 == 0:
            plot(1, batch_idx, train_losses)
    return epoch_loss/len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    eval_losses = []
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for text, label, len_text in iterator:
            text = text.to(device)
            label = label.to(device)
            predictions = model(text, len_text).squeeze(1)
            loss = criterion(predictions, label.float())
            eval_losses.append(loss.item())
            acc = binary_accuracy(predictions, label.float())
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss/len(iterator), epoch_acc / len(iterator)

In [None]:
def plot(epoch, step, train_losses):
    clear_output()
    plt.title(f'Epochs {epoch}, step {step}')
    plt.plot(train_losses)
    plt.show()

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 1
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, val_dataloader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
from nltk import (sent_tokenize as splitter, wordpunct_tokenize as tokenizer)

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tokenizer(sentence) for sentence in splitter(sentence)]
    indexed = [vocab.sent2idx(tokenized[0])]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    prediction = torch.sigmoid(model(tensor, length))
    return prediction.item()

In [None]:
predict_sentiment(model, "сегодня отличный день:))")

In [None]:
predict_sentiment(model, "Какой ужасный день:((((")

In [None]:
def sentiment_predictor(text):
    return predict_sentiment(model, text)

In [None]:
s = SentimentPredictor()

In [None]:
score = s.predict_sentiment('сегодня отличный день:))')
score