## Assignment 6

Develop RNN model in pytorch to solve the following problem:  

1. Detect sarcasm Data from https://www.kaggle.com/sherinclaudia/sarcastic-comments-on-reddit  

Your quality metric = accuracy  
Randomly select 20% of your data for test set. You can use it only for final perfomance estimation.  
Remember, you can use GPU resourses in kaggle kernels.

In [1]:
import re
from time import time

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import torch as tt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from torchtext.data import Field, LabelField, TabularDataset, Iterator
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
STOPWORDS = stopwords.words('english')

Загрузка данных

In [3]:
df = pd.read_csv('train-balanced-sarcasm.csv')

In [4]:
df = df[df.comment.notnull()]

In [5]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=17)

In [5]:
df_train.shape, df_test.shape

((808618, 10), (202155, 10))

Медиана длин комментариевв словах

In [6]:
np.median([len(x.split()) for x in df_train.comment.values])

9.0

In [6]:
df_train.to_csv('sarcasm-train.csv', index=False)
df_test.to_csv('sarcasm-test.csv', index=False)

Подготовка данных

In [3]:
def tokenizer(x):
    tokens = []
    
    for w in x:
        token = re.sub('^[\W]*|[\W]*$', '', w)
        
        if token:
            tokens.append(token)
            
    return tokens

In [4]:
TEXT = Field(
    sequential=True,
    fix_length=10,
    pad_first=True,
    preprocessing=tokenizer,
    batch_first=True, 
    eos_token='<eos>',
    lower=True,
    stop_words=STOPWORDS
)

TRAIN_LABEL = LabelField(dtype=tt.int64, use_vocab=False, preprocessing=lambda x: int(x))
TEST_LABEL = LabelField(dtype=tt.int64, use_vocab=False, preprocessing=lambda x: int(x))


train_dataset = TabularDataset(
    'sarcasm-train.csv',
     format='csv', 
     fields=[('label', TRAIN_LABEL), ('comment', TEXT)], 
     skip_header=True
)


test_dataset = TabularDataset(
    'sarcasm-test.csv',
     format='csv', 
     fields=[('label', TEST_LABEL), ('comment', TEXT)], 
     skip_header=True
)

In [5]:
TEXT.build_vocab(train_dataset, min_freq=1, vectors='glove.6B.300d')

In [6]:
VECTORS = TEXT.vocab.vectors

In [7]:
TRAIN_LABEL.build_vocab(train_dataset)
TEST_LABEL.build_vocab(test_dataset)

NN функционал.  
Использованная модель состоит из рекуррентного слоя, за которым следуют три полносвязных слоя.  
При построении модели использовались и доучивались GloVe эмбеддинги.

In [8]:
def get_iterator(
    dataset,
    batch_size,
    train=True, 
    shuffle=True,
    repeat=False
):
    dataset_iter = Iterator(
        dataset,
        batch_size=batch_size,
        train=train,
        shuffle=shuffle,
        repeat=repeat,
        sort=False
    )
    
    return dataset_iter

In [9]:
class NN(tt.nn.Module):
        
    def __init__(
        self,
        vocab_size,
        seq_len,
        embedding_size,
        hidden_size,
        vectors=None,
        freeze_embeddings=True,
    ):
        super(NN, self).__init__()
        
        self.vocab_size = vocab_size
        self.seq_len = seq_len
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.vectors = vectors
        self.freeze_embeddings = freeze_embeddings
        
        if self.vectors is not None:
            self.embeddings = tt.nn.Embedding.from_pretrained(self.vectors, freeze=self.freeze_embeddings)
            
        else:
            self.embeddings = tt.nn.Embedding(self.vocab_size, self.embedding_size)
            self.embeddings.requires_grad = True
                    
        self.rnn = tt.nn.LSTM(
            input_size=self.embedding_size,
            hidden_size=self.hidden_size,
            num_layers=1,
            batch_first=True
        )
        
        self.dense1 = tt.nn.Linear(self.hidden_size * 2, 512)
        self.dense2 = tt.nn.Linear(512, 256)
        self.dense3 = tt.nn.Linear(256, 128)
        self.output_layer = tt.nn.Linear(128, 2)
        
        self.dropout = tt.nn.Dropout(0.15)
        
    def init_hidden(self, batch_size):
        return (tt.zeros(1, batch_size, self.hidden_size, requires_grad=True).cuda(),
                tt.zeros(1, batch_size, self.hidden_size, requires_grad=True).cuda())
        
    def forward(self, x):
        x = self.embeddings(x)
        
        x, _hidden = self.rnn(x)
        hidden, cell = _hidden
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        
        x = self.dense1(x)
        x = tt.nn.functional.relu(x)
        x = self.dropout(x)
        
        x = self.dense2(x)
        x = tt.nn.functional.relu(x)
        x = self.dropout(x)
        
        x = self.dense3(x)
        x = tt.nn.functional.relu(x)
        x = self.dropout(x)
        
        x = self.output_layer(x)
        
        return x

In [21]:
def train(
    epochs,
    model, 
    optimizer,
    train_iterator,
    test_iterator=None,
    scheduler=None,
    patience=5,
    save_path='tt_model'
):    
    train_losses = []
    test_accs = []
    max_test_acc = 0
    n_no_improv_epochs = 0
    
    criterion = tt.nn.CrossEntropyLoss()
    
    for epoch in range(epochs):
        c_train_losses = []
        c_test_losses = []
        
        st_time = time()
        
        for batch in tqdm_notebook(train_iterator):
            optimizer.zero_grad()
            
            pred = model.forward(batch.comment.cuda())
            train_loss = criterion(pred, batch.label.cuda())
            c_train_losses.append(train_loss.item())
            
            train_loss.backward()
            
            optimizer.step()
        
        c_train_loss = np.mean(c_train_losses)
        train_losses.append(c_train_loss)
        
        test_acc = eval_accuracy(model, test_iterator)
        test_accs.append(test_acc)
        
        if test_acc > max_test_acc:
            max_test_acc = test_acc
            n_no_improv_epochs = 0
            tt.save(model.state_dict(), save_path)
            
        elif n_no_improv_epochs < patience:
            n_no_improv_epochs += 1
            
        else:
            print(f'Early stopping at epoch {epoch+1}\nBest test accuracy: {max_test_acc:.4f}')
            break
            
        if scheduler is not None:
            scheduler.step()
        
        c_time = time() - st_time
        
        print(f'epoch: {epoch+1} \t train_loss: {c_train_loss:.4f} \t test_acc: {test_acc:.4f} \t time: {c_time:.2f} s.')
    
    return train_losses, test_accs

In [11]:
def eval_accuracy(model, test_iter):
    y_true = []
    y_pred = []
    
    with tt.no_grad():
        for test_batch in test_iter:
            pred = model.forward(test_batch.comment.cuda()).cpu()
            pred = tt.nn.functional.softmax(pred, dim=1)
            pred = pred.numpy()
            pred = pred[:,1]
            pred = np.where(pred > 0.5, 1, 0)
            y_pred.extend(list(pred))
            y_true.extend(list(test_batch.label.numpy()))
    
    return accuracy_score(np.array(y_true), np.array(y_pred))

In [12]:
VOCAB_SIZE = len(TEXT.vocab.itos) 
BATCH_SIZE = 256

In [13]:
train_iter = get_iterator(train_dataset, BATCH_SIZE)

In [14]:
test_iter = get_iterator(test_dataset, BATCH_SIZE)

In [26]:
model = NN(
    vocab_size=VOCAB_SIZE,
    seq_len=10,
    embedding_size=300,
    hidden_size=512,
    vectors=VECTORS,
    freeze_embeddings=False
)

In [27]:
model = model.cuda()

In [28]:
optimizer = tt.optim.Adam(model.parameters())

In [29]:
scheduler = tt.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.2)

In [30]:
train(10, model, optimizer, train_iter, test_iterator=test_iter, scheduler=scheduler, patience=3)

HBox(children=(IntProgress(value=0, max=3159), HTML(value='')))

epoch: 1 	 train_loss: 0.5855 	 test_acc: 0.7047 	 time: 165.10 s.


HBox(children=(IntProgress(value=0, max=3159), HTML(value='')))

epoch: 2 	 train_loss: 0.5064 	 test_acc: 0.6945 	 time: 162.21 s.


HBox(children=(IntProgress(value=0, max=3159), HTML(value='')))

epoch: 3 	 train_loss: 0.4095 	 test_acc: 0.6872 	 time: 162.20 s.


HBox(children=(IntProgress(value=0, max=3159), HTML(value='')))

epoch: 4 	 train_loss: 0.3738 	 test_acc: 0.6827 	 time: 162.05 s.


HBox(children=(IntProgress(value=0, max=3159), HTML(value='')))

Early stopping at epoch 5
Best test accuracy: 0.7047


([0.5855079675898139,
  0.5064315397779411,
  0.40946778430520603,
  0.37377089785601225,
  0.36532113330580834],
 [0.7046968909994806,
  0.6945462640053425,
  0.6872300957186317,
  0.6827434394400337,
  0.6821993025153966])

Загрузим лучшую модель

In [31]:
model.load_state_dict(tt.load('tt_model'))

In [32]:
eval_accuracy(model, test_iter)

0.7051371472385051