In [1]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import nltk
import gensim
#import spacy
from tqdm import tqdm_notebook

from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, ReversibleField, TabularDataset

from sklearn.metrics import accuracy_score

SEED = 42
np.random.seed(SEED)

In [2]:
df = pd.read_csv('train-balanced-sarcasm.csv')

In [3]:
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [4]:
df[['label', 'comment']].to_csv('data_set.csv', index=False)

In [5]:
pd.read_csv('data_set.csv')[40:55]

Unnamed: 0,label,comment
40,0,Funny how the media chose to never bring it up...
41,0,cant go wrong with yahudi and DAP ....Soros as...
42,0,1571049
43,0,Roger is such a good owner
44,1,wow it is totally unreasonable to assume that ...
45,1,Ho ho ho... But Melania said that there is no ...
46,0,"TBH, that giant dent was probably made by the ..."
47,0,How I'm considered an asshole because I let pe...
48,0,Try talking to the department head
49,0,run run pass punt


In [6]:
from nltk.tokenize import wordpunct_tokenize
from string import punctuation

punct = punctuation+'«»—…“”*№–'

def tokenizer(text):
    return [word.strip(punct) for word in text.lower().split()]

In [7]:
classes={
    '0':0,
    '1':1,
}

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english'))
LABEL = LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])


dataset = TabularDataset('dataset.csv', format='csv',
                         fields=[('label', LABEL), ('comment', TEXT)], 
                         skip_header=True)

In [31]:
TEXT.build_vocab(dataset, min_freq=10, vectors="glove.6B.100d")
len(TEXT.vocab.itos)

28427

In [10]:
TEXT.vocab.itos[:10]

['<unk>',
 '<pad>',
 '<eos>',
 'like',
 'yeah',
 '',
 'people',
 'would',
 'get',
 'one']

In [11]:
LABEL.build_vocab(dataset)

In [12]:
train, test = dataset.split(0.8, stratified=True)
train, valid = train.split(0.8, stratified=True)

In [13]:
np.unique([x.label for x in train.examples], return_counts=True)

(array([0, 1]), array([323464, 323464]))

In [14]:
np.unique([x.label for x in valid.examples], return_counts=True)

(array([0, 1]), array([80866, 80866]))

In [15]:
np.unique([x.label for x in test.examples], return_counts=True)

(array([0, 1]), array([101083, 101083]))

In [16]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2 *2, 3)
        
    def forward(self, batch):
        
        x, x_lengths = batch.comment
        
        x = self.embedding(x)

        if x_lengths is not None:
            x_lengths = x_lengths.view(-1).tolist()
            x = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
            
        _, (hidden, cell) = self.rnn(x)
        
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        return x

In [17]:
# tt.cuda.empty_cache()

batch_size = 32

model = MyModel(len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=128,
               )

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.comment),
    sort_within_batch=True,
)

optimizer = optim.Adam(model.parameters())
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss()

In [19]:
def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        pred = model(batch)
        loss = criterion(pred, batch.label)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            pred = model(batch)
            loss = criterion(pred, batch.label)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        valid_loss = _test_epoch(model, valid_iterator, criterion)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [20]:
%%time
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler, 
        n_epochs=20, early_stopping=2)


validation loss 0.57980



validation loss 0.57425



validation loss 0.58646



validation loss 0.61641
Early stopping! best epoch: 1 val 0.57425
CPU times: user 2h 38min 19s, sys: 5min 6s, total: 2h 43min 26s
Wall time: 3h 21min 39s


In [37]:
def count_accuracy(model, test_iterator, criterion):
    model.eval()
    epoch_acc = 0

    n_batches = len(test_iterator)
    with tt.no_grad():
        for batch in test_iterator:
            pred = model(batch)
            pred = tt.softmax(pred, dim=-1)
            pred = pred.detach().numpy()
            acc = accuracy_score(batch.label, pred.argmax(axis=1))
            epoch_acc += acc.item()

    return epoch_acc / n_batches

In [28]:
acc = count_accuracy(model, test_iterator, criterion)

In [29]:
acc

0.6775896428674207

In [43]:
pretrained_embeddings = TEXT.vocab.vectors

In [33]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.3474, -0.4717, -0.6240,  ...,  0.1387,  0.0312, -0.0892],
        [-0.3048, -0.8415,  0.4758,  ..., -0.5625, -0.3513, -0.0643],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [34]:
%%time
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler, 
        n_epochs=20, early_stopping=2)

validation loss 0.56855


validation loss 0.57052


validation loss 0.59144
Early stopping! best epoch: 0 val 0.56855
CPU times: user 1h 49min 13s, sys: 3min 56s, total: 1h 53min 10s
Wall time: 2h 9min 35s


In [38]:
acc = count_accuracy(model, test_iterator, criterion)

In [39]:
acc

0.6968276065498288