Detection of sarcasm in sentences - 
In this notebook, we have tried 3 methods for detecting sarcasm in sentences - 
1. Basic Linear Regression
2. Bidirectional LSTM
3. Attention-Like Classifier

We have tried these methods on "News Headlines Dataset For Sarcasm Detection" in Kaggle. We have reached 87% accuracy in the third method.

Embedding 'fasttext-wiki-news-subwords-300' currently gives bad results. Check why.


In [272]:
## Imports ##
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import re
import multiprocessing as mp
num_procs =  mp.cpu_count()
from multiprocessing import Pool


import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.nn import functional as F
from torch.optim.lr_scheduler import StepLR

In [238]:
import gensim.downloader as api

model_type = "glove-twitter-50" #'fasttext-wiki-news-subwords-300' # 
word_embedding_model = api.load(model_type)



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [239]:
# data_url = "https://raw.githubusercontent.com/asafkar/sarcarsm_detection/master/Sarcasm_Headlines_Dataset_v2.json"

data_url = os.path.join(os.getcwd(),"Sarcasm_Headlines_Dataset_v2.json")

# Read the Data, remove "article_link", and display properties
df = pd.read_json(open(data_url, "r", encoding="utf8"), lines=True)
df = df[['headline','is_sarcastic']]
df['headline'] = df['headline'].apply(lambda x: x.lower())
df['headline'] = df['headline'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
df.describe()


Unnamed: 0,is_sarcastic
count,28619.0
mean,0.476397
std,0.499451
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [240]:
# go over the data. 
# data_padded is the data padded, in order to receive fixed length sentences

def word2vec(word):
    try:
        return word_embedding_model[word]
    except:
        return word_embedding_model['unk']

def sentence2vecs(sentence):
    return [word2vec(x) for x in sentence.split()]
    
    
def second_longest_sentence():
    longest_sentence = 0
    second_longest = 0
    for sents in df['headline'].values:
        if len(sents.split()) > longest_sentence:
            second_longest = longest_sentence
            longest_sentence = len(sents.split())
    return second_longest


# word2vec - For each sample (sentence), turn each word into a word-embedding


corpus_size = df['headline'].size
word2vec_model_size = len(word_embedding_model['unk'])
model_sentence_length = second_longest_sentence()
data_padded = np.zeros((corpus_size, model_sentence_length , word2vec_model_size))
data = []
labels = np.zeros(corpus_size, dtype=np.int)

for ii in range(df.shape[0]):
    sents = df['headline'].iloc[ii]
    labels[ii]= df['is_sarcastic'].iloc[ii]
    
    vectors = sentence2vecs(sents)  # size = word2vec_model_size*sentence_length
    data.append(vectors)
    for jj, vector in enumerate(vectors):
        if jj == model_sentence_length:
            break
        data_padded[ii, jj,:] = np.asarray(vector)

        

In [241]:
# Split to Train and Test (For both padded and unpadded)
training_samples  = corpus_size*9//10
validation_samples = corpus_size - training_samples


indices = np.arange(corpus_size)
np.random.shuffle(indices)
data_padded = data_padded[indices]
labels = labels[indices]
X_train = data_padded[:training_samples]
y_train = labels[:training_samples]
X_val = data_padded[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]



*First Attempt - Linear Regression*

In [242]:

from sklearn.linear_model import LogisticRegression
X_train_flat = X_train.reshape(-1, model_sentence_length*word2vec_model_size)
X_val_flat = X_val.reshape(-1, model_sentence_length*word2vec_model_size)
log_reg = LogisticRegression()
log_reg.fit(X_train_flat, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(log_reg.score(X_train_flat, y_train)))
print('Accuracy of Logistic regression classifier  on test set: {:.2f}'
     .format(log_reg.score(X_val_flat, y_val)))



Accuracy of Logistic regression classifier on training set: 0.80
Accuracy of Logistic regression classifier  on test set: 0.78


*Second Attempt - Bidirectional LSTM*

In [243]:
# CPU or GPU :
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
sequence_length = model_sentence_length
input_size =      word2vec_model_size
hidden_size = 128
num_layers = 2
num_classes = 2
batch_size = 64
num_epochs = 5
learning_rate = 0.005
lstm_dropout = 0.1


# Data loader
train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long())
train_loader =  torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_dataset = torch.utils.data.TensorDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).long())
test_loader =  torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
              
              

In [244]:

# Bidirectional LSTM (many to one)
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, lstm_dropout):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection
    
    def forward(self, x):
        # Set initial states
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) # 2 for bidirection 
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

model = BiLSTM(input_size, hidden_size, num_layers, num_classes, lstm_dropout).to(device)


# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    


In [245]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (sents, labels) in enumerate(train_loader):
#         print(sents.shape)
        sents = sents.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(sents)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

# Test the train accuracy
with torch.no_grad():
    correct = 0
    total = 0
    for sents, labels in train_loader:
        sents = sents.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(sents)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Train Accuracy of the model on the 10000 test sents: {} %'.format(100 * correct / total)) 
            

# Test the model
with torch.no_grad():
    correct = 0
    total = 0
    for sents, labels in test_loader:
        sents = sents.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(sents)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Test Accuracy of the model on the 10000 test sents: {} %'.format(100 * correct / total)) 

# Save the model checkpoint
# torch.save(model.state_dict(), 'model.ckpt')



Epoch [1/5], Step [100/403], Loss: 0.7089
Epoch [1/5], Step [200/403], Loss: 0.6193
Epoch [1/5], Step [300/403], Loss: 0.5530
Epoch [1/5], Step [400/403], Loss: 0.4670
Epoch [2/5], Step [100/403], Loss: 0.5968
Epoch [2/5], Step [200/403], Loss: 0.4371
Epoch [2/5], Step [300/403], Loss: 0.4401
Epoch [2/5], Step [400/403], Loss: 0.2820
Epoch [3/5], Step [100/403], Loss: 0.4800
Epoch [3/5], Step [200/403], Loss: 0.4245
Epoch [3/5], Step [300/403], Loss: 0.3525
Epoch [3/5], Step [400/403], Loss: 0.2356
Epoch [4/5], Step [100/403], Loss: 0.4033
Epoch [4/5], Step [200/403], Loss: 0.3918
Epoch [4/5], Step [300/403], Loss: 0.2500
Epoch [4/5], Step [400/403], Loss: 0.1784
Epoch [5/5], Step [100/403], Loss: 0.3350
Epoch [5/5], Step [200/403], Loss: 0.2731
Epoch [5/5], Step [300/403], Loss: 0.1698
Epoch [5/5], Step [400/403], Loss: 0.0867
Train Accuracy of the model on the 10000 test sents: 93.47361882206779 %
Test Accuracy of the model on the 10000 test sents: 86.16352201257861 %


*Third Attempt - Attention*

In [269]:
# CPU or GPU :
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
sequence_length = model_sentence_length
input_size =      word2vec_model_size  # embedding size
hidden_size = 128
num_layers = 2
num_classes = 2
batch_size = 64
num_epochs = 8
learning_rate = 0.005
lstm_dropout = 0.1  # dropout between 2 lstm layers
alpha_dropout = 0.3 # dropout between attention output and fc



Ideas are from the following paper - *Attention-Based Bidirectional Long Short-Term Memory Networks for
Relation Classification*
https://www.aclweb.org/anthology/P16-2034

In [289]:
# Attention Model 
class AttentionModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, lstm_dropout, sequence_length, alpha_dropout):
        super(AttentionModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True, dropout=lstm_dropout)
        
        self.alpha_net = nn.Linear(hidden_size*2*sequence_length, sequence_length)
        self.dropout = nn.Dropout(alpha_dropout)
        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection
        
    def attention_layer(self, input_from_lstm):
        # H = {h1,h2...,ht} = input_from_lstm
        # alpha = softmax(w*M)
        M = nn.Tanh()(input_from_lstm).permute(1,0,2) # torch.Size([64, 29, 256]) - batch, sequence_length, hidden*2
        
        wM_flat = self.alpha_net(M.reshape(-1, hidden_size*2*sequence_length))
        wM = wM_flat.reshape(-1,sequence_length)
        alpha_weights = F.softmax(wM, 1).unsqueeze(2) # includes w (weights) - sized hidden*2 - unsqueeze to add axis for bmm
        
        # r = H*alpha
        #  bmm shape should be (b×n×m) and (b×m×p) then only it will give (b×n×p) as the output shape
        r = torch.bmm(input_from_lstm.permute(1,2,0), alpha_weights).squeeze()
        return r        
    
    def forward(self, x):
        # Set initial states
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) # 2 for bidirection 
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
        
        # lstm_out: tensor of shape (batch_size, seq_length, hidden_size*2) - 
        # tensor containing the output features (h_t) from the last layer of the LSTM, for each t
        
        # last_hidden_state : (num_layers*2, batch_size, hidden_size) -
        #  tensor containing the hidden state for t = seq_len ; used for backprop...
        lstm_out, (last_hidden_state, last_cell_state) = self.lstm(x, (h0, c0)) 
        
        attention_out = self.attention_layer(lstm_out.permute(1,0,2))
        h_star = nn.Tanh()(attention_out)
        out = self.fc(self.dropout((h_star)))
        return out

model = AttentionModel(input_size, hidden_size, num_layers, num_classes, lstm_dropout, 
                       sequence_length, alpha_dropout).to(device)


# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0001)
    



In [290]:
# Train the model
total_step = len(train_loader)
scheduler = StepLR(optimizer, step_size=3, gamma=0.1)
for epoch in range(num_epochs):
    for i, (sents, labels) in enumerate(train_loader):
        sents = sents.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(sents)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
#         scheduler.zero_grad()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
            
    scheduler.step()

# Test the train accuracy
with torch.no_grad():
    correct = 0
    total = 0
    for sents, labels in train_loader:
        sents = sents.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(sents)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Train Accuracy of the model on the 10000 test sents: {} %'.format(100 * correct / total)) 
            

# Test the model
with torch.no_grad():
    correct = 0
    total = 0
    for sents, labels in test_loader:
        sents = sents.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(sents)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Test Accuracy of the model on the 10000 test sents: {} %'.format(100 * correct / total)) 



Epoch [1/8], Step [100/403], Loss: 0.7031
Epoch [1/8], Step [200/403], Loss: 0.4718
Epoch [1/8], Step [300/403], Loss: 0.4175
Epoch [1/8], Step [400/403], Loss: 0.3866
Epoch [2/8], Step [100/403], Loss: 0.4835
Epoch [2/8], Step [200/403], Loss: 0.4381
Epoch [2/8], Step [300/403], Loss: 0.3637
Epoch [2/8], Step [400/403], Loss: 0.3159
Epoch [3/8], Step [100/403], Loss: 0.3989
Epoch [3/8], Step [200/403], Loss: 0.3980
Epoch [3/8], Step [300/403], Loss: 0.2889
Epoch [3/8], Step [400/403], Loss: 0.2881
Epoch [4/8], Step [100/403], Loss: 0.3634
Epoch [4/8], Step [200/403], Loss: 0.2546
Epoch [4/8], Step [300/403], Loss: 0.1972
Epoch [4/8], Step [400/403], Loss: 0.2184
Epoch [5/8], Step [100/403], Loss: 0.3648
Epoch [5/8], Step [200/403], Loss: 0.2365
Epoch [5/8], Step [300/403], Loss: 0.1854
Epoch [5/8], Step [400/403], Loss: 0.2148
Epoch [6/8], Step [100/403], Loss: 0.3447
Epoch [6/8], Step [200/403], Loss: 0.2361
Epoch [6/8], Step [300/403], Loss: 0.1755
Epoch [6/8], Step [400/403], Loss: