In [1]:
import json
import pandas as pd
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk import word_tokenize
from torch.nn.utils.rnn import pad_sequence
import torch
from torch.nn import Embedding
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import os
import numpy as np
from torch.utils.data import Dataset, DataLoader

In [2]:
def parse_data(file):
    for l in open(file, 'r'):
        yield json.loads(l)
        
data = pd.DataFrame(parse_data('C:\\Users\\david\\sarcasm-detection\\data\\Sarcasm_Headlines_Dataset_v2.json'))

In [3]:
data.columns = ['Sarcastic', 'Headline', 'Link']
data.head(5)

Unnamed: 0,Sarcastic,Headline,Link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [4]:
stop_words_removed = []
for i in data['Headline']:
    i = re.sub(r"[^a-zA-Z\s]", "", i)
    main_words = ' '.join([j for j in i.split() if j.lower() not in ENGLISH_STOP_WORDS])
    stop_words_removed.append(main_words)
    
tokenized_corpus = [word_tokenize(i) for i in stop_words_removed]
vocab = set([word for sentence in tokenized_corpus for word in sentence])
word_to_idx = {word: i for i, word in enumerate(vocab)}


In [5]:
ttsplit = 0.2 #Personal preference
max_length = 25 #Number previously derived from tuning

tokenized_corpus_num = []
for headline in tokenized_corpus:
    idx_seq = []
    for word in headline:
        idx_seq.append(word_to_idx[word])
    tokenized_corpus_num.append(idx_seq)

tensor_corpus = [torch.tensor(seq, dtype = torch.long) for seq in tokenized_corpus_num]
padded_sequences = pad_sequence(tensor_corpus, batch_first=True,padding_value=len(vocab))
padded_sequences = padded_sequences[:, :max_length]

sarcasm = torch.tensor(data['Sarcastic'].values, dtype = torch.long)

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, sarcasm, test_size = ttsplit,
                                                    stratify = sarcasm, shuffle = True)



In [6]:
embeddings_index = {}
embedding_dim = 100

GloVe = open(os.path.join(os.getcwd(), 'glove.twitter.27B.100d.txt'), encoding = "utf-8")

for entry in GloVe:
    values = entry.split()
    word = values[0]
    coeffs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coeffs
GloVe.close()

In [7]:
embedding_matrix = np.zeros((len(word_to_idx) + 1, embedding_dim))
c = 0
for word, i in word_to_idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        c+=1
        embedding_matrix[i] = embedding_vector


In [8]:
vocab_size = len(word_to_idx) + 1
class SarcasmLSTM(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim = 64, dropout = 0.2, rdropout = 0.25):
        super().__init__()
        
        self.embedding_weights = torch.tensor(embedding_matrix, dtype = torch.float32)
        self.embedding_layer = Embedding.from_pretrained(self.embedding_weights, freeze = True, padding_idx=len(vocab))

    
        self.lstm = nn.LSTM(input_size = 100, hidden_size = hidden_dim,
                            batch_first=True, dropout=rdropout,
                            bidirectional = True)
        
        self.fc = nn.Linear(hidden_dim*2, 1)
        
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.embedding_layer(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc(x)
        x = self.sigmoid(x)
        return x
        
class SarcasmDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    
train_dataset = SarcasmDataset(X_train, y_train)
test_dataset = SarcasmDataset(X_test, y_test)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle = False)

In [19]:
model2 = SarcasmLSTM(embedding_matrix = embedding_matrix)
device = torch.device("cpu")

criterion = nn.BCELoss()
optimizer = optim.Adam(model2.parameters(), lr = 0.001)
num_epochs = 30

for epoch in range(num_epochs):
    model2.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch in train_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device).float().unsqueeze(1)
        
        optimizer.zero_grad()
        outputs = model2(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        predictions = (outputs > 0.5).float()
        correct += (predictions == labels).sum().item()
        total += labels.size(0)
            
    avg_loss = total_loss / len(train_loader)
    accuracy = correct / total
    
    print(f"Epoch [{epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy}]")
    
torch.save(model2.state_dict, "sarcasm_lstm_GloVe2.pth")
print('Model Saved!')
        

Epoch [1/30, Loss: 0.5552, Accuracy: 0.7110286088665647]
Epoch [2/30, Loss: 0.4622, Accuracy: 0.7829220353789037]
Epoch [3/30, Loss: 0.4202, Accuracy: 0.8059838392662153]
Epoch [4/30, Loss: 0.3791, Accuracy: 0.828958287835772]
Epoch [5/30, Loss: 0.3474, Accuracy: 0.845555798209216]
Epoch [6/30, Loss: 0.3165, Accuracy: 0.8628084734658222]
Epoch [7/30, Loss: 0.2877, Accuracy: 0.8773531338720244]
Epoch [8/30, Loss: 0.2619, Accuracy: 0.8909368857829221]
Epoch [9/30, Loss: 0.2367, Accuracy: 0.9056562568246342]
Epoch [10/30, Loss: 0.2094, Accuracy: 0.9190216204411443]
Epoch [11/30, Loss: 0.1881, Accuracy: 0.9257916575671544]
Epoch [12/30, Loss: 0.1700, Accuracy: 0.936580039309893]
Epoch [13/30, Loss: 0.1459, Accuracy: 0.9464075125573269]
Epoch [14/30, Loss: 0.1328, Accuracy: 0.9534396156366018]
Epoch [15/30, Loss: 0.1193, Accuracy: 0.9592050666084297]
Epoch [16/30, Loss: 0.1061, Accuracy: 0.9647521292858703]
Epoch [17/30, Loss: 0.0949, Accuracy: 0.9695566717623936]
Epoch [18/30, Loss: 0.0905