In [1]:
# Dataset: https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection
import pandas as pd
from transformers import BertTokenizer
import torch.nn as nn
import torch
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
import math

df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines=True)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_input = tokenizer(df['headline'].tolist(), return_tensors='pt',padding=True)

X = encoded_input['input_ids']
y = torch.tensor(df['is_sarcastic'].values).float()

#Keeping only the first 10k samples to cut down training time
#X=X[:10000,:]
#y=y[:10000]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.empty_cache()

cuda


In [2]:
class TransformerBlock(nn.Module):
    def __init__(self,embed_dim, num_heads, dropout, expansion_ratio):
        super(TransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, expansion_ratio*embed_dim),
            nn.ReLU(),
            nn.Linear(expansion_ratio*embed_dim,embed_dim)
        )
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, value, key, query):
        attention, _ = self.attention(value, key, query)
        x=self.dropout(self.norm1(attention+query))
        forward = self.feed_forward(x)
        out=self.dropout(self.norm2(forward+x))
        return out

class Encoder(nn.Module):
    #the vocab size is one more than the max value in the X matrix.
    def __init__(self,vocab_size=30109,embed_dim=128,num_layers=1,num_heads=4,device="cpu",expansion_ratio=4,dropout=0.1,max_length=193):
        super(Encoder,self).__init__()
        
        self.device = device
        self.word_embedding = nn.Embedding(vocab_size,embed_dim)
        self.position_embedding = nn.Embedding(max_length,embed_dim)
        self.layers = nn.ModuleList(
            [
                TransformerBlock(embed_dim,num_heads,dropout,expansion_ratio) for _ in range(num_layers)
            ]
        )
        
        self.dropout = nn.Dropout(dropout)
        self.classifier1 = nn.Linear(embed_dim,embed_dim)
        self.classifier2 = nn.Linear(embed_dim,1)
        self.relu = nn.ReLU()
    
    def forward(self,x):
        N, seq_length = x.shape
        positions = torch.arange(0,seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
        
        for layer in self.layers:
            #print(out.shape)
            out = layer(out,out,out)
        
        #Get the first output for classification
        #Pooled output from hugging face is: Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function.
        #Pooled output from hugging face will be different from out[:,0,:], which is the output from the CLS token.
        out = self.relu(self.classifier1(out[:,0,:]))
        out = self.classifier2(out)
        
        return out

torch.cuda.empty_cache()
net = Encoder(device=device)
net.to(device)

Encoder(
  (word_embedding): Embedding(30109, 128)
  (position_embedding): Embedding(193, 128)
  (layers): ModuleList(
    (0): TransformerBlock(
      (attention): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (feed_forward): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): ReLU()
        (2): Linear(in_features=512, out_features=128, bias=True)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier1): Linear(in_features=128, out_features=128, bias=True)
  (classifier2): Linear(in_features=128, out_features=1, bias=True)
  (relu): ReLU()
)

In [3]:
#only required if debugging for CUDA errors to get an accurate traceback
#import os
#os.environ["CUDA_LAUNCH_BLOCKING"] = str(1)

batch_size = 32
num_train_samples = X_train.shape[0]
num_val_samples = X_test.shape[0]

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(net.parameters(),lr=1e-5)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5)

In [4]:
#variables that need to be reset for the first training
val_loss_hist=[]
loss_hist=[]
epoch = 0
min_val_loss = math.inf

In [5]:
print("Training Started")

patience = 0

for _ in range(100):
    
    epoch += 1
        
    net.train()
    epoch_loss = 0
    
    permutation = torch.randperm(X_train.size()[0])
    
    for i in range(0,X_train.size()[0], batch_size):
        
        indices = permutation[i:i+batch_size]
        
        features=X_train[indices].to(device)
        labels=y_train[indices].reshape(-1,1).to(device)
        
        output = net.forward(features)
        loss = criterion(output, labels)

        optimizer.zero_grad() 
        loss.backward()
        optimizer.step()
        
        epoch_loss+=loss.item()
        
    epoch_loss = epoch_loss / num_train_samples * num_val_samples
    loss_hist.append(epoch_loss)
    
    #print("Eval")
    net.eval()
    epoch_val_loss = 0
    
    permutation = torch.randperm(X_test.size()[0])
    
    for i in range(0,X_test.size()[0], batch_size):
        
        indices = permutation[i:i+batch_size]
        
        features=X_test[indices].to(device)
        labels = y_test[indices].reshape(-1,1).to(device)
        
        output = net.forward(features)
        loss = criterion(output, labels)        

        epoch_val_loss+=loss.item()
    
    val_loss_hist.append(epoch_val_loss)
    
    scheduler.step(epoch_val_loss)
    
    #if epoch % 5 == 0:
    print("Epoch: " + str(epoch) + " Train Loss: " + format(epoch_loss, ".4f") + ". Val Loss: " + format(epoch_val_loss, ".4f") + " LR: " + str(optimizer.param_groups[0]['lr']))
            
    if epoch_val_loss < min_val_loss:
        min_val_loss = epoch_val_loss
        torch.save(net.state_dict(), "torchmodel/weights_best.pth")
        print('\033[93m'+"Model Saved"+'\033[0m')
        patience = 0
        
    else:
        patience += 1
    
    if (patience == 10):
        break
        
print("Training Ended")

Training Started
Epoch: 1 Train Loss: 124.1192. Val Loss: 123.8753 LR: 1e-05
[93mModel Saved[0m
Epoch: 2 Train Loss: 124.0192. Val Loss: 123.8767 LR: 1e-05
Epoch: 3 Train Loss: 124.0183. Val Loss: 123.8744 LR: 1e-05
[93mModel Saved[0m
Epoch: 4 Train Loss: 123.9228. Val Loss: 123.9224 LR: 1e-05
Epoch: 5 Train Loss: 123.9748. Val Loss: 123.8779 LR: 1e-05
Epoch: 6 Train Loss: 124.0237. Val Loss: 123.8771 LR: 1e-05
Epoch: 7 Train Loss: 124.0117. Val Loss: 123.8727 LR: 1.0000000000000002e-06
[93mModel Saved[0m
Epoch: 8 Train Loss: 123.9211. Val Loss: 123.8745 LR: 1.0000000000000002e-06
Epoch: 9 Train Loss: 123.9601. Val Loss: 123.8778 LR: 1.0000000000000002e-06
Epoch: 10 Train Loss: 123.9187. Val Loss: 123.8783 LR: 1.0000000000000002e-06
Epoch: 11 Train Loss: 123.8877. Val Loss: 123.8774 LR: 1.0000000000000002e-06
Epoch: 12 Train Loss: 124.0129. Val Loss: 123.8770 LR: 1.0000000000000002e-06
Epoch: 13 Train Loss: 124.0220. Val Loss: 123.8778 LR: 1.0000000000000002e-07
Epoch: 14 Train L

In [6]:
from sklearn.metrics import accuracy_score

#Inference
torch.cuda.empty_cache()
net = Encoder(device=device)
net.load_state_dict(torch.load("torchmodel/weights_best.pth"))
net.to(device)
net.eval()

#checking on the train set to see if it gives all the same output
X=X_test[:64,:]
y=y_test[:64]

features=X.to(device)

output = net.forward(features).detach()

sigmoid = torch.nn.Sigmoid()
output = sigmoid(output)

print(output)

output = (output>0.5).float().to("cpu")
labels = y.reshape(-1,1)

accuracy_score(labels, output)

tensor([[0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0.4761],
        [0

0.484375

In [21]:
#Testing Scripts
import pandas as pd
from transformers import BertTokenizer
import torch.nn as nn
import torch

df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines=True)
df

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [14]:
res = df['headline'].str.split().str.len().max()
print("The maximum length in words are : " +  str(res)) 

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_input = tokenizer(df['headline'].tolist(), return_tensors='pt',padding=True)

The maximum length in words are : 151


In [22]:
print(encoded_input.keys())
print(encoded_input['input_ids'].shape)
print(encoded_input['input_ids'][0])
print(torch.max(encoded_input['input_ids']))

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
torch.Size([28619, 193])
tensor([  101,  4228, 14045, 20744,  6529,  4895,  3726,  4014, 12677, 16150,
         4710,  5119,  1997,  2606,  3279,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,

In [25]:
print(torch.any(torch.isnan(X_train)))
print(torch.any(torch.isnan(X_test)))
print(torch.any(torch.isnan(y_train)))
print(torch.any(torch.isnan(y_test)))

print(X_train[0])
print(X_test[0])
#print(len(y_train))
#print(len(y_test))
#print(torch.sum(y_train))
#print(torch.sum(y_test))
print(y_test)

tensor(False)
tensor(False)
tensor(False)
tensor(False)
tensor([  101, 11734,  3237, 20057,  2039,  5896,  2008,  4269,  2007,  2996,
         8154, 14059,  2046,  7411,  2915,  1997,  5025,  3137,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
        