In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [29]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import plotly.express as px

import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet


import keras
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras 
from keras import backend as K
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from keras.layers import LSTM,Dense,Bidirectional,Input
from keras.models import Model
import torch
import transformers

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # to utilize GPU in colab

In [31]:
df=pd.read_json("/content/drive/MyDrive/Btech Project/Sarcasm_Headlines_Dataset_v2.json", lines=True)# Using version 2 of dataset
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [32]:
df.drop('article_link', axis=1, inplace=True)

## 2. Basic Data Cleaning

In [33]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

In [35]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
# Removing URL's
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)
#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop and i.strip().lower().isalpha():
            final_text.append(i.strip().lower())
    return " ".join(final_text)
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text
#Apply function on review column
df['headline']=df['headline'].apply(denoise_text)

  soup = BeautifulSoup(text, "html.parser")


In [36]:
def get_corpus(text):
    words = []
    for i in text:
        for j in i.split():
            words.append(j.strip())
    return words  
corpus = get_corpus(df.headline)
corpus[:5]

['thirtysomething', 'scientists', 'unveil', 'doomsday', 'clock']

In [37]:
from collections import Counter
counter = Counter(corpus)
most_common = counter.most_common(10)
most_common = dict(most_common)
most_common

{'new': 1637,
 'man': 1351,
 'trump': 1284,
 'one': 527,
 'area': 494,
 'woman': 483,
 'says': 482,
 'donald': 472,
 'day': 435,
 'like': 428}

## 4. Model Building

In [38]:
X = df['headline']
y = df['is_sarcastic']

In [39]:
train_text,val_text,train_labels,val_labels = train_test_split(X,y,random_state = 0 , stratify = y)

In [40]:
from transformers import AutoTokenizer,AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
bert = AutoModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [41]:
MAX_LENGHT = 500

# Tokenize and encode sequences in the train set
tokens_train1 = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)

train_seq1 = torch.tensor(tokens_train1['input_ids']).to(device)
train_mask1 = torch.tensor(tokens_train1['attention_mask']).to(device)
del tokens_train1

tokens_val1 = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)

val_seq1 = torch.tensor(tokens_val1['input_ids']).to(device)
val_mask1 = torch.tensor(tokens_val1['attention_mask']).to(device)
del tokens_val1



In [42]:
train_y = torch.tensor(train_labels.tolist()).to(device)
val_y = torch.tensor(val_labels.tolist()).to(device)

In [43]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 32                                               #define a batch size

In [44]:
for param in bert.parameters():
    param.requires_grad = False    # false here means gradient need not be computed

We shall be using the pre-trained BERT model which has been extensively trained on huge text corpuses of Wikipedia.
Now after getting the pre-trained model, we can fine-tune the model for our own specific task.

The key thing to notice is here is that for our final prediction we would only be using the output generated by the first token ([CLS]), after which we use a sigmoid activation for final prediction.

In [45]:
train_data = TensorDataset(train_seq1, train_mask1, train_y)    # wrap tensors
train_sampler = RandomSampler(train_data)                     # sampler for sampling the data during training
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
                                                              # dataLoader for train set
val_data = TensorDataset(val_seq1, val_mask1, val_y)            # wrap tensors

val_sampler = SequentialSampler(val_data)                     # sampler for sampling the data during training
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [46]:
import torch
import torch.nn as nn

In [47]:
class bert_arch(nn.Module):
    def __init__(self, bert):  
      super(bert_arch, self).__init__()
      self.bert = bert
      self.dropout = nn.Dropout(0.5)
      self.fc1 = nn.Linear(768,2)             # dense layer 1
      self.softmax = nn.LogSoftmax(dim=1)       # softmax activation function
    def forward(self, sent_id, mask):           # define the forward pass  
      cls_hs = self.bert(sent_id, attention_mask=mask)[0][:,0]

      x = self.fc1(cls_hs)
      x = self.softmax(x)                       # apply softmax activation
      return x


In [48]:
model = bert_arch(bert)
model.to(device)
# Defining the hyperparameters (optimizer, weights of the classes and the epochs)
# Define the optimizer
from transformers import AdamW
optimizer = AdamW(model.parameters(),
                  lr = 2e-5)          # learning rate
epochs = 10
crossEntropyloss = nn.CrossEntropyLoss()



In [49]:
# Defining training and evaluation functions
def train(optimizer):
  model.train()
  total_loss, total_accuracy = 0, 0
  for step,batch in enumerate(train_dataloader):                # iterate over batches
    if step % 50 == 0 and not step == 0:                        # progress update after every 50 batches.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    batch = [r for r in batch]                                  # push the batch to gpu
    sent_id1, mask1,labels = batch 
    model.zero_grad()                                           # clear previously calculated gradients
    preds = model(sent_id1, mask1)                                # get model predictions for current batch
    # loss = cross_entropy(preds, labels)                         # compute loss between actual & predicted values
    loss = crossEntropyloss(preds, labels)
    total_loss = total_loss + loss.item()                       # add on to the total loss
    loss.backward()                                             # backward pass to calculate the gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)     # clip gradients to 1.0. It helps in preventing exploding gradient problem
    optimizer.step()
                                              # update parameters
    # preds=preds.detach().cpu().numpy()                          # model predictions are stored on GPU. So, push it to CPU

  avg_loss = total_loss / len(train_dataloader)                 # compute training loss of the epoch  
                                                            # reshape predictions in form of (# samples, # classes)
  return avg_loss                                 # returns the loss and predictions

def evaluate():  
  print("\nEvaluating...")  
  model.eval()                                    # Deactivate dropout layers
  total_loss, total_accuracy = 0, 0  
  for step,batch in enumerate(val_dataloader):    # Iterate over batches  
    if step % 50 == 0 and not step == 0:          # Progress update every 50 batches.     
                                                  # Calculate elapsed time in minutes.
                                                  # Elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
                                                  # Report progress
    batch = [t for t in batch]                    # Push the batch to GPU
    sent_id1, mask1,labels = batch
    with torch.no_grad():                         # Deactivate autograd
      preds = model(sent_id1, mask1)                # Model predictions
      loss = crossEntropyloss(preds,labels)          # Compute the validation loss between actual and predicted values
      total_loss = total_loss + loss.item()
      # preds = preds.detach().cpu().numpy()
                                                  # update parameters
  avg_loss = total_loss / len(val_dataloader)         # compute the validation loss of the epoch
  
  return avg_loss

In [50]:
train_losses=[]
valid_losses=[]
best_valid_loss = float('inf')
model_location='/content/drive/MyDrive/Btech Project/BERT MODELS'

In [51]:
import torch
torch.cuda.empty_cache()

In [52]:
for epoch in range(epochs):     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))     
    train_loss = train(optimizer)                       # train model
    valid_loss = evaluate()                    # evaluate model
    if valid_loss < best_valid_loss:              # save the best model
        best_valid_loss = valid_loss
        print('Saving best model at epoch:', epoch)
        torch.save(model.state_dict(), model_location+'bert_best.pt')
    if epoch % 10 == 0:
      print('Saving checkpoint:', epoch)
      torch.save(model.state_dict(),model_location+'bert' + str(epoch) + '_.pt')
    train_losses.append(train_loss)               # append training and validation loss
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')



 Epoch 1 / 10
  Batch    50  of    671.
  Batch   100  of    671.
  Batch   150  of    671.
  Batch   200  of    671.
  Batch   250  of    671.
  Batch   300  of    671.
  Batch   350  of    671.
  Batch   400  of    671.
  Batch   450  of    671.
  Batch   500  of    671.
  Batch   550  of    671.
  Batch   600  of    671.
  Batch   650  of    671.

Evaluating...
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.
Saving best model at epoch: 0
Saving checkpoint: 0

Training Loss: 0.697
Validation Loss: 0.691

 Epoch 2 / 10
  Batch    50  of    671.
  Batch   100  of    671.
  Batch   150  of    671.
  Batch   200  of    671.
  Batch   250  of    671.
  Batch   300  of    671.
  Batch   350  of    671.
  Batch   400  of    671.
  Batch   450  of    671.
  Batch   500  of    671.
  Batch   550  of    671.
  Batch   600  of    671.
  Batch   650  of    671.

Evaluating...
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   15

In [53]:
import tensorflow as tf

In [56]:
model=bert_arch(bert)
model.load_state_dict(torch.load(r'/content/drive/MyDrive/Btech Project/BERT MODELS/bert_best.pt'))

<All keys matched successfully>

In [57]:
model.to(device)

bert_arch(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [58]:
def evaluate_after():  
  print("\nEvaluating...")  
  model.eval()                                    # Deactivate dropout layers
  # total_loss, total_accuracy = 0, 0  
  preds_all = np.array([])
  labels_all = np.array([])
  for step,batch in enumerate(val_dataloader):    # Iterate over batches  
    if step % 50 == 0 and not step == 0:          # Progress update every 50 batches.     
                                                  # Calculate elapsed time in minutes.
                                                  # Elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
                                                  # Report progress
    batch = [t for t in batch]                    # Push the batch to GPU
    sent_id1, mask1, labels = batch
    with torch.no_grad():                         # Deactivate autograd
      preds = model(sent_id1, mask1)                # Model predictions
      preds = preds.detach().cpu().numpy()
      preds = np.argmax(preds, axis = 1)
      preds_all = np.concatenate((preds_all, preds))
      labels_all = np.concatenate((labels_all, labels.detach().cpu().numpy()))
      # print(preds_all)
      # print(labels_all)
      # break
      # loss = cross_entropy (preds,labels)          # Compute the validation loss between actual and predicted values
      # total_loss = total_loss + loss.item()
      # preds = preds.detach().cpu().numpy()
  #avg_loss = total_loss / len(val_dataloader)         # compute the validation loss of the epoch
  # return avg_los
  print(len(preds_all))
  print(len(labels_all))
  return preds_all, labels_all

preds, labels = evaluate_after()
from sklearn.metrics import classification_report
print(classification_report(labels, preds))


Evaluating...
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.
7155
7155
              precision    recall  f1-score   support

         0.0       0.54      0.92      0.68      3746
         1.0       0.63      0.14      0.24      3409

    accuracy                           0.55      7155
   macro avg       0.59      0.53      0.46      7155
weighted avg       0.59      0.55      0.47      7155



In [59]:
model=bert_arch(bert)
model.load_state_dict(torch.load(r'/content/drive/MyDrive/Btech Project/BERT MODELS/bert10_.pt'))

<All keys matched successfully>

In [60]:
model.to(device)

bert_arch(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [61]:
def evaluate_after():  
  print("\nEvaluating...")  
  model.eval()                                    # Deactivate dropout layers
  # total_loss, total_accuracy = 0, 0  
  preds_all = np.array([])
  labels_all = np.array([])
  for step,batch in enumerate(val_dataloader):    # Iterate over batches  
    if step % 50 == 0 and not step == 0:          # Progress update every 50 batches.     
                                                  # Calculate elapsed time in minutes.
                                                  # Elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
                                                  # Report progress
    batch = [t for t in batch]                    # Push the batch to GPU
    sent_id1, mask1, labels = batch
    with torch.no_grad():                         # Deactivate autograd
      preds = model(sent_id1, mask1)                # Model predictions
      preds = preds.detach().cpu().numpy()
      preds = np.argmax(preds, axis = 1)
      preds_all = np.concatenate((preds_all, preds))
      labels_all = np.concatenate((labels_all, labels.detach().cpu().numpy()))
      # print(preds_all)
      # print(labels_all)
      # break
      # loss = cross_entropy (preds,labels)          # Compute the validation loss between actual and predicted values
      # total_loss = total_loss + loss.item()
      # preds = preds.detach().cpu().numpy()
  #avg_loss = total_loss / len(val_dataloader)         # compute the validation loss of the epoch
  # return avg_los
  print(len(preds_all))
  print(len(labels_all))
  return preds_all, labels_all

preds, labels = evaluate_after()
from sklearn.metrics import classification_report
print(classification_report(labels, preds)) ## This produces the best results for BERT


Evaluating...
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.
7155
7155
              precision    recall  f1-score   support

         0.0       0.60      0.74      0.66      3746
         1.0       0.61      0.45      0.52      3409

    accuracy                           0.60      7155
   macro avg       0.61      0.60      0.59      7155
weighted avg       0.61      0.60      0.59      7155



In [62]:
model=bert_arch(bert)
model.load_state_dict(torch.load(r'/content/drive/MyDrive/Btech Project/BERT MODELS/bert0_.pt'))

<All keys matched successfully>

In [63]:
model.to(device)

bert_arch(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [64]:
def evaluate_after():  
  print("\nEvaluating...")  
  model.eval()                                    # Deactivate dropout layers
  # total_loss, total_accuracy = 0, 0  
  preds_all = np.array([])
  labels_all = np.array([])
  for step,batch in enumerate(val_dataloader):    # Iterate over batches  
    if step % 50 == 0 and not step == 0:          # Progress update every 50 batches.     
                                                  # Calculate elapsed time in minutes.
                                                  # Elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
                                                  # Report progress
    batch = [t for t in batch]                    # Push the batch to GPU
    sent_id1, mask1, labels = batch
    with torch.no_grad():                         # Deactivate autograd
      preds = model(sent_id1, mask1)                # Model predictions
      preds = preds.detach().cpu().numpy()
      preds = np.argmax(preds, axis = 1)
      preds_all = np.concatenate((preds_all, preds))
      labels_all = np.concatenate((labels_all, labels.detach().cpu().numpy()))
      # print(preds_all)
      # print(labels_all)
      # break
      # loss = cross_entropy (preds,labels)          # Compute the validation loss between actual and predicted values
      # total_loss = total_loss + loss.item()
      # preds = preds.detach().cpu().numpy()
  #avg_loss = total_loss / len(val_dataloader)         # compute the validation loss of the epoch
  # return avg_los
  print(len(preds_all))
  print(len(labels_all))
  return preds_all, labels_all

preds, labels = evaluate_after()
from sklearn.metrics import classification_report
print(classification_report(labels, preds))


Evaluating...
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.
7155
7155
              precision    recall  f1-score   support

         0.0       0.53      0.95      0.68      3746
         1.0       0.60      0.09      0.15      3409

    accuracy                           0.54      7155
   macro avg       0.57      0.52      0.42      7155
weighted avg       0.56      0.54      0.43      7155

