In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
   ---------------------------------------- 0.0/10.5 MB ? eta -:--:--
   - -------------------------------------- 0.3/10.5 MB ? eta -:--:--
   --- ------------------------------------ 0.8/10.5 MB 2.6 MB/s eta 0:00:04
   ----- ---------------------------------- 1.3/10.5 MB 3.2 MB/s eta 0:00:03
   ------- -------------------------------- 1.8/10.5 MB 3.1 MB/s eta 0:00:03
   ----------- ---------------------------- 2.9/10.5 MB 3.1 MB/s eta 0:00:03
   ------------- -

In [2]:
import torch
import torch.nn as nn
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [3]:
df = pd.read_json('dataset/Sarcasm_Headlines_Dataset.json', lines = True)

In [4]:
df.shape

(26709, 3)

In [5]:
df.dropna(inplace=True)

In [6]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.drop(['article_link'], inplace=True, axis=1)

In [10]:
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [12]:
np.array(df['headline'])

array(["former versace store clerk sues over secret 'black code' for minority shoppers",
       "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
       "mom starting to fear son's web series closest thing she will have to grandchild",
       ..., 'reparations and obama',
       'israeli ban targeting boycott supporters raises alarm abroad',
       'gourmet gifts for the foodie 2014'], dtype=object)

In [13]:
X = np.array(df['headline'])
y= np.array(df['is_sarcastic'])

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
X_val,X_test,y_val,y_test = train_test_split(X_test,y_test,test_size=0.5)

In [17]:
X_train.shape,X_test.shape,X_val.shape

((18695,), (4007,), (4006,))

In [19]:
y_train.shape,y_test.shape,y_val.shape

((18695,), (4007,), (4006,))

In [20]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [23]:
class dataset(Dataset):
    def __init__(self, X,Y):
        self.X = [tokenizer(
            x,
            max_length = 100,
            truncation = True,
            padding='max_length',
            return_tensors = 'pt').to(device)
            for x in X            
        ]
        self.Y = torch.tensor(Y, dtype = torch.float32).to(device)

    def __len__(self):
        return len(self.X)
    def __getitem__(self, indx):
        return self.X[indx],self.Y[indx]

training_data = dataset(X_train,y_train)
validation_data = dataset(X_val, y_val)
testing_data = dataset(X_test,y_test)

In [24]:
BATCH_SIZE = 32
EPOCHS = 10
LR = 1e-4

In [25]:
train_dataloader = DataLoader(training_data, batch_size = BATCH_SIZE, shuffle=True)
validation_dataloader = DataLoader(validation_data, batch_size = BATCH_SIZE, shuffle=True)
testing_dataloader = DataLoader(testing_data, batch_size = BATCH_SIZE, shuffle=True)

In [26]:
class MyModel(nn.Module):
    def __init__(self, bert):
        super(MyModel, self).__init__()

        self.bert = bert
        self.dropout = nn.Dropout(0.25)
        self.linear1 = nn.Linear(768, 384)
        self.linear2 = nn.Linear(384,1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        pooled_output = self.bert(input_ids, attention_mask, return_dict = False)[0][:,0]
        output = self.linear1(pooled_output)
        output = self.dropout(output)
        output = self.linear2(output)
        output = self.sigmoid(output)
        return output

In [27]:
# We want to preserve the pretrained knowledge of the model
for param in bert_model.parameters():
    param.requires_grad = False
model = MyModel(bert_model).to(device)

In [28]:
model

MyModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [29]:
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr =LR)

In [None]:
total_loss_train_plot = []
total_acc_train_plot = []
total_loss_validation_plot = []
total_acc_validation_plot = []

for epoch in range(EPOCHS):
    total_acc_train = 0
    total_loss_train = 0
    total_acc_val = 0
    total_loss_val = 0

    for indx, data in enumerate(train_dataloader):
        inputs, labels = data
        inputs.to(device)
        labels.to(device)

        prediction = model(inputs['input_ids'].squeeze(1), inputs['attention_mask'].squeeze(1)).squeeze(1)
        batch_loss = criterion(prediction, labels)
        total_loss_train += batch_loss.item()

        acc = (prediction.round() == labels).sum().item()

        total_acc_train +=acc

        batch_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    with torch.no_grad():
        for indx, data in enumerate(validation_dataloader):
            inputs, labels = data
            inputs.to(device)
            labels.to(device)

            prediction = model(inputs['input_ids'].squeeze(1), inputs['attention_mask'].squeeze(1)).squeeze(1)
            batch_loss = criterion(prediction, labels)
            total_loss_val += batch_loss.item()

            acc = (prediction.round() == labels).sum().item()
            total_acc_val += acc
# we divide the total loss for normalization the data as the data will be big for plotting
    total_loss_train_plot.append(round(total_loss_train/1000, 4))
    total_loss_validation_plot.append(round(total_loss_val/1000, 4))
    total_acc_train_plot.append(round(total_acc_train/training_data.__len__()*100,4))
    total_acc_validation_plot.append(round(total_acc_val/validation_data.__len__())*100,4)

    print(f"""
             Epoch No. {epoch+1}
             Train Loss: {round(total_loss_train/1000, 4)}
             Train Accuracy: {round(total_acc_train/training_data.__len__()*100,4)}
             Validation Loss: {round(total_loss_val/1000, 4)}
             Validation Accuracy: {round(total_acc_val/validation_data.__len__()*100,4)}
    """)

In [None]:
with torch.no_grad():
    total_loss_test = -
    total_acc_test = 0

    for indx, data in enumerate(validation_dataloader):
            inputs, labels = data
            inputs.to(device)
            labels.to(device)

            prediction = model(inputs['input_ids'].squeeze(1), inputs['attention_mask'].squeeze(1)).squeeze(1)
            batch_loss = criterion(prediction, labels)
            total_loss_test += batch_loss.item()

            acc = (prediction.round() == labels).sum().item()
            total_acc_test += acc

    print(f'Accuracy Score on testing data: {round(total_acc_test/testing_data.__len__()*100,4)}')