In [None]:
!pip install transformers

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import datetime
import numpy as np
import pandas
import random
import time
import torch

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Running on '%s'." % (device))

## Mount

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')
#%cd /content/drive/MyDrive/sarcasm-detection/notebooks
#!ls

# Get Data

In [None]:
df = pandas.read_csv("../data/processed_sarcasm-dataset.csv", index_col=False)
df = df.astype({'label': 'int32', 'comment': 'str', 'parent_comment': 'str'})
df = df.sample(frac = 1) # Shuffle rows
df = df.reset_index(drop=True)

In [None]:
display(df)

# Get Tokenizer

In [None]:
counter = 0

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# make [CLS] Sentence1 [SEP] Sentence2 [SEP]
def encode_text(parent_comment, comment):
  global counter
  counter +=1
  print("\rProcessed rows: %d" % counter, end='')
  return tokenizer.encode_plus(
                      parent_comment, comment,
                      add_special_tokens = True, #'[CLS]' and '[SEP]'
                      padding = 'max_length',
                      truncation = True, # Longest first
                      max_length = 384,  # Max length for input
                      return_attention_mask = True, # Retrieve attention masks.
                      return_tensors = 'pt', # Return pytorch tensors.
                  )

In [None]:
encoding_out = df.apply(lambda row : encode_text(row['parent_comment'], row['comment']), axis=1)

#df['input_ids'] = [out['input_ids'] for out in encoding_out]
#df['token_type_id'] = [out['token_type_ids'] for out in encoding_out]
#df['attention_mask'] = [out['attention_mask'] for out in encoding_out]

# Train-Test Split

In [None]:
x_train, x_val, y_train, y_val = train_test_split(encoding_out, df['label'].to_list(), random_state=2023, test_size=0.2)

In [None]:
train_inputs = torch.cat([t['input_ids'] for t in x_train], dim=0).to(device)
validation_inputs = torch.cat([t['input_ids'] for t in x_val], dim=0).to(device)
train_labels = torch.tensor(y_train).to(device)
validation_labels = torch.tensor(y_val).to(device)

train_masks = torch.cat([t['attention_mask'] for t in x_train], dim=0).to(device)
validation_masks = torch.cat([t['attention_mask'] for t in x_val], dim=0).to(device)

# Generate batches

In [None]:
batch_size = 16

In [None]:
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Train Config

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased", # Use the 12-layer BERT model
    num_labels = 2, # sarcastic or not
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
learning_rate = 3e-5
epsilon = 1e-8

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate, eps = epsilon)
# Eps avoids divide-by-zero

# Train Model

In [None]:
epochs = 4
loss_values = []

In [None]:
# Measure the total loss for each epoch.
total_loss = 0

# Loop over each epoch.
for epoch in range(epochs+1):

    print("")
    print('======== Epoch %d / %d ========' % (epoch, epochs))

    # Reset the total loss for each epoch.
    total_loss = 0

    # Put the model into training mode
    model.train()

    # For each batch of training data
    for step, batch in enumerate(train_dataloader):

        # Unpack the training batch from dataloader and copy each tensor to the GPU
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Clear any previously calculated gradients before performing a backward pass
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch)
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        # The call to `model` always returns a tuple, so we need to pull the loss value out of the tuple
        loss = outputs[0]

        # Accumulate the training loss over all of the batches
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients
        loss.backward()

        # Clip the norm of the gradients to 1.0 (helps prevent the "exploding gradients" problem)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient
        optimizer.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)

    print("Average training loss: %.2f" % (avg_train_loss))

    # ========================================
    #               Validation
    # ========================================

    # After the completion of each training epoch, measure the model's performance on
    # the validation set.

    print("\nRunning Validation...")

    # Put the model in evaluation mode
    model.eval()

    # Tracking variables
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():

            # Forward pass, calculate logit predictions.
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = accuracy_score(np.argmax(logits, axis=1).flatten(), label_ids.flatten())

        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("Val Accuracy: %.2f" % (eval_accuracy/nb_eval_steps))


In [None]:
# let's assume that you have your parent_comment and comment as follows
parent_comment = "obamas fema camps have experienced significant delays PERIOD but they should finally be ready on or before november 8 PERIOD"
comment = "they will all be tragic victims of climate changemuahahahaha"

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# The BERT model expects data in a specific format.
# Here, we are tokenizing the comment pair and formatting it as the BERT model expects.
encoded_input = tokenizer(parent_comment, comment, padding=True, truncation=True, max_length = 384, return_tensors='pt')

# If you are using GPU, move the tensor to GPU
if torch.cuda.is_available():
    model = model.to('cuda')
    encoded_input = encoded_input.to('cuda')

# Run the data through the model
with torch.no_grad():
    outputs = model(**encoded_input)

In [None]:
import torch.nn.functional as F

# Apply softmax to the logits
probs = F.softmax(outputs.logits, dim=-1)

# Get the predicted labels: choose the label with the highest probability
predicted_labels = torch.argmax(probs, dim=-1)

print(predicted_labels)

# Test model

In [None]:
from google.colab import runtime
runtime.unassign()