In [1]:
!pip install datasets transformers evaluate accelerate  -qU

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/280.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from pprint import pprint
from datasets import load_dataset
dataset = load_dataset("raquiba/Sarcasm_News_Headline")
pprint(dataset["train"][0])
pprint(dataset["train"][1])



{'article_link': 'https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205',
 'headline': 'thirtysomething scientists unveil doomsday clock of hair loss',
 'is_sarcastic': 1}
{'article_link': 'https://www.huffingtonpost.com/entry/donna-edwards-inequality_us_57455f7fe4b055bb1170b207',
 'headline': 'dem rep. totally nails why congress is falling short on gender, '
             'racial equality',
 'is_sarcastic': 0}


## PREPROCESSING

In [3]:
dataset = dataset.map(
    lambda example: {
        "text": example["headline"],
        "label": example["is_sarcastic"]
        }, remove_columns=["headline", "article_link", "is_sarcastic"]
)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 28619
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 26709
    })
})

## Fine-Tuning a Pre-Trained Transformer Model

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
model_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

## Lets tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/26709 [00:00<?, ? examples/s]

In [5]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [6]:
from datetime import datetime

# Initialize the DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)  # Using pad_to_multiple_of is optional

# Setup TrainingArguments and Trainer as previously described, including the data_collator
dt_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
num_epochs = 5
training_args = TrainingArguments(
    output_dir="./results/" + dt_str,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=128,
    warmup_steps=min(500, int(0.1 * num_epochs * len(tokenized_datasets['train']))),
    weight_decay=0.01,
    logging_dir="./logs/" + dt_str,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="tensorboard",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [7]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1871,0.120339,0.956494
2,0.152,0.050492,0.984088
3,0.0657,0.026297,0.992886
4,0.0339,0.008548,0.998203
5,0.0203,0.007329,0.998652


TrainOutput(global_step=4475, training_loss=0.1277098144431125, metrics={'train_runtime': 1179.9117, 'train_samples_per_second': 121.276, 'train_steps_per_second': 3.793, 'total_flos': 2183977981677120.0, 'train_loss': 0.1277098144431125, 'epoch': 5.0})

## Interacting with the Model


In [10]:
from transformers import pipeline

model = AutoModelForSequenceClassification.from_pretrained('/content/results/2024-02-29_05-08-27/checkpoint-1790')

clf = pipeline('text-classification', model=model, tokenizer=tokenizer)


In [11]:
clf("Alabama Supreme Court Justice Invokes ‘VeggieTales’ In Ruling")


[{'label': 'LABEL_0', 'score': 0.9957224130630493}]

In [12]:
clf([
    "Donald Trump Won South Carolina — But There's 1 Big Caveat",
    "Man Sets Himself On Fire In Front Of Israeli Embassy In Washington",
    "Israeli Media Report Progress On Reaching A Temporary Truce In Gaza And A Hostage-Prisoner Exchange",
    "A White Liberal Is Trying To Oust A Progressive Black Congressman. His Comments Could Make That Job Harder.",
    "Climate Change-Fueled Winter Extremes Put 90% Of This Country At 'High Risk'"
])


[{'label': 'LABEL_0', 'score': 0.9980106949806213},
 {'label': 'LABEL_0', 'score': 0.9978170394897461},
 {'label': 'LABEL_1', 'score': 0.7746263742446899},
 {'label': 'LABEL_0', 'score': 0.9979562759399414},
 {'label': 'LABEL_0', 'score': 0.9784660935401917}]

## References
- https://hunterheidenreich.com/posts/sarcasm-detection-with-transformers/
- https://docs.nvidia.com/nemo-framework/user-guide/latest/gemma/dataprep.html