In [1]:
# !wget https://ml-coding-test.s3.eu-west-1.amazonaws.com/webis_train.csv
# !wget https://ml-coding-test.s3.eu-west-1.amazonaws.com/webis_test.csv
     

In [2]:
import pandas as pd

train = pd.read_csv("webis_train.csv", usecols=["postText", "truthClass"])
test = pd.read_csv("webis_test.csv", usecols=["postText", "truthClass"])

train.rename(columns={"postText": "text", "truthClass": "label"}, inplace=True)
test.rename(columns={"postText": "text", "truthClass": "label"}, inplace=True)

In [3]:
train.shape, test.shape

((19538, 2), (18979, 2))

In [4]:
train.isna().sum(), test.isna().sum()

(text     54
 label     0
 dtype: int64,
 text     66
 label     0
 dtype: int64)

In [5]:
train = train.dropna(subset=["text"]).reset_index(drop=True)
test = test.dropna(subset=["text"]).reset_index(drop=True)

In [6]:
import torch
import random
import numpy as np

# Set a fixed seed value for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [7]:
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets, ClassLabel
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

In [8]:
dataset = concatenate_datasets(
    [
        Dataset.from_pandas(train, split="train"),
        Dataset.from_pandas(test, split="test"),
    ]
)

dataset = dataset.cast_column("label", ClassLabel(names=["no-clickbait", "clickbait"]))

Casting the dataset:   0%|          | 0/38397 [00:00<?, ? examples/s]

In [9]:
SAMPLE_SIZE = 15000

dataset = dataset.shuffle(seed=SEED).select([i for i in list(range(SAMPLE_SIZE))])

train_test = dataset.train_test_split(test_size=0.3, stratify_by_column="label")
eval_test = train_test["test"].train_test_split(test_size=0.5)

webis17 = DatasetDict(
    {
        "train": train_test["train"],
        "eval": eval_test["train"],
        "test": eval_test["test"],
    }
)

webis17

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10500
    })
    eval: Dataset({
        features: ['text', 'label'],
        num_rows: 2250
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2250
    })
})

In [10]:
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from datasets import load_metric

MODEL_NAME = "FacebookAI/roberta-base"


# Move the model to the GPU (if available)
device = "mps" if torch.backends.mps.is_available() else "cpu"
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# Define label mappings
num_classes = len(webis17["train"].features["label"].names)
id2label = {id: webis17["train"].features["label"].int2str(id) for id in range(num_classes)}
label2id = {label: id for (id, label) in id2label.items()}


# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_classes,
    device_map=device,
    id2label=id2label,
    label2id=label2id
)


# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=model.config.max_position_embeddings,
    )


tokenized_datasets = webis17.map(tokenize_function, batched=True)

# Load metric
metric = load_metric("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


# Define training arguments
training_args = TrainingArguments(
    output_dir="./checkpoints",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the fine-tuned model
model.save_pretrained(f"{MODEL_NAME.split('/')[1]}_webis17_tuned")
tokenizer.save_pretrained(f"{MODEL_NAME.split('/')[1]}_webis17_tuned")

# Test the model
test_results = trainer.predict(tokenized_datasets["test"])
print(f"Test results: {test_results.metrics}")

Using device: mps


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/987 [00:00<?, ?it/s]

{'loss': 0.6456, 'grad_norm': 1.4068024158477783, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.03}
{'loss': 0.6321, 'grad_norm': 4.528217315673828, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.06}
{'loss': 0.6257, 'grad_norm': 0.7532750964164734, 'learning_rate': 3e-06, 'epoch': 0.09}
{'loss': 0.6052, 'grad_norm': 1.9847543239593506, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.12}
{'loss': 0.5798, 'grad_norm': 1.6198363304138184, 'learning_rate': 5e-06, 'epoch': 0.15}
{'loss': 0.5318, 'grad_norm': 4.512450695037842, 'learning_rate': 6e-06, 'epoch': 0.18}
{'loss': 0.4967, 'grad_norm': 2.6624746322631836, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.21}
{'loss': 0.4998, 'grad_norm': 2.8481016159057617, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.24}
{'loss': 0.4336, 'grad_norm': 3.1879513263702393, 'learning_rate': 9e-06, 'epoch': 0.27}
{'loss': 0.3728, 'grad_norm': 4.110686302185059, 'learning_rate': 1e-05, 'epoch': 0.3}
{'loss': 0.4124, 'grad_norm': 5.

  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.33279648423194885, 'eval_accuracy': 0.8546666666666667, 'eval_runtime': 62.3947, 'eval_samples_per_second': 36.061, 'eval_steps_per_second': 0.577, 'epoch': 1.0}
{'loss': 0.3801, 'grad_norm': 5.095461368560791, 'learning_rate': 3.3e-05, 'epoch': 1.0}
{'loss': 0.3025, 'grad_norm': 6.850643157958984, 'learning_rate': 3.4000000000000007e-05, 'epoch': 1.03}
{'loss': 0.3266, 'grad_norm': 4.93298864364624, 'learning_rate': 3.5e-05, 'epoch': 1.06}
{'loss': 0.3388, 'grad_norm': 4.784635066986084, 'learning_rate': 3.6e-05, 'epoch': 1.09}
{'loss': 0.3033, 'grad_norm': 9.057267189025879, 'learning_rate': 3.7e-05, 'epoch': 1.12}
{'loss': 0.3279, 'grad_norm': 4.2189226150512695, 'learning_rate': 3.8e-05, 'epoch': 1.16}
{'loss': 0.2556, 'grad_norm': 9.13725471496582, 'learning_rate': 3.9000000000000006e-05, 'epoch': 1.19}
{'loss': 0.3282, 'grad_norm': 5.4304022789001465, 'learning_rate': 4e-05, 'epoch': 1.22}
{'loss': 0.3053, 'grad_norm': 15.918543815612793, 'learning_rate': 4.1e-05,

  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.32942432165145874, 'eval_accuracy': 0.8577777777777778, 'eval_runtime': 50.362, 'eval_samples_per_second': 44.677, 'eval_steps_per_second': 0.715, 'epoch': 2.0}
{'loss': 0.3041, 'grad_norm': 4.700130462646484, 'learning_rate': 3.357289527720739e-05, 'epoch': 2.01}
{'loss': 0.3335, 'grad_norm': 10.092000961303711, 'learning_rate': 3.254620123203286e-05, 'epoch': 2.04}
{'loss': 0.2398, 'grad_norm': 5.0191497802734375, 'learning_rate': 3.1519507186858315e-05, 'epoch': 2.07}
{'loss': 0.223, 'grad_norm': 4.4642415046691895, 'learning_rate': 3.049281314168378e-05, 'epoch': 2.1}
{'loss': 0.1256, 'grad_norm': 11.553142547607422, 'learning_rate': 2.9466119096509244e-05, 'epoch': 2.13}
{'loss': 0.2267, 'grad_norm': 8.868629455566406, 'learning_rate': 2.8439425051334705e-05, 'epoch': 2.16}
{'loss': 0.2146, 'grad_norm': 4.975587368011475, 'learning_rate': 2.7412731006160163e-05, 'epoch': 2.19}
{'loss': 0.2349, 'grad_norm': 9.018712043762207, 'learning_rate': 2.6386036960985628e-05,

  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.3719605505466461, 'eval_accuracy': 0.8644444444444445, 'eval_runtime': 49.5793, 'eval_samples_per_second': 45.382, 'eval_steps_per_second': 0.726, 'epoch': 3.0}
{'train_runtime': 5458.21, 'train_samples_per_second': 5.771, 'train_steps_per_second': 0.181, 'train_loss': 0.3097732257939641, 'epoch': 3.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.32942432165145874, 'eval_accuracy': 0.8577777777777778, 'eval_runtime': 50.2006, 'eval_samples_per_second': 44.82, 'eval_steps_per_second': 0.717, 'epoch': 3.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Test results: {'test_loss': 0.3352513015270233, 'test_accuracy': 0.8431111111111111, 'test_runtime': 53.9672, 'test_samples_per_second': 41.692, 'test_steps_per_second': 0.667}


In [11]:
# With both the model and tokenizer initialized we are now able to get explanations on an example text.

from transformers_interpret import SequenceClassificationExplainer

cls_explainer = SequenceClassificationExplainer(model.to("cpu"), tokenizer)

In [12]:
word_attributions = cls_explainer(
    "Shocking Revelation: The Secret Ingredient That Could Change Your Life Forever!",
    class_name="clickbait",
)
word_attributions

[('<s>', 0.0),
 ('Sh', 0.07818216743942773),
 ('ocking', 0.7347983709536536),
 ('Revelation', 0.3717169262984062),
 (':', -0.20230018422187643),
 ('The', 0.030308820173817988),
 ('Secret', 0.3827598405571305),
 ('Ing', -0.03546143032901893),
 ('red', 0.12055172008267402),
 ('ient', -0.07413058733821046),
 ('That', 0.0014391813737729623),
 ('Could', 0.12493023668972683),
 ('Change', 0.1096041686176846),
 ('Your', 0.10885278207638402),
 ('Life', 0.059378179536186106),
 ('Forever', 0.04398845452252915),
 ('!', 0.24426642583253733),
 ('', 0.039399164901395334),
 ('</s>', 0.0)]

In [13]:
cls_explainer.predicted_class_index, cls_explainer.predicted_class_name

(array(1), 'clickbait')

In [14]:
cls_explainer.visualize("viz.html", true_class="clickbait");

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
clickbait,clickbait (0.85),clickbait,2.14,#s Sh ocking Revelation : The Secret Ing red ient That Could Change Your Life Forever ! #/s
,,,,


In [15]:
id2label, label2id

({0: 'no-clickbait', 1: 'clickbait'}, {'no-clickbait': 0, 'clickbait': 1})