In [1]:
import os
import pandas as pd

# prepare the data. Read tsv files into a datafram and drop not_propaganda instances. 
def load_data_and_return_dataframe(data_dir=".", propaganda_only=False):
    train_data_path = os.path.join(data_dir, "propaganda_train.tsv")
    test_data_path = os.path.join(data_dir, "propaganda_val.tsv")
    
    train_data = pd.read_csv(train_data_path, delimiter="\t")
    test_data = pd.read_csv(test_data_path, delimiter="\t")
    
    if propaganda_only:
        train_data = train_data[train_data['label'] != 'not_propaganda'].reset_index(drop=True)
        test_data = test_data[test_data['label'] != 'not_propaganda'].reset_index(drop=True)

    return train_data, test_data

train_data, test_data = load_data_and_return_dataframe(propaganda_only=True)

In [2]:
import random, torch
import numpy as np

# set fixed seed to ensure deterministic behavior
def set_seed(seed=291158):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
set_seed() 

In [3]:
# same function as for the first approach except this time
# i used [SEP] token to seperate the context from the span

def process_data_for_bert(data):
    sents = []
    labels = []

    for label, text in data.values:
        bos_idx = text.find("<BOS>")
        eos_idx = text.find("<EOS>")

        span = text[bos_idx + len("<BOS>"):eos_idx].strip()
        context = text.replace("<BOS>", "").replace("<EOS>", "").strip()

        sent = context + " [SEP] " + span
 
        sents.append(sent)
        labels.append(label)

    return sents, labels

In [5]:
from transformers import AutoTokenizer

# define model name
model_name = "distilbert-base-cased" 

# init the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# define the label list, label2id and id2label correct mapping
label_list = sorted(train_data["label"].unique())
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

In [6]:
from datasets import Dataset, DatasetDict
import numpy as np
import torch

X_train, Y_train = process_data_for_bert(train_data)
X_test, Y_test = process_data_for_bert(test_data)

# Encode labels
Y_train_ids = [label2id[y] for y in Y_train]
Y_test_ids = [label2id[y] for y in Y_test]

# Convert to HF datasets
train_dataset = Dataset.from_dict({
    "text": X_train,
    "label": Y_train_ids
})
test_dataset = Dataset.from_dict({
    "text": X_test,
    "label": Y_test_ids
})

# Tokenization function
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True
    )

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1223 [00:00<?, ? examples/s]

Map:   0%|          | 0/279 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    label2id=label2id,
    id2label=id2label
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=15,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    load_best_model_at_end=True,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100
)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# define compute metrics function,
# here i am using accuary score only,
# this is because the classes are balanced

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds)
    }

## Train Model

In [None]:
from transformers import Trainer, EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

## Evaluate Model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt

preds_output = trainer.predict(tokenized_test)
preds = np.argmax(preds_output.predictions, axis=1)

print(f"Accuracy Score: {accuracy_score(Y_test_ids, preds):.4f}")

cm = confusion_matrix(Y_test_ids, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
# Save plot
plt.savefig("confusion_matrix_distilbert.png")

## Save Model

In [None]:
trainer.save_model("distilbert-for-propaganda-type-classification")