In [1]:
!pip install peft
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install datasets

Looking in indexes: https://download.pytorch.org/whl/cu121


In [1]:
import numpy as np
import pandas as pd
import re
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'distilbert-base-uncased' #'bert-base-uncased'
device = "cuda" if torch.device("cuda") else "cpu"

In [3]:
device

'cuda'

## Load dataset

In [4]:
def load_csv_data(data_path):
    df = pd.read_csv(data_path)
    
    # Preprocess dataset
    df = df.drop(columns=["Unnamed: 0"])  # Drop ID column
    df = df[~df["text"].isnull()]  # Drop articles with no text
    
    # Map bias to numeric value
    bias_mapping = {'left': 1, 'center': 0, 'right': 1}
    df['bias_numeric'] = df['bias_rating'].map(bias_mapping)
    
    # Group articles by event
    unique_titles = df['title'].unique()
    title_to_event_id = {title: idx for idx, title in enumerate(unique_titles)}
    df['event_id'] = df['title'].map(title_to_event_id)
    
    df.loc[df["tags"] == "[]", "tags"] = "N/A"  # Handle missing tags/topics
    df["tags"] = df["tags"].apply(lambda x: re.sub("\[|\]|\'", "", x))
    
    # Select columns
    variables = ["event_id", "tags", "heading", "text", "bias_numeric"]
    df = df[variables]
    
    return df

In [5]:
def construct_inputs(df):
    df_ = df.copy()
    df_["input_text"] = "[CLS] Heading: " + df_['heading'] + " [SEP] Content: " + df_['text'] + " [SEP]"
    #df_["input_text"] = "[CLS] Tags: " + df_['tags'] + " [SEP] Heading: " + df_['heading'] + " [SEP] Content: " + df_['text'] + " [SEP]"
    return df_[["input_text", "bias_numeric"]]

In [6]:
def build_dataset(df):
    dataset = construct_inputs(df)
    dataset = Dataset.from_pandas(dataset)
    return dataset

In [7]:
def load_datasets(df, test_size=0.1, eval_size=0.1):
    from sklearn.model_selection import train_test_split
    
    grouped = df.groupby('event_id')
    
    # Split groups into train, eval, and test sets
    train_groups, test_groups = train_test_split(list(grouped), test_size=test_size, random_state=42)
    train_groups, eval_groups = train_test_split(train_groups, test_size=eval_size, random_state=42)
    
    # Create datasets from the groups
    train_df = pd.concat([group[1] for group in train_groups])
    eval_df = pd.concat([group[1] for group in eval_groups])
    test_df = pd.concat([group[1] for group in test_groups])
    
    # Convert to Datasets
    train_data = build_dataset(train_df)
    eval_data = build_dataset(eval_df)
    test_data = build_dataset(test_df)
    
    return train_data, eval_data, test_data

In [8]:
def reduce_dataset(df, frac=0.1):
    from sklearn.model_selection import train_test_split
    grouped = df.groupby('event_id')
    # Split groups
    groups, _ = train_test_split(list(grouped), test_size=1.0-frac, random_state=42)
    # Create datasets from the groups
    new_df = pd.concat([group[1] for group in groups])
    
    return new_df

Load CSV file

In [9]:
df = load_csv_data(data_path="data/allsides_balanced_news_headlines-texts.csv")
df.shape

(21747, 5)

In [10]:
# Significantly reduce the number of elements for testing purposes
df = reduce_dataset(df, frac=0.5)
df.shape

(10868, 5)

In [11]:
df["event_id"].value_counts().value_counts()

count
3    3598
2      31
6       2
Name: count, dtype: int64

In [12]:
train_data, eval_data, test_data = load_datasets(df, test_size=0.3, eval_size=0.1)

Preprocess dataset

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [14]:
def preprocess_dataset(dataset):
    def tokenize_function(examples):
        # Tokenize the input_text to get input_ids and attention_mask
        return tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=512)
    
    tokenized = dataset.map(tokenize_function, batched=True).rename_column("bias_numeric", "labels")
    tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    return tokenized

In [15]:
tokenized_train = preprocess_dataset(train_data)
tokenized_eval = preprocess_dataset(eval_data)
tokenized_test = preprocess_dataset(test_data)

Map: 100%|████████████████████████████████████████████████████████████████| 6845/6845 [00:01<00:00, 4483.27 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 761/761 [00:00<00:00, 4825.03 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 3262/3262 [00:00<00:00, 3978.66 examples/s]


## Train model

In [16]:
# Load the BERT model with a classification head
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training configuration

In [17]:
# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.05,
    logging_steps=10,
    logging_dir='./logs',
    load_best_model_at_end=True,
    save_total_limit=1,
    report_to=["none"]
)

In [18]:
# Define the Trainer
trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

Train model

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5947,0.535162
2,0.5625,0.517653
3,0.5599,0.622643
4,0.1012,0.824195
5,0.2837,0.998823
6,0.7393,1.244926
7,0.1876,1.439857
8,0.4377,1.57254
9,0.178,1.570349
10,0.0019,1.701423


TrainOutput(global_step=17120, training_loss=0.2840318426188932, metrics={'train_runtime': 2567.7064, 'train_samples_per_second': 26.658, 'train_steps_per_second': 6.667, 'total_flos': 9067555142092800.0, 'train_loss': 0.2840318426188932, 'epoch': 10.0})

## Evaluate model

In [20]:
# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.5176530480384827, 'eval_runtime': 9.5508, 'eval_samples_per_second': 79.679, 'eval_steps_per_second': 19.998, 'epoch': 10.0}


In [21]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Accuracy
    accuracy = accuracy_score(labels, predictions)

    # F1 Score (using weighted average for multiclass classification)
    f1 = f1_score(labels, predictions, average="weighted")

    # Cohen's Kappa
    kappa = cohen_kappa_score(labels, predictions)

    return {
        "accuracy": accuracy,
        "f1": f1,
        "kappa": kappa
    }

In [22]:
# Assign the compute_metrics function to the trainer
trainer.compute_metrics = compute_metrics

In [23]:
test_results = trainer.evaluate(eval_dataset=tokenized_test)
print("Test Results:", test_results)

Test Results: {'eval_loss': 0.4962586760520935, 'eval_accuracy': 0.826486817903127, 'eval_f1': 0.7939300766051058, 'eval_kappa': 0.2513470378143764, 'eval_runtime': 45.5532, 'eval_samples_per_second': 71.609, 'eval_steps_per_second': 17.913, 'epoch': 10.0}
