In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
from transformers import PreTrainedTokenizerFast, LlamaForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [4]:
def load_csv_data(data_path):
    df = pd.read_csv(data_path)
    # Drop ID column
    df = df.drop(columns=["Unnamed: 0"])
    # Drop articles with no text
    df = df[~df["text"].isnull()]
    # Map bias to numeric value
    bias_mapping = {'left': 1, 'center': 0, 'right': 2}
    df['bias_numeric'] = df['bias_rating'].map(bias_mapping)
    # Group articles by event
    unique_titles = df['title'].unique()
    title_to_event_id = {title: idx for idx, title in enumerate(unique_titles)}
    df['event_id'] = df['title'].map(title_to_event_id)
    # Handle missing tags/topics
    df.loc[df["tags"] == "[]", "tags"] = "N/A"
    # Select columns
    variables = ["event_id", "tags", "heading", "text", "bias_numeric"]
    df = df[variables]

    return df

In [5]:
def preprocess_dataset(example):
    input_text = (
        f"Article Title: {example['heading']}\n"
        f"Topics: {example['tags']}\n"
        f"Content: {example['text']}\n"
        "What is the political bias of this article? (Options: 1 = Left, 0 = Center, 2 = Right)"
    )
    return {'input_text': input_text, 'label': int(example['bias_numeric'])}

In [6]:
def build_dataset(df):
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(preprocess_dataset)
    return dataset

In [7]:
def load_datasets(df, test_size=0.1, eval_size=0.1):
    from sklearn.model_selection import train_test_split
    
    grouped = df.groupby('event_id')
    # Split groups into train, eval, and test sets
    train_groups, test_groups = train_test_split(list(grouped), test_size=test_size, random_state=42)
    train_groups, eval_groups = train_test_split(train_groups, test_size=eval_size, random_state=42)
    # Create datasets from the groups
    train_df = pd.concat([group[1] for group in train_groups])
    eval_df = pd.concat([group[1] for group in eval_groups])
    test_df = pd.concat([group[1] for group in test_groups])
    # Convert to Datasets
    train_data = build_dataset(train_df)
    eval_data = build_dataset(eval_df)
    test_data = build_dataset(test_df)
    
    return train_data, eval_data, test_data

In [8]:
def reduce_dataset(df, frac=0.1):
    from sklearn.model_selection import train_test_split
    grouped = df.groupby('event_id')
    # Split groups
    groups, _ = train_test_split(list(grouped), test_size=1.0-frac, random_state=42)
    # Create datasets from the groups
    new_df = pd.concat([group[1] for group in groups])
    
    return new_df

### Load dataset

In [9]:
df = load_csv_data(data_path="data/allsides_balanced_news_headlines-texts.csv")
df.shape

(21747, 5)

In [10]:
# Significantly reduce the number of elements for testing purposes
df = reduce_dataset(df, frac=0.35)
df.shape

(7613, 5)

In [11]:
df.head()

Unnamed: 0,event_id,tags,heading,text,bias_numeric
19486,6504,"['Presidential Elections', '2020 Election', '2...",Debates commission plans to cut off mics if Tr...,The commission that oversees the general elect...,1
19487,6504,"['Presidential Elections', '2020 Election', '2...",Debate commission considering cutting candidat...,The presidential debate commission is consider...,2
19488,6504,"['Presidential Elections', '2020 Election', '2...",Presidential debate: Rules to change after Tru...,The commission that oversees US presidential d...,0
20840,6958,"['Gun Control', 'Gun Rights', 'Background Chec...",A Universal-Background-Check Law Would Not Vio...,"The terrible shootings in Gilroy, El Paso, and...",2
20841,6958,"['Gun Control', 'Gun Rights', 'Background Chec...",Trump calls for 'intelligent background checks...,"President Trump on Friday called for ""intellig...",0


In [12]:
df.groupby("event_id").count()["text"].unique()

array([3, 6, 2], dtype=int64)

In [13]:
train_data, eval_data, test_data = load_datasets(df, test_size=0.3, eval_size=0.1)

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 4793/4793 [00:00<00:00, 10923.79 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 532/532 [00:00<00:00, 11636.99 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 2288/2288 [00:00<00:00, 10345.21 examples/s]


### Preprocess datasets

In [14]:
model_name = 'unsloth/Llama-3.2-1B'

In [15]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)

In [None]:
if tokenizer.pad_token is None:
    print("No padding token")
    #tokenizer.add_special_tokens({'pad_token': '<pad>'})
else:
    print(tokenizer.pad_token)

In [17]:
def tokenize_function(examples):
    # Tokenize the input_text to get input_ids and attention_mask
    encoding = tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=512)
    encoding['label'] = examples['label']  # Add the label to the encoding
    return encoding

In [18]:
%%time
# Tokenize the datasets
tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_eval = eval_data.map(tokenize_function, batched=True)
tokenized_test = test_data.map(tokenize_function, batched=True)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 4793/4793 [00:00<00:00, 5432.15 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 532/532 [00:00<00:00, 5232.31 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 2288/2288 [00:00<00:00, 5633.30 examples/s]

CPU times: total: 4.67 s
Wall time: 1.86 s





In [None]:
tokenized_train

### Prepare LoRa for fine-turning

In [19]:
# Prepare LoRa configuration
lora_config = LoraConfig(
    r=8,                       # Rank of LoRa matrices
    lora_alpha=32,             # Scaling factor
    lora_dropout=0.1,          # Dropout probability for LoRa
    bias="none",               # Bias configuration
    target_modules=["q_proj", "v_proj"]  # Target layers for LoRa in LLaMa
)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=6,
    num_train_epochs=3,
    learning_rate=1e-4,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    fp16=True,  # Enable mixed-precision training for faster performance
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to='none' # Disable W&B logging
)

### Train model

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [22]:
# Load pre-trained LLaMa model for sequence classification
model = LlamaForSequenceClassification.from_pretrained(model_name, num_labels=3)
model = model.to(device)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at unsloth/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
model.config.pad_token_id = tokenizer.pad_token_id

In [24]:
# Apply LoRa to the model
model = get_peft_model(model, lora_config)

In [25]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer
)

In [None]:
%%time
# Fine-tune the model
trainer.train()

### Save model

In [None]:
# Save the model and tokenizer
model.save_pretrained("./outputs")
tokenizer.save_pretrained("./outputs")

#### Test model

In [None]:
model = LlamaForSequenceClassification.from_pretrained("./outputs")
tokenizer = PreTrainedTokenizerFast.from_pretrained("./outputs")

In [None]:
model.eval()

In [None]:
predictions = trainer.predict(tokenized_test)