In [2]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.utils import load_cleaned_data, split, compute_metrics, load_config
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, XLMRobertaConfig, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
import numpy as np
import torch
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import pandas as pd

from datasets import Dataset as HFDataset

2025-04-05 07:19:28.467806: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-05 07:19:28.735037: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743830368.830204    1978 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743830368.858201    1978 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743830369.069903    1978 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [4]:
config = load_config("../cfg/xlm_roberta.json")
# Load data from your existing code
data = load_cleaned_data(config["data"]["preprocessed_data_path"])
X = data['full_text']
y = data['label']

print(f"Total samples: {len(data)}")
print(f"Label distribution: {y.value_counts().to_dict()}")

print("\nSample article (non-hyperpartisan):")
print(X[y == 0].iloc[0][:200] + "...")
print("\nSample article (hyperpartisan):")
print(X[y == 1].iloc[0][:200] + "...")

Total samples: 645
Label distribution: {0: 407, 1: 238}

Sample article (non-hyperpartisan):
It's 1968 All Over Again. Almost a half-century ago, in 1968, the United States seemed to be falling apart. The Vietnam War, a bitter and close presidential election, antiwar protests, racial riots, p...

Sample article (hyperpartisan):
Kucinich: Reclaiming the money power. Money ( Image by 401(K) 2013 ) Permission Details DMCA No Pill Can Stop Tinnitus, But This 1 Weird Trick Can The walls are closing in on Congress. Terrifying wall...


In [None]:
# Set up model name and tokenizer from config
model_name = config["model"]["base_model"]
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)

# Set up XLM-RoBERTa configuration from config
xlm_roberta_config = XLMRobertaConfig.from_pretrained(model_name)
xlm_roberta_config.classifier_dropout = config["model"]["classifier_dropout"]
xlm_roberta_config.num_labels = config["model"]["num_labels"]

def tokenize_texts(texts, labels, tokenizer, max_length=config["data"]["max_length"]):
    """Tokenize texts and prepare for model input"""
    dataset_dict = {'text': texts.tolist(), 'label': labels.tolist()}
    dataset = HFDataset.from_dict(dataset_dict)
    
    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
    
    # Map tokenization function
    tokenized_dataset = dataset.map(
        tokenize_function, 
        batched=True, 
        remove_columns=['text']
    )
    
    # Format for PyTorch
    tokenized_dataset.set_format(
        type='torch', 
        columns=['input_ids', 'attention_mask', 'label']
    )
    
    return tokenized_dataset

In [None]:
# Set up device
device = None
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    print("No GPU available, using CPU.")
print(f"Using device: {device}")

Using device: cuda

=== Fold 1/5 ===
Original training data distribution: {0: 326, 1: 190}
Resampled training data distribution: {1: 326, 0: 326}


Map:   0%|          | 0/652 [00:00<?, ? examples/s]

Map:   0%|          | 0/129 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Balanced Accuracy,Precision,Recall,F1
1,No log,0.669778,0.510417,1.0,0.020833,0.040816
2,No log,0.670954,0.783565,0.6,0.9375,0.731707
3,No log,0.549267,0.77392,0.723404,0.708333,0.715789
4,No log,0.780529,0.775463,0.833333,0.625,0.714286
5,No log,1.037616,0.769676,0.733333,0.6875,0.709677



=== Fold 2/5 ===
Original training data distribution: {0: 324, 1: 192}
Resampled training data distribution: {1: 324, 0: 324}


Map:   0%|          | 0/648 [00:00<?, ? examples/s]

Map:   0%|          | 0/129 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Balanced Accuracy,Precision,Recall,F1
1,No log,0.534999,0.799764,0.685185,0.804348,0.74
2,No log,0.465894,0.829885,0.755102,0.804348,0.778947
3,No log,0.716613,0.707177,0.494382,0.956522,0.651852
4,No log,0.49887,0.826218,0.795455,0.76087,0.777778
5,No log,0.904384,0.827528,0.709091,0.847826,0.772277



=== Fold 3/5 ===
Original training data distribution: {0: 330, 1: 186}
Resampled training data distribution: {1: 330, 0: 330}


Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Map:   0%|          | 0/129 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Balanced Accuracy,Precision,Recall,F1
1,No log,0.536349,0.728521,0.608696,0.807692,0.694215
2,No log,0.635398,0.689061,0.541176,0.884615,0.671533
3,No log,0.695423,0.685689,0.534091,0.903846,0.671429
4,No log,0.528287,0.721279,0.564706,0.923077,0.70073
5,No log,0.474441,0.807193,0.857143,0.692308,0.765957
6,No log,0.939301,0.786464,0.657143,0.884615,0.754098
7,0.527500,0.863854,0.791334,0.891892,0.634615,0.741573
8,0.527500,0.916646,0.806943,0.808511,0.730769,0.767677



=== Fold 4/5 ===
Original training data distribution: {0: 322, 1: 194}
Resampled training data distribution: {1: 322, 0: 322}


Map:   0%|          | 0/644 [00:00<?, ? examples/s]

Map:   0%|          | 0/129 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Balanced Accuracy,Precision,Recall,F1
1,No log,0.674998,0.778476,0.58209,0.886364,0.702703
2,No log,0.423328,0.754278,0.787879,0.590909,0.675325
3,No log,0.533896,0.811497,0.857143,0.681818,0.759494
4,No log,0.747375,0.794652,0.875,0.636364,0.736842
5,No log,0.88131,0.833824,0.825,0.75,0.785714
6,No log,1.037119,0.805615,0.833333,0.681818,0.75
7,0.353600,1.077821,0.805214,0.794872,0.704545,0.746988
8,0.353600,1.106031,0.816578,0.8,0.727273,0.761905



=== Fold 5/5 ===
Original training data distribution: {0: 326, 1: 190}
Resampled training data distribution: {1: 326, 0: 326}


Map:   0%|          | 0/652 [00:00<?, ? examples/s]

Map:   0%|          | 0/129 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Balanced Accuracy,Precision,Recall,F1
1,No log,0.6233,0.731867,0.771429,0.5625,0.650602
2,No log,0.457636,0.771991,0.7,0.729167,0.714286
3,No log,0.525558,0.766975,0.586667,0.916667,0.715447
4,No log,0.625526,0.773148,0.903226,0.583333,0.708861
5,No log,0.83118,0.794367,0.804878,0.6875,0.741573
6,No log,0.809329,0.810957,0.829268,0.708333,0.764045
7,0.424600,0.861584,0.80054,0.825,0.6875,0.75
8,0.424600,0.82851,0.815201,0.813953,0.729167,0.769231


In [None]:
# Cross-validation setup from config
n_splits = config["cross_validation"]["n_splits"]
kf = KFold(
    n_splits=n_splits, 
    shuffle=config["cross_validation"]["shuffle"], 
    random_state=config["cross_validation"]["random_state"]
)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n=== Fold {fold+1}/{n_splits} ===")
    
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    # Apply class balancing if configured
    if config["class_balancing"]["use_balancing"]:
        if config["class_balancing"]["method"] == "oversample":
            ros = RandomOverSampler(random_state=config["cross_validation"]["random_state"])
            train_indices = np.array(range(len(X_train_fold))).reshape(-1, 1)
            train_indices_resampled, y_train_resampled = ros.fit_resample(train_indices, y_train_fold.values)
            train_indices_resampled = train_indices_resampled.flatten()
            X_train_resampled = X_train_fold.iloc[train_indices_resampled].reset_index(drop=True)
            y_train_resampled = pd.Series(y_train_resampled)
            
            print(f"Original training data distribution: {y_train_fold.value_counts().to_dict()}")
            print(f"Resampled training data distribution: {y_train_resampled.value_counts().to_dict()}")
    else:
        # No resampling
        X_train_resampled, y_train_resampled = X_train_fold, y_train_fold
    
    # Tokenize datasets
    train_dataset = tokenize_texts(
        X_train_resampled, 
        y_train_resampled, 
        tokenizer, 
        max_length=config["data"]["max_length"]
    )
    val_dataset = tokenize_texts(
        X_val_fold, 
        y_val_fold, 
        tokenizer, 
        max_length=config["data"]["max_length"]
    )
    
    # Create a fresh model for this fold
    model = XLMRobertaForSequenceClassification.from_pretrained(
        model_name,
        config=xlm_roberta_config
    ).to(device)
    
    # Create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # Training arguments from config
    training_args = TrainingArguments(
        output_dir=f"{config['training']['output_dir']}/xlm_roberta_fold_{fold+1}",
        learning_rate=config["training"]["learning_rate"],
        per_device_train_batch_size=config["training"]["batch_size"],
        per_device_eval_batch_size=config["training"]["eval_batch_size"],
        num_train_epochs=config["training"]["epochs"],
        weight_decay=config["training"]["weight_decay"],
        evaluation_strategy=config["training"]["evaluation_strategy"],
        save_strategy=config["training"]["save_strategy"],
        load_best_model_at_end=config["training"]["load_best_model_at_end"],
        metric_for_best_model=config["training"]["metric_for_best_model"],
        save_total_limit=config["training"]["save_total_limit"],
        fp16=torch.cuda.is_available(),
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=config["training"]["early_stopping_patience"])]
    )
    
    trainer.train()
    results = trainer.evaluate()
    fold_results.append(results)
    
    # Save model
    trainer.save_model(f"{training_args.output_dir}")
    
    # Clear memory
    del model, trainer
    torch.cuda.empty_cache()

In [5]:
# Analyze cross-validation results
accuracies = [result['balanced_accuracy'] for result in fold_results]
precisions = [result['precision'] for result in fold_results]
recalls = [result['recall'] for result in fold_results]
f1_scores = [result['f1'] for result in fold_results]

print("\nCross-validation summary:")
print(f"Balanced accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} ± {np.std(precisions):.4f}")
print(f"Recall: {np.mean(recalls):.4f} ± {np.std(recalls):.4f}")
print(f"F1 Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

# Visualize metrics across folds
plt.figure(figsize=(10, 6))
fold_nums = list(range(1, n_splits + 1))
plt.plot(fold_nums, accuracies, 'o-', label='Balanced accuracy')
plt.plot(fold_nums, precisions, 'o-', label='Precision')
plt.plot(fold_nums, recalls, 'o-', label='Recall')
plt.plot(fold_nums, f1_scores, 'o-', label='F1 Score')
plt.xlabel('Fold')
plt.ylabel('Score')
plt.title('Model Performance Across Folds')
plt.legend()
plt.grid(True)
plt.show()

KeyError: 'balanced_accuracy'

In [None]:
# Train final model using the original train/test split
X_train, X_test, y_train, y_test = split(X, y)

# Tokenize datasets
train_dataset = tokenize_texts(
    X_train, 
    y_train, 
    tokenizer, 
    max_length=config["data"]["max_length"]
)
test_dataset = tokenize_texts(
    X_test, 
    y_test, 
    tokenizer, 
    max_length=config["data"]["max_length"]
)

# Create final model
final_model = XLMRobertaForSequenceClassification.from_pretrained(
    model_name,
    config=xlm_roberta_config
).to(device)

# Training arguments for final model
final_training_args = TrainingArguments(
    output_dir=f"{config['training']['output_dir']}/xlm_roberta_final",
    learning_rate=config["training"]["learning_rate"],
    per_device_train_batch_size=config["training"]["batch_size"],
    per_device_eval_batch_size=config["training"]["eval_batch_size"],
    num_train_epochs=config["training"]["epochs"],
    weight_decay=config["training"]["weight_decay"],
    evaluation_strategy=config["training"]["evaluation_strategy"],
    save_strategy=config["training"]["save_strategy"],
    load_best_model_at_end=config["training"]["load_best_model_at_end"],
    metric_for_best_model=config["training"]["metric_for_best_model"],
    save_total_limit=config["training"]["save_total_limit"],
    fp16=torch.cuda.is_available(),
)

# Initialize trainer
final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=config["training"]["early_stopping_patience"])]
)

# Train the model
final_trainer.train()

# Evaluate and save the model
test_results = final_trainer.evaluate()
print("\nTest set evaluation:")
print(test_results)

# Save the final model
final_trainer.save_model(final_training_args.output_dir)
tokenizer.save_pretrained(final_training_args.output_dir)# Train final model using the original train/test split
X_train, X_test, y_train, y_test = split(X, y)

# Tokenize datasets
train_dataset = tokenize_texts(
    X_train, 
    y_train, 
    tokenizer, 
    max_length=config["data"]["max_length"]
)
test_dataset = tokenize_texts(
    X_test, 
    y_test, 
    tokenizer, 
    max_length=config["data"]["max_length"]
)

# Create final model
final_model = XLMRobertaForSequenceClassification.from_pretrained(
    model_name,
    config=xlm_roberta_config
).to(device)

# Training arguments for final model
final_training_args = TrainingArguments(
    output_dir=f"{config['training']['output_dir']}/xlm_roberta_final",
    learning_rate=config["training"]["learning_rate"],
    per_device_train_batch_size=config["training"]["batch_size"],
    per_device_eval_batch_size=config["training"]["eval_batch_size"],
    num_train_epochs=config["training"]["epochs"],
    weight_decay=config["training"]["weight_decay"],
    evaluation_strategy=config["training"]["evaluation_strategy"],
    save_strategy=config["training"]["save_strategy"],
    load_best_model_at_end=config["training"]["load_best_model_at_end"],
    metric_for_best_model=config["training"]["metric_for_best_model"],
    save_total_limit=config["training"]["save_total_limit"],
    fp16=torch.cuda.is_available(),
)

# Initialize trainer
final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=config["training"]["early_stopping_patience"])]
)

# Train the model
final_trainer.train()

# Evaluate and save the model
test_results = final_trainer.evaluate()
print("\nTest set evaluation:")
print(test_results)

# Save the final model
final_trainer.save_model(final_training_args.output_dir)
tokenizer.save_pretrained(final_training_args.output_dir)

In [None]:
test_true_labels = test_results['true_labels']
test_predictions = test_results['predictions']

errors = [(i, pred, true) for i, (pred, true) in 
         enumerate(zip(test_predictions, test_true_labels)) if pred != true]

print(f"\nTotal errors: {len(errors)} out of {len(test_predictions)} samples ({len(errors)/len(test_predictions)*100:.2f}%)")

for i, (idx, pred, true) in enumerate(errors[:5]):
    actual_idx = X_test.index[idx]  # Get the original index in the dataset
    text = X_test.iloc[idx][:500] + "..."  # Show first 500 chars
    
    print(f"\nError {i+1}:")
    print(f"Predicted: {'hyperpartisan' if pred == 1 else 'not hyperpartisan'}")
    print(f"Actual: {'hyperpartisan' if true == 1 else 'not hyperpartisan'}")
    print(f"Text snippet: {text}")


In [None]:
def predict_hyperpartisan(text, model=final_model, tokenizer=tokenizer):
    """Predict if a text is hyperpartisan"""
    # Tokenize
    inputs = tokenizer(
        text,
        add_special_tokens=True,
        max_length=config["data"]["max_length"],
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    ).to(device)
    
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        prediction = torch.argmax(outputs.logits, dim=1).item()
        confidence = probs[0][prediction].item()
    
    return {
        'prediction': 'hyperpartisan' if prediction == 1 else 'not hyperpartisan',
        'confidence': confidence,
        'label': prediction
    }

sample_texts = [
    X_test.iloc[0],  
    "This article proves that the President is the worst in history and a complete disaster for America.",  # Likely hyperpartisan
    "The Senate voted yesterday on the new healthcare bill, with 45 votes for and 55 against."  # Likely not hyperpartisan
]

for i, text in enumerate(sample_texts):
    result = predict_hyperpartisan(text)
    print(f"\nSample {i+1}:")
    print(f"Prediction: {result['prediction']}")
    print(f"Confidence: {result['confidence']:.4f}")