In [7]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # add parent directory to path
from utils.utils import load_cleaned_data, metrics, split
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from torch.optim import AdamW
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from transformers import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

## Now let's load and explore the data:

In [8]:
# Load data from your existing code
data = load_cleaned_data("../data/preprocessed/byarticle_clean.tsv")
X = data['full_text']
y = data['label']

print(f"Total samples: {len(data)}")
print(f"Label distribution: {y.value_counts().to_dict()}")

print("\nSample article (non-hyperpartisan):")
print(X[y == 0].iloc[0][:200] + "...")
print("\nSample article (hyperpartisan):")
print(X[y == 1].iloc[0][:200] + "...")

Total samples: 645
Label distribution: {0: 407, 1: 238}

Sample article (non-hyperpartisan):
It's 1968 All Over Again. Almost a half-century ago, in 1968, the United States seemed to be falling apart. The Vietnam War, a bitter and close presidential election, antiwar protests, racial riots, p...

Sample article (hyperpartisan):
Kucinich: Reclaiming the money power. Money ( Image by 401(K) 2013 ) Permission Details DMCA No Pill Can Stop Tinnitus, But This 1 Weird Trick Can The walls are closing in on Congress. Terrifying wall...


## Let's set up the tokenizer and create a custom dataset class:

In [9]:
model_name = "xlm-roberta-base"  # or "xlm-roberta-large" for better performance
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
#model = XLMRobertaForSequenceClassification.from_pretrained(
#    model_name,
#    num_labels=2,
#    #classifier_dropout=0.2
#)

class HyperpartisanDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = int(self.labels.iloc[idx])
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
# Test the dataset class with a small sample
test_dataset = HyperpartisanDataset(X.iloc[:5], y.iloc[:5], tokenizer)
sample_item = test_dataset[0]
print(f"Input shape: {sample_item['input_ids'].shape}")
print(f"Attention mask shape: {sample_item['attention_mask'].shape}")
print(f"Label: {sample_item['labels']}")

Input shape: torch.Size([512])
Attention mask shape: torch.Size([512])
Label: 1


## Now, let's define our training and evaluation functions:

In [None]:
# Set up device
device = None
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    print("No GPU available, using CPU.")
print(f"Using device: {device}")

def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(data_loader, desc="Training")
    for batch in progress_bar:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
        
    return total_loss / len(data_loader)

def evaluate(model, data_loader, device):
    model.eval()
    
    predictions = []
    true_labels = []
    total_loss = 0
    
    progress_bar = tqdm(data_loader, desc="Evaluating")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            _, preds = torch.max(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
            
            progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
    
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='binary', zero_division=0)
    recall = recall_score(true_labels, predictions, average='binary', zero_division=0)
    f1 = f1_score(true_labels, predictions, average='binary', zero_division=0)

    unique_true, counts_true = np.unique(true_labels, return_counts=True)
    unique_pred, counts_pred = np.unique(predictions, return_counts=True)

    print(f"Validation Loss: {avg_loss:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"True labels distribution: {dict(zip(unique_true, counts_true))}")
    print(f"Predicted labels distribution: {dict(zip(unique_pred, counts_pred))}")
    
    return {
        'true_labels': true_labels,
        'predictions': predictions,
        'loss': avg_loss,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

Using device: cuda


## Let's implement cross-validation:

In [None]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)

BATCH_SIZE = 8
EPOCHS = 8
LEARNING_RATE = 2e-5

fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n=== Fold {fold+1}/{n_splits} ===")
    
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    print(f"Training samples: {len(X_train_fold)}")
    print(f"Validation samples: {len(X_val_fold)}")
    
    train_dataset = HyperpartisanDataset(X_train_fold, y_train_fold, tokenizer)
    val_dataset = HyperpartisanDataset(X_val_fold, y_val_fold, tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    
    model = XLMRobertaForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2
    ).to(device)
    
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    
    for epoch in range(EPOCHS):
        print(f"\nEpoch {epoch+1}/{EPOCHS}")
        
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
        print(f"Training loss: {train_loss:.4f}")
        
        print("Validation results:")
        val_results = evaluate(model, val_loader, device)
    
    fold_results.append(val_results)
    
    model.save_pretrained(f"../models/xlm_roberta_fold_{fold+1}")
    
    del model
    torch.cuda.empty_cache()


=== Fold 1/5 ===
Training samples: 516
Validation samples: 129


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/8


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Training loss: 0.6522
Validation results:


Evaluating:   0%|          | 0/17 [00:00<?, ?it/s]

Validation Loss: 0.7135
Accuracy: 0.6279
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
True labels distribution: {np.int64(0): np.int64(81), np.int64(1): np.int64(48)}
Predicted labels distribution: {np.int64(0): np.int64(129)}

Epoch 2/8


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Training loss: 0.6108
Validation results:


Evaluating:   0%|          | 0/17 [00:00<?, ?it/s]

Validation Loss: 0.5610
Accuracy: 0.7287
Precision: 0.5942
Recall: 0.8542
F1 Score: 0.7009
True labels distribution: {np.int64(0): np.int64(81), np.int64(1): np.int64(48)}
Predicted labels distribution: {np.int64(0): np.int64(60), np.int64(1): np.int64(69)}

Epoch 3/8


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Training loss: 0.5064
Validation results:


Evaluating:   0%|          | 0/17 [00:00<?, ?it/s]

Validation Loss: 0.4481
Accuracy: 0.7597
Precision: 0.6667
Recall: 0.7083
F1 Score: 0.6869
True labels distribution: {np.int64(0): np.int64(81), np.int64(1): np.int64(48)}
Predicted labels distribution: {np.int64(0): np.int64(78), np.int64(1): np.int64(51)}

Epoch 4/8


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Training loss: 0.3804
Validation results:


Evaluating:   0%|          | 0/17 [00:00<?, ?it/s]

Validation Loss: 0.5353
Accuracy: 0.7907
Precision: 0.7692
Recall: 0.6250
F1 Score: 0.6897
True labels distribution: {np.int64(0): np.int64(81), np.int64(1): np.int64(48)}
Predicted labels distribution: {np.int64(0): np.int64(90), np.int64(1): np.int64(39)}

Epoch 5/8


Training:   0%|          | 0/65 [00:00<?, ?it/s]

## Let's analyze the cross-validation results:

In [None]:
# Analyze cross-validation results
accuracies = [result['accuracy'] for result in fold_results]
precisions = [result['precision'] for result in fold_results]
recalls = [result['recall'] for result in fold_results]
f1_scores = [result['f1'] for result in fold_results]

print("\nCross-validation summary:")
print(f"Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} ± {np.std(precisions):.4f}")
print(f"Recall: {np.mean(recalls):.4f} ± {np.std(recalls):.4f}")
print(f"F1 Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

# Visualize metrics across folds
plt.figure(figsize=(10, 6))
fold_nums = list(range(1, n_splits + 1))
plt.plot(fold_nums, accuracies, 'o-', label='Accuracy')
plt.plot(fold_nums, precisions, 'o-', label='Precision')
plt.plot(fold_nums, recalls, 'o-', label='Recall')
plt.plot(fold_nums, f1_scores, 'o-', label='F1 Score')
plt.xlabel('Fold')
plt.ylabel('Score')
plt.title('Model Performance Across Folds')
plt.legend()
plt.grid(True)
plt.show()

## Now let's train our final model on the original train/test split:

In [None]:
X_train, X_test, y_train, y_test = split(X, y)

train_dataset = HyperpartisanDataset(X_train, y_train, tokenizer)
test_dataset = HyperpartisanDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

final_model = XLMRobertaForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
).to(device)

# Prepare optimizer and scheduler
optimizer = AdamW(final_model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Training loop
print("\nTraining final model...")
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    
    # Train
    train_loss = train_epoch(final_model, train_loader, optimizer, scheduler, device)
    print(f"Training loss: {train_loss:.4f}")

# Save the final model
final_model.save_pretrained("../models/xlm_roberta_final")
tokenizer.save_pretrained("../models/xlm_roberta_final")

# Evaluate on test set
print("\nTest set evaluation:")
test_results = evaluate(final_model, test_loader, device)

## Let's look at the most challenging examples - the ones where our model made mistakes:

In [None]:
# Analyze errors
test_true_labels = test_results['true_labels']
test_predictions = test_results['predictions']

errors = [(i, pred, true) for i, (pred, true) in 
          enumerate(zip(test_predictions, test_true_labels)) if pred != true]

print(f"\nTotal errors: {len(errors)} out of {len(test_predictions)} samples ({len(errors)/len(test_predictions)*100:.2f}%)")

# Examine a few errors
for i, (idx, pred, true) in enumerate(errors[:5]):
    actual_idx = X_test.index[idx]  # Get the original index in the dataset
    text = X_test.iloc[idx][:500] + "..."  # Show first 500 chars
    
    print(f"\nError {i+1}:")
    print(f"Predicted: {'hyperpartisan' if pred == 1 else 'not hyperpartisan'}")
    print(f"Actual: {'hyperpartisan' if true == 1 else 'not hyperpartisan'}")
    print(f"Text snippet: {text}")

## Finally, let's implement a simple prediction function for new articles:

In [None]:
def predict_hyperpartisan(text, model, tokenizer, device, max_length=1024):
    model.eval()
    
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, prediction = torch.max(outputs.logits, dim=1)
    
    result = prediction.item()
    confidence = torch.softmax(outputs.logits, dim=1)[0][result].item()
    
    return {
        'prediction': 'hyperpartisan' if result == 1 else 'not hyperpartisan',
        'confidence': confidence,
        'label': result
    }

# Test with a couple of examples
sample_texts = [
    X_test.iloc[0],  # Test with a real example
    "This article proves that the President is the worst in history and a complete disaster for America.",  # Likely hyperpartisan
    "The Senate voted yesterday on the new healthcare bill, with 45 votes for and 55 against."  # Likely not hyperpartisan
]

for i, text in enumerate(sample_texts):
    result = predict_hyperpartisan(text, final_model, tokenizer, device)
    print(f"\nSample {i+1}:")
    print(f"Prediction: {result['prediction']}")
    print(f"Confidence: {result['confidence']:.4f}")