# Training the RoBERTa Model
In this notebook, we:
- Wrap tokenized data into PyTorch datasets
- Load a pre-trained `roberta-base` model
- Fine-tune it on our labeled fake news dataset
- Evaluate performance using accuracy and F1 score


In [None]:
import torch
import time
import pickle
import sys
import os
import numpy as np
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss, BCEWithLogitsLoss
from torch.nn.utils import clip_grad_norm_
from torch.optim import AdamW
from transformers import RobertaForSequenceClassification, RobertaTokenizer, get_scheduler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from torch.cuda.amp import GradScaler, autocast
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
with open("artifacts/train_encodings.pkl", "rb") as f:
    train_encodings = pickle.load(f)
with open("artifacts/train_labels.pkl", "rb") as f:
    train_labels = pickle.load(f)
with open("artifacts/val_encodings.pkl", "rb") as f:
    val_encodings = pickle.load(f)
with open("artifacts/val_labels.pkl", "rb") as f:
    val_labels = pickle.load(f)
with open("artifacts/test_encodings.pkl", "rb") as f:
    test_encodings = pickle.load(f)
with open("artifacts/test_labels.pkl", "rb") as f:
    test_labels = pickle.load(f)
with open("artifacts/train_df.pkl", "rb") as f:
    train_df = pickle.load(f)
with open("artifacts/val_df.pkl", "rb") as f:
    val_df = pickle.load(f)


## Create Custom Dataset Class

We need to wrap the `input_ids`, `attention_mask`, and `labels` into a single PyTorch `Dataset` object.  
This makes it easy to iterate over batches during training.


In [None]:
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

## Load Pretrained RoBERTa Model with Classification Head

We’ll now load `roberta-base` and add a classification head for **binary classification (2 labels)**.


In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels = 2)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
!where python

In [None]:
print("Python executable:", sys.executable)
print("Torch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))

In [None]:
print("Train label distribution:")
print(train_df['label'].value_counts())

print("Validation label distribution:")
print(val_df['label'].value_counts())

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## Training Loop

We'll train the model for a few epochs:
- Use loss.backward() for gradient calculation
- Use optimizer.step() to update weights
- Track loss and evaluation metrics per epoch


In [None]:
pos_weight = torch.tensor([train_df['label'].value_counts()[0] / train_df['label'].value_counts()[1]]).to(device)

criterion = BCEWithLogitsLoss(pos_weight=pos_weight)  # Handles class imbalance

In [None]:
class_counts = torch.tensor([18785, 17133], dtype=torch.float)
class_weights = 1.0 / class_counts
class_weights = class_weights / class_weights.sum()  # Normalize
class_weights = class_weights.to(device)

loss_fn = CrossEntropyLoss(weight=class_weights)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # <- Make sure the model is on the same device

model.eval()
with torch.no_grad():
    batch = next(iter(train_loader))

    # Only move tensor items to the device
    batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

    start_time = time.time()
    outputs = model(**batch)
    end_time = time.time()

print(f"Single forward pass time: {end_time - start_time:.4f} seconds")

In [None]:
scaler = GradScaler()
epochs = 10

# Optional scheduler to reduce LR on plateau
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1, verbose=True)

for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")

    for batch_idx, batch in enumerate(progress_bar):
        batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
        optimizer.zero_grad()

        with autocast():
            # Let model compute loss itself
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["labels"]
            )
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        avg_loss = total_loss / (batch_idx + 1)
        progress_bar.set_postfix(loss=loss.item(), avg_loss=avg_loss)

        if epoch == 0 and batch_idx == 0:
            print("\n Batch Tensor Devices (first batch only):")
            for k, v in batch.items():
                print(f"{k}: shape={v.shape}, dtype={v.dtype}, device={v.device}")

    print(f"\n Epoch {epoch + 1} - Avg Loss: {avg_loss:.4f}")

In [None]:
model.eval()
pred_labels = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validating", leave=False):
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch['labels'].cpu().numpy()

        with autocast():
            outputs = model(**batch).logits
            probs = torch.softmax(outputs, dim=1).cpu().numpy()  # Softmax for multi-class

        preds = np.argmax(probs, axis=1)

        pred_labels.extend(preds)
        true_labels.extend(labels)

# Results
f1 = f1_score(true_labels, pred_labels, average="macro")
print(f"\n✅ Validation F1 Score: {f1:.4f}")
print("🔍 Predicted label distribution:", np.unique(pred_labels, return_counts=True))
print("🔍 True label distribution:", np.unique(true_labels, return_counts=True))
print("\n📊 Classification Report:\n", classification_report(true_labels, pred_labels, target_names=['fake', 'real']))

In [None]:
save_directory = "../model"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer (make sure you're using the same one you loaded before)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")