In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# Imports from src
import os
import sys
sys.path.append('/content/drive/MyDrive/fake-news-multimodal/src')

In [3]:
from dataloader.text_dataloader import load_text_data
from models.XML_Roberta.XLM_R_Model import build_xlm_model, get_tokenizer
from train.train_roberta import train_text_model
from evaluate.evaluate_roberta import compute_text_metrics


In [4]:
import os
import torch
from torch.utils.data import TensorDataset

# Paths
TEXT_DATA_PATH = "/content/drive/MyDrive/fake-news-multimodal/textdata"
MODEL_SAVE_PATH = "/content/drive/MyDrive/fake-news-multimodal/models"
LOG_DIR = "/content/drive/MyDrive/fake-news-multimodal/logs/xml_roberta"

def run_pipeline():
    # 1Ô∏è‚É£ Load Data
    print("1. Loading Preprocessed Text Data...")
    train_dataset, test_dataset = load_text_data()

    if train_dataset is not None:

        # 2Ô∏è‚É£ Build Model
        print("\n2. Building XLM-RoBERTa Model...")
        model = build_xlm_model()

        # 3Ô∏è‚É£ Train, Save, and Evaluate Model
        print("\n3. Starting Training Process...")
        os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
        os.makedirs(LOG_DIR, exist_ok=True)

        # train_text_model now returns the trained model and the final evaluation results
        trained_model, evaluation_results = train_text_model(model, train_dataset, test_dataset)

        # 4Ô∏è‚É£ Display Results
        print("\n--- Final Test Set Results Summary ---")
        # Display the final metrics collected by trainer.evaluate(test_dataset)
        for key, value in evaluation_results.items():
            # Clean up the key name for display (e.g., 'eval_f1' -> 'F1')
            print(f"{key.replace('eval_', '').capitalize()}: {value:.4f}")

        print(f"\n‚úÖ Pipeline execution complete. Best model saved to: {MODEL_SAVE_PATH}")
    else:
        print("Pipeline halted due to missing data.")

if __name__ == "__main__":
    run_pipeline()

1. Loading Preprocessed Text Data...
Loaded training dataset with 3400 samples.
Loaded testing dataset with 600 samples.

2. Building XLM-RoBERTa Model...
Building model architecture: xlm-roberta-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model initialized with 2 output classes.

3. Starting Training Process...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

  trainer = Trainer(



--- Starting XLM-RoBERTa Fine-Tuning ---




Step,Training Loss
50,0.702
100,0.6601
150,0.5056
200,0.3394
250,0.2267
300,0.2107
350,0.1714
400,0.1649
450,0.1813
500,0.2104



--- Running Final Evaluation on Test Set ---





Fusion-ready BEST MODEL saved to: /content/drive/MyDrive/fake-news-multimodal/models/xlm_roberta/text_encoder_fusion.pt
üìÅ HuggingFace BEST MODEL saved to: /content/drive/MyDrive/fake-news-multimodal/models/xlm_roberta/hf_format

--- Final Test Set Results Summary ---
Loss: 0.0935
Accuracy: 0.9833
F1: 0.9833
Precision: 0.9866
Recall: 0.9800
Runtime: 435.9589
Samples_per_second: 1.3760
Steps_per_second: 0.0870
Epoch: 3.0000

‚úÖ Pipeline execution complete. Best model saved to: /content/drive/MyDrive/fake-news-multimodal/models


In [None]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from evaluate.evaluate_roberta import compute_text_metrics  # your metrics function

# Paths
TRAIN_PATH = "/content/drive/MyDrive/fake-news-multimodal/textdata/train_text_tensors.pt"
TEST_PATH  = "/content/drive/MyDrive/fake-news-multimodal/textdata/test_text_tensors.pt"
MODEL_SAVE_PATH = "/content/drive/MyDrive/fake-news-multimodal/models"
LOG_DIR = "/content/drive/MyDrive/fake-news-multimodal/logs/xlm_roberta"

MODEL_NAME = "xlm-roberta-base"
NUM_LABELS = 2

# --- 1Ô∏è‚É£ Load tensors ---
train_data = torch.load(TRAIN_PATH)
test_data  = torch.load(TEST_PATH)

print(f"Train samples: {train_data['labels'].shape[0]}")
print(f"Test samples: {test_data['labels'].shape[0]}")

# --- 2Ô∏è‚É£ Convert tensors into a dictionary-style Dataset ---
class TextTensorDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

train_dataset = TextTensorDataset(
    {'input_ids': train_data['input_ids'], 'attention_mask': train_data['attention_mask']},
    train_data['labels']
)
test_dataset = TextTensorDataset(
    {'input_ids': test_data['input_ids'], 'attention_mask': test_data['attention_mask']},
    test_data['labels']
)

# --- 3Ô∏è‚É£ Load tokenizer & model ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
print(f"Model initialized with {NUM_LABELS} output classes.")

# --- 4Ô∏è‚É£ Training arguments ---
training_args = TrainingArguments(
    output_dir=MODEL_SAVE_PATH,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir=LOG_DIR,
    logging_steps=50,
    metric_for_best_model="f1",
    save_total_limit=1,
    report_to="none"  # disables WandB logging
)

# --- 5Ô∏è‚É£ Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_text_metrics
)

# --- 6Ô∏è‚É£ Train the model ---
print("\n--- Starting XLM-RoBERTa Fine-Tuning ---")
trainer.train()

# --- 7Ô∏è‚É£ Save the best model ---
trainer.save_model(MODEL_SAVE_PATH)
print(f"‚úÖ Model saved to: {MODEL_SAVE_PATH}")

# --- 8Ô∏è‚É£ Evaluate on test set ---
eval_results = trainer.evaluate(test_dataset)
print("\n--- Final Test Set Metrics ---")
for k, v in eval_results.items():
    print(f"{k}: {v:.4f}")


Train samples: 3400
Test samples: 600


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Model initialized with 2 output classes.

--- Starting XLM-RoBERTa Fine-Tuning ---


Step,Training Loss
50,0.7043
100,0.6724
150,0.5507
200,0.3842
250,0.2632
300,0.1849
350,0.2207
400,0.195
450,0.1533
500,0.0819


‚úÖ Model saved to: /content/drive/MyDrive/fake-news-multimodal/models





--- Final Test Set Metrics ---
eval_loss: 0.1255
eval_accuracy: 0.9717
eval_f1: 0.9714
eval_precision: 0.9797
eval_recall: 0.9633
eval_runtime: 341.1801
eval_samples_per_second: 1.7590
eval_steps_per_second: 0.1110
epoch: 3.0000
