<a href="https://colab.research.google.com/github/Zerixxx8995/multimodal-fake-news-detector/blob/main/notebooks/02_phase3_text_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Phase 3 Observation (Text-only Baseline)

- The text-only DistilBERT baseline collapses to predicting the majority class.
- Precision, recall, and F1-score drop to zero after training.
- This is expected due to the lack of informative linguistic signal in the synthetic dataset.
- This result motivates the need for image-based and multimodal models.


In [6]:
import pandas as pd

df = pd.read_csv("multimodal_dataset.csv")
text_df = df[["headline", "label"]].copy()

print(text_df.head())
print(text_df["label"].value_counts())

                           headline  label
0  A shocking and misleading scene.      1
1          A normal everyday scene.      0
2  A shocking and misleading scene.      1
3          A normal everyday scene.      0
4          A normal everyday scene.      1
label
1    1000
0    1000
Name: count, dtype: int64


In [7]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    text_df,
    test_size=0.2,
    random_state=42,
    stratify=text_df["label"]
)

print(len(train_df), len(val_df))


1600 400


In [9]:
!pip install transformers torch scikit-learn




In [10]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import DistilBertTokenizerFast, DistilBertModel
from sklearn.metrics import precision_recall_fscore_support


In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased"
)

MAX_LEN = 16


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [12]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype=torch.float)
        }


In [13]:
train_dataset = TextDataset(
    train_df["headline"].tolist(),
    train_df["label"].tolist(),
    tokenizer,
    MAX_LEN
)

val_dataset = TextDataset(
    val_df["headline"].tolist(),
    val_df["label"].tolist(),
    tokenizer,
    MAX_LEN
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [14]:
class DistilBERTClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained(
            "distilbert-base-uncased"
        )
        self.classifier = nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        cls_output = outputs.last_hidden_state[:, 0]
        logits = self.classifier(cls_output)
        return logits.squeeze(-1)


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DistilBERTClassifier().to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [16]:
def train_epoch(model, loader):
    model.train()
    total_loss = 0

    for batch in loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)


In [17]:
def evaluate(model, loader):
    model.eval()
    preds, true = [], []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits)

            preds.extend((probs > 0.5).cpu().numpy())
            true.extend(labels.cpu().numpy())

    p, r, f, _ = precision_recall_fscore_support(
        true, preds, average="binary"
    )
    return p, r, f


In [18]:
for epoch in range(2):
    loss = train_epoch(model, train_loader)
    p, r, f = evaluate(model, val_loader)

    print(f"Epoch {epoch+1}")
    print(f"Loss: {loss:.4f}")
    print(f"Precision: {p:.3f}")
    print(f"Recall: {r:.3f}")
    print(f"F1-score: {f:.3f}")
    print("-" * 30)


Epoch 1
Loss: 0.6946
Precision: 0.440
Recall: 0.425
F1-score: 0.433
------------------------------
Epoch 2
Loss: 0.6950
Precision: 0.000
Recall: 0.000
F1-score: 0.000
------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
