In [2]:
# Install necessary libraries (if not installed)
# !pip install transformers tensorflow scikit-learn pandas numpy torch

# Import libraries
import numpy as np
import pandas as pd
import re
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 1. Create a Fake News dataset
data = {
    "text": [
        "Breaking: Scientists discover a cure for COVID-19!",  # Fake
        "NASA confirms water on the moon, groundbreaking discovery!",  # Real
        "Experts warn of stock market collapse due to secret government policies.",  # Fake
        "Government announces new tax relief for small businesses.",  # Real
        "Shocking: Aliens found living in Area 51!",  # Fake
        "New study finds link between exercise and improved mental health.",  # Real
        "Politician caught hiding millions in offshore accounts.",  # Fake
        "Medical researchers develop breakthrough cancer treatment.",  # Real
        "Secret messages found in ancient pyramids predict end of the world!",  # Fake
        "Tech company launches revolutionary AI that changes programming forever.",  # Real,
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Fake, 0 = Real
}

df = pd.DataFrame(data)

# 2. Tokenization using BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Custom dataset class for PyTorch
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=20):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.3, random_state=42)

# Create dataset objects
train_dataset = FakeNewsDataset(X_train.tolist(), y_train.tolist(), tokenizer)
test_dataset = FakeNewsDataset(X_test.tolist(), y_test.tolist(), tokenizer)

# 3. Load Pre-trained DistilBERT Model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(device)

# 4. Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=50,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=5,
    evaluation_strategy="epoch",
    save_strategy="no"
)

# 5. Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# 6. Train the Model
trainer.train()

# 7. Evaluate the Model
def compute_accuracy(model, dataset):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=2)
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    return accuracy_score(true_labels, predictions)

bert_accuracy = compute_accuracy(model, test_dataset)
print(f"BERT Model Accuracy: {bert_accuracy:.2f}")

# 8. Test with new headlines
new_headlines = [
    "Scientists invent teleportation technology, changing travel forever!",
    "World Health Organization announces breakthrough in malaria vaccine.",
    "Shocking discovery: Atlantis city found under the ocean!",
    "New smartphone released with groundbreaking AI features."
]

# Tokenize and Predict
new_encodings = tokenizer(new_headlines, padding=True, truncation=True, max_length=20, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**new_encodings)
    predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()

# Print Predictions
for headline, pred in zip(new_headlines, predictions):
    category = "Fake News" if pred == 1 else "Real News"
    print(f"'{headline}' → {category}")


Using device: cpu


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.675916
2,0.699100,0.676274
3,0.655500,0.714644
4,0.553800,0.658355
5,0.329000,0.666103
6,0.329000,0.66872
7,0.183600,0.793753
8,0.080800,0.97779
9,0.042400,1.128682
10,0.019800,1.268721


BERT Model Accuracy: 0.67
'Scientists invent teleportation technology, changing travel forever!' → Fake News
'World Health Organization announces breakthrough in malaria vaccine.' → Real News
'Shocking discovery: Atlantis city found under the ocean!' → Fake News
'New smartphone released with groundbreaking AI features.' → Real News


Why use BERT model? Where does it make sense to use it?