In [1]:
# Step 1: Imports
from huggingface_hub import login
import os
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DistilBertTokenizer, DistilBertForSequenceClassification,
    BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
)
from tqdm import tqdm
from torch.optim import AdamW
import numpy as np

# Step 2: Download NLTK Resources
nltk.download("stopwords")
nltk.download("punkt")
# Download the specific punkt_tab resource needed by word_tokenize's internal sent_tokenize
nltk.download("punkt_tab")

# Step 3: Load Data
true_news = pd.read_csv("/content/sample_data/True.csv")
fake_news = pd.read_csv("/content/sample_data/Fake.csv")

true_news["label"] = 1
fake_news["label"] = 0

df = pd.concat([true_news, fake_news], axis=0)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 4: Enhanced Text Preprocessing
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)

    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    # Keep only alphabetic words with length > 2
    filtered_words = [word for word in word_tokens
                     if word.isalpha() and word not in stop_words and len(word) > 2]

    return ' '.join(filtered_words)

df["cleaned_text"] = df["text"].apply(preprocess_text)
df = df.dropna(subset=['cleaned_text', 'label'])

# Step 5: TF-IDF + Random Forest Baseline
X = df["cleaned_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)

print("\n=== Random Forest Results ===")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")

# ========== Transformer Models ==========

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def train_model(model, tokenizer, train_texts, train_labels,
                val_texts, val_labels, max_len=256,
                batch_size=16, epochs=3, learning_rate=2e-5):
    print(f"\n=== Training {model.__class__.__name__} ===")
    train_dataset = NewsDataset(
        texts=train_texts,
        labels=train_labels,
        tokenizer=tokenizer,
        max_len=max_len
    )

    val_dataset = NewsDataset(
        texts=val_texts,
        labels=val_labels,
        tokenizer=tokenizer,
        max_len=max_len
    )

    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2
    )

    # Setup device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    best_accuracy = 0
    history = {'train_loss': [], 'val_loss': [], 'val_acc': [], 'val_f1': []}

    for epoch in range(epochs):
        print(f'\nEpoch {epoch + 1}/{epochs}')
        print('-' * 10)

        # Training phase
        model.train()
        running_loss = 0.0

        for batch in tqdm(train_loader, desc='Training'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            running_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = running_loss / len(train_loader)
        history['train_loss'].append(avg_train_loss)
        print(f'Training loss: {avg_train_loss:.4f}')

        # Validation phase
        model.eval()
        running_loss = 0.0
        correct_predictions = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                running_loss += loss.item()

                _, preds = torch.max(outputs.logits, dim=1)
                correct_predictions += torch.sum(preds == labels)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = running_loss / len(val_loader)
        val_accuracy = correct_predictions.double() / len(val_dataset)
        val_f1 = f1_score(all_labels, all_preds)

        history['val_loss'].append(avg_val_loss)
        history['val_acc'].append(val_accuracy)
        history['val_f1'].append(val_f1)

        print(f'Validation loss: {avg_val_loss:.4f}')
        print(f'Validation Accuracy: {val_accuracy:.4f}')
        print(f'Validation F1 Score: {val_f1:.4f}')

        # Save best model
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model_state.bin')
            print('New best model saved!')

    print(f'\nBest validation accuracy: {best_accuracy:.4f}')
    return history

# Split data for transformers
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['cleaned_text'],
    df['label'],
    test_size=0.1,
    random_state=42,
    stratify=df['label']
)

# Add .reset_index(drop=True) to ensure continuous integer indices for DataLoader
train_texts = train_texts.reset_index(drop=True)
val_texts = val_texts.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True)
val_labels = val_labels.reset_index(drop=True)


# Step 6B: Train BERT
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

bert_history = train_model(
    model=bert_model,
    tokenizer=bert_tokenizer,
    train_texts=train_texts,
    train_labels=train_labels,
    val_texts=val_texts,
    val_labels=val_labels,
    max_len=256,
    batch_size=16,  # Smaller batch size due to BERT's larger size
    epochs=2,
    learning_rate=3e-5  # Slightly lower learning rate for BERT
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



=== Random Forest Results ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4696
           1       1.00      1.00      1.00      4284

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

Accuracy: 0.9973
F1 Score: 0.9972


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Training BertForSequenceClassification ===

Epoch 1/2
----------


Training: 100%|██████████| 2526/2526 [27:46<00:00,  1.52it/s]


Training loss: 0.0144


Validation: 100%|██████████| 281/281 [00:57<00:00,  4.87it/s]


Validation loss: 0.0041
Validation Accuracy: 0.9991
Validation F1 Score: 0.9991
New best model saved!

Epoch 2/2
----------


Training: 100%|██████████| 2526/2526 [27:50<00:00,  1.51it/s]


Training loss: 0.0025


Validation: 100%|██████████| 281/281 [00:57<00:00,  4.88it/s]


Validation loss: 0.0043
Validation Accuracy: 0.9993
Validation F1 Score: 0.9993
New best model saved!

Best validation accuracy: 0.9993


In [2]:
def evaluate_model(model, tokenizer, test_texts, test_labels, max_len=256, batch_size=32):
    test_dataset = NewsDataset(
        texts=test_texts,
        labels=test_labels,
        tokenizer=tokenizer,
        max_len=max_len
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Testing'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print("\n=== Test Set Results ===")
    print(classification_report(all_labels, all_preds))
    print(f"Accuracy: {accuracy_score(all_labels, all_preds):.4f}")
    print(f"F1 Score: {f1_score(all_labels, all_preds):.4f}")


In [3]:
# Example test split
test_texts, _, test_labels, _ = train_test_split(
    df['cleaned_text'], df['label'], test_size=0.1, random_state=42, stratify=df['label']
)

# Load best model state (if saved)
bert_model.load_state_dict(torch.load('best_model_state.bin'))

# Evaluate
evaluate_model(bert_model, bert_tokenizer, test_texts.reset_index(drop=True), test_labels.reset_index(drop=True))


Testing: 100%|██████████| 1263/1263 [12:01<00:00,  1.75it/s]


=== Test Set Results ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     21133
           1       1.00      1.00      1.00     19275

    accuracy                           1.00     40408
   macro avg       1.00      1.00      1.00     40408
weighted avg       1.00      1.00      1.00     40408

Accuracy: 1.0000
F1 Score: 0.9999





In [4]:
bert_model.save_pretrained('./bert_fake_news_model')
bert_tokenizer.save_pretrained('./bert_fake_news_model')


('./bert_fake_news_model/tokenizer_config.json',
 './bert_fake_news_model/special_tokens_map.json',
 './bert_fake_news_model/vocab.txt',
 './bert_fake_news_model/added_tokens.json')

In [5]:
def predict_news(model, tokenizer, text, max_len=256):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, prediction = torch.max(outputs.logits, dim=1)

    return "Real News" if prediction.item() == 1 else "Fake News"


In [6]:
# Use the trained BERT model and tokenizer for prediction
predict_news(bert_model, bert_tokenizer, "Breaking: New study proves COVID-19 vaccine saves lives")

'Fake News'