<a href="https://colab.research.google.com/github/bhaskareddy652/Fake_News_Detection_Using_BERT/blob/main/Fake.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

# Load datasets
true_news = pd.read_csv("/content/sample_data/True.csv")
fake_news = pd.read_csv("/content/sample_data/Fake.csv")

# Add labels
true_news["True.csv"] = 1  # 1 for real news
fake_news["label"] = 0  # 0 for fake news

# Combine datasets
df = pd.concat([true_news, fake_news], axis=0)

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df

Unnamed: 0,title,text,subject,date,True.csv,label
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",,0.0
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",,0.0
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",,0.0
3,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",1.0,
4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",,0.0
...,...,...,...,...,...,...
44893,Nigeria says U.S. agrees delayed $593 million ...,ABUJA (Reuters) - The United States has formal...,worldnews,"December 27, 2017",1.0,
44894,Boiler Room #62 – Fatal Illusions,Tune in to the Alternate Current Radio Network...,Middle-east,"June 29, 2016",,0.0
44895,ATHEISTS SUE GOVERNOR OF TEXAS Over Display on...,I m convinced the Freedom From Religion group...,Government News,"Feb 27, 2016",,0.0
44896,Republican tax plan would deal financial hit t...,WASHINGTON (Reuters) - The Republican tax plan...,politicsNews,"November 2, 2017",1.0,


In [3]:
print(f"Class distribution:\n{df['label'].value_counts()}")

Class distribution:
label
0.0    23481
Name: count, dtype: int64


In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("stopwords")
nltk.download("punkt")
# Download the 'punkt_tab' resource
nltk.download('punkt_tab')

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs, special chars
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"\@\w+|\#", "", text)
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    text = " ".join([word for word in word_tokens if word not in stop_words])
    return text

df["cleaned_text"] = df["text"].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
df = df.dropna(subset=['label'])
# Split data
X = df["cleaned_text"]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = rf.predict(X_test_tfidf)
print("Random Forest Results:")
print(classification_report(y_test, y_pred))

Random Forest Results:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      4697

    accuracy                           1.00      4697
   macro avg       1.00      1.00      1.00      4697
weighted avg       1.00      1.00      1.00      4697



In [8]:
pip install --upgrade transformers



In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

# 1. Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# 2. Tokenize the text
tokens = tokenizer(
    list(df['cleaned_text']),
    padding=True,
    truncation=True,
    max_length=256,  # Reduced for speed
    return_tensors="pt"
)

labels = df['label'].values

# 3. Split the data
train_input_ids, val_input_ids, train_attention_mask, val_attention_mask, train_labels, val_labels = train_test_split(
    tokens['input_ids'], tokens['attention_mask'], labels, test_size=0.1, random_state=42
)

# 4. Dataset class
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "label": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# 5. Create datasets and loaders
train_dataset = NewsDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = NewsDataset(val_input_ids, val_attention_mask, val_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# 6. Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 7. Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# 8. Training loop
epochs = 2  # You can increase to 3 later
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}")
    model.train()

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Training Loss: {loss.item():.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()
            preds = torch.argmax(logits, dim=-1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    print(f"Validation Loss: {val_loss / len(val_loader):.4f}, Accuracy: {correct / total:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
Training Loss: 0.7961
Training Loss: 0.7122
Training Loss: 0.6243
Training Loss: 0.5436
Training Loss: 0.4633
Training Loss: 0.3847
Training Loss: 0.3253
Training Loss: 0.2825
Training Loss: 0.2585
Training Loss: 0.2024
Training Loss: 0.1653
Training Loss: 0.1512
Training Loss: 0.1107
Training Loss: 0.1408
Training Loss: 0.1150
Training Loss: 0.0887
Training Loss: 0.0877
Training Loss: 0.0533
Training Loss: 0.0578
Training Loss: 0.0639
Training Loss: 0.0456
Training Loss: 0.0452
Training Loss: 0.0388
Training Loss: 0.0294
Training Loss: 0.0308
Training Loss: 0.0238
Training Loss: 0.0249
Training Loss: 0.0237
Training Loss: 0.0191
Training Loss: 0.0246
Training Loss: 0.0161
Training Loss: 0.0214
Training Loss: 0.0202
Training Loss: 0.0175
Training Loss: 0.0147
Training Loss: 0.0148
Training Loss: 0.0222
Training Loss: 0.0136
Training Loss: 0.0143
Training Loss: 0.0131
Training Loss: 0.0140
Training Loss: 0.0110
Training Loss: 0.0113
Training Loss: 0.0111
Training Loss: 0.0103
T

In [10]:
# prompt: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# use this tokenizer we have used distlibert and use bertsrquenceclassification

from transformers import BertTokenizer, BertForSequenceClassification
# ... (rest of your existing code)

# 1. Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# ... (rest of your existing code)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
Training Loss: 0.6237
Training Loss: 0.5432
Training Loss: 0.4695
Training Loss: 0.4012
Training Loss: 0.3360
Training Loss: 0.2518
Training Loss: 0.2147
Training Loss: 0.1790
Training Loss: 0.1383
Training Loss: 0.1087
Training Loss: 0.1025
Training Loss: 0.0721
Training Loss: 0.0596
Training Loss: 0.0687
Training Loss: 0.0580
Training Loss: 0.0765
Training Loss: 0.0497
Training Loss: 0.0322
Training Loss: 0.0332
Training Loss: 0.0254
Training Loss: 0.0220
Training Loss: 0.0190
Training Loss: 0.0276
Training Loss: 0.0211
Training Loss: 0.0185
Training Loss: 0.0177
Training Loss: 0.0131
Training Loss: 0.0116
Training Loss: 0.0152
Training Loss: 0.0101
Training Loss: 0.0092
Training Loss: 0.0123
Training Loss: 0.0156
Training Loss: 0.0106
Training Loss: 0.0090
Training Loss: 0.0107
Training Loss: 0.0068
Training Loss: 0.0064
Training Loss: 0.0087
Training Loss: 0.0068
Training Loss: 0.0059
Training Loss: 0.0055
Training Loss: 0.0052
Training Loss: 0.0051
Training Loss: 0.0061
T

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokens = tokenizer(
    list(df['cleaned_text']),
    padding=True,
    truncation=True,
    # max_length is changed to 512 to match BERT's expected input size
    max_length=512,
    return_tensors="pt"
)

labels = df['label'].values

# 3. Split the data
train_input_ids, val_input_ids, train_attention_mask, val_attention_mask, train_labels, val_labels = train_test_split(
    tokens['input_ids'], tokens['attention_mask'], labels, test_size=0.1, random_state=42
)

# 4. Dataset class
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "label": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# 5. Create datasets and loaders
train_dataset = NewsDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = NewsDataset(val_input_ids, val_attention_mask, val_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# 6. Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 7. Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# 8. Training loop
epochs = 3  # You can increase to 3 later
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}")
    model.train()

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Training Loss: {loss.item():.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()
            preds = torch.argmax(logits, dim=-1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    print(f"Validation Loss: {val_loss / len(val_loader):.4f}, Accuracy: {correct / total:.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'df' is not defined