# Fake News Detection

## Section 1: Importing the dependencies

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import torch
from tqdm import tqdm

### Section 1.1: NLTK Setup

In [2]:
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Section 1.2: Stopwords Preview

In [3]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

## Section 2: Data Loading and Merging

In [4]:
# Load both CSVs
true_news = pd.read_csv('True.csv')
fake_news = pd.read_csv('Fake.csv')

In [5]:
# Add label column
true_news['label'] = 1
fake_news['label'] = 0

In [6]:
# Combine the datasets
news_dataset = pd.concat([true_news, fake_news], axis=0)

In [7]:
# Shuffle the dataset
news_dataset = shuffle(news_dataset, random_state=42).reset_index(drop=True)

## Section 3: Column Cleanup and Preprocessing

In [8]:
# Add dummy id column
news_dataset['id'] = range(1, len(news_dataset)+1)

In [9]:
# Select required columns
news_dataset = news_dataset[['id', 'title', 'text', 'label']]

In [10]:
# Drop rows with missing label (just in case)
news_dataset = news_dataset.dropna(subset=['label'])

In [11]:
# Fill missing values in text columns
news_dataset['title'] = news_dataset['title'].fillna('')
news_dataset['text'] = news_dataset['text'].fillna('')

In [12]:
# Create 'content' column
news_dataset['content'] = news_dataset['title'].astype(str) + ' ' + news_dataset['text'].astype(str)

In [13]:
news_dataset['raw_content'] = news_dataset['content']

## Section 4: Final Feature and Label Split

In [14]:
# Split into X and Y
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [15]:
news_dataset['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,23481
1,21417


In [16]:
print("X shape:", X.shape)
print("Y shape:", Y.shape)

X shape: (44898, 5)
Y shape: (44898,)


In [17]:
print("\nSample X:\n", X.head())


Sample X:
    id                                              title  \
0   1   BREAKING: GOP Chairman Grassley Has Had Enoug...   
1   2   Failed GOP Candidates Remembered In Hilarious...   
2   3   Mike Pence’s New DC Neighbors Are HILARIOUSLY...   
3   4  California AG pledges to defend birth control ...   
4   5  AZ RANCHERS Living On US-Mexico Border Destroy...   

                                                text  \
0  Donald Trump s White House is in chaos, and th...   
1  Now that Donald Trump is the presumptive GOP n...   
2  Mike Pence is a huge homophobe. He supports ex...   
3  SAN FRANCISCO (Reuters) - California Attorney ...   
4  Twisted reasoning is all that comes from Pelos...   

                                             content  \
0   BREAKING: GOP Chairman Grassley Has Had Enoug...   
1   Failed GOP Candidates Remembered In Hilarious...   
2   Mike Pence’s New DC Neighbors Are HILARIOUSLY...   
3  California AG pledges to defend birth control ...   
4  AZ RANC

In [18]:
print("\nSample Y:\n", Y.head())


Sample Y:
 0    0
1    0
2    0
3    1
4    0
Name: label, dtype: int64


## Section 5: Text Preprocessing and Stemming

In [19]:
port_stem = PorterStemmer()

In [20]:
stop_words = set(stopwords.words('english')) # define once globally for performance

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stop_words]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

## Section 6: Raw-Text Preservation and Dataset Splitting

In [21]:
# Assuming 'content' = cleaned version (without stemming), 'raw_content' = untouched, and 'label' = target

X_raw = news_dataset['raw_content'].values   # For BERT
X_content = news_dataset['content'].values   # To apply stemming AFTER split
Y = news_dataset['label'].values

# Create consistent indices for splitting both raw and processed data
indices = np.arange(len(X_content))
train_idx, test_idx = train_test_split(indices, test_size=0.2, stratify=Y, random_state=2)

# Split raw content for BERT
X_train_raw = X_raw[train_idx]
X_test_raw = X_raw[test_idx]

# Split unstemmed content for traditional ML models
X_train_content = X_content[train_idx]
X_test_content = X_content[test_idx]

Y_train = Y[train_idx]
Y_test = Y[test_idx]

## Section 7: Apply Stemming After Splitting

In [22]:
X_train_stemmed = np.array([stemming(text) for text in X_train_content])
X_test_stemmed = np.array([stemming(text) for text in X_test_content])

## Section 8: Text Vectorization Using TF-IDF

In [23]:
# Convert stemmed text into numerical features
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train_stemmed)

X_train_vect = vectorizer.transform(X_train_stemmed)
X_test_vect = vectorizer.transform(X_test_stemmed)

## Section 9: Training the Model - Logistic Regression

In [24]:
model = LogisticRegression()

In [25]:
# Train the model
model.fit(X_train_vect, Y_train)

## Section 10: Evaluation – Logistic Regression

In [26]:
# Evaluation on Training Data
train_preds = model.predict(X_train_vect)
train_accuracy = accuracy_score(Y_train, train_preds)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Evaluation on Test Data
test_preds = model.predict(X_test_vect)
test_accuracy = accuracy_score(Y_test, test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(Y_test, test_preds, target_names=["FAKE", "REAL"]))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(Y_test, test_preds))

# ROC-AUC Score (for probability-based models)
if hasattr(model, "predict_proba"):
    y_probs = model.predict_proba(X_test_vect)[:, 1]  # Probabilities for class 'Fake'
    roc_auc = roc_auc_score(Y_test, y_probs)
    print(f"ROC-AUC Score: {roc_auc:.4f}")


Training Accuracy: 0.9916
Test Accuracy: 0.9865

Classification Report:
              precision    recall  f1-score   support

        FAKE       0.99      0.99      0.99      4696
        REAL       0.98      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Confusion Matrix:
[[4631   65]
 [  56 4228]]
ROC-AUC Score: 0.9984


In [27]:
# Show predictions for first 5 test samples
for i in range(5):
    print(f"\n News Title: {X_test_raw[i]}")
    pred = model.predict(X_test_vect[i]).item()
    label = "FAKE" if pred == 0 else "REAL"
    actual = "FAKE" if Y_test[i] == 0 else "REAL"
    print(f"Predicted: {label} | Actual: {actual}")



 News Title: Shifting sands: What is changing in Saudi Arabia? (Reuters) - Saudi Arabia s Crown Prince Mohammed bin Salman has ordered a crackdown on corruption, the latest in a wave of frenetic changes in the kingdom over the past 2-1/2 years. Prince Mohammed says he is determined to remodel his conservative country into a modern state no longer dependent on oil. As his father, King Salman bin Abdulaziz, has handed the 32-year-old Prince Mohammed more and more power over the past three years, the ambitious young leader has taken on everything from economic reforms to waging war in neighboring Yemen. Here is what you need to know. Prince Mohammed capped his rapid rise to power in June this year by replacing his elder cousin Prince Mohammed bin Nayef, widely known as MbN, as crown prince. A source close to King Salman said MbN s dismissal was  in the higher interests of the state  because he was incapacitated by morphine and cocaine addiction, a legacy of an assassination attempt that 

In [28]:
def predict_news(news_text):
    cleaned = stemming(news_text)
    vectorized_input = vectorizer.transform([cleaned])

    prediction = model.predict(vectorized_input)[0]
    prob = model.predict_proba(vectorized_input)[0][1] if hasattr(model, "predict_proba") else None

    print("\nInput News Title:")
    print(news_text)

    print("\nPrediction:")
    if prediction == 0:
      print("The news is classified as: FAKE")
    else:
      print("The news is classified as: REAL")

    if prob is not None:
        print(f"Confidence (Real Probability): {prob:.4f}")
        # print(f"Confidence - FAKE: {prob[0]:.4f}, REAL: {prob[1]:.4f}")

In [29]:
# Example test
custom_news = "Pfizer and BioNTech announce positive results from late-stage COVID-19 vaccine trial. The companies reported that the vaccine was more than 90% effective in preventing COVID-19 among participants without evidence of prior infection. The Phase 3 clinical trial enrolled 43,538 participants, with no serious safety concerns reported. The vaccine candidate is based on mRNA technology and requires two doses administered three weeks apart. Pfizer plans to seek Emergency Use Authorization from the U.S. FDA. Global health experts have hailed the development as a significant milestone in the fight against the pandemic. Pfizer and BioNTech also confirmed their commitment to equitable global access, including low- and middle-income countries."
predict_news(custom_news)


Input News Title:
Pfizer and BioNTech announce positive results from late-stage COVID-19 vaccine trial. The companies reported that the vaccine was more than 90% effective in preventing COVID-19 among participants without evidence of prior infection. The Phase 3 clinical trial enrolled 43,538 participants, with no serious safety concerns reported. The vaccine candidate is based on mRNA technology and requires two doses administered three weeks apart. Pfizer plans to seek Emergency Use Authorization from the U.S. FDA. Global health experts have hailed the development as a significant milestone in the fight against the pandemic. Pfizer and BioNTech also confirmed their commitment to equitable global access, including low- and middle-income countries.

Prediction:
The news is classified as: REAL
Confidence (Real Probability): 0.6094


In [30]:
# Example custom input
custom_news = "Government confirms lizard people control major banks and media. In a shocking revelation, the Prime Minister's office released a statement confirming the existence of reptilian humanoids infiltrating global financial institutions and news networks. These shape-shifting beings, originating from the Draco constellation, have allegedly been manipulating world events for decades. The statement claims their ultimate goal is to enslave humanity and establish a New World Order. The Ministry of Defense has deployed elite units to capture and expose the reptilians, promising transparency and public safety in the coming weeks. Meanwhile, conspiracy theorists claim they were right all along."
predict_news(custom_news)


Input News Title:
Government confirms lizard people control major banks and media. In a shocking revelation, the Prime Minister's office released a statement confirming the existence of reptilian humanoids infiltrating global financial institutions and news networks. These shape-shifting beings, originating from the Draco constellation, have allegedly been manipulating world events for decades. The statement claims their ultimate goal is to enslave humanity and establish a New World Order. The Ministry of Defense has deployed elite units to capture and expose the reptilians, promising transparency and public safety in the coming weeks. Meanwhile, conspiracy theorists claim they were right all along.

Prediction:
The news is classified as: FAKE
Confidence (Real Probability): 0.2920


## Section 11: Fake News Detection using Pretrained BERT

In [31]:
# Load Model & Tokenizer
model_name = "jy46604790/Fake-News-Bert-Detect"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Use pipeline with truncation handling
classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    truncation=True,
    max_length=512,
    padding=True
)

# Define label map
label_map = {'LABEL_0': 0, 'LABEL_1': 1}  # 0 = FAKE, 1 = REAL

def clean_batch(batch):
    return [str(x) if x is not None else "" for x in batch]

# Evaluation on Training Data
bert_train_preds = []
batch_size = 8

for i in tqdm(range(0, len(X_train_raw), batch_size), desc="BERT Train Batches"):
    batch = clean_batch(X_train_raw[i:i+batch_size])
    results = classifier(batch)
    for pred in results:
        label = pred['label']
        if label not in label_map:
            raise ValueError(f"Unexpected label: {label}")
        bert_train_preds.append(label_map[label])

train_accuracy = accuracy_score(Y_train, bert_train_preds)
print(f"\nTraining Accuracy: {train_accuracy:.4f}")

# Evaluation on Test Data
bert_test_preds = []

for i in tqdm(range(0, len(X_test_raw), batch_size), desc="BERT Test Batches"):
    batch = clean_batch(X_test_raw[i:i+batch_size])
    results = classifier(batch)
    for pred in results:
        label = pred['label']
        if label not in label_map:
            raise ValueError(f"Unexpected label: {label}")
        bert_test_preds.append(label_map[label])

test_accuracy = accuracy_score(Y_test, bert_test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Classification Report
print("\n[BERT] Classification Report:")
print(classification_report(Y_test, bert_test_preds, target_names=["FAKE", "REAL"]))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(Y_test, bert_test_preds))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Device set to use cuda:0
BERT Train Batches:   0%|          | 0/4490 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

  return forward_call(*args, **kwargs)
BERT Train Batches:   0%|          | 10/4490 [00:02<17:31,  4.26it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
BERT Train Batches: 100%|██████████| 4490/4490 [16:01<00:00,  4.67it/s]



Training Accuracy: 0.9995


  return forward_call(*args, **kwargs)
BERT Test Batches: 100%|██████████| 1123/1123 [04:00<00:00,  4.66it/s]

Test Accuracy: 0.9990

[BERT] Classification Report:
              precision    recall  f1-score   support

        FAKE       1.00      1.00      1.00      4696
        REAL       1.00      1.00      1.00      4284

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

Confusion Matrix:
[[4690    6]
 [   3 4281]]





In [32]:
# Show predictions for first 5 test samples
for i in range(5):
    print(f"\nNews Title: {X_test_raw[i]}")
    predicted_label = "FAKE" if bert_test_preds[i] == 0 else "REAL"
    actual_label = "FAKE" if Y_test[i] == 0 else "REAL"
    print(f"Predicted: {predicted_label} | Actual: {actual_label}")


News Title: Shifting sands: What is changing in Saudi Arabia? (Reuters) - Saudi Arabia s Crown Prince Mohammed bin Salman has ordered a crackdown on corruption, the latest in a wave of frenetic changes in the kingdom over the past 2-1/2 years. Prince Mohammed says he is determined to remodel his conservative country into a modern state no longer dependent on oil. As his father, King Salman bin Abdulaziz, has handed the 32-year-old Prince Mohammed more and more power over the past three years, the ambitious young leader has taken on everything from economic reforms to waging war in neighboring Yemen. Here is what you need to know. Prince Mohammed capped his rapid rise to power in June this year by replacing his elder cousin Prince Mohammed bin Nayef, widely known as MbN, as crown prince. A source close to King Salman said MbN s dismissal was  in the higher interests of the state  because he was incapacitated by morphine and cocaine addiction, a legacy of an assassination attempt that l

In [33]:
# Custom Prediction Function
def predict_news_bert(news_text):
    pred = classifier(news_text)[0]
    label = "FAKE" if pred['label'] == 'LABEL_0' else "REAL"
    score = pred['score']

    print("\nInput News Title:")
    print(news_text)

    print("\nPrediction:")
    print(f"The news is classified as: {label}")
    print(f"Confidence Score: {score:.4f}")

In [34]:
# Example custom input
custom_news = "Pfizer and BioNTech announce positive results from late-stage COVID-19 vaccine trial. The companies reported that the vaccine was more than 90% effective in preventing COVID-19 among participants without evidence of prior infection. The Phase 3 clinical trial enrolled 43,538 participants, with no serious safety concerns reported. The vaccine candidate is based on mRNA technology and requires two doses administered three weeks apart. Pfizer plans to seek Emergency Use Authorization from the U.S. FDA. Global health experts have hailed the development as a significant milestone in the fight against the pandemic. Pfizer and BioNTech also confirmed their commitment to equitable global access, including low- and middle-income countries."
predict_news_bert(custom_news)


Input News Title:
Pfizer and BioNTech announce positive results from late-stage COVID-19 vaccine trial. The companies reported that the vaccine was more than 90% effective in preventing COVID-19 among participants without evidence of prior infection. The Phase 3 clinical trial enrolled 43,538 participants, with no serious safety concerns reported. The vaccine candidate is based on mRNA technology and requires two doses administered three weeks apart. Pfizer plans to seek Emergency Use Authorization from the U.S. FDA. Global health experts have hailed the development as a significant milestone in the fight against the pandemic. Pfizer and BioNTech also confirmed their commitment to equitable global access, including low- and middle-income countries.

Prediction:
The news is classified as: REAL
Confidence Score: 0.5012


  return forward_call(*args, **kwargs)


In [35]:
# Example custom input
custom_news = "Government confirms lizard people control major banks and media. In a shocking revelation, the Prime Minister's office released a statement confirming the existence of reptilian humanoids infiltrating global financial institutions and news networks. These shape-shifting beings, originating from the Draco constellation, have allegedly been manipulating world events for decades. The statement claims their ultimate goal is to enslave humanity and establish a New World Order. The Ministry of Defense has deployed elite units to capture and expose the reptilians, promising transparency and public safety in the coming weeks. Meanwhile, conspiracy theorists claim they were right all along."
predict_news_bert(custom_news)


Input News Title:
Government confirms lizard people control major banks and media. In a shocking revelation, the Prime Minister's office released a statement confirming the existence of reptilian humanoids infiltrating global financial institutions and news networks. These shape-shifting beings, originating from the Draco constellation, have allegedly been manipulating world events for decades. The statement claims their ultimate goal is to enslave humanity and establish a New World Order. The Ministry of Defense has deployed elite units to capture and expose the reptilians, promising transparency and public safety in the coming weeks. Meanwhile, conspiracy theorists claim they were right all along.

Prediction:
The news is classified as: FAKE
Confidence Score: 0.9993
