# Fake News Detection

## Section 1: Importing the dependencies

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import torch
from tqdm import tqdm

### Section 1.1: NLTK Setup

In [None]:
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

### Section 1.2: Stopwords Preview

In [None]:
print(stopwords.words('english'))

## Section 2: Data Loading and Merging

In [None]:
# Load both CSVs
true_news = pd.read_csv('True.csv')
fake_news = pd.read_csv('Fake.csv')

In [None]:
# Add label column
true_news['label'] = 1
fake_news['label'] = 0

In [None]:
# Combine the datasets
news_dataset = pd.concat([true_news, fake_news], axis=0)

In [None]:
# Shuffle the dataset
news_dataset = shuffle(news_dataset, random_state=42).reset_index(drop=True)

## Section 3: Column Cleanup and Preprocessing

In [None]:
# Add dummy id column
news_dataset['id'] = range(1, len(news_dataset)+1)

In [None]:
# Select required columns
news_dataset = news_dataset[['id', 'title', 'text', 'label']]

In [None]:
# Drop rows with missing label (just in case)
news_dataset = news_dataset.dropna(subset=['label'])

In [None]:
# Fill missing values in text columns
news_dataset['title'] = news_dataset['title'].fillna('')
news_dataset['text'] = news_dataset['text'].fillna('')

In [None]:
# Create 'content' column
news_dataset['content'] = news_dataset['title'].astype(str) + ' ' + news_dataset['text'].astype(str)

In [None]:
news_dataset['raw_content'] = news_dataset['content']

## Section 4: Final Feature and Label Split

In [None]:
# Split into X and Y
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [None]:
news_dataset['label'].value_counts()

In [None]:
print("X shape:", X.shape)
print("Y shape:", Y.shape)

In [None]:
print("\nSample X:\n", X.head())

In [None]:
print("\nSample Y:\n", Y.head())

## Section 5: Text Preprocessing and Stemming

In [None]:
port_stem = PorterStemmer()

In [None]:
stop_words = set(stopwords.words('english')) # define once globally for performance

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stop_words]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

## Section 6: Raw-Text Preservation and Dataset Splitting

In [None]:
# Assuming 'content' = cleaned version (without stemming), 'raw_content' = untouched, and 'label' = target

X_raw = news_dataset['raw_content'].values   # For BERT
X_content = news_dataset['content'].values   # To apply stemming AFTER split
Y = news_dataset['label'].values

# Create consistent indices for splitting both raw and processed data
indices = np.arange(len(X_content))
train_idx, test_idx = train_test_split(indices, test_size=0.2, stratify=Y, random_state=2)

# Split raw content for BERT
X_train_raw = X_raw[train_idx]
X_test_raw = X_raw[test_idx]

# Split unstemmed content for traditional ML models
X_train_content = X_content[train_idx]
X_test_content = X_content[test_idx]

Y_train = Y[train_idx]
Y_test = Y[test_idx]

## Section 7: Apply Stemming After Splitting

In [None]:
X_train_stemmed = np.array([stemming(text) for text in X_train_content])
X_test_stemmed = np.array([stemming(text) for text in X_test_content])

## Section 8: Text Vectorization Using TF-IDF

In [None]:
# Convert stemmed text into numerical features
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train_stemmed)

X_train_vect = vectorizer.transform(X_train_stemmed)
X_test_vect = vectorizer.transform(X_test_stemmed)

## Section 9: Training the Model - Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# Train the model
model.fit(X_train_vect, Y_train)

## Section 10: Evaluation – Logistic Regression

In [None]:
# Evaluation on Training Data
train_preds = model.predict(X_train_vect)
train_accuracy = accuracy_score(Y_train, train_preds)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Evaluation on Test Data
test_preds = model.predict(X_test_vect)
test_accuracy = accuracy_score(Y_test, test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(Y_test, test_preds, target_names=["FAKE", "REAL"]))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(Y_test, test_preds))

# ROC-AUC Score (for probability-based models)
if hasattr(model, "predict_proba"):
    y_probs = model.predict_proba(X_test_vect)[:, 1]  # Probabilities for class 'Fake'
    roc_auc = roc_auc_score(Y_test, y_probs)
    print(f"ROC-AUC Score: {roc_auc:.4f}")


In [None]:
# Show predictions for first 5 test samples
for i in range(5):
    print(f"\n News Title: {X_test_raw[i]}")
    pred = model.predict(X_test_vect[i]).item()
    label = "FAKE" if pred == 0 else "REAL"
    actual = "FAKE" if Y_test[i] == 0 else "REAL"
    print(f"Predicted: {label} | Actual: {actual}")


In [None]:
def predict_news(news_text):
    cleaned = stemming(news_text)
    vectorized_input = vectorizer.transform([cleaned])

    prediction = model.predict(vectorized_input)[0]
    prob = model.predict_proba(vectorized_input)[0][1] if hasattr(model, "predict_proba") else None

    print("\nInput News Title:")
    print(news_text)

    print("\nPrediction:")
    if prediction == 0:
      print("The news is classified as: FAKE")
    else:
      print("The news is classified as: REAL")

    if prob is not None:
        print(f"Confidence (Real Probability): {prob:.4f}")
        # print(f"Confidence - FAKE: {prob[0]:.4f}, REAL: {prob[1]:.4f}")

In [None]:
# Example test
custom_news = "Pfizer and BioNTech announce positive results from late-stage COVID-19 vaccine trial. The companies reported that the vaccine was more than 90% effective in preventing COVID-19 among participants without evidence of prior infection. The Phase 3 clinical trial enrolled 43,538 participants, with no serious safety concerns reported. The vaccine candidate is based on mRNA technology and requires two doses administered three weeks apart. Pfizer plans to seek Emergency Use Authorization from the U.S. FDA. Global health experts have hailed the development as a significant milestone in the fight against the pandemic. Pfizer and BioNTech also confirmed their commitment to equitable global access, including low- and middle-income countries."
predict_news(custom_news)

In [None]:
# Example custom input
custom_news = "Government confirms lizard people control major banks and media. In a shocking revelation, the Prime Minister's office released a statement confirming the existence of reptilian humanoids infiltrating global financial institutions and news networks. These shape-shifting beings, originating from the Draco constellation, have allegedly been manipulating world events for decades. The statement claims their ultimate goal is to enslave humanity and establish a New World Order. The Ministry of Defense has deployed elite units to capture and expose the reptilians, promising transparency and public safety in the coming weeks. Meanwhile, conspiracy theorists claim they were right all along."
predict_news(custom_news)

## Section 11: Fake News Detection using Pretrained BERT

In [None]:
# Load Model & Tokenizer
model_name = "jy46604790/Fake-News-Bert-Detect"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Use pipeline with truncation handling
classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    truncation=True,
    max_length=512,
    padding=True
)

# Define label map
label_map = {'LABEL_0': 0, 'LABEL_1': 1}  # 0 = FAKE, 1 = REAL

def clean_batch(batch):
    return [str(x) if x is not None else "" for x in batch]

# Evaluation on Training Data
bert_train_preds = []
batch_size = 8

for i in tqdm(range(0, len(X_train_raw), batch_size), desc="BERT Train Batches"):
    batch = clean_batch(X_train_raw[i:i+batch_size])
    results = classifier(batch)
    for pred in results:
        label = pred['label']
        if label not in label_map:
            raise ValueError(f"Unexpected label: {label}")
        bert_train_preds.append(label_map[label])

train_accuracy = accuracy_score(Y_train, bert_train_preds)
print(f"\nTraining Accuracy: {train_accuracy:.4f}")

# Evaluation on Test Data
bert_test_preds = []

for i in tqdm(range(0, len(X_test_raw), batch_size), desc="BERT Test Batches"):
    batch = clean_batch(X_test_raw[i:i+batch_size])
    results = classifier(batch)
    for pred in results:
        label = pred['label']
        if label not in label_map:
            raise ValueError(f"Unexpected label: {label}")
        bert_test_preds.append(label_map[label])

test_accuracy = accuracy_score(Y_test, bert_test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Classification Report
print("\n[BERT] Classification Report:")
print(classification_report(Y_test, bert_test_preds, target_names=["FAKE", "REAL"]))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(Y_test, bert_test_preds))

In [None]:
# Show predictions for first 5 test samples
for i in range(5):
    print(f"\nNews Title: {X_test_raw[i]}")
    predicted_label = "FAKE" if bert_test_preds[i] == 0 else "REAL"
    actual_label = "FAKE" if Y_test[i] == 0 else "REAL"
    print(f"Predicted: {predicted_label} | Actual: {actual_label}")

In [None]:
# Custom Prediction Function
def predict_news_bert(news_text):
    pred = classifier(news_text)[0]
    label = "FAKE" if pred['label'] == 'LABEL_0' else "REAL"
    score = pred['score']

    print("\nInput News Title:")
    print(news_text)

    print("\nPrediction:")
    print(f"The news is classified as: {label}")
    print(f"Confidence Score: {score:.4f}")

In [None]:
# Example custom input
custom_news = "Pfizer and BioNTech announce positive results from late-stage COVID-19 vaccine trial. The companies reported that the vaccine was more than 90% effective in preventing COVID-19 among participants without evidence of prior infection. The Phase 3 clinical trial enrolled 43,538 participants, with no serious safety concerns reported. The vaccine candidate is based on mRNA technology and requires two doses administered three weeks apart. Pfizer plans to seek Emergency Use Authorization from the U.S. FDA. Global health experts have hailed the development as a significant milestone in the fight against the pandemic. Pfizer and BioNTech also confirmed their commitment to equitable global access, including low- and middle-income countries."
predict_news_bert(custom_news)

In [None]:
# Example custom input
custom_news = "Government confirms lizard people control major banks and media. In a shocking revelation, the Prime Minister's office released a statement confirming the existence of reptilian humanoids infiltrating global financial institutions and news networks. These shape-shifting beings, originating from the Draco constellation, have allegedly been manipulating world events for decades. The statement claims their ultimate goal is to enslave humanity and establish a New World Order. The Ministry of Defense has deployed elite units to capture and expose the reptilians, promising transparency and public safety in the coming weeks. Meanwhile, conspiracy theorists claim they were right all along."
predict_news_bert(custom_news)