<a href="https://colab.research.google.com/github/alwaysalearner1234/ML01/blob/main/ML04Fake_News_Detection_using_NLP_%2B_ML_Classifiers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 📌 Fake News Detection using NLP + ML Classifiers

# Step 1: Install dependencies (if not already installed in Colab)
!pip install nltk scikit-learn pandas numpy

# Step 2: Import Libraries
import pandas as pd
import numpy as np
import nltk
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nltk.download('stopwords')
from nltk.corpus import stopwords

# Step 3: Load Dataset (Kaggle Fake News Dataset or any CSV with 'text' + 'label')
# You can upload your dataset in Colab from local system
from google.colab import files
uploaded = files.upload()

# Suppose file name is 'fake_news.csv' with columns -> 'text', 'label'
df = pd.read_csv("fake_news.csv")

print(df.head())

# Step 4: Text Preprocessing
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)  # remove text in brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # remove links
    text = re.sub(r'<.*?>+', '', text)  # remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # remove punctuation
    text = re.sub(r'\n', '', text)  # remove new lines
    text = re.sub(r'\w*\d\w*', '', text)  # remove words with numbers
    return text

df['text'] = df['text'].apply(clean_text)

# Step 5: Split Data
X = df['text']
y = df['label']  # labels: 'FAKE' or 'REAL'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 7: Train Classifier
# Option 1: Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)
y_pred_lr = log_reg.predict(X_test_tfidf)

# Option 2: Passive Aggressive Classifier (fast for text classification)
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(X_train_tfidf, y_train)
y_pred_pac = pac.predict(X_test_tfidf)

# Step 8: Evaluate Models
print("\n🔹 Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

print("\n🔹 Passive Aggressive Classifier Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_pac))
print(classification_report(y_test, y_pred_pac))

# Step 9: Test with a Sample Input
sample = ["Breaking news! Scientists discovered water on Mars!"]
sample_tfidf = vectorizer.transform(sample)
print("\nSample Prediction:", pac.predict(sample_tfidf)[0])  # FAKE or REAL



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


TypeError: 'NoneType' object is not subscriptable

In [None]:
from google.colab import files
files.upload()

In [None]:
# Fake News Detection using NLP + Classification Algorithms
# Ready to run in Google Colab (no need to upload dataset manually)

!pip install datasets scikit-learn

from datasets import load_dataset
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# -------------------------------------------------
# 1. Load Dataset (directly from HuggingFace datasets)
# -------------------------------------------------
# dataset = load_dataset("C:\\Users\\lidiy\\Downloads\\Fake.csv\\Fake.csv") # Original line
df = pd.read_csv("/content/Fake.csv")

# It has 2 splits: "train" and "test"
# df_train = pd.DataFrame(dataset['train']) # Original line
# df_test = pd.DataFrame(dataset['test']) # Original line

# print("Training data shape:", df_train.shape) # Original line
# print("Testing data shape:", df_test.shape) # Original line
print(df.head())

# -------------------------------------------------
# 2. Preprocess Text
# -------------------------------------------------
def clean_text(text):
    text = str(text).lower()  # lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove links
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove numbers
    return text

df["text"] = df["text"].apply(clean_text)
# df_test["text"] = df_test["text"].apply(clean_text) # Original line

# -------------------------------------------------
# 3. Split Features & Labels
# -------------------------------------------------
X = df["text"]
y = df["label"]  # Assuming 'label' is the column with labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# -------------------------------------------------
# 4. Vectorization (TF-IDF)
# -------------------------------------------------
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# -------------------------------------------------
# 5. Train Model (Logistic Regression)
# -------------------------------------------------
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# -------------------------------------------------
# 6. Evaluate Model
# -------------------------------------------------
y_pred = model.predict(X_test_tfidf)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -------------------------------------------------
# 7. Test on Custom Input
# -------------------------------------------------
def predict_news(news_text):
    news_text = clean_text(news_text)
    vectorized = tfidf.transform([news_text])
    prediction = model.predict(vectorized)[0]
    return "✅ Real News" if prediction == 1 else "❌ Fake News" # This assumes the labels are 0 and 1, adjust if needed

# Example test
print("\nCustom Test 1:", predict_news("The government just announced a new healthcare policy."))
print("Custom Test 2:", predict_news("Aliens have landed in New York City and taken over the White House!"))

In [None]:
!pip install datasets scikit-learn

from datasets import load_dataset
import pandas as pd
import re, string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# -------------------------------------------------
# 1. Load Dataset (HuggingFace)
# -------------------------------------------------
dataset = load_dataset("liar")

# liar dataset has 'train', 'validation', 'test'
df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])

print("✅ Training data shape:", df_train.shape)
print("✅ Testing data shape:", df_test.shape)
print(df_train.head())

# -------------------------------------------------
# 2. Preprocess Text
# -------------------------------------------------
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text.strip()

df_train["text"] = df_train["statement"].apply(clean_text)
df_test["text"] = df_test["statement"].apply(clean_text)

# -------------------------------------------------
# 3. Features & Labels
# label: 0 = pants-fire, 1 = false, 2 = barely-true, 3 = half-true, 4 = mostly-true, 5 = true
X_train, y_train = df_train["text"], df_train["label"]
X_test, y_test = df_test["text"], df_test["label"]

# -------------------------------------------------
# 4. TF-IDF
# -------------------------------------------------
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# -------------------------------------------------
# 5. Train Model
# -------------------------------------------------
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# -------------------------------------------------
# 6. Evaluate
# -------------------------------------------------
y_pred = model.predict(X_test_tfidf)

print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n✅ Classification Report:\n", classification_report(y_test, y_pred))
print("\n✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -------------------------------------------------
# 7. Custom Test
# -------------------------------------------------
def predict_news(news_text):
    news_text = clean_text(news_text)
    vectorized = tfidf.transform([news_text])
    prediction = model.predict(vectorized)[0]
    labels_map = {
        0: "🔥 Pants-Fire Fake",
        1: "❌ Fake",
        2: "⚠ Barely-True",
        3: "🔸 Half-True",
        4: "✅ Mostly-True",
        5: "🟢 True"
    }
    return labels_map[prediction]

print("\nCustom Test 1:", predict_news("The government just announced a new healthcare policy."))
print("Custom Test 2:", predict_news("Aliens have landed in New York City and taken over the White House!"))

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re, string

# -------------------------------------------------
# 1. Apna Dataset banalo (chhota demo)
# -------------------------------------------------
data = {
    "text": [
        "The government announced a new healthcare policy.",   # Real
        "Aliens landed in New York and took over the White House!", # Fake
        "Scientists discovered a cure for cancer after 20 years of research.", # Real
        "Celebrity found alive on Mars after disappearing 10 years ago.", # Fake
        "Stock markets are showing positive growth this quarter.", # Real
        "Man claims to have a time machine built in his garage.", # Fake
        "Schools to reopen from next Monday after holidays.", # Real
        "Dinosaurs seen alive in the Amazon rainforest.", # Fake
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Real, 0 = Fake
}

df = pd.DataFrame(data)

# -------------------------------------------------
# 2. Preprocess function
# -------------------------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text.strip()

df["text"] = df["text"].apply(clean_text)

# -------------------------------------------------
# 3. Features & Labels
# -------------------------------------------------
X, y = df["text"], df["label"]

# -------------------------------------------------
# 4. TF-IDF
# -------------------------------------------------
tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf.fit_transform(X)

# -------------------------------------------------
# 5. Train Model
# -------------------------------------------------
model = LogisticRegression()
model.fit(X_tfidf, y)

# -------------------------------------------------
# 6. Evaluate
# -------------------------------------------------
y_pred = model.predict(X_tfidf)

print("\n✅ Accuracy:", accuracy_score(y, y_pred))
print("\n✅ Classification Report:\n", classification_report(y, y_pred))
print("\n✅ Confusion Matrix:\n", confusion_matrix(y, y_pred))

# -------------------------------------------------
# 7. Custom Prediction
# -------------------------------------------------
def predict_news(news_text):
    news_text = clean_text(news_text)
    vectorized = tfidf.transform([news_text])
    prediction = model.predict(vectorized)[0]
    return "✅ Real News" if prediction == 1 else "❌ Fake News"

# Test karo
print("\nCustom Test 1:", predict_news("NASA announced a new mission to the Moon."))
print("Custom Test 2:", predict_news("Aliens have captured the Eiffel Tower."))

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re, string

# -------------------------------------------------
# 1. Apna Dataset banalo (chhota demo)
# -------------------------------------------------
data = {
    "text": [
        "The government announced a new healthcare policy.",   # Real
        "Aliens landed in New York and took over the White House!", # Fake
        "Scientists discovered a cure for cancer after 20 years of research.", # Real
        "Celebrity found alive on Mars after disappearing 10 years ago.", # Fake
        "Stock markets are showing positive growth this quarter.", # Real
        "Man claims to have a time machine built in his garage.", # Fake
        "Schools to reopen from next Monday after holidays.", # Real
        "Dinosaurs seen alive in the Amazon rainforest.", # Fake
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Real, 0 = Fake
}

df = pd.DataFrame(data)

# -------------------------------------------------
# 2. Preprocess function
# -------------------------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text.strip()

df["text"] = df["text"].apply(clean_text)

# -------------------------------------------------
# 3. Features & Labels
# -------------------------------------------------
X, y = df["text"], df["label"]

# -------------------------------------------------
# 4. TF-IDF
# -------------------------------------------------
tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf.fit_transform(X)

# -------------------------------------------------
# 5. Train Model
# -------------------------------------------------
model = LogisticRegression()
model.fit(X_tfidf, y)

# -------------------------------------------------
# 6. Evaluate
# -------------------------------------------------
y_pred = model.predict(X_tfidf)

print("\n✅ Accuracy:", accuracy_score(y, y_pred))
print("\n✅ Classification Report:\n", classification_report(y, y_pred))
print("\n✅ Confusion Matrix:\n", confusion_matrix(y, y_pred))

# -------------------------------------------------
# 7. Custom Prediction
# -------------------------------------------------
def predict_news(news_text):
    news_text = clean_text(news_text)
    vectorized = tfidf.transform([news_text])
    prediction = model.predict(vectorized)[0]
    return "✅ Real News" if prediction == 1 else "❌ Fake News"

# Test karo
print("\nCustom Test 1:", predict_news("NASA announced a new mission to the Moon."))
print("Custom Test 2:", predict_news("Aliens have captured the Eiffel Tower."))