<a href="https://colab.research.google.com/github/alwaysalearner1234/ML01/blob/main/ML04(2_Kaggle)Fake_News_Detection_using_NLP_%2B_ML_Classifiers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()  # Select kaggle.json

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"duddekuntalidiya","key":"33dfbb1572ff71de289ea1dc8f7508f6"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset
!unzip fake-and-real-news-dataset.zip -d fake_news

Dataset URL: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset
License(s): CC-BY-NC-SA-4.0
Downloading fake-and-real-news-dataset.zip to /content
  0% 0.00/41.0M [00:00<?, ?B/s]
100% 41.0M/41.0M [00:00<00:00, 800MB/s]
Archive:  fake-and-real-news-dataset.zip
  inflating: fake_news/Fake.csv      
  inflating: fake_news/True.csv      


In [None]:
import pandas as pd
import numpy as np
import re, string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# -------------------------------------------------
# 1. Load Dataset
# -------------------------------------------------
true_news = pd.read_csv("fake_news/True.csv")
fake_news = pd.read_csv("fake_news/Fake.csv")

true_news["label"] = 1   # Real News = 1
fake_news["label"] = 0   # Fake News = 0

df = pd.concat([true_news, fake_news], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

print("Dataset shape:", df.shape)
print(df.head())

# -------------------------------------------------
# 2. Clean Text
# -------------------------------------------------
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'\d+', '', text)
        return text.strip()
    else:
        return ""

df["text"] = df["title"].fillna('') + " " + df["text"].fillna('')
df["text"] = df["text"].apply(clean_text)

# -------------------------------------------------
# 3. Split Data
# -------------------------------------------------
X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------------------------------
# 4. TF-IDF Vectorization
# -------------------------------------------------
tfidf = TfidfVectorizer(stop_words="english", max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# -------------------------------------------------
# 5. Logistic Regression Model
# -------------------------------------------------
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# -------------------------------------------------
# 6. Evaluation
# -------------------------------------------------
y_pred = model.predict(X_test_tfidf)

print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n✅ Classification Report:\n", classification_report(y_test, y_pred))
print("\n✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -------------------------------------------------
# 7. Custom Prediction
# -------------------------------------------------
def predict_news(news_text):
    news_text = clean_text(news_text)
    vectorized = tfidf.transform([news_text])
    prediction = model.predict(vectorized)[0]
    return "✅ Real News" if prediction == 1 else "❌ Fake News"

print("\nCustom Test 1:", predict_news("The government just passed a new law on climate change."))
print("Custom Test 2:", predict_news("Aliens have invaded New York City and captured the mayor!"))

Dataset shape: (44898, 5)
                                               title  \
0   BREAKING: GOP Chairman Grassley Has Had Enoug...   
1   Failed GOP Candidates Remembered In Hilarious...   
2   Mike Pence’s New DC Neighbors Are HILARIOUSLY...   
3  California AG pledges to defend birth control ...   
4  AZ RANCHERS Living On US-Mexico Border Destroy...   

                                                text       subject  \
0  Donald Trump s White House is in chaos, and th...          News   
1  Now that Donald Trump is the presumptive GOP n...          News   
2  Mike Pence is a huge homophobe. He supports ex...          News   
3  SAN FRANCISCO (Reuters) - California Attorney ...  politicsNews   
4  Twisted reasoning is all that comes from Pelos...      politics   

               date  label  
0     July 21, 2017      0  
1       May 7, 2016      0  
2  December 3, 2016      0  
3  October 6, 2017       1  
4      Apr 25, 2017      0  

✅ Accuracy: 0.9856347438752784

✅ Classif

In [None]:
# Fake News Detection using NLP + Classification Algorithms
# Ready to run in Google Colab (no need to upload dataset manually)

!pip install scikit-learn

# from datasets import load_dataset # No longer needed
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# -------------------------------------------------
# 1. Load Dataset (directly from HuggingFace datasets)
# -------------------------------------------------
# dataset = load_dataset("fake_news/True.csv") # No longer needed

# Load from local files instead
true_news = pd.read_csv("fake_news/True.csv")
fake_news = pd.read_csv("fake_news/Fake.csv")

true_news["label"] = 1   # Real News = 1
fake_news["label"] = 0   # Fake News = 0

df = pd.concat([true_news, fake_news], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)


print("Dataset shape:", df.shape)
print(df.head())


# -------------------------------------------------
# 2. Preprocess Text
# -------------------------------------------------
def clean_text(text):
    if isinstance(text, str): # Added check for non-string input
        text = text.lower()  # lowercase
        text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove links
        text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
        text = re.sub(r'\d+', '', text)  # remove numbers
        return text.strip() # Added strip to remove leading/trailing whitespace
    else:
        return "" # Return empty string for non-string input

# Combine title and text for cleaning and then apply cleaning
df["text"] = df["title"].fillna('') + " " + df["text"].fillna('') # Combine title and text
df["text"] = df["text"].apply(clean_text)


# -------------------------------------------------
# 3. Split Features & Labels
# -------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)


# -------------------------------------------------
# 4. Vectorization (TF-IDF)
# -------------------------------------------------
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# -------------------------------------------------
# 5. Train Model (Logistic Regression)
# -------------------------------------------------
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# -------------------------------------------------
# 6. Evaluate Model
# -------------------------------------------------
y_pred = model.predict(X_test_tfidf)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -------------------------------------------------
# 7. Test on Custom Input
# -------------------------------------------------
def predict_news(news_text):
    news_text = clean_text(news_text)
    vectorized = tfidf.transform([news_text])
    prediction = model.predict(vectorized)[0]
    return "✅ Real News" if prediction == 1 else "❌ Fake News"

# Example test
print("\nCustom Test 1:", predict_news("The government just announced a new healthcare policy."))
print("Custom Test 2:", predict_news("Aliens have landed in New York City and taken over the White House!"))

Dataset shape: (44898, 5)
                                               title  \
0   BREAKING: GOP Chairman Grassley Has Had Enoug...   
1   Failed GOP Candidates Remembered In Hilarious...   
2   Mike Pence’s New DC Neighbors Are HILARIOUSLY...   
3  California AG pledges to defend birth control ...   
4  AZ RANCHERS Living On US-Mexico Border Destroy...   

                                                text       subject  \
0  Donald Trump s White House is in chaos, and th...          News   
1  Now that Donald Trump is the presumptive GOP n...          News   
2  Mike Pence is a huge homophobe. He supports ex...          News   
3  SAN FRANCISCO (Reuters) - California Attorney ...  politicsNews   
4  Twisted reasoning is all that comes from Pelos...      politics   

               date  label  
0     July 21, 2017      0  
1       May 7, 2016      0  
2  December 3, 2016      0  
3  October 6, 2017       1  
4      Apr 25, 2017      0  

Accuracy: 0.9856347438752784

Classificat

In [None]:
# Single code for Kaggle dataset download + unzip
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset -p ./data --force
!unzip -o ./data/fake-and-real-news-dataset.zip -d ./data/fake_news

Dataset URL: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset
License(s): CC-BY-NC-SA-4.0
Downloading fake-and-real-news-dataset.zip to ./data
  0% 0.00/41.0M [00:00<?, ?B/s]
100% 41.0M/41.0M [00:00<00:00, 908MB/s]
Archive:  ./data/fake-and-real-news-dataset.zip
  inflating: ./data/fake_news/Fake.csv  
  inflating: ./data/fake_news/True.csv  


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re, string

# -------------------------------------------------
# 1. Apna Dataset banalo (chhota demo)
# -------------------------------------------------
data = {
    "text": [
        "The government announced a new healthcare policy.",   # Real
        "Aliens landed in New York and took over the White House!", # Fake
        "Scientists discovered a cure for cancer after 20 years of research.", # Real
        "Celebrity found alive on Mars after disappearing 10 years ago.", # Fake
        "Stock markets are showing positive growth this quarter.", # Real
        "Man claims to have a time machine built in his garage.", # Fake
        "Schools to reopen from next Monday after holidays.", # Real
        "Dinosaurs seen alive in the Amazon rainforest.", # Fake
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Real, 0 = Fake
}

df = pd.DataFrame(data)

# -------------------------------------------------
# 2. Preprocess function
# -------------------------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text.strip()

df["text"] = df["text"].apply(clean_text)

# -------------------------------------------------
# 3. Features & Labels
# -------------------------------------------------
X, y = df["text"], df["label"]

# -------------------------------------------------
# 4. TF-IDF
# -------------------------------------------------
tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf.fit_transform(X)

# -------------------------------------------------
# 5. Train Model
# -------------------------------------------------
model = LogisticRegression()
model.fit(X_tfidf, y)

# -------------------------------------------------
# 6. Evaluate
# -------------------------------------------------
y_pred = model.predict(X_tfidf)

print("\n✅ Accuracy:", accuracy_score(y, y_pred))
print("\n✅ Classification Report:\n", classification_report(y, y_pred))
print("\n✅ Confusion Matrix:\n", confusion_matrix(y, y_pred))

# -------------------------------------------------
# 7. Custom Prediction
# -------------------------------------------------
def predict_news(news_text):
    news_text = clean_text(news_text)
    vectorized = tfidf.transform([news_text])
    prediction = model.predict(vectorized)[0]
    return "✅ Real News" if prediction == 1 else "❌ Fake News"

# Test karo
print("\nCustom Test 1:", predict_news("NASA announced a new mission to the Moon."))
print("Custom Test 2:", predict_news("Aliens have captured the Eiffel Tower."))


✅ Accuracy: 1.0

✅ Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         4

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8


✅ Confusion Matrix:
 [[4 0]
 [0 4]]

Custom Test 1: ✅ Real News
Custom Test 2: ❌ Fake News
