**Project Title**:* AI-Powered Fake News Detection System*

**Name**- ***Ajay Kumar***

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Setup and Installations

In [2]:
!pip install beautifulsoup4
!pip install tldextract
!sudo apt install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


Data Loading and Preparation

In [3]:
import pandas as pd
from google.colab import drive

# --- Step 1: Connect to Google Drive ---
print("Connecting to Google Drive...")
drive.mount('/content/drive')
print("✅ Drive connected!")

# --- Step 2: Define file paths ---
# This is the path to the files inside your Google Drive
# Make sure the folder name matches what you created ('Colab_Projects')
base_path = '/content/drive/My Drive/colab_project/'
fake_file_path = base_path + 'Fake.csv'
real_file_path = base_path + 'True.csv'

print("\n--- Step 3: Loading files from Drive ---")

try:
    df_fake = pd.read_csv(fake_file_path)
    print("✅ Successfully loaded Fake.csv from Drive.")
    df_real = pd.read_csv(real_file_path)
    print("✅ Successfully loaded True.csv from Drive.")

    # --- Step 4: Combine and shuffle the data ---
    df_real['label'] = 0
    df_fake['label'] = 1
    df = pd.concat([df_real, df_fake], ignore_index=True)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    print("\n✅ Success! Data is combined and ready.")
    display(df.head())

except FileNotFoundError:
    print("\n❌ ERROR: File not found. Please double-check two things:")
    print("1. That you created the 'Colab_Projects' folder in your Google Drive.")
    print("2. That 'Fake.csv' and 'True.csv' are inside that folder.")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")

Connecting to Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Drive connected!

--- Step 3: Loading files from Drive ---
✅ Successfully loaded Fake.csv from Drive.
✅ Successfully loaded True.csv from Drive.

✅ Success! Data is combined and ready.


Unnamed: 0,title,text,subject,date,label
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",1
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",1
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",1
3,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",0
4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",1


Data Cleaning and Feature Extraction

In [4]:
import re
import nltk
from nltk.corpus import stopwords

# --- Step 1: Combine title and text ---
# We'll drop rows with missing text just in case
df.dropna(subset=['text'], inplace=True)
df['full_text'] = df['title'] + ' ' + df['text']

# --- Step 2: Download stopwords ---
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# --- Step 3: Create a text cleaning function ---
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

# --- Step 4: Apply the function to the text ---
# This might take a minute or two to run on all the articles
print("Cleaning the text data... please wait.")
df['clean_text'] = df['full_text'].apply(preprocess_text)

print("\n✅ Text cleaning complete!")
# Display the original text, the cleaned text, and the label
display(df[['full_text', 'clean_text', 'label']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cleaning the text data... please wait.

✅ Text cleaning complete!


Unnamed: 0,full_text,clean_text,label
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,breaking gop chairman grassley enough demands ...,1
1,Failed GOP Candidates Remembered In Hilarious...,failed gop candidates remembered hilarious moc...,1
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,mike pences new dc neighbors hilariously troll...,1
3,California AG pledges to defend birth control ...,california ag pledges defend birth control ins...,0
4,AZ RANCHERS Living On US-Mexico Border Destroy...,az ranchers living usmexico border destroy nan...,1


Model Training and Comparison

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Define features (X) and target (y)
X = df['clean_text']
y = df['label']

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Initialize the TF-IDF Vectorizer
# We'll limit it to the top 5000 most frequent words to keep it efficient
vectorizer = TfidfVectorizer(max_features=5000)

# 4. Fit and transform the training data, then transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Data has been split and converted to numerical features.")
print(f"Shape of training data (X_train_tfidf): {X_train_tfidf.shape}")
print(f"Shape of testing data (X_test_tfidf): {X_test_tfidf.shape}")

Data has been split and converted to numerical features.
Shape of training data (X_train_tfidf): (35918, 5000)
Shape of testing data (X_test_tfidf): (8980, 5000)


 Building the Final Prediction System -1

In [6]:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Initialize the model
model = PassiveAggressiveClassifier(max_iter=1000, random_state=42)

# 2. Train the model
print("Training the model...")
model.fit(X_train_tfidf, y_train)
print("✅ Model training complete!")

# 3. Make predictions on the test data
print("\nMaking predictions on the test data...")
y_pred = model.predict(X_test_tfidf)
print("✅ Predictions are made!")

# 4. Evaluate the model's performance
print("\n--- Model Evaluation ---")
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Display the Confusion Matrix
print("\nConfusion Matrix:")
# Note: 0 = Real, 1 = Fake
print(confusion_matrix(y_test, y_pred))

# Display the Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Real', 'Fake']))

Training the model...
✅ Model training complete!

Making predictions on the test data...
✅ Predictions are made!

--- Model Evaluation ---
Accuracy: 0.9950 (99.50%)

Confusion Matrix:
[[4292   19]
 [  26 4643]]

Classification Report:
              precision    recall  f1-score   support

        Real       0.99      1.00      0.99      4311
        Fake       1.00      0.99      1.00      4669

    accuracy                           0.99      8980
   macro avg       0.99      1.00      0.99      8980
weighted avg       0.99      0.99      0.99      8980



 Building the Final Prediction System - 2

In [8]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Initialize the LightGBM model
lgbm_model = LGBMClassifier(random_state=42)

# 2. Train the model
print("Training the LightGBM model...")
lgbm_model.fit(X_train_tfidf, y_train)
print("✅ Model training complete!")

# 3. Make predictions on the test data
print("\nMaking predictions with the new model...")
y_pred_lgbm = lgbm_model.predict(X_test_tfidf)
print("✅ Predictions are made!")

# 4. Evaluate the new model's performance
print("\n--- LightGBM Model Evaluation ---")
accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
print(f"Accuracy: {accuracy_lgbm:.4f} ({accuracy_lgbm*100:.2f}%)")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lgbm))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_lgbm, target_names=['Real', 'Fake']))

Training the LightGBM model...
[LightGBM] [Info] Number of positive: 18812, number of negative: 17106
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 5.971955 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 820828
[LightGBM] [Info] Number of data points in the train set: 35918, number of used features: 5000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.523749 -> initscore=0.095066
[LightGBM] [Info] Start training from score 0.095066
✅ Model training complete!

Making predictions with the new model...
✅ Predictions are made!

--- LightGBM Model Evaluation ---
Accuracy: 0.9972 (99.72%)

Confusion Matrix:
[[4304    7]
 [  18 4651]]

Classification Report:
              precision    recall  f1-score   support

        Real       1.00      1.00      1.00      4311
        Fake       1.00      1.00      1.00      4669

    accuracy                           1.00      8980
   macro avg       1.00      1.



In [None]:
# A simple list of known satire or unreliable domains
UNRELIABLE_DOMAINS = [
    'theonion.com',       # Satire
    'clickhole.com',      # Satire
    'infowars.com',       # Known for misinformation
    'naturalnews.com',    # Known for pseudoscience
    'worldnewsdailyreport.com' # Known for fake stories
]

In [None]:
# Install a library to help with parsing URLs
!pip install tldextract

import tldextract

def check_source_credibility(url):
    """
    Checks a URL against a list of known unreliable domains.
    """
    extracted = tldextract.extract(url)
    # Combines the main domain and its top-level suffix (e.g., 'google' + 'com')
    domain = f"{extracted.domain}.{extracted.suffix}"

    if domain in UNRELIABLE_DOMAINS:
        return f"🔴 Warning: The source '{domain}' is on a list of known unreliable or satire websites."
    else:
        return f"🟢 Source '{domain}' not found on our basic unreliable list."

Validation Using Link

In [None]:
import re
import nltk
from nltk.corpus import stopwords
import requests
from bs4 import BeautifulSoup
import tldextract

# --- Helper Function 1: The Text Cleaner ---
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

# --- Helper Function 2: The Core Predictor ---
def predict_news(article_text, model_to_use, vectorizer_to_use):
    cleaned_text = preprocess_text(article_text)
    vectorized_text = vectorizer_to_use.transform([cleaned_text])
    prediction = model_to_use.predict(vectorized_text)
    if prediction[0] == 1:
        return "🔴 This news is likely FAKE."
    else:
        return "🟢 This news is likely REAL."

# --- Helper Function 3: The Link Scraper (UPDATED) ---
def predict_from_link(url, model_to_use, vectorizer_to_use):
    try:
        # **THIS IS THE NEW PART** - We are adding a User-Agent header
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers, timeout=10)

        if response.status_code != 200:
            return f"Could not retrieve article (Status code: {response.status_code})"
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        article_text = ' '.join([p.get_text() for p in paragraphs])
        if not article_text.strip():
            return "Could not find any article text on the page."
        return predict_news(article_text, model_to_use, vectorizer_to_use)
    except requests.RequestException as e:
        return f"An error occurred while fetching the URL: {e}"

# --- Helper Function 4: The Source Checker ---
def check_source_credibility(url):
    extracted = tldextract.extract(url)
    domain = f"{extracted.domain}.{extracted.suffix}"
    if domain in UNRELIABLE_DOMAINS:
        return f"🔴 Warning: The source '{domain}' is on a list of known unreliable or satire websites."
    else:
        return f"🟢 Source '{domain}' not found on our basic unreliable list."

print("✅ All helper functions have been updated!")

In [None]:
# The list of unreliable domains for the source checker
UNRELIABLE_DOMAINS = [
    'theonion.com', 'clickhole.com', 'infowars.com',
    'naturalnews.com', 'worldnewsdailyreport.com'
]

# The main function that uses all the helpers
def get_final_verdict(url, model_to_use, vectorizer_to_use):
    print(f"--- Analyzing URL: {url} ---")
    # Expert 1: Source Investigator
    source_check_result = check_source_credibility(url)
    print(f"Source Analysis: {source_check_result}")

    # Expert 2: Text Expert
    text_prediction_result = predict_from_link(url, model_to_use, vectorizer_to_use)
    print(f"Text Analysis: {text_prediction_result}\n")


# --- Let's Test the Full System! ---
# Test with a real news link
real_news_url = "https://www.reuters.com/world/us/biden-will-seek-release-wsj-reporter-gershkovich-putin-meeting-2024-06-13/"
get_final_verdict(real_news_url, lgbm_model, vectorizer)

print("\n" + "="*50 + "\n")

# Test with a new, working satire link
satire_url = "https://www.theonion.com/frustrated-man-cant-believe-he-has-to-keep-remembering-1851296562"
get_final_verdict(satire_url, lgbm_model, vectorizer)

In [None]:
#
# THIS IS YOUR INPUT BOX
#
# 1. Find a news article URL online.
# 2. Paste it between the quotation marks below.
# 3. Run this cell to get the analysis.
#

my_url_to_test = "Link_to_test"


# --- This code will run the analysis on the link you provided above ---
get_final_verdict(my_url_to_test, lgbm_model, vectorizer)