In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import re
import joblib
import string

In [None]:
import zipfile
import os

# Path to the ZIP file
zip_path = "fake-and-real-news-dataset.zip"
extract_path = "news_dataset"

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# List extracted files
os.listdir(extract_path)


['News _dataset']

In [None]:
import pandas as pd

# Load fake and real news data
fake_df = pd.read_csv(os.path.join(extract_path, "News _dataset/Fake.csv"))
real_df = pd.read_csv(os.path.join(extract_path, "News _dataset/True.csv"))

# Assign labels (0 = Fake, 1 = Real)
fake_df["label"] = 0
real_df["label"] = 1

# Combine both datasets
df = pd.concat([fake_df, real_df], ignore_index=True)

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display dataset
df.head()

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",0
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",1
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",1
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",0
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",1


In [None]:
# Drop unnecessary columns
df = df.drop(["title", "subject", "date"], axis=1)

# Display dataset
df.head()

Unnamed: 0,text,label
0,"21st Century Wire says Ben Stein, reputable pr...",0
1,WASHINGTON (Reuters) - U.S. President Donald T...,1
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1
3,"On Monday, Donald Trump once again embarrassed...",0
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",1


In [None]:
#stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"\W", " ", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    #text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"<.*?>+", "", text)
    text = re.sub(r"[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub(r"\w*\d\w*", "", text)
    return text

# Apply text cleaning
df["text"] = df["text"].apply(clean_text)

# Display dataset after preprocessing
df.head()


Unnamed: 0,text,label
0,century wire says ben stein reputable profess...,0
1,washington reuters u s president donald trump ...,1
2,reuters puerto rico governor ricardo rossello ...,1
3,on monday donald trump once again embarrassed ...,0
4,glasgow scotland reuters most u s presidential...,1


In [None]:
from sklearn.model_selection import train_test_split

# Split dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# Check data distribution
print(f"Train: {len(X_train)}, Test: {len(X_test)}")


Train: 35918, Test: 8980


In [None]:
vectorizer = TfidfVectorizer()
X_train_tokens = vectorizer.fit_transform(X_train)
X_test_tokens = vectorizer.transform(X_test)


In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_tokens, y_train)

In [None]:
predictions = logreg.predict(X_test_tokens)
lr_accuracy = accuracy_score(y_test, predictions)
print(f"Logistic Regression Accuracy: {lr_accuracy}")


Logistic Regression Accuracy: 0.984521158129176


In [None]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4710
           1       0.98      0.99      0.98      4270

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980

