In [1]:
!pip install kagglehub scikit-learn pandas joblib

Defaulting to user installation because normal site-packages is not writeable
Collecting kagglehub
  Downloading kagglehub-0.4.2-py3-none-any.whl.metadata (38 kB)
Collecting kagglesdk<1.0,>=0.1.14 (from kagglehub)
  Downloading kagglesdk-0.1.15-py3-none-any.whl.metadata (13 kB)
Collecting protobuf (from kagglesdk<1.0,>=0.1.14->kagglehub)
  Downloading protobuf-6.33.5-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Downloading kagglehub-0.4.2-py3-none-any.whl (69 kB)
Downloading kagglesdk-0.1.15-py3-none-any.whl (160 kB)
Downloading protobuf-6.33.5-cp310-abi3-win_amd64.whl (437 kB)
Installing collected packages: protobuf, kagglesdk, kagglehub

   ---------------------------------------- 0/3 [protobuf]
   ---------------------------------------- 0/3 [protobuf]
   ---------------------------------------- 0/3 [protobuf]
   ---------------------------------------- 0/3 [protobuf]
   ---------------------------------------- 0/3 [protobuf]
   ---------------------------------------- 0/3 [protobu

In [2]:
import pandas as pd
import numpy as np
import re
import joblib
import kagglehub

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [3]:
path = kagglehub.dataset_download(
    "clmentbisaillon/fake-and-real-news-dataset"
)

print("Path to dataset files:", path)


Downloading from https://www.kaggle.com/api/v1/datasets/download/clmentbisaillon/fake-and-real-news-dataset?dataset_version_number=1...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 41.0M/41.0M [00:03<00:00, 11.4MB/s]

Extracting files...





Path to dataset files: C:\Users\aadar\.cache\kagglehub\datasets\clmentbisaillon\fake-and-real-news-dataset\versions\1


In [4]:
fake_df = pd.read_csv(f"{path}/Fake.csv")
true_df = pd.read_csv(f"{path}/True.csv")

fake_df["label"] = 0   # Fake news
true_df["label"] = 1   # Real news

df = pd.concat([fake_df, true_df], axis=0)
df = df.sample(frac=1).reset_index(drop=True)

df.head()


Unnamed: 0,title,text,subject,date,label
0,U.S. judge in California blocks Trump's order ...,(Reuters) - A federal court judge in Californi...,politicsNews,"November 21, 2017",1
1,Donna Brazile CRUSHES Karl Rove For Making ‘J...,This is why Republicans lost the black vote an...,News,"June 6, 2016",0
2,SAY WHAT? Organization With Ties To Muslim Bro...,Just another attempt by CAIR to make it appear...,politics,"Nov 2, 2015",0
3,Fact Checkers Clear Hillary Of ‘Big Oil Bribes’,Among the slew of baseless attacks launched ag...,News,"April 2, 2016",0
4,FLASHBACK: BERNIE SANDERS’ Socialist Democrat ...,If I found out my college-age daughter was att...,politics,"Jan 23, 2016",0


In [5]:
def clean_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [6]:
df["clean_text"] = df["text"].apply(clean_text)


In [7]:
X = df["clean_text"]
y = df["label"]

vectorizer = TfidfVectorizer(
    stop_words="english",
    max_df=0.7,
    min_df=2
)

X_vec = vectorizer.fit_transform(X)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y,
    test_size=0.2,
    random_state=42
)


In [9]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [10]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [11]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.9867483296213808

Classification Report:

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4734
           1       0.99      0.99      0.99      4246

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [15]:
joblib.dump(vectorizer, "fake_vectorizer.pkl")
joblib.dump(model, "fake_news_model.pkl")

print("✅ Model and vectorizer saved successfully")


✅ Model and vectorizer saved successfully


In [13]:
def predict_news(text):
    text = clean_text(text)
    vec = vectorizer.transform([text])
    pred = model.predict(vec)[0]
    prob = model.predict_proba(vec).max()

    return {
        "label": "Fake" if pred == 0 else "True",
        "confidence": round(float(prob), 3)
    }

predict_news("Breaking: Scientists confirm water found on Mars")


{'label': 'Fake', 'confidence': 0.887}

In [14]:
import sklearn
sklearn.__version__


'1.5.1'