<a href="https://colab.research.google.com/github/alwaysalearner1234/ML01/blob/main/ML04Fake_News_Detection_using_NLP_%2B_ML_Classifiers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re, string

# -------------------------------------------------
# 1. Apna Dataset banalo (chhota demo)
# -------------------------------------------------
data = {
    "text": [
        "The government announced a new healthcare policy.",   # Real
        "Aliens landed in New York and took over the White House!", # Fake
        "Scientists discovered a cure for cancer after 20 years of research.", # Real
        "Celebrity found alive on Mars after disappearing 10 years ago.", # Fake
        "Stock markets are showing positive growth this quarter.", # Real
        "Man claims to have a time machine built in his garage.", # Fake
        "Schools to reopen from next Monday after holidays.", # Real
        "Dinosaurs seen alive in the Amazon rainforest.", # Fake
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Real, 0 = Fake
}

df = pd.DataFrame(data)

# -------------------------------------------------
# 2. Preprocess function
# -------------------------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text.strip()

df["text"] = df["text"].apply(clean_text)

# -------------------------------------------------
# 3. Features & Labels
# -------------------------------------------------
X, y = df["text"], df["label"]

# -------------------------------------------------
# 4. TF-IDF
# -------------------------------------------------
tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf.fit_transform(X)

# -------------------------------------------------
# 5. Train Model
# -------------------------------------------------
model = LogisticRegression()
model.fit(X_tfidf, y)

# -------------------------------------------------
# 6. Evaluate
# -------------------------------------------------
y_pred = model.predict(X_tfidf)

print("\n✅ Accuracy:", accuracy_score(y, y_pred))
print("\n✅ Classification Report:\n", classification_report(y, y_pred))
print("\n✅ Confusion Matrix:\n", confusion_matrix(y, y_pred))

# -------------------------------------------------
# 7. Custom Prediction
# -------------------------------------------------
def predict_news(news_text):
    news_text = clean_text(news_text)
    vectorized = tfidf.transform([news_text])
    prediction = model.predict(vectorized)[0]
    return "✅ Real News" if prediction == 1 else "❌ Fake News"

# Test karo
print("\nCustom Test 1:", predict_news("NASA announced a new mission to the Moon."))
print("Custom Test 2:", predict_news("Aliens have captured the Eiffel Tower."))


✅ Accuracy: 1.0

✅ Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         4

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8


✅ Confusion Matrix:
 [[4 0]
 [0 4]]

Custom Test 1: ✅ Real News
Custom Test 2: ❌ Fake News
