In [6]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords


In [7]:
# Download stopwords if not already
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Step 2: Load datasets
true_df = pd.read_csv("/content/True.csv")
fake_df = pd.read_csv("/content/Fake.csv")


In [9]:
# Add labels
true_df['label'] = 1  # Real news
fake_df['label'] = 0  # Fake news

# Combine datasets
df = pd.concat([true_df, fake_df], ignore_index=True)

# Optional: Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)



In [10]:
# Step 3: Preprocess the text
def clean_text(text):
    text = text.lower()
    text = "".join([char for char in text if char not in string.punctuation])
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['clean_text'] = df['text'].apply(clean_text)


In [11]:
# Step 4: Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])

# Labels
y = df['label']


In [14]:
# Step 5: Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Step 6: Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": LinearSVC()
}



In [15]:
# Step 7: Train and evaluate each model
for name, model in models.items():
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")



=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4669
           1       0.98      0.99      0.99      4311

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Accuracy: 0.9878

=== Naive Bayes ===
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      4669
           1       0.93      0.95      0.94      4311

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980

Accuracy: 0.9438

=== Random Forest ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4669
           1       1.00      1.00      1.00      4311

    accuracy                           1.00      8980
   macro avg       1.

In [16]:
# Step 8: Ensemble model - Voting Classifier
ensemble = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000)),
        ('nb', MultinomialNB()),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('svm', LinearSVC())
    ],
    voting='hard'
)

print("\n=== Ensemble Voting Classifier ===")
ensemble.fit(X_train, y_train)
y_pred_ensemble = ensemble.predict(X_test)
print(classification_report(y_test, y_pred_ensemble))
print(f"Accuracy: {accuracy_score(y_test, y_pred_ensemble):.2f}")


=== Ensemble Voting Classifier ===
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4669
           1       0.99      0.99      0.99      4311

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Accuracy: 0.99


In [18]:
import pickle

# Save the trained model
with open("fake_news_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save the TF-IDF vectorizer as "scaler.pkl"
with open("scaler.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("✅ Model saved as fake_news_model.pkl and vectorizer saved as scaler.pkl")


✅ Model saved as fake_news_model.pkl and vectorizer saved as scaler.pkl
