In [1]:
!pip install pandas numpy scikit-learn flask joblib nltk


Defaulting to user installation because normal site-packages is not writeable


In [2]:
#Load and Combine Dataset
import pandas as pd

# Load datasets
fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")

# Add labels
fake['label'] = 0  # Fake news
real['label'] = 1  # Real news

# Combine datasets
df = pd.concat([fake, real], axis=0)
df = df[['text', 'label']]  # Only use 'text' and 'label'

# Show first few rows
df.head()


Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [3]:
#Text Cleaning and Preprocessing
import string
import nltk
from nltk.corpus import stopwords

# Download stopwords (only once)
nltk.download('stopwords')

# Define cleaning function
def clean_text(text):
    text = text.lower()  # Lowercase
    text = ''.join([c for c in text if c not in string.punctuation])  # Remove punctuation
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

# Show cleaned text
df[['text', 'clean_text', 'label']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\syeds\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Unnamed: 0,text,clean_text,label
0,Donald Trump just couldn t wish all Americans ...,donald trump wish americans happy new year lea...,0
1,House Intelligence Committee Chairman Devin Nu...,house intelligence committee chairman devin nu...,0
2,"On Friday, it was revealed that former Milwauk...",friday revealed former milwaukee sheriff david...,0
3,"On Christmas day, Donald Trump announced that ...",christmas day donald trump announced would bac...,0
4,Pope Francis used his annual Christmas Day mes...,pope francis used annual christmas day message...,0


In [4]:
#Convert Text to Numbers
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize clean text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text']).toarray()
y = df['label']

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Define and apply the text cleaning function
def clean_text(text):
    text = text.lower()  # Lowercase
    text = ''.join([c for c in text if c not in string.punctuation])  # Remove punctuation
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

df['clean_text'] = df['text'].apply(clean_text)
df[['text', 'clean_text', 'label']].head()


Unnamed: 0,text,clean_text,label
0,Donald Trump just couldn t wish all Americans ...,donald trump wish americans happy new year lea...,0
1,House Intelligence Committee Chairman Devin Nu...,house intelligence committee chairman devin nu...,0
2,"On Friday, it was revealed that former Milwauk...",friday revealed former milwaukee sheriff david...,0
3,"On Christmas day, Donald Trump announced that ...",christmas day donald trump announced would bac...,0
4,Pope Francis used his annual Christmas Day mes...,pope francis used annual christmas day message...,0


In [6]:
#Vectorize the text using TF-IDF and split the data:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text']).toarray()
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
#Train the Machine Learning Model
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate performance
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))


✅ Accuracy: 0.9462138084632516

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95      4733
           1       0.94      0.95      0.94      4247

    accuracy                           0.95      8980
   macro avg       0.95      0.95      0.95      8980
weighted avg       0.95      0.95      0.95      8980



In [8]:
#Save the Model & Vectorizer for Deployment
import joblib

# Save the model
joblib.dump(model, "fake_news_model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "vectorizer.pkl")

print("✅ Model and vectorizer saved successfully!")


✅ Model and vectorizer saved successfully!
