In [2]:
import os

# Set Kaggle API key location
os.environ['KAGGLE_CONFIG_DIR'] = "C:\\Users\\Sarvadnya\\Project ML\\.kaggle"

# Verify by listing Kaggle files
!kaggle datasets list


ref                                                          title                                              size  lastUpdated          downloadCount  voteCount  usabilityRating  
-----------------------------------------------------------  ------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
asinow/car-price-dataset                                     Car Price Dataset                                 135KB  2025-01-26 19:53:28           5706         73  1.0              
anandshaw2001/netflix-movies-and-tv-shows                    Netflix Movies and TV Shows                         1MB  2025-01-03 10:33:01          15857        420  1.0              
andrexibiza/grocery-sales-dataset                            Grocery Sales Database                            223MB  2025-01-31 19:04:00           1549         32  1.0              
ashaychoudhary/diabetes-prediction-in-america-dataset        Diabetes Prediction in A

In [3]:
import zipfile
import os

# Define paths
zip_path = r"C:\Users\Sarvadnya\Project ML\Fake_News_Dataset.zip"
extract_path = r"C:\Users\Sarvadnya\Project ML"

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Dataset extracted successfully!")


Dataset extracted successfully!


In [4]:
import os

# List extracted files
files = os.listdir(extract_path)
print("Extracted Files:", files)


Extracted Files: ['.ipynb_checkpoints', '.kaggle', 'FakeNewsDetection.ipynb', 'fake_and_real_news.csv', 'Fake_News_Dataset.zip']


In [5]:
import pandas as pd

# Define dataset path
dataset_path = r"C:\Users\Sarvadnya\Project ML\fake_and_real_news.csv"

# Load dataset
df = pd.read_csv(dataset_path)

# Display first 5 rows
df.head()


Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [6]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9900 entries, 0 to 9899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    9900 non-null   object
 1   label   9900 non-null   object
dtypes: object(2)
memory usage: 154.8+ KB


In [7]:
import re
import string

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in square brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', '', text)  # Remove newlines
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing numbers
    return text

# Apply text cleaning
df['clean_text'] = df['Text'].apply(clean_text)

# Show cleaned data
df[['Text', 'clean_text']].head()


Unnamed: 0,Text,clean_text
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,top trump surrogate brutally stabs him in the...
1,U.S. conservative leader optimistic of common ...,us conservative leader optimistic of common gr...
2,"Trump proposes U.S. tax overhaul, stirs concer...",trump proposes us tax overhaul stirs concerns ...
3,Court Forces Ohio To Allow Millions Of Illega...,court forces ohio to allow millions of illega...
4,Democrats say Trump agrees to work on immigrat...,democrats say trump agrees to work on immigrat...


In [8]:
from sklearn.preprocessing import LabelEncoder

# Convert Fake/Real to 0 and 1
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Display label encoding
df[['label', 'label_encoded']].drop_duplicates()


Unnamed: 0,label,label_encoded
0,Fake,0
1,Real,1


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Transform text data
X = tfidf_vectorizer.fit_transform(df['clean_text'])

# Target variable
y = df['label_encoded']

# Check the shape of transformed data
X.shape


(9900, 5000)

In [10]:
from sklearn.model_selection import train_test_split

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the training and testing sets
X_train.shape, X_test.shape


((7920, 5000), (1980, 5000))

In [11]:
from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression()

# Train the model on the training set
model.fit(X_train, y_train)


In [12]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Display classification report
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(report)


Accuracy: 0.9914
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       973
           1       0.99      1.00      0.99      1007

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



In [13]:
import joblib

# Save the trained model
model_filename = "C:/Users/Sarvadnya/Project ML/fake_news_model.pkl"
joblib.dump(model, model_filename)

print(f"Model saved successfully at {model_filename}")


Model saved successfully at C:/Users/Sarvadnya/Project ML/fake_news_model.pkl


In [14]:
!pip install flask joblib numpy pandas


Collecting flask
  Downloading flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting Werkzeug>=3.1 (from flask)
  Downloading werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Collecting itsdangerous>=2.2 (from flask)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Downloading flask-3.1.0-py3-none-any.whl (102 kB)
Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Downloading werkzeug-3.1.3-py3-none-any.whl (224 kB)
Installing collected packages: Werkzeug, itsdangerous, flask
Successfully installed Werkzeug-3.1.3 flask-3.1.0 itsdangerous-2.2.0


In [16]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Train on sample data (replace this with actual training data if needed)
sample_texts = ["Fake news example", "Real news example"]
vectorizer.fit(sample_texts)

# Save the vectorizer
joblib.dump(vectorizer, "C:/Users/Sarvadnya/Project ML/tfidf_vectorizer.pkl")

print("Vectorizer saved successfully!")


Vectorizer saved successfully!


In [17]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd

# Load dataset
df = pd.read_csv("fake_and_real_news.csv")

# Combine title and text for better results
df['content'] = df['title'] + " " + df['text']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['label'], test_size=0.2, random_state=42)

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Save the model and vectorizer
with open("fake_news_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("✅ Model and vectorizer saved successfully!")


KeyError: 'title'

In [18]:
import pandas as pd

# Load dataset
df = pd.read_csv("fake_and_real_news.csv")

# Show the first few rows and column names
print(df.head())
print("\nColumn Names:", df.columns)


                                                Text label
0   Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake
1  U.S. conservative leader optimistic of common ...  Real
2  Trump proposes U.S. tax overhaul, stirs concer...  Real
3   Court Forces Ohio To Allow Millions Of Illega...  Fake
4  Democrats say Trump agrees to work on immigrat...  Real

Column Names: Index(['Text', 'label'], dtype='object')


In [20]:
df['content'] = df['Text']  # Use only the 'Text' column


In [21]:
print(df.head())  # Show the first few rows
print("\nColumn Names:", df.columns)  # Show all column names


                                                Text label  \
0   Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake   
1  U.S. conservative leader optimistic of common ...  Real   
2  Trump proposes U.S. tax overhaul, stirs concer...  Real   
3   Court Forces Ohio To Allow Millions Of Illega...  Fake   
4  Democrats say Trump agrees to work on immigrat...  Real   

                                             content  
0   Top Trump Surrogate BRUTALLY Stabs Him In The...  
1  U.S. conservative leader optimistic of common ...  
2  Trump proposes U.S. tax overhaul, stirs concer...  
3   Court Forces Ohio To Allow Millions Of Illega...  
4  Democrats say Trump agrees to work on immigrat...  

Column Names: Index(['Text', 'label', 'content'], dtype='object')


In [22]:
df.drop(columns=['Text'], inplace=True)
print(df.head())  # Verify the changes


  label                                            content
0  Fake   Top Trump Surrogate BRUTALLY Stabs Him In The...
1  Real  U.S. conservative leader optimistic of common ...
2  Real  Trump proposes U.S. tax overhaul, stirs concer...
3  Fake   Court Forces Ohio To Allow Millions Of Illega...
4  Real  Democrats say Trump agrees to work on immigrat...


In [23]:
df.dropna(inplace=True)
print("Remaining rows after dropping missing values:", len(df))


Remaining rows after dropping missing values: 9900


In [25]:
df['content'] = df['content'].str.lower()


In [26]:
import re

df['content'] = df['content'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))


In [27]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df['content'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sarvadnya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
!pip install scikit-learn   # Run this only if scikit-learn is not installed

from sklearn.feature_extraction.text import TfidfVectorizer


ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^


In [29]:
!pip install scikit-learn





In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 words
X = vectorizer.fit_transform(df['content']).toarray()


In [31]:
print(X.shape)  # Check dimensions of transformed data
print(X[:5])    # Print first 5 rows of vectorized data


(9900, 5000)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [32]:
from sklearn.model_selection import train_test_split

y = df['label']  # Target labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train Set Shape:", X_train.shape)
print("Test Set Shape:", X_test.shape)


Train Set Shape: (7920, 5000)
Test Set Shape: (1980, 5000)


In [33]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

print("Model training completed!")


Model training completed!


In [34]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9955

Classification Report:
              precision    recall  f1-score   support

        Fake       1.00      0.99      1.00       973
        Real       0.99      1.00      1.00      1007

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980



In [35]:
import joblib

# Save the trained model
joblib.dump(model, "C:/Users/Sarvadnya/Project ML/fake_news_model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "C:/Users/Sarvadnya/Project ML/tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [36]:
import joblib

# Save the trained model
joblib.dump(model, "C:/Users/Sarvadnya/Project ML/fake_news_model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "C:/Users/Sarvadnya/Project ML/tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [37]:
import numpy as np
import pandas as pd
import joblib
import sklearn
import flask  # If using Flask for deployment
import pickle


In [38]:
# Load saved model
loaded_model = joblib.load("C:/Users/Sarvadnya/Project ML/fake_news_model.pkl")

# Load saved vectorizer
loaded_vectorizer = joblib.load("C:/Users/Sarvadnya/Project ML/tfidf_vectorizer.pkl")

# Test prediction
sample_text = ["This is a test news article"]
sample_vectorized = loaded_vectorizer.transform(sample_text)
prediction = loaded_model.predict(sample_vectorized)

print("Prediction Output:", prediction)


Prediction Output: ['Fake']


In [39]:
sample_text = ["Breaking news: AI detects fake articles!"]
sample_vectorized = loaded_vectorizer.transform(sample_text)
prediction = loaded_model.predict(sample_vectorized)


In [40]:
pip install streamlit


Collecting streamlitNote: you may need to restart the kernel to use updated packages.

  Using cached streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Using cached streamlit-1.42.0-py2.py3-none-any.whl (9.6 MB)
Installing collected packages: streamlit
Successfully installed streamlit-1.42.0


In [41]:
import pickle

# Save the vectorizer again
vectorizer_path = "C:/Users/Sarvadnya/Project ML/tfidf_vectorizer.pkl"
with open(vectorizer_path, "wb") as file:
    pickle.dump(vectorizer, file)

print("✅ Vectorizer saved successfully!")


✅ Vectorizer saved successfully!


In [42]:
import pickle

# Define the path where the model.pkl file is stored
model_path = "C:/Users/Sarvadnya/Project ML/model.pkl"

try:
    # Try loading the model
    with open(model_path, "rb") as file:
        model = pickle.load(file)
    
    # Print model type to verify it is a classifier
    print("✅ Model Loaded Successfully:", type(model))

    # Check if model has a .predict() method
    if hasattr(model, 'predict'):
        print("✅ Model is ready for predictions!")
    else:
        print("❌ Model does NOT have a 'predict()' method. Resave the correct model.")

except Exception as e:
    print("🚨 Error loading model:", e)


🚨 Error loading model: [Errno 2] No such file or directory: 'C:/Users/Sarvadnya/Project ML/model.pkl'


In [43]:
import pickle

# Define the correct save path
model_path = "C:/Users/Sarvadnya/Project ML/model.pkl"

# Save the trained model again
with open(model_path, "wb") as file:
    pickle.dump(model, file)

print("✅ Model saved successfully at:", model_path)


✅ Model saved successfully at: C:/Users/Sarvadnya/Project ML/model.pkl


In [44]:
import pickle

# Define the path where the model.pkl file is stored
model_path = "C:/Users/Sarvadnya/Project ML/model.pkl"

try:
    # Try loading the model
    with open(model_path, "rb") as file:
        model = pickle.load(file)
    
    # Print model type to verify it is a classifier
    print("✅ Model Loaded Successfully:", type(model))

    # Check if model has a .predict() method
    if hasattr(model, 'predict'):
        print("✅ Model is ready for predictions!")
    else:
        print("❌ Model does NOT have a 'predict()' method. Resave the correct model.")

except Exception as e:
    print("🚨 Error loading model:", e)


✅ Model Loaded Successfully: <class 'sklearn.linear_model._logistic.LogisticRegression'>
✅ Model is ready for predictions!


In [45]:
import pickle

# Define the correct save path
model_path = "C:/Users/Sarvadnya/Project ML/model.pkl"

# Save the trained model again
with open(model_path, "wb") as file:
    pickle.dump(model, file)

print("✅ Model saved successfully at:", model_path)


✅ Model saved successfully at: C:/Users/Sarvadnya/Project ML/model.pkl


In [46]:
import pickle

# Load the model
model_path = "C:/Users/Sarvadnya/Project ML/model.pkl"
model = pickle.load(open(model_path, "rb"))

# Check the type of model
print(type(model))  # It should NOT be <class 'numpy.ndarray'>


<class 'sklearn.linear_model._logistic.LogisticRegression'>


In [47]:
import pickle

# Define the correct file path
model_path = "C:/Users/Sarvadnya/Project ML/model.pkl"

# Save the trained model
pickle.dump(model, open(model_path, "wb"))

print("✅ Model saved successfully at:", model_path)


✅ Model saved successfully at: C:/Users/Sarvadnya/Project ML/model.pkl


In [48]:
# Load the model
loaded_model = pickle.load(open(model_path, "rb"))

# Check the type of model
print(type(loaded_model))  # Expected output: <class 'sklearn.linear_model._logistic.LogisticRegression'>


<class 'sklearn.linear_model._logistic.LogisticRegression'>


In [49]:
import pickle

# Save the trained model
with open("C:/Users/Sarvadnya/Project ML/model.pkl", "wb") as file:
    pickle.dump(model, file)

print("Model saved successfully!")


Model saved successfully!


In [50]:
import pickle

# Load the saved model
with open("C:/Users/Sarvadnya/Project ML/model.pkl", "rb") as file:
    model = pickle.load(file)

print("Model loaded successfully!")


Model loaded successfully!


In [51]:
import pickle

# Load the saved model
with open("C:/Users/Sarvadnya/Project ML/model.pkl", "rb") as file:
    model = pickle.load(file)

print("Model loaded successfully!")


Model loaded successfully!


In [52]:
import pickle

# Load the model
with open("C:/Users/Sarvadnya/Project ML/model.pkl", "rb") as file:
    model = pickle.load(file)

# Load the vectorizer
with open("C:/Users/Sarvadnya/Project ML/tfidf_vectorizer.pkl", "rb") as file:
    vectorizer = pickle.load(file)

# Sample text input
input_text = "Breaking News: Stock market crashes due to economic downturn!"

# Transform the input
transformed_text = vectorizer.transform([input_text])

# Make the prediction
prediction = model.predict(transformed_text)[0]

print("Prediction:", prediction)


Prediction: Fake


In [53]:
import pickle

# Save the trained model
with open("C:/Users/Sarvadnya/Project ML/model.pkl", "wb") as file:
    pickle.dump(model, file)

print("Model saved successfully!")



Model saved successfully!


In [54]:
import pickle

# Load the saved model
with open("C:/Users/Sarvadnya/Project ML/model.pkl", "rb") as file:
    model = pickle.load(file)

print("Model loaded successfully!")


Model loaded successfully!


In [55]:
import pickle

# Load model
with open("C:/Users/Sarvadnya/Project ML/model.pkl", "rb") as file:
    model = pickle.load(file)

print(type(model))  # Check the type


<class 'sklearn.linear_model._logistic.LogisticRegression'>


In [56]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on test data
y_pred = model.predict(X_test)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9954545454545455
Classification Report:
               precision    recall  f1-score   support

        Fake       1.00      0.99      1.00       973
        Real       0.99      1.00      1.00      1007

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980



In [57]:
# Test with a known Fake news sample
fake_news = ["Breaking: Celebrity endorses unverified product with false claims!"]
transformed_fake = vectorizer.transform(fake_news)
pred_fake = model.predict(transformed_fake)

print("Prediction for Fake News sample:", pred_fake)  # Expected: 0 (Fake)


Prediction for Fake News sample: ['Fake']
