In [9]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download necessary NLTK resources (first-time setup)
# nltk.download('stopwords')
# nltk.download('punkt')

# Load datasets
fake_df = pd.read_csv("news/Fake.csv")
true_df = pd.read_csv("news/True.csv")

# Add labels (1 = Fake, 0 = Real)
fake_df["label"] = 1
true_df["label"] = 0

# Combine datasets and shuffle
df = pd.concat([fake_df, true_df], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [10]:
# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Tokenization
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Stemming (reducing words to root form)
    words = [stemmer.stem(word) for word in words]
    return " ".join(words)

# Apply preprocessing to dataset
df["clean_text"] = df["text"].apply(preprocess_text)

# Display sample
print(df[["text", "clean_text", "label"]].head())


                                                text  \
0  21st Century Wire says Ben Stein, reputable pr...   
1  WASHINGTON (Reuters) - U.S. President Donald T...   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...   
3  On Monday, Donald Trump once again embarrassed...   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...   

                                          clean_text  label  
0  21st centuri wire say ben stein reput professo...      1  
1  washington reuter us presid donald trump remov...      0  
2  reuter puerto rico governor ricardo rossello s...      0  
3  monday donald trump embarrass countri accident...      1  
4  glasgow scotland reuter us presidenti candid g...      0  


In [11]:
df.to_csv("cleaned_news.csv", index=False)


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load preprocessed dataset
df = pd.read_csv("cleaned_news.csv")
# Drop any rows with NaN values in the text column
df = df.dropna(subset=["clean_text"])
# Reset index after dropping rows
df = df.reset_index(drop=True)

print(f"Cleaned dataset size: {df.shape[0]} samples")

# Split into features (X) and labels (y)
X = df["clean_text"]  # Preprocessed text
y = df["label"]  # 1 = Fake, 0 = Real

# Convert text into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))  # Unigrams + Bigrams
X_tfidf = vectorizer.fit_transform(X)

# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")


Cleaned dataset size: 44266 samples
Training samples: 35412, Test samples: 8854


In [4]:
# Initialize Logistic Regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print(f"🔹 Accuracy: {accuracy:.4f}")
print(f"🔹 Precision: {precision:.4f}")
print(f"🔹 Recall: {recall:.4f}")
print(f"🔹 F1 Score: {f1:.4f}")


🔹 Accuracy: 0.9880
🔹 Precision: 0.9894
🔹 Recall: 0.9872
🔹 F1 Score: 0.9883


In [5]:
import joblib

# Save model and vectorizer
joblib.dump(model, "fake_news_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("✅ Model and vectorizer saved!")


✅ Model and vectorizer saved!


In [7]:
import string
def predict_fake_news(text):
    # Load saved model and vectorizer
    model = joblib.load("fake_news_model.pkl")
    vectorizer = joblib.load("tfidf_vectorizer.pkl")

    # Preprocess the input text
    clean_text = text.lower().translate(str.maketrans("", "", string.punctuation))

    # Convert text to TF-IDF vector
    text_tfidf = vectorizer.transform([clean_text])

    # Make prediction
    prediction = model.predict(text_tfidf)[0]

    return "Fake News" if prediction == 1 else "Real News"

# Example usage
test_article = "Breaking: Scientists discover a new planet that could sustain life!"
print(f"📰 Prediction: {predict_fake_news(test_article)}")


📰 Prediction: Fake News
