In [96]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [97]:
# Read the datasets
fake_data = pd.read_csv("Fake.csv")
true_data = pd.read_csv("True.csv")

In [98]:
# Preprocess text (removing special characters, extra spaces)
def clean_text(text):
    text = text.str.replace(r"[^\w\s]", "", regex=True)  # Remove special characters
    text = text.str.lower()  # Convert to lowercase
    text = text.str.strip()  # Remove leading/trailing spaces
    return text

fake_data["text"] = clean_text(fake_data["text"])
true_data["text"] = clean_text(true_data["text"])

In [99]:
# Add labels to the datasets: 1 for fake, 0 for real
fake_data["label"] = 1
true_data["label"] = 0

In [100]:
# Combine datasets
data = pd.concat([fake_data, true_data], axis=0).reset_index(drop=True)

In [101]:
# Shuffle the dataset to mix fake and real samples
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [102]:
# Check for missing values
data.dropna(subset=["text"], inplace=True)

In [103]:
# Use 'text' and 'label' columns
X, y = data["text"], data["label"]

In [104]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [105]:
# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [106]:
# Initialize models
models = {
    "LinearSVC": LinearSVC(random_state=42),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42)
}

In [107]:
# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train_vectorized, y_train)
    predictions = model.predict(X_test_vectorized)
    print(f"{model_name} Classification Report:\n{classification_report(y_test, predictions)}")
    print(f"{model_name} Confusion Matrix:\n{confusion_matrix(y_test, predictions)}")

Training LinearSVC...
LinearSVC Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4284
           1       1.00      0.99      1.00      4696

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

LinearSVC Confusion Matrix:
[[4272   12]
 [  24 4672]]
Training Naive Bayes...
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      4284
           1       0.96      0.94      0.95      4696

    accuracy                           0.95      8980
   macro avg       0.95      0.95      0.95      8980
weighted avg       0.95      0.95      0.95      8980

Naive Bayes Confusion Matrix:
[[4102  182]
 [ 285 4411]]
Training Logistic Regression...
Logistic Regression Classification Report:
              precision    recall  f1-score  

In [108]:
# Test the models on new text
def predict_new_text(file_path, vectorizer, models):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    # Clean and vectorize the text
    cleaned_text = clean_text(pd.Series(text))
    vectorized_text = vectorizer.transform(cleaned_text)
    
    for model_name, model in models.items():
        prediction = model.predict(vectorized_text)
        print(f"{model_name} Prediction: {'FAKE' if prediction[0] == 1 else 'REAL'}")

In [109]:
# Test new input
predict_new_text("mytest.txt", vectorizer, models)

LinearSVC Prediction: REAL
Naive Bayes Prediction: REAL
Logistic Regression Prediction: REAL
