In [2]:
# Import libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load datasets
# Use 'engine="python"' and 'on_bad_lines="skip"' to handle potential parsing errors in the CSV files.
# This is particularly useful for files with inconsistent quoting or newlines within fields.
try:
    fake = pd.read_csv('Fake.csv', engine='python', on_bad_lines='skip')
    real = pd.read_csv('True.csv', engine='python', on_bad_lines='skip')
except FileNotFoundError:
    print("Error: Dataset files 'Fake.csv' or 'True.csv' not found.")
    print("Please ensure the files are in the correct directory.")
    # Exit the script or handle the error as needed
    exit()


# Add labels: 0 = FAKE, 1 = REAL
fake['label'] = 0
real['label'] = 1

# Combine and shuffle
data = pd.concat([fake, real])
data = data.sample(frac=1).reset_index(drop=True)

# Clean text
def clean_text(text):
    # Ensure text is a string before applying regex
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters
        text = text.lower()  # Convert to lowercase
    else:
        # Handle non-string values, e.g., convert to empty string
        text = ""
    return text


data['text'] = data['text'].apply(clean_text)

# Split features and labels
X = data['text']
y = data['label']

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_vec = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Train classifier
model = PassiveAggressiveClassifier(max_iter=50)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {acc*100:.2f}%")
print("Confusion Matrix:")
print(cm)

Accuracy: 99.39%
Confusion Matrix:
[[4725   28]
 [  27 4200]]
