In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import pickle
import os
import warnings

In [2]:
true_df = pd.read_csv('data/true.csv', low_memory=False)
fake_df = pd.read_csv('data/fake.csv', low_memory=False)
    
    # Add labels
true_df['label'] = 1  # True news
fake_df['label'] = 0  # Fake news
    
    # Combine datasets
df = pd.concat([true_df, fake_df], ignore_index=True)

In [3]:
print(f"Total samples: {len(df)}")
print(f"True news: {len(true_df)}")
print(f"Fake news: {len(fake_df)}")

Total samples: 54633
True news: 31131
Fake news: 23502


In [4]:
df['title'] = df['title'].fillna('')
df['text'] = df['text'].fillna('')

In [5]:
df['content'] = df['title'] + ' ' + df['text']

In [6]:
df = df[df['content'].str.strip() != '']

In [7]:
df['content'] = df['content'].str.lower()
df['content'] = df['content'].str.replace(r'[^a-zA-Z\s]', ' ', regex=True)
df['content'] = df['content'].str.replace(r'\s+', ' ', regex=True)
df['content'] = df['content'].str.strip()

In [8]:
df = df[df['content'].str.len() > 50]
    
print(f"After preprocessing: {len(df)} samples")

After preprocessing: 50033 samples


In [9]:
X = df['content']
y = df['label']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

In [11]:
vectorizer = TfidfVectorizer(
        max_features=5000,  # Reduced from 10000
        stop_words='english',
        lowercase=True,
        ngram_range=(1, 2),
        min_df=2,  # Ignore terms that appear in less than 2 documents
        max_df=0.95  # Ignore terms that appear in more than 95% of documents
    )

In [12]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [15]:
from pycaret.classification import setup, compare_models

In [14]:
clf_setup = setup(data=df, target='label', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Binary
3,Original data shape,"(50033, 174)"
4,Transformed data shape,"(50033, 749)"
5,Transformed train set shape,"(35023, 749)"
6,Transformed test set shape,"(15010, 749)"
7,Ordinal features,10
8,Numeric features,1
9,Categorical features,172


In [16]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9989,0.9997,0.998,1.0,0.999,0.9979,0.9979,85.927
ridge,Ridge Classifier,0.9989,0.0,0.998,1.0,0.999,0.9979,0.9979,55.278
lda,Linear Discriminant Analysis,0.9989,0.9997,0.998,1.0,0.999,0.9979,0.9979,50.61
svm,SVM - Linear Kernel,0.9987,0.0,0.9976,1.0,0.9988,0.9974,0.9974,47.738
et,Extra Trees Classifier,0.8368,0.9997,0.9999,0.7709,0.8687,0.6649,0.7087,45.217
knn,K Neighbors Classifier,0.6798,0.6589,1.0,0.6237,0.7682,0.3308,0.4451,76.044
ada,Ada Boost Classifier,0.6754,0.6542,1.0,0.6205,0.7658,0.3212,0.4374,47.668
dt,Decision Tree Classifier,0.6728,0.6514,1.0,0.6186,0.7644,0.3155,0.4327,68.172
rf,Random Forest Classifier,0.6721,0.8062,1.0,0.6181,0.764,0.314,0.4315,58.787
lightgbm,Light Gradient Boosting Machine,0.6721,0.6507,1.0,0.6181,0.764,0.314,0.4315,40.923


In [None]:
models = {
        'Logistic Regression': LogisticRegression(
            random_state=42, 
            max_iter=1000,
            C=1.0,  # Regularization parameter
            solver='liblinear'
        ),
        'Random Forest': RandomForestClassifier(
            n_estimators=100,
            max_depth=10,  # Limit depth to prevent overfitting
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42
        ),
        'Naive Bayes': MultinomialNB(alpha=1.0),
        'SVM': SVC(
            kernel='linear',
            C=1.0,  # Regularization parameter
            random_state=42,
            probability=True
        )
    }