# Fake vs Real Review Classifier - Complete Model Comparison

## 1. Import Required Libraries

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from scipy.sparse import hstack

# XGBoost (install with: pip install xgboost)
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except ImportError:
    print("WARNING:  XGBoost not installed. Run: pip install xgboost")
    XGBOOST_AVAILABLE = False
from scipy.special import expit

print(" Libraries imported successfully")

 Libraries imported successfully


## 2. Load and Filter Dataset

In [3]:
print("Loading dataset...")
df = pd.read_csv("../data/processed/labeled_reviews.csv")

print(f"Original dataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['fake_label'].value_counts())
print(f"\nConfidence distribution:")
print(df['label_confidence'].value_counts())

Loading dataset...
Original dataset shape: (882403, 21)

Label distribution:
fake_label
0    723325
1    159078
Name: count, dtype: int64

Confidence distribution:
label_confidence
high_real    528623
uncertain    252974
high_fake    100806
Name: count, dtype: int64


### 2.1 Filter to High Confidence Samples

In [4]:
print("Filtering to high confidence samples...")

df_filtered = df[df['label_confidence'].isin(['high_real', 'high_fake'])].copy()

print(f"Filtered dataset size: {len(df_filtered)}")
print(f"Removed: {len(df) - len(df_filtered)} samples ({(len(df) - len(df_filtered))/len(df)*100:.2f}%)")
print(f"\nFiltered label distribution:")
print(df_filtered['fake_label'].value_counts())

Filtering to high confidence samples...
Filtered dataset size: 629429
Removed: 252974 samples (28.67%)

Filtered label distribution:
fake_label
0    528623
1    100806
Name: count, dtype: int64


## 3. Shared Preprocessing Pipeline

### 3.1 Select Features

In [5]:
# Text Feature
X_text = df_filtered["review_text"].astype(str)

# Numerical Features
X_numeric = df_filtered[["review_length", "rating_deviation"]]

# Target
y = df_filtered["fake_label"]

print(f"Text features: {len(X_text)} samples")
print(f"Numeric features: {X_numeric.shape}")
print(f"Target distribution:\n{y.value_counts()}")

Text features: 629429 samples
Numeric features: (629429, 2)
Target distribution:
fake_label
0    528623
1    100806
Name: count, dtype: int64


### 3.2 Train-Test Split

In [6]:
X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_text, X_numeric, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training samples: {len(y_train)}")
print(f"Test samples: {len(y_test)}")

Training samples: 503543
Test samples: 125886


### 3.3 TF-IDF Vectorization

In [7]:
tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=5000,
    ngram_range=(1, 2)
)

X_tfidf_train = tfidf.fit_transform(X_text_train)
X_tfidf_test = tfidf.transform(X_text_test)

print(f"TF-IDF train shape: {X_tfidf_train.shape}")
print(f"TF-IDF test shape: {X_tfidf_test.shape}")

TF-IDF train shape: (503543, 5000)
TF-IDF test shape: (125886, 5000)


### 3.4 Scale Numeric Features

In [9]:
scaler = StandardScaler()
X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled = scaler.transform(X_num_test)

print("Numeric features scaled")

Numeric features scaled


### 3.5 Combine Features

In [10]:
X_train = hstack([X_tfidf_train, X_num_train_scaled])
X_test = hstack([X_tfidf_test, X_num_test_scaled])

print(f"Combined train shape: {X_train.shape}")
print(f"Combined test shape: {X_test.shape}")
print("Preprocessing complete - ready to train models!")

Combined train shape: (503543, 5002)
Combined test shape: (125886, 5002)
Preprocessing complete - ready to train models!


## 4. Model 1: Logistic Regression (Baseline)


In [11]:
print("Training Logistic Regression...")

lr_model = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    random_state=42
)

lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

# Calculate metrics
lr_accuracy = accuracy_score(y_test, lr_pred)
lr_precision = precision_score(y_test, lr_pred)
lr_recall = recall_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred)

print("\n Logistic Regression Results:")
print(f"   Accuracy:  {lr_accuracy:.4f}")
print(f"   Precision: {lr_precision:.4f}")
print(f"   Recall:    {lr_recall:.4f}")
print(f"   F1-Score:  {lr_f1:.4f}")

Training Logistic Regression...

 Logistic Regression Results:
   Accuracy:  0.9222
   Precision: 0.6906
   Recall:    0.9315
   F1-Score:  0.7931


## 5. Model 2: Linear SVM (Text Specialist)


In [12]:
print("Training Linear SVM...")

svm_model = LinearSVC(
    class_weight="balanced",
    random_state=42,
    max_iter=2000,
    dual=False
)

svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

# Calculate metrics
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_precision = precision_score(y_test, svm_pred)
svm_recall = recall_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred)

print("\nLinear SVM Results:")
print(f"   Accuracy:  {svm_accuracy:.4f}")
print(f"   Precision: {svm_precision:.4f}")
print(f"   Recall:    {svm_recall:.4f}")
print(f"   F1-Score:  {svm_f1:.4f}")

Training Linear SVM...

Linear SVM Results:
   Accuracy:  0.9240
   Precision: 0.6953
   Recall:    0.9357
   F1-Score:  0.7978


## 6. Model 3: Decision Tree (Simple Non-Linear)

In [13]:
print("Training Decision Tree...")

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

start_time = time.time()

dt_model = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10,
    class_weight="balanced",
    random_state=42
)

# ✅ DO NOT convert to dense   
dt_model.fit(X_train, y_train)

dt_pred = dt_model.predict(X_test)

# Calculate metrics
dt_accuracy = accuracy_score(y_test, dt_pred)
dt_precision = precision_score(y_test, dt_pred, zero_division=0)
dt_recall = recall_score(y_test, dt_pred, zero_division=0)
dt_f1 = f1_score(y_test, dt_pred, zero_division=0)

end_time = time.time()

print("\nDecision Tree Results:")
print(f"   Accuracy:  {dt_accuracy:.4f}")
print(f"   Precision: {dt_precision:.4f}")
print(f"   Recall:    {dt_recall:.4f}")
print(f"   F1-Score:  {dt_f1:.4f}")
print(f"   Training Time: {end_time - start_time:.2f} seconds")


Training Decision Tree...

Decision Tree Results:
   Accuracy:  0.9344
   Precision: 0.7241
   Recall:    0.9537
   F1-Score:  0.8232
   Training Time: 67.96 seconds


## 7. Model 4: Random Forest (Ensemble Power)

In [14]:
print("Training Random Forest...")

# ===== IMPORTS (safe after kernel restart) =====
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

start_time = time.time()

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1   # uses all CPU cores
)

# ✅ DO NOT convert to dense
rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)

# ===== METRICS =====
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred, zero_division=0)
rf_recall = recall_score(y_test, rf_pred, zero_division=0)
rf_f1 = f1_score(y_test, rf_pred, zero_division=0)

end_time = time.time()

print("\nRandom Forest Results:")
print(f"   Accuracy:  {rf_accuracy:.4f}")
print(f"   Precision: {rf_precision:.4f}")
print(f"   Recall:    {rf_recall:.4f}")
print(f"   F1-Score:  {rf_f1:.4f}")
print(f"   Training Time: {end_time - start_time:.2f} seconds")

Training Random Forest...

Random Forest Results:
   Accuracy:  0.9185
   Precision: 0.6893
   Recall:    0.8945
   F1-Score:  0.7786
   Training Time: 11.39 seconds


## 6. Model 3: XGBoost (Non-Linear Power)


In [15]:
if XGBOOST_AVAILABLE:
    print("Training XGBoost...")
    
    # Calculate scale_pos_weight
    scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])
    
    xgb_model = XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight,
        eval_metric="logloss",
        random_state=42,
        tree_method='hist',
        verbosity=0
    )
    
    xgb_model.fit(X_train, y_train)
    xgb_pred = xgb_model.predict(X_test)
    
    # Calculate metrics
    xgb_accuracy = accuracy_score(y_test, xgb_pred)
    xgb_precision = precision_score(y_test, xgb_pred)
    xgb_recall = recall_score(y_test, xgb_pred)
    xgb_f1 = f1_score(y_test, xgb_pred)
    
    print("\n XGBoost Results:")
    print(f"   Accuracy:  {xgb_accuracy:.4f}")
    print(f"   Precision: {xgb_precision:.4f}")
    print(f"   Recall:    {xgb_recall:.4f}")
    print(f"   F1-Score:  {xgb_f1:.4f}")
else:
    print(" XGBoost not available. Install with: pip install xgboost")
    xgb_accuracy = xgb_precision = xgb_recall = xgb_f1 = None

Training XGBoost...

 XGBoost Results:
   Accuracy:  0.9567
   Precision: 0.8079
   Recall:    0.9572
   F1-Score:  0.8762


## 7. Model Comparison

Side-by-side comparison of all three models.

In [16]:
print("MODEL COMPARISON SUMMARY")

import pandas as pd

# Base models (always included)
comparison_data = {
    'Model': [
        'Logistic Regression',
        'Linear SVM',
        'Decision Tree',
        'Random Forest'
    ],
    'Accuracy': [
        lr_accuracy,
        svm_accuracy,
        dt_accuracy,
        rf_accuracy
    ],
    'Precision': [
        lr_precision,
        svm_precision,
        dt_precision,
        rf_precision
    ],
    'Recall': [
        lr_recall,
        svm_recall,
        dt_recall,
        rf_recall
    ],
    'F1-Score': [
        lr_f1,
        svm_f1,
        dt_f1,
        rf_f1
    ]
}

# Add XGBoost if available
if 'XGBOOST_AVAILABLE' in globals() and XGBOOST_AVAILABLE and xgb_f1 is not None:
    comparison_data['Model'].append('XGBoost')
    comparison_data['Accuracy'].append(xgb_accuracy)
    comparison_data['Precision'].append(xgb_precision)
    comparison_data['Recall'].append(xgb_recall)
    comparison_data['F1-Score'].append(xgb_f1)

comparison_df = pd.DataFrame(comparison_data)

print("\n", comparison_df.to_string(index=False))

# ===== BEST MODEL PER METRIC =====
print("\nBEST MODELS PER METRIC")
for metric in ['Accuracy', 'Precision', 'Recall', 'F1-Score']:
    best_idx = comparison_df[metric].idxmax()
    best_model = comparison_df.loc[best_idx, 'Model']
    best_value = comparison_df.loc[best_idx, metric]
    print(f"{metric:12} → {best_model:20} ({best_value:.4f})")

MODEL COMPARISON SUMMARY

               Model  Accuracy  Precision   Recall  F1-Score
Logistic Regression  0.922184   0.690568 0.931501  0.793141
         Linear SVM  0.924034   0.695304 0.935717  0.797792
      Decision Tree  0.934385   0.724082 0.953723  0.823187
      Random Forest  0.918545   0.689332 0.894549  0.778646
            XGBoost  0.956699   0.807921 0.957195  0.876246

BEST MODELS PER METRIC
Accuracy     → XGBoost              (0.9567)
Precision    → XGBoost              (0.8079)
Recall       → XGBoost              (0.9572)
F1-Score     → XGBoost              (0.8762)


### 7.2 Confusion Matrices

In [18]:
print("CONFUSION MATRICES")

from sklearn.metrics import confusion_matrix

def print_confusion_matrix(model_name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    print(f"\n{model_name}:")
    print(cm)
    print("[[TN  FP]\n [FN  TP]]")
    print(f"TN = {tn}, FP = {fp}, FN = {fn}, TP = {tp}")


# ===== Logistic Regression =====
print_confusion_matrix("Logistic Regression", y_test, lr_pred)

# ===== Linear SVM =====
print_confusion_matrix("Linear SVM", y_test, svm_pred)

# ===== Decision Tree =====
print_confusion_matrix("Decision Tree", y_test, dt_pred)

# ===== Random Forest =====
print_confusion_matrix("Random Forest", y_test, rf_pred)

# ===== XGBoost =====
print_confusion_matrix("XGBoost", y_test, xgb_pred)

CONFUSION MATRICES

Logistic Regression:
[[97310  8415]
 [ 1381 18780]]
[[TN  FP]
 [FN  TP]]
TN = 97310, FP = 8415, FN = 1381, TP = 18780

Linear SVM:
[[97458  8267]
 [ 1296 18865]]
[[TN  FP]
 [FN  TP]]
TN = 97458, FP = 8267, FN = 1296, TP = 18865

Decision Tree:
[[98398  7327]
 [  933 19228]]
[[TN  FP]
 [FN  TP]]
TN = 98398, FP = 7327, FN = 933, TP = 19228

Random Forest:
[[97597  8128]
 [ 2126 18035]]
[[TN  FP]
 [FN  TP]]
TN = 97597, FP = 8128, FN = 2126, TP = 18035

XGBoost:
[[101137   4588]
 [   863  19298]]
[[TN  FP]
 [FN  TP]]
TN = 101137, FP = 4588, FN = 863, TP = 19298
