# Fake vs Real Review Classifier - Complete Model Comparison

This notebook implements and compares **three different classifiers**:
1. **Logistic Regression** - Baseline
2. **Linear SVM** - Text specialist
3. **XGBoost** - Non-linear power

All models share the same preprocessing pipeline for efficiency.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from scipy.sparse import hstack

# XGBoost (install with: pip install xgboost)
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except ImportError:
    print("WARNING:  XGBoost not installed. Run: pip install xgboost")
    XGBOOST_AVAILABLE = False
from scipy.special import expit

print(" Libraries imported successfully")

## 2. Load and Filter Dataset

In [None]:
print("Loading dataset...")
df = pd.read_csv("../data/processed/labeled_reviews.csv")

print(f"Original dataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['fake_label'].value_counts())
print(f"\nConfidence distribution:")
print(df['label_confidence'].value_counts())

### 2.1 Filter to High Confidence Samples

**Critical Improvement:** Keep only high-confidence labels to reduce noise.

In [None]:
print("Filtering to high confidence samples...")

# Keep only high_real and high_fake
df_filtered = df[df['label_confidence'].isin(['high_real', 'high_fake'])].copy()

print(f"Filtered dataset size: {len(df_filtered)}")
print(f"Removed: {len(df) - len(df_filtered)} samples ({(len(df) - len(df_filtered))/len(df)*100:.2f}%)")
print(f"\nFiltered label distribution:")
print(df_filtered['fake_label'].value_counts())

## 3. Shared Preprocessing Pipeline

### 3.1 Select Features

In [None]:
# Text Feature
X_text = df_filtered["review_text"].astype(str)

# Numerical Features
X_numeric = df_filtered[["review_length", "rating_deviation"]]

# Target
y = df_filtered["fake_label"]

print(f"Text features: {len(X_text)} samples")
print(f"Numeric features: {X_numeric.shape}")
print(f"Target distribution:\n{y.value_counts()}")

### 3.2 Train-Test Split

In [None]:
X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_text, X_numeric, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training samples: {len(y_train)}")
print(f"Test samples: {len(y_test)}")

### 3.3 TF-IDF Vectorization

In [None]:
tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=5000,
    ngram_range=(1, 2)
)

X_tfidf_train = tfidf.fit_transform(X_text_train)
X_tfidf_test = tfidf.transform(X_text_test)

print(f"TF-IDF train shape: {X_tfidf_train.shape}")
print(f"TF-IDF test shape: {X_tfidf_test.shape}")

### 3.4 Scale Numeric Features

In [None]:
scaler = StandardScaler()
X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled = scaler.transform(X_num_test)

print(" Numeric features scaled")

### 3.5 Combine Features

In [None]:
X_train = hstack([X_tfidf_train, X_num_train_scaled])
X_test = hstack([X_tfidf_test, X_num_test_scaled])

print(f"Combined train shape: {X_train.shape}")
print(f"Combined test shape: {X_test.shape}")
print("\n Preprocessing complete - ready to train models!")

## 4. Model 1: Logistic Regression (Baseline)

**Why:** Simple, interpretable baseline for binary classification.

In [None]:
print("Training Logistic Regression...")

lr_model = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    random_state=42
)

lr_model.fit(X_train, y_train)

# Use threshold 0.65 for more conservative predictions
lr_pred_proba = lr_model.predict_proba(X_test)[:, 1]
lr_pred = (lr_pred_proba >= 0.65).astype(int)

# Calculate metrics
lr_accuracy = accuracy_score(y_test, lr_pred)
lr_precision = precision_score(y_test, lr_pred)
lr_recall = recall_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred)

print("\n Logistic Regression Results (threshold=0.65):")
print(f"   Accuracy:  {lr_accuracy:.4f}")
print(f"   Precision: {lr_precision:.4f}")
print(f"   Recall:    {lr_recall:.4f}")
print(f"   F1-Score:  {lr_f1:.4f}")

## 5. Model 2: Linear SVM (Text Specialist)

**Why:** Designed for high-dimensional sparse data (TF-IDF).

In [None]:
print("Training Linear SVM...")

svm_model = LinearSVC(
    class_weight="balanced",
    random_state=42,
    max_iter=2000,
    dual=False
)

svm_model.fit(X_train, y_train)

# LinearSVC doesn't have predict_proba, use decision_function
# Convert decision scores to probabilities using sigmoid
from scipy.special import expit
svm_decision = svm_model.decision_function(X_test)
svm_pred_proba = expit(svm_decision)  # Sigmoid to get probabilities
svm_pred = (svm_pred_proba >= 0.65).astype(int)

# Calculate metrics
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_precision = precision_score(y_test, svm_pred)
svm_recall = recall_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred)

print("\n Linear SVM Results (threshold=0.65):")
print(f"   Accuracy:  {svm_accuracy:.4f}")
print(f"   Precision: {svm_precision:.4f}")
print(f"   Recall:    {svm_recall:.4f}")
print(f"   F1-Score:  {svm_f1:.4f}")

## 6. Model 3: XGBoost (Non-Linear Power)

**Why:** Captures non-linear feature interactions.

**Note:** Requires `pip install xgboost`

In [None]:
if XGBOOST_AVAILABLE:
    print("Training XGBoost...")
    
    # Calculate scale_pos_weight
    scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])
    
    xgb_model = XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight,
        eval_metric="logloss",
        random_state=42,
        tree_method='hist',
        verbosity=0
    )
    
    xgb_model.fit(X_train, y_train)
    
    # Use threshold 0.65 for more conservative predictions
    xgb_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
    xgb_pred = (xgb_pred_proba >= 0.65).astype(int)
    
    # Calculate metrics
    xgb_accuracy = accuracy_score(y_test, xgb_pred)
    xgb_precision = precision_score(y_test, xgb_pred)
    xgb_recall = recall_score(y_test, xgb_pred)
    xgb_f1 = f1_score(y_test, xgb_pred)
    
    print("\n XGBoost Results (threshold=0.65):")
    print(f"   Accuracy:  {xgb_accuracy:.4f}")
    print(f"   Precision: {xgb_precision:.4f}")
    print(f"   Recall:    {xgb_recall:.4f}")
    print(f"   F1-Score:  {xgb_f1:.4f}")
else:
    print("WARNING:  XGBoost not available. Install with: pip install xgboost")
    xgb_accuracy = xgb_precision = xgb_recall = xgb_f1 = None

## 7. Model Comparison

Side-by-side comparison of all three models.

In [None]:
print("=" * 70)
print("MODEL COMPARISON SUMMARY")
print("=" * 70)

# Create comparison dataframe
comparison_data = {
    'Model': ['Logistic Regression', 'Linear SVM'],
    'Accuracy': [lr_accuracy, svm_accuracy],
    'Precision': [lr_precision, svm_precision],
    'Recall': [lr_recall, svm_recall],
    'F1-Score': [lr_f1, svm_f1]
}

if XGBOOST_AVAILABLE and xgb_f1 is not None:
    comparison_data['Model'].append('XGBoost')
    comparison_data['Accuracy'].append(xgb_accuracy)
    comparison_data['Precision'].append(xgb_precision)
    comparison_data['Recall'].append(xgb_recall)
    comparison_data['F1-Score'].append(xgb_f1)

comparison_df = pd.DataFrame(comparison_data)
print("\n", comparison_df.to_string(index=False))

# Find best model for each metric
print("\n" + "=" * 70)
print("BEST MODELS PER METRIC")
print("=" * 70)
for metric in ['Accuracy', 'Precision', 'Recall', 'F1-Score']:
    best_idx = comparison_df[metric].idxmax()
    best_model = comparison_df.loc[best_idx, 'Model']
    best_value = comparison_df.loc[best_idx, metric]
    print(f"{metric:12} → {best_model:20} ({best_value:.4f})")

print("\n" + "=" * 70)

### 7.1 Percentage Improvements

In [None]:
print("IMPROVEMENTS OVER LOGISTIC REGRESSION BASELINE")
print("=" * 70)

# SVM vs LR
svm_acc_imp = ((svm_accuracy - lr_accuracy) / lr_accuracy) * 100
svm_prec_imp = ((svm_precision - lr_precision) / lr_precision) * 100
svm_rec_imp = ((svm_recall - lr_recall) / lr_recall) * 100
svm_f1_imp = ((svm_f1 - lr_f1) / lr_f1) * 100

print("\n Linear SVM vs Logistic Regression:")
print(f"   Accuracy:  {svm_acc_imp:+.2f}%")
print(f"   Precision: {svm_prec_imp:+.2f}%")
print(f"   Recall:    {svm_rec_imp:+.2f}%")
print(f"   F1-Score:  {svm_f1_imp:+.2f}%")

if XGBOOST_AVAILABLE and xgb_f1 is not None:
    xgb_acc_imp = ((xgb_accuracy - lr_accuracy) / lr_accuracy) * 100
    xgb_prec_imp = ((xgb_precision - lr_precision) / lr_precision) * 100
    xgb_rec_imp = ((xgb_recall - lr_recall) / lr_recall) * 100
    xgb_f1_imp = ((xgb_f1 - lr_f1) / lr_f1) * 100
    
    print("\n XGBoost vs Logistic Regression:")
    print(f"   Accuracy:  {xgb_acc_imp:+.2f}%")
    print(f"   Precision: {xgb_prec_imp:+.2f}%")
    print(f"   Recall:    {xgb_rec_imp:+.2f}%")
    print(f"   F1-Score:  {xgb_f1_imp:+.2f}%")

print("\n" + "=" * 70)

### 7.2 Confusion Matrices

In [None]:
print("CONFUSION MATRICES")
print("=" * 70)

print("\n Logistic Regression:")
lr_cm = confusion_matrix(y_test, lr_pred)
print(lr_cm)
print("[[TN  FP]\n [FN  TP]]")

print("\n Linear SVM:")
svm_cm = confusion_matrix(y_test, svm_pred)
print(svm_cm)
print("[[TN  FP]\n [FN  TP]]")

if XGBOOST_AVAILABLE and xgb_f1 is not None:
    print("\n XGBoost:")
    xgb_cm = confusion_matrix(y_test, xgb_pred)
    print(xgb_cm)
    print("[[TN  FP]\n [FN  TP]]")

print("\n" + "=" * 70)

## 8. Recommendations

Based on the results, here's when to use each model.

In [None]:
print("MODEL SELECTION GUIDE")
print("=" * 70)

print("\n Use Logistic Regression when:")
print("   • You need a quick baseline")
print("   • Interpretability is critical")
print("   • Fast training/prediction is required")

print("\n Use Linear SVM when:")
print("   • You have high-dimensional text data")
print("   • You want better performance than LR")
print("   • Interpretability is still important")

if XGBOOST_AVAILABLE:
    print("\n Use XGBoost when:")
    print("   • You need maximum performance")
    print("   • You can afford longer training time")
    print("   • You want feature importance insights")
    print("   • You have high-confidence labels")

# Determine best overall model
best_f1_idx = comparison_df['F1-Score'].idxmax()
best_model = comparison_df.loc[best_f1_idx, 'Model']
best_f1_value = comparison_df.loc[best_f1_idx, 'F1-Score']

print("\n" + "=" * 70)
print(f" RECOMMENDED MODEL: {best_model}")
print(f"   Best F1-Score: {best_f1_value:.4f}")
print("=" * 70)

## 9. Summary

**Key Takeaways:**
- All models trained on high-confidence labels only
- Shared preprocessing pipeline for efficiency
- Progressive improvement from LR → SVM → XGBoost
- Choose model based on your priorities (speed vs performance vs interpretability)

**Next Steps:**
- Save the best model for deployment
- Perform hyperparameter tuning
- Try ensemble methods