# Fake vs Real Classifier


### 1. Import Required Libraries

In [1]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from scipy.sparse import hstack


### 2. Load Weakly Labeled Dataset

In [2]:
print("Loading dataset...")
df = pd.read_csv("../data/processed/labeled_reviews.csv")

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(df.head())

Loading dataset...
Dataset shape: (882403, 21)

Columns: ['user_id', 'product_id', 'rating', 'review_text', 'summary', 'verified', 'review_timestamp', 'clean_review_text', 'review_date', 'review_length', 'rule_short_extreme', 'review_day', 'daily_count', 'rule_high_frequency', 'product_mean_rating', 'rating_deviation', 'rule_rating_deviation', 'rule_duplicate', 'fake_score', 'fake_label', 'label_confidence']
          user_id  product_id  rating  \
0  A1D4G1SNUZWQOT  7106116521       5   
1  A3DDWDH9PX2YX2  7106116521       2   
2  A2MWC41EW7XL15  7106116521       4   
3  A2UH2QQ275NV45  7106116521       2   
4   A89F3LQADZBS5  7106116521       3   

                                         review_text  \
0                             exactly what i needed.   
1  i agree with the other review, the opening is ...   
2  love these... i am going to order another pack...   
3                                too tiny an opening   
4                                               okay   

    

### 3. Select Required Features

In [16]:

# Select Required Features
print("Preparing features...")

# Text Feature
X_text = df["review_text"].astype(str)

# Numerical Features
X_numeric = df[["review_length", "rating_deviation"]]

# Target
y = df["fake_label"]

print(f"Text features: {len(X_text)} samples")
print(f"Numeric features shape: {X_numeric.shape}")
print(f"Target distribution:\n{y.value_counts()}")


Preparing features...
Text features: 882403 samples
Numeric features shape: (882403, 2)
Target distribution:
fake_label
0    723325
1    159078
Name: count, dtype: int64


### 4. Train-Test Split


In [17]:
print("Splitting data...")

X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_text,
    X_numeric,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training samples: {len(y_train)}")
print(f"Test samples: {len(y_test)}")
print(f"\nTrain label distribution:\n{y_train.value_counts(normalize=True)}")

Splitting data...
Training samples: 705922
Test samples: 176481

Train label distribution:
fake_label
0    0.819722
1    0.180278
Name: proportion, dtype: float64


### 5. TF-IDF Vectorization


In [6]:
print("Creating TF-IDF features...")

tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=5000,
    ngram_range=(1, 2)
)

X_tfidf_train = tfidf.fit_transform(X_text_train)
X_tfidf_test = tfidf.transform(X_text_test)

print(f"TF-IDF train shape: {X_tfidf_train.shape}")
print(f"TF-IDF test shape: {X_tfidf_test.shape}")

Creating TF-IDF features...
TF-IDF train shape: (705922, 5000)
TF-IDF test shape: (176481, 5000)


In [7]:
# Scale Numeric Features 
print("Scaling numeric features...")

scaler = StandardScaler()
X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled = scaler.transform(X_num_test)

print(f"Numeric features scaled.")
print(f"Train mean: {X_num_train_scaled.mean(axis=0)}")
print(f"Train std: {X_num_train_scaled.std(axis=0)}")

Scaling numeric features...
Numeric features scaled.
Train mean: [-1.72572256e-17  6.38663296e-16]
Train std: [1. 1.]


### 6. Combine Text + Numeric Features


In [9]:
print("Combining features...")

X_train = hstack([X_tfidf_train, X_num_train_scaled])
X_test = hstack([X_tfidf_test, X_num_test_scaled])

print(f"Combined train features shape: {X_train.shape}")
print(f"Combined test features shape: {X_test.shape}")

Combining features...
Combined train features shape: (705922, 5002)
Combined test features shape: (176481, 5002)


### 7. Train Logistic Regression Model


In [10]:
print("Training model...")

model = LogisticRegression(
    max_iter=2000,  # Increased from 1000
    class_weight="balanced",
    random_state=42
)

model.fit(X_train, y_train)

print("Model training complete.")

Training model...
Model training complete.


### 8. Make Predictions

In [11]:
print("Making predictions...")

y_pred = model.predict(X_test)

print(f"Predictions made: {len(y_pred)}")

Making predictions...
Predictions made: 176481


### 9. Evaluation Metrics


In [12]:
print("EVALUATION METRICS")

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")

EVALUATION METRICS
Accuracy:  0.8110
Precision: 0.4856
Recall:    0.8166
F1-Score:  0.6090


### 10. Confusion Matrix

**Interpretation:**
- **TP (True Positive):** Correctly detected fake
- **FP (False Positive):** Genuine marked as fake
- **FN (False Negative):** Missed fake
- **TN (True Negative):** Correctly identified as real

In [13]:
print("CONFUSION MATRIX")

cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)
print("\n[[TN  FP]")
print(" [FN  TP]]")

CONFUSION MATRIX
Confusion Matrix:
[[117142  27523]
 [  5836  25980]]

[[TN  FP]
 [FN  TP]]


### 11. Summary


In [15]:
print("BASELINE CLASSIFIER PERFORMANCE SUMMARY")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")


BASELINE CLASSIFIER PERFORMANCE SUMMARY
Accuracy:  0.8110
Precision: 0.4856
Recall:    0.8166
F1-Score:  0.6090
