In [1]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("../EDA/final_features_with_labels.csv")
print("Dataset shape:", df.shape)

# Drop 'phq8_score' (we're doing classification now)
df = df.drop(columns=['phq8_score'])

# Separate features and target
X = df.drop(columns=['participant_id', 'phq8_binary'])
y = df['phq8_binary']

# Check for missing values
print("Missing values in features:", X.isnull().sum().sum())
print("Missing values in target:", y.isnull().sum())

# Check class distribution
print("Class distribution:\n", y.value_counts())

Dataset shape: (78, 402)
Missing values in features: 0
Missing values in target: 0
Class distribution:
 phq8_binary
1    40
0    38
Name: count, dtype: int64


In [2]:
from sklearn.preprocessing import StandardScaler

# Apply Standard Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Sanity check
print("X_scaled shape:", X_scaled.shape)
print("Sample scaled values:\n", X_scaled[:2])

X_scaled shape: (78, 399)
Sample scaled values:
 [[-1.31677239 -0.37766258 -0.37795489 -0.5042921   0.10229364 -0.82450259
   1.21594485  0.02377023  1.02702809 -0.41094428 -0.31739128  0.76740897
   0.33733659  0.07421528 -0.50991719  1.46748232  1.70975581 -0.24603016
  -0.68111349  0.76527241  0.74878898 -0.1208162  -0.60782038 -0.11960901
   1.19304108  0.20491568  0.92966859 -0.33084613 -0.24135471 -1.01918388
  -0.42575143 -0.30460835 -0.70419468  0.19302153 -1.00629916  0.59419513
  -1.47152497 -1.06918662 -0.18674083  0.93598005 -0.35506097 -0.10842956
  -0.25221284  0.73711465  0.4804145   1.85965257  1.12238672 -1.11754195
   1.50223259  0.29861037  0.11231986 -0.83226254 -0.61854826  2.77851451
   0.03577349  2.37267473 -0.83036748 -1.24876238  0.14399131  0.43560985
  -1.20345848  2.23488209  0.33758199  1.38891907 -1.2790472   1.24073423
   0.00745768 -0.34289132 -1.81237347 -0.10919865  0.39914677 -0.06902611
  -0.30727947  0.55222289  0.3611993  -0.12100819  0.31525288  

MODELS PLANNING TO USE:

| Model                                        | Why We're Using It                        | Strengths                                             |
| -------------------------------------------- | ----------------------------------------- | ----------------------------------------------------- |
| 🔸 **SVM (Support Vector Machine)**          | Handles high-dimensional data well        | Good with small datasets, finds optimal boundary      |
| 🔸 **LightGBM**                              | Fast, handles noisy high-dimensional data | Great with many features, built-in regularization     |
| 🔸 **Logistic Regression (L2)**              | Simple, interpretable baseline            | Can work well with proper scaling, avoids overfitting |
| 🔸 **Voting Ensemble (SVM + LGBM + LogReg)** | Combines multiple models for robustness   | Reduces bias/variance, improves generalization        |


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

# # STEP 1: Load and preprocess
# df = pd.read_csv("EDA/features.csv")
# X = df.drop(columns=['participant_id', 'phq8_score', 'phq8_binary'])
# y = df['phq8_binary']

# # STEP 2: Feature Scaling
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# STEP 3: Define Models
models = {
    "SVM (Linear)": SVC(kernel='linear', probability=True, random_state=42),
    # "LightGBM": LGBMClassifier(random_state=42),
    "Logistic (L2)": LogisticRegression(penalty='l2', max_iter=1000, random_state=42),
}

# STEP 4: Voting Ensemble
voting = VotingClassifier(
    estimators=[('svm', models["SVM (Linear)"]),
                # ('lgbm', models["LightGBM"]),
                ('logreg', models["Logistic (L2)"])],
    voting='soft'
)
models["Voting Ensemble"] = voting

# STEP 5: Evaluate Each Model
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("🔍 5-Fold Cross-Validation Results\n")
for name, model in models.items():
    f1_scores = cross_val_score(model, X_scaled, y, cv=skf, scoring='f1')
    acc_scores = cross_val_score(model, X_scaled, y, cv=skf, scoring='accuracy')
    
    print(f"📌 {name}")
    print(f"   ✅ Mean Accuracy: {acc_scores.mean():.4f}")
    print(f"   🎯 Mean F1 Score: {f1_scores.mean():.4f}")
    print("-" * 50)


🔍 5-Fold Cross-Validation Results

📌 SVM (Linear)
   ✅ Mean Accuracy: 0.5358
   🎯 Mean F1 Score: 0.5627
--------------------------------------------------
📌 Logistic (L2)
   ✅ Mean Accuracy: 0.5233
   🎯 Mean F1 Score: 0.5544
--------------------------------------------------
📌 Voting Ensemble
   ✅ Mean Accuracy: 0.5233
   🎯 Mean F1 Score: 0.5544
--------------------------------------------------
