In [1]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("../EDA/final_features_with_labels.csv")
print("Dataset shape:", df.shape)

# Drop 'phq8_score' (we're doing classification now)
df = df.drop(columns=['phq8_score'])

# Separate features and target
X = df.drop(columns=['participant_id', 'phq8_binary'])
y = df['phq8_binary']

# Check for missing values
print("Missing values in features:", X.isnull().sum().sum())
print("Missing values in target:", y.isnull().sum())

# Check class distribution
print("Class distribution:\n", y.value_counts())

Dataset shape: (119, 402)
Missing values in features: 0
Missing values in target: 0
Class distribution:
 phq8_binary
0    60
1    59
Name: count, dtype: int64


In [2]:
from sklearn.preprocessing import StandardScaler

# Apply Standard Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Sanity check
print("X_scaled shape:", X_scaled.shape)
print("Sample scaled values:\n", X_scaled[:2])

X_scaled shape: (119, 399)
Sample scaled values:
 [[-1.35986083 -0.38167097 -0.40013599 -0.5827175   0.063736   -0.88409772
   1.2964311  -0.01696282  1.24046929 -0.41603474 -0.33552881  1.00452126
   0.31881237 -0.0313718  -0.46768604  1.5524155   1.72558708 -0.28717188
  -0.6625099   0.77091904  0.7383851  -0.09354983 -0.39120243 -0.09114678
   1.2023181   0.07143761  0.93260601 -0.38835408 -0.26145164 -1.16169782
  -0.39784516 -0.33646232 -0.76457215  0.19078856 -1.03879824  0.59659978
  -1.49363314 -1.05493209 -0.23458989  0.98028671 -0.27862815 -0.12242382
  -0.23855562  0.78797272  0.59639936  1.84056788  1.07102955 -1.02806865
   1.48614194  0.35643309 -0.0773993  -0.73394069 -0.59250361  2.95817174
  -0.01523424  2.52450991 -0.7473162  -1.14729798  0.20179246  0.49868479
  -1.199556    2.41656493  0.18483749  1.38407909 -1.25554288  1.19220995
   0.00616795 -0.32395166 -1.54880091 -0.0634082   0.4600402  -0.03223983
  -0.38509671  0.60837899  0.29771948 -0.09536796  0.44160766 

MODELS PLANNING TO USE:

| Model                                        | Why We're Using It                        | Strengths                                             |
| -------------------------------------------- | ----------------------------------------- | ----------------------------------------------------- |
| 🔸 **SVM (Support Vector Machine)**          | Handles high-dimensional data well        | Good with small datasets, finds optimal boundary      |
| 🔸 **LightGBM**                              | Fast, handles noisy high-dimensional data | Great with many features, built-in regularization     |
| 🔸 **Logistic Regression (L2)**              | Simple, interpretable baseline            | Can work well with proper scaling, avoids overfitting |
| 🔸 **Voting Ensemble (SVM + LGBM + LogReg)** | Combines multiple models for robustness   | Reduces bias/variance, improves generalization        |


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

# STEP 3: Define Models
models = {
    "SVM (Linear)": SVC(kernel='linear', probability=True, random_state=42),
    # "LightGBM": LGBMClassifier(random_state=42),
    "Logistic (L2)": LogisticRegression(penalty='l2', max_iter=1000, random_state=42),
}

# STEP 4: Voting Ensemble
voting = VotingClassifier(
    estimators=[('svm', models["SVM (Linear)"]),
                # ('lgbm', models["LightGBM"]),
                ('logreg', models["Logistic (L2)"])],
    voting='soft'
)
models["Voting Ensemble"] = voting

# STEP 5: Evaluate Each Model
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("🔍 5-Fold Cross-Validation Results\n")
for name, model in models.items():
    f1_scores = cross_val_score(model, X_scaled, y, cv=skf, scoring='f1')
    acc_scores = cross_val_score(model, X_scaled, y, cv=skf, scoring='accuracy')
    
    print(f"📌 {name}")
    print(f"   ✅ Mean Accuracy: {acc_scores.mean():.4f}")
    print(f"   🎯 Mean F1 Score: {f1_scores.mean():.4f}")
    print("-" * 50)


🔍 5-Fold Cross-Validation Results

📌 SVM (Linear)
   ✅ Mean Accuracy: 0.5116
   🎯 Mean F1 Score: 0.4457
--------------------------------------------------
📌 Logistic (L2)
   ✅ Mean Accuracy: 0.5116
   🎯 Mean F1 Score: 0.4498
--------------------------------------------------
📌 Voting Ensemble
   ✅ Mean Accuracy: 0.5116
   🎯 Mean F1 Score: 0.4498
--------------------------------------------------
