In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# 1. Load dataset
df = pd.read_csv("../preprocessing/expanded_dataset.csv")

# 2. Select features and labels
X = df[['daily_screen_time', 'session_duration', 'app_switches', 'night_activity']]
y = df['label']

# 3. Split into train/test BEFORE applying SMOTE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 4. Build a pipeline: SMOTE -> Scaling -> Random Forest
pipe = Pipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),  # optional for RF, useful if you try other models
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# 5. Cross-validation on TRAINING data
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring='f1')
print("Cross-validation F1 scores:", cv_scores)
print("Average CV F1:", cv_scores.mean())

# 6. Fit the pipeline on training data
pipe.fit(X_train, y_train)

# 7. Evaluate on the untouched TEST set
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


Cross-validation scores: [0.96969697 1.         1.         1.         1.        ]
Average cross-validation score: 0.9939393939393939
Accuracy: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        18

    accuracy                           1.00        33
   macro avg       1.00      1.00      1.00        33
weighted avg       1.00      1.00      1.00        33

