# Main Code

In [3]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.pipeline import Pipeline

from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,
    ExtraTreesClassifier, StackingClassifier, VotingClassifier
)
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

df = pd.read_csv("/content/Raw Dataset.csv")
df.columns = [col.split(". ", 1)[-1].strip() for col in df.columns]

stress_cols = df.columns[7:17]
anxiety_cols = df.columns[17:24]
depression_cols = df.columns[24:33]
selected_cols = list(stress_cols) + list(anxiety_cols) + list(depression_cols)

def convert_response(resp):
    mapping = {
        "0 - Never": 0, "1 - Almost Never": 1, "2 - Sometimes": 2,
        "3 - Fairly Often": 3, "4 - Very Often": 4,
        "0 - Not at all": 0, "1 - Several days": 1,
        "2 - More than half the days": 2, "3 - Nearly every day": 3
    }
    return mapping.get(resp, np.nan)

for col in selected_cols:
    df[col] = df[col].apply(convert_response)

imputer = KNNImputer(n_neighbors=5)
df[selected_cols] = imputer.fit_transform(df[selected_cols])

def remove_outliers(df, cols):
    Q1 = df[cols].quantile(0.25)
    Q3 = df[cols].quantile(0.75)
    IQR = Q3 - Q1
    return df[~((df[cols] < (Q1 - 1.5 * IQR)) | (df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

df = remove_outliers(df, selected_cols)

df["Stress_Score"] = df[stress_cols].sum(axis=1)
df["Anxiety_Score"] = df[anxiety_cols].sum(axis=1)
df["Depression_Score"] = df[depression_cols].sum(axis=1)

def classify(score, mild, moderate):
    if score <= mild: return "Low"
    elif score <= moderate: return "Moderate"
    else: return "Severe"

df["Stress_Level"] = df["Stress_Score"].apply(lambda x: classify(x, 13, 26))
df["Anxiety_Level"] = df["Anxiety_Score"].apply(lambda x: classify(x, 7, 14))
df["Depression_Level"] = df["Depression_Score"].apply(lambda x: classify(x, 9, 18))

X = df[selected_cols]
y = df[["Stress_Level", "Anxiety_Level", "Depression_Level"]]

os.makedirs("models/cv_models", exist_ok=True)

models = [
    ("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("Logistic Regression", LogisticRegression(max_iter=1000, random_state=42)),
    ("KNN", KNeighborsClassifier()),
    ("SVM", SVC(probability=True, random_state=42)),
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
    ("AdaBoost", AdaBoostClassifier(random_state=42)),
    ("Naive Bayes", GaussianNB()),
    ("Extra Trees", ExtraTreesClassifier(random_state=42)),
    ("QDA", QuadraticDiscriminantAnalysis()),
    ("LDA", LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')),
    ("Ridge Classifier", RidgeClassifier(alpha=1))
]

# Stratified K-Fold Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

label_encoders = {label: LabelEncoder().fit(y[label]) for label in y.columns}

# Loop through each label (target)
for label in y.columns:
    print(f"\nProcessing target: {label}")

    le = label_encoders[label]
    y_encoded = le.transform(y[label])

    selector = SelectKBest(f_classif, k=15)
    X_selected = selector.fit_transform(X, y_encoded)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # PCA
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    # SMOTE + ENN Resampling (via Pipeline)
    smote = SMOTE(random_state=42)
    enn = EditedNearestNeighbours(n_neighbors=3)
    resampling_pipeline = Pipeline(steps=[('smote', smote), ('enn', enn)])

    X_resampled, y_resampled = resampling_pipeline.fit_resample(X_pca, y_encoded)

    X_train, X_val, y_train, y_val = train_test_split(
        X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
    )

    # Hyperparameter tuning for Random Forest
    param_grid_rf = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, n_jobs=-1, verbose=2)
    grid_rf.fit(X_train, y_train)
    print(f"Best Random Forest Hyperparameters: {grid_rf.best_params_}")
    print(f"Best Random Forest Accuracy: {grid_rf.best_score_}")

    # stacking classifier
    base_learners = [
        ('rf', RandomForestClassifier(n_estimators=200, random_state=42)),
        ('svc', SVC(probability=True, random_state=42)),
        ('mlp', MLPClassifier(hidden_layer_sizes=(256, 128, 64), activation='relu', solver='adam',
                   alpha=0.0001, max_iter=500, learning_rate_init=0.001, random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('ada', AdaBoostClassifier(random_state=42))
    ]
    meta_learner = LogisticRegression()
    stacking_clf = StackingClassifier(estimators=base_learners, final_estimator=meta_learner)

    stacking_clf.fit(X_train, y_train)
    stacking_acc = stacking_clf.score(X_val, y_val)
    print(f"Stacking Classifier (with more models) | Accuracy: {stacking_acc:.4f}")
    joblib.dump(stacking_clf, f"models/cv_models/{label}_StackingClassifier.pkl")

    # Voting ensemble
    ensemble = VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('svc', SVC(probability=True, random_state=42)),
        ('mlp', MLPClassifier(hidden_layer_sizes=(256, 128, 64), activation='relu', solver='adam',
                   alpha=0.0001, max_iter=500, learning_rate_init=0.001, random_state=42))
    ], voting='soft')

    ensemble.fit(X_train, y_train)
    ensemble_acc = ensemble.score(X_val, y_val)
    print(f"Voting Ensemble | Accuracy: {ensemble_acc:.4f}")
    joblib.dump(ensemble, f"models/cv_models/{label}_VotingEnsemble.pkl")

    # Cross-validation with Repeated Stratified K-Fold
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    scores = cross_val_score(stacking_clf, X_resampled, y_resampled, cv=cv, n_jobs=-1, scoring='accuracy')
    print(f"Stacking Classifier | Cross-validated accuracy: {np.mean(scores):.4f} ± {np.std(scores):.4f}")



Processing target: Stress_Level
Fitting 3 folds for each of 36 candidates, totalling 108 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Stress_Score"] = df[stress_cols].sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Anxiety_Score"] = df[anxiety_cols].sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Depression_Score"] = df[depression_cols].sum(axis=1)
A value is trying to be set on a copy of a slice

Best Random Forest Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Random Forest Accuracy: 0.9841214699279215
Stacking Classifier (with more models) | Accuracy: 0.9918
Voting Ensemble | Accuracy: 0.9898
Stacking Classifier | Cross-validated accuracy: 0.9917 ± 0.0041

Processing target: Anxiety_Level
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best Random Forest Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Random Forest Accuracy: 0.9731046227321594
Stacking Classifier (with more models) | Accuracy: 1.0000
Voting Ensemble | Accuracy: 0.9971
Stacking Classifier | Cross-validated accuracy: 0.9950 ± 0.0049

Processing target: Depression_Level
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best Random Forest Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Random Forest Accu