In [34]:
# train_model.py
import json
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# === 1) Load dataset ===
CSV_PATH = "Stress_Dataset.csv"
TARGET_COL = "Which type of stress do you primarily experience?"

df = pd.read_csv(CSV_PATH)

# If there are duplicated column names (e.g., anxiety appears twice), keep the first
df = df.loc[:, ~df.columns.duplicated()]

# Ensure target column exists
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in CSV. Found: {list(df.columns)}")

# === 2) Split features/target ===
# Features are all columns EXCEPT the target (25 features)
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# Save feature order for the Flask app to keep consistency
feature_order = list(X.columns)
with open("feature_order.json", "w", encoding="utf-8") as f:
    json.dump(feature_order, f, ensure_ascii=False, indent=2)

# === 3) Encode target labels ===
le = LabelEncoder()
y_enc = le.fit_transform(y)

# === 4) Train/test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

# === 5) Scale features ===
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# === 6) Train model (ExtraTrees – fast, strong baseline) ===
model = ExtraTreesClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train_s, y_train)

# === 7) Evaluate ===
y_pred = model.predict(X_test_s)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# === 8) Persist artifacts ===
joblib.dump(model, "model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")

print("✅ Saved: model.pkl, scaler.pkl, label_encoder.pkl, feature_order.json")


Accuracy: 0.9408

Classification Report:
                                                                                  precision    recall  f1-score   support

Distress (Negative Stress) - Stress that causes anxiety and impairs well-being.       1.00      0.67      0.80         6
   Eustress (Positive Stress) - Stress that motivates and enhances performance.       0.94      1.00      0.97       154
                       No Stress - Currently experiencing minimal to no stress.       1.00      0.11      0.20         9

                                                                       accuracy                           0.94       169
                                                                      macro avg       0.98      0.59      0.66       169
                                                                   weighted avg       0.94      0.94      0.92       169

Confusion Matrix:
 [[  4   2   0]
 [  0 154   0]
 [  0   8   1]]
✅ Saved: model.pkl, scaler.pkl, label_encod