In [1]:
!pip install pandas scikit-learn joblib





[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd

# Load the CSV
df = pd.read_csv('balanced_mental_health_dataset.csv')

# Quick look at shape, columns, and first few rows
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()


Shape: (192000, 32)
Columns: ['PHQ9_Item_1', 'PHQ9_Item_2', 'PHQ9_Item_3', 'PHQ9_Item_4', 'PHQ9_Item_5', 'PHQ9_Item_6', 'PHQ9_Item_7', 'PHQ9_Item_8', 'PHQ9_Item_9', 'GAD7_Item_1', 'GAD7_Item_2', 'GAD7_Item_3', 'GAD7_Item_4', 'GAD7_Item_5', 'GAD7_Item_6', 'GAD7_Item_7', 'PHQ9_Total', 'PHQ9_Severity', 'GAD7_Total', 'GAD7_Severity', 'Age_Group', 'Gender', 'Employment_Status', 'Physical_Activity', 'Social_Interaction', 'Chronic_Health_Conditions', 'Medication_Usage', 'Sleep_Quality', 'Stress_Level', 'Substance_Use', 'Mood_Rating', 'Sleep_Hours']


Unnamed: 0,PHQ9_Item_1,PHQ9_Item_2,PHQ9_Item_3,PHQ9_Item_4,PHQ9_Item_5,PHQ9_Item_6,PHQ9_Item_7,PHQ9_Item_8,PHQ9_Item_9,GAD7_Item_1,...,Employment_Status,Physical_Activity,Social_Interaction,Chronic_Health_Conditions,Medication_Usage,Sleep_Quality,Stress_Level,Substance_Use,Mood_Rating,Sleep_Hours
0,0,1,0,0,0,0,0,2,0,1,...,Unemployed,5.0,5.0,0,0,3.0,3.0,,9.0,6.6
1,0,0,0,0,0,0,2,0,0,2,...,Unemployed,4.0,4.0,0,0,5.0,2.0,,8.0,7.9
2,0,1,0,0,0,0,0,0,0,1,...,Unemployed,5.0,5.0,0,0,5.0,1.0,,8.0,6.2
3,2,0,0,0,0,0,0,1,1,0,...,Unemployed,5.0,5.0,0,1,4.0,2.0,,9.0,5.9
4,0,0,1,0,2,1,0,0,0,0,...,Unemployed,3.0,4.0,1,0,5.0,2.0,,9.0,5.7


In [3]:
from sklearn.preprocessing import LabelEncoder

# PHQ-9 and GAD-7 items
phq_cols  = [f'PHQ9_Item_{i}' for i in range(1,10)]
gad_cols  = [f'GAD7_Item_{i}' for i in range(1,8)]

# Other predictors
demo_cols = [
    'Age_Group','Gender','Employment_Status','Physical_Activity','Social_Interaction',
    'Chronic_Health_Conditions','Medication_Usage','Sleep_Quality','Stress_Level',
    'Substance_Use','Mood_Rating','Sleep_Hours'
]

feature_cols = phq_cols + gad_cols + demo_cols
target_dep   = 'PHQ9_Severity'
target_anx   = 'GAD7_Severity'

# Encode categorical predictors
for col in demo_cols:
    if df[col].dtype == 'object':
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Encode targets
le_dep = LabelEncoder(); df[target_dep] = le_dep.fit_transform(df[target_dep])
le_anx = LabelEncoder(); df[target_anx] = le_anx.fit_transform(df[target_anx])

# Check a sample
df[feature_cols + [target_dep, target_anx]].head()


Unnamed: 0,PHQ9_Item_1,PHQ9_Item_2,PHQ9_Item_3,PHQ9_Item_4,PHQ9_Item_5,PHQ9_Item_6,PHQ9_Item_7,PHQ9_Item_8,PHQ9_Item_9,GAD7_Item_1,...,Social_Interaction,Chronic_Health_Conditions,Medication_Usage,Sleep_Quality,Stress_Level,Substance_Use,Mood_Rating,Sleep_Hours,PHQ9_Severity,GAD7_Severity
0,0,1,0,0,0,0,0,2,0,1,...,5.0,0,0,3.0,3.0,3,9.0,6.6,1,1
1,0,0,0,0,0,0,2,0,0,2,...,4.0,0,0,5.0,2.0,3,8.0,7.9,1,1
2,0,1,0,0,0,0,0,0,0,1,...,5.0,0,0,5.0,1.0,3,8.0,6.2,1,1
3,2,0,0,0,0,0,0,1,1,0,...,5.0,0,1,4.0,2.0,3,9.0,5.9,1,1
4,0,0,1,0,2,1,0,0,0,0,...,4.0,1,0,5.0,2.0,3,9.0,5.7,1,1


In [4]:
from sklearn.model_selection import train_test_split

X = df[feature_cols]
y_dep = df[target_dep]
y_anx = df[target_anx]

# For depression
X_train_dep, X_test_dep, y_train_dep, y_test_dep = train_test_split(
    X, y_dep, test_size=0.2, random_state=42, stratify=y_dep
)

# For anxiety
X_train_anx, X_test_anx, y_train_anx, y_test_anx = train_test_split(
    X, y_anx, test_size=0.2, random_state=42, stratify=y_anx
)

print("Train/Test sizes:", X_train_dep.shape, X_test_dep.shape)


Train/Test sizes: (153600, 28) (38400, 28)


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Instantiate RF
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train on Depression
rf.fit(X_train_dep, y_train_dep)
dep_preds = rf.predict(X_test_dep)
print("=== Random Forest: Depression ===")
print(classification_report(
    y_test_dep, dep_preds,
    target_names=[str(c) for c in le_dep.classes_]
))

# Train on Anxiety
rf.fit(X_train_anx, y_train_anx)
anx_preds = rf.predict(X_test_anx)
print("=== Random Forest: Anxiety ===")
print(classification_report(
    y_test_anx, anx_preds,
    target_names=[str(c) for c in le_anx.classes_]
))


=== Random Forest: Depression ===
                   precision    recall  f1-score   support

             Mild       0.88      0.90      0.89      7680
          Minimal       0.97      0.95      0.96      7680
         Moderate       0.84      0.84      0.84      7680
Moderately Severe       0.84      0.88      0.86      7680
           Severe       0.97      0.92      0.94      7680

         accuracy                           0.90     38400
        macro avg       0.90      0.90      0.90     38400
     weighted avg       0.90      0.90      0.90     38400

=== Random Forest: Anxiety ===
              precision    recall  f1-score   support

        Mild       0.92      0.94      0.93      9600
     Minimal       0.99      0.97      0.98      9600
    Moderate       0.90      0.93      0.92      9600
      Severe       0.99      0.94      0.96      9600

    accuracy                           0.95     38400
   macro avg       0.95      0.95      0.95     38400
weighted avg       0.

In [6]:
import joblib

# Retrain on full depression and anxiety training sets
best_dep = RandomForestClassifier(n_estimators=100, random_state=42)
best_dep.fit(X_train_dep, y_train_dep)

best_anx = RandomForestClassifier(n_estimators=100, random_state=42)
best_anx.fit(X_train_anx, y_train_anx)

# Save models and encoders
joblib.dump(best_dep, 'rf_depression_model.pkl')
joblib.dump(best_anx, 'rf_anxiety_model.pkl')
joblib.dump(le_dep, 'le_depression.pkl')
joblib.dump(le_anx, 'le_anxiety.pkl')

print("Random Forest models and label encoders saved.")


Random Forest models and label encoders saved.


In [8]:
import pandas as pd
import joblib
import numpy as np

# Reload models & encoders (if needed)
model_dep = joblib.load('rf_depression_model.pkl')
model_anx = joblib.load('rf_anxiety_model.pkl')
le_dep    = joblib.load('le_depression.pkl')
le_anx    = joblib.load('le_anxiety.pkl')

feature_cols = [
  'PHQ9_Item_1','PHQ9_Item_2','PHQ9_Item_3','PHQ9_Item_4','PHQ9_Item_5',
  'PHQ9_Item_6','PHQ9_Item_7','PHQ9_Item_8','PHQ9_Item_9',
  'GAD7_Item_1','GAD7_Item_2','GAD7_Item_3','GAD7_Item_4',
  'GAD7_Item_5','GAD7_Item_6','GAD7_Item_7',
  'Age_Group','Gender','Employment_Status','Physical_Activity','Social_Interaction',
  'Chronic_Health_Conditions','Medication_Usage','Sleep_Quality',
  'Stress_Level','Substance_Use','Mood_Rating','Sleep_Hours'
]

def predict_mental_health(input_dict):
    # Build a one‐row DataFrame so feature names align perfectly
    df_input = pd.DataFrame([input_dict], columns=feature_cols)
    dep_i = model_dep.predict(df_input)[0]
    anx_i = model_anx.predict(df_input)[0]
    return {
        'Depression_Severity': le_dep.inverse_transform([dep_i])[0],
        'Anxiety_Severity':    le_anx.inverse_transform([anx_i])[0]
    }

# Test it
sample = X_test_dep.iloc[0].to_dict()
print("Input sample:", sample)
print("Prediction:", predict_mental_health(sample))


Input sample: {'PHQ9_Item_1': 3.0, 'PHQ9_Item_2': 0.0, 'PHQ9_Item_3': 1.0, 'PHQ9_Item_4': 2.0, 'PHQ9_Item_5': 3.0, 'PHQ9_Item_6': 3.0, 'PHQ9_Item_7': 1.0, 'PHQ9_Item_8': 3.0, 'PHQ9_Item_9': 2.0, 'GAD7_Item_1': 3.0, 'GAD7_Item_2': 3.0, 'GAD7_Item_3': 1.0, 'GAD7_Item_4': 3.0, 'GAD7_Item_5': 1.0, 'GAD7_Item_6': 2.0, 'GAD7_Item_7': 2.0, 'Age_Group': 0.0, 'Gender': 2.0, 'Employment_Status': 2.0, 'Physical_Activity': 3.0, 'Social_Interaction': 3.0, 'Chronic_Health_Conditions': 1.0, 'Medication_Usage': 1.0, 'Sleep_Quality': 1.0, 'Stress_Level': 6.0, 'Substance_Use': 0.0, 'Mood_Rating': 3.0, 'Sleep_Hours': 5.2}
Prediction: {'Depression_Severity': 'Severe', 'Anxiety_Severity': 'Severe'}
