What Does Leave (ordinal catagorical) show ?

In [3]:
import pandas as pd

# Load your data
df = pd.read_csv("/Users/adityachawla/Desktop/mental_health_predictor/HealthSurvey.csv")
df = df[(df["Age"] >= 18) & (df["Age"] <= 65)]
df.fillna({"self_employed": "No", "work_interfere": "Don't know"}, inplace=True)

# Keep only relevant columns
leave_df = df[["leave", "treatment"]].dropna()

# Label encode 'leave' and 'treatment'
from sklearn.preprocessing import LabelEncoder
le_leave = LabelEncoder()
le_treat = LabelEncoder()

leave_df["leave_enc"] = le_leave.fit_transform(leave_df["leave"].astype(str))
leave_df["treat_enc"] = le_treat.fit_transform(leave_df["treatment"])

# Group and calculate treatment % by leave level
result = leave_df.groupby("leave_enc")["treat_enc"].mean().reset_index()
result.columns = ["Leave Level (Encoded)", "% Seeking Treatment"]
print(result)




# People who answered “somewhat easy” (1) or “somewhat difficult” (3) were the most likely to seek treatment.

# Highest treatment % is at encoded value = 3, not at the extreme end (4 = “very difficult”).

# “Very difficult” (4) surprisingly shows a drop in treatment (only ~50%).



# Why might that happen?
# Because real-world behavior is complex:

# People with somewhat difficult leave may still try to get help.

# But those with very difficult leave may avoid treatment out of fear, pressure, or lack of support.

# So difficulty might increase distress, but not always lead to action.





   Leave Level (Encoded)  % Seeking Treatment
0                      0             0.450980
1                      1             0.648000
2                      2             0.492424
3                      3             0.680412
4                      4             0.497537


Q: Is the dataset imbalanced in terms of the target class?

In [4]:
df["treatment"].value_counts(normalize=True)


treatment
Yes    0.5048
No     0.4952
Name: proportion, dtype: float64

In [17]:
with open("treatment_analysis_report.txt", "w") as f:
    for col in features:
        f.write("═" * 70 + "\n")
        f.write(f"📊 Feature: {col.upper()}\n\n")

        mapping = {i: label for i, label in enumerate(encoders[col].classes_)}
        grouped = df.groupby(f"{col}_enc")["treatment_enc"].mean().reset_index()
        grouped.columns = ["Encoded Value", "% Seeking Treatment"]
        grouped["Original Category"] = grouped["Encoded Value"].map(mapping)
        grouped = grouped[["Original Category", "Encoded Value", "% Seeking Treatment"]]
        grouped["% Seeking Treatment"] = (grouped["% Seeking Treatment"] * 100).round(2).astype(str) + "%"

        f.write(grouped.to_string(index=False) + "\n")

        f.write("\n🔢 Label Encoding Mapping:\n")
        for i, label in enumerate(encoders[col].classes_):
            f.write(f"  {label} → {i}\n")
    f.write("═" * 70)


In [20]:
# Count and treatment rate per category
grouped = df.groupby("benefits").agg(
    People_in_Group=("treatment", "count"),
    People_Sought_Treatment=("treatment", lambda x: (x == "Yes").sum())
)

grouped["% Seeking Treatment"] = (grouped["People_Sought_Treatment"] / grouped["People_in_Group"]) * 100
print(grouped)


            People_in_Group  People_Sought_Treatment  % Seeking Treatment
benefits                                                                 
Don't know              407                      151            37.100737
No                      371                      179            48.247978
Yes                     472                      301            63.771186
