In [None]:
# =========================================================
# 1. IMPORT LIBRARIES
# =========================================================
import pandas as pd
import numpy as np

# =========================================================
# 2. LOAD CSV
# =========================================================
df = pd.read_csv("Form_Responses.csv")

# Keep only the columns we need for this analysis
cols = ["Depression_Score", "Anxiety_Score", "Total_Score", "Depressed_Anxious"]
df_scores = df[cols].copy()

df_scores.head()


Unnamed: 0,Depression_Score,Anxiety_Score,Total_Score,Depressed_Anxious
0,27,21,48,Severe
1,4,5,9,Minimal and Mild
2,6,6,12,Minimal and Mild
3,27,21,48,Severe
4,0,0,0,Minimal and Mild


In [2]:
# =========================================================
# 3. CHECK HOW TOTAL_SCORE IS FORMED
#    Hypothesis: Total_Score = Depression_Score + Anxiety_Score
# =========================================================
df_scores["Depression_plus_Anxiety"] = (
    df_scores["Depression_Score"] + df_scores["Anxiety_Score"]
)

# Check if they match
mismatch_count = (df_scores["Total_Score"] != df_scores["Depression_plus_Anxiety"]).sum()
print("Number of rows where Total_Score != Depression_Score + Anxiety_Score:", mismatch_count)

# Show a few rows for confirmation
df_scores.head()


Number of rows where Total_Score != Depression_Score + Anxiety_Score: 0


Unnamed: 0,Depression_Score,Anxiety_Score,Total_Score,Depressed_Anxious,Depression_plus_Anxiety
0,27,21,48,Severe,48
1,4,5,9,Minimal and Mild,9
2,6,6,12,Minimal and Mild,12
3,27,21,48,Severe,48
4,0,0,0,Minimal and Mild,0


In [3]:
# =========================================================
# 4. CLASS DISTRIBUTION
# =========================================================
df_scores["Depressed_Anxious"].value_counts()


Depressed_Anxious
Minimal and Mild    212
Moderate             53
Severe               47
Name: count, dtype: int64

In [4]:
# =========================================================
# 5. SUMMARY OF TOTAL_SCORE BY EACH CATEGORY
# =========================================================
group_summary = df_scores.groupby("Depressed_Anxious")["Total_Score"].agg(
    ["min", "max", "mean", "median", "count"]
).sort_index()

group_summary


Unnamed: 0_level_0,min,max,mean,median,count
Depressed_Anxious,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Minimal and Mild,0,19,10.75,11.0,212
Moderate,20,29,24.320755,24.0,53
Severe,30,48,37.765957,38.0,47


In [5]:
# =========================================================
# 6. DEFINE RULE-BASED CLASSIFIER USING TOTAL_SCORE
# =========================================================
def classify_from_total(total_score: float) -> str:
    """
    Reconstruct the Depressed_Anxious label from Total_Score
    based on observed ranges in the dataset:
      - 0–19   -> 'Minimal and Mild'
      - 20–29  -> 'Moderate'
      - 30–48  -> 'Severe'
    """
    if pd.isna(total_score):
        return np.nan
    
    if total_score <= 19:
        return "Minimal and Mild"
    elif total_score <= 29:
        return "Moderate"
    else:
        return "Severe"

# Apply the function
df_scores["Rule_Based_Level"] = df_scores["Total_Score"].apply(classify_from_total)

df_scores[["Total_Score", "Depressed_Anxious", "Rule_Based_Level"]].head(10)


Unnamed: 0,Total_Score,Depressed_Anxious,Rule_Based_Level
0,48,Severe,Severe
1,9,Minimal and Mild,Minimal and Mild
2,12,Minimal and Mild,Minimal and Mild
3,48,Severe,Severe
4,0,Minimal and Mild,Minimal and Mild
5,40,Severe,Severe
6,10,Minimal and Mild,Minimal and Mild
7,15,Minimal and Mild,Minimal and Mild
8,47,Severe,Severe
9,15,Minimal and Mild,Minimal and Mild


In [6]:
# =========================================================
# 7. VALIDATE RULE AGAINST ORIGINAL LABELS
# =========================================================

# 1) Exact match rate
match_rate = (df_scores["Depressed_Anxious"] == df_scores["Rule_Based_Level"]).mean()
print(f"Match rate between rule-based labels and original labels: {match_rate:.3f}")

# 2) Confusion table
confusion_table = pd.crosstab(
    df_scores["Depressed_Anxious"],
    df_scores["Rule_Based_Level"],
    rownames=["Original label"],
    colnames=["Rule-based label"]
)

confusion_table


Match rate between rule-based labels and original labels: 1.000


Rule-based label,Minimal and Mild,Moderate,Severe
Original label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Minimal and Mild,212,0,0
Moderate,0,53,0
Severe,0,0,47


### 8. Mapping from dataset scores (0–48) to questionnaire scores (0–30)

The original dataset uses `Total_Score` in the range 0–48, formed by combining the depression and anxiety subscales. 
Based on the existing labels, three intervals were identified:

- 0–19  : Minimal and Mild  
- 20–29 : Moderate  
- 30–48 : Severe  

For the interactive "Know Yourself" questionnaire in Streamlit, a shorter 10-item checklist was designed 
with scores ranging from 0 to 30. To keep the interpretation consistent and easy to understand, 
proportionally similar cut-offs were used:

- 0–9   → "Minimal and Mild"  
- 10–19 → "Moderate"  
- 20–30 → "Severe"  

This ensures that a higher score still reflects a higher level of distress, while maintaining three 
clearly separated wellness levels.
