# Setup

In [3]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

In [4]:
def find_project_root(start: Path | None = None) -> Path:
    """Return the repo root by searching upward for markers."""
    p = (start or Path.cwd()).resolve()
    markers = {".git", "environment.yml", "README.md"}
    while True:
        if any((p / m).exists() for m in markers):
            return p
        if p.parent == p:
            # fallback: use start if nothing found
            return (start or Path.cwd()).resolve()
        p = p.parent

# Allow manual override via env var if needed
ROOT = Path(os.environ.get("HBN_PROJ_ROOT", find_project_root()))

# --- Paths for INPUT data ---
CORE_PATH   = ROOT / "data" / "processed" / "hbn_core_view_v1.csv"
CLUSTER_PATH = ROOT / "results" / "kmeans_model" / "cluster_assignments.csv"
FLAGS_PATH  = ROOT / "data" / "processed" / "hbn_diag_flags_neuro_anx.csv"

# --- Paths for OUTPUT (this notebook) ---
RESULTS_DIR = ROOT / "results" / "kmeans_diagnosis"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

MERGED_PROCESSED_PATH = ROOT / "data" / "processed" / "hbn_core_clusters_diag.csv"
MERGED_INTERIM_PATH   = ROOT / "data" / "interim"   / "hbn_core_clusters_diag.csv"

print("Project root: ", ROOT)
print("Core data   : ", CORE_PATH)
print("Clusters    : ", CLUSTER_PATH)
print("Diag flags  : ", FLAGS_PATH)
print("Results dir : ", RESULTS_DIR)

Project root:  /Users/yizj/Desktop/hbn_project
Core data   :  /Users/yizj/Desktop/hbn_project/data/processed/hbn_core_view_v1.csv
Clusters    :  /Users/yizj/Desktop/hbn_project/results/kmeans_model/cluster_assignments.csv
Diag flags  :  /Users/yizj/Desktop/hbn_project/data/processed/hbn_diag_flags_neuro_anx.csv
Results dir :  /Users/yizj/Desktop/hbn_project/results/kmeans_diagnosis


In [5]:
df_core = pd.read_csv(CORE_PATH)
clusters = pd.read_csv(CLUSTER_PATH)
diag_flags = pd.read_csv(FLAGS_PATH)

# Standardize ID in all three
for frame in [df_core, clusters, diag_flags]:
    frame["_EID"] = frame["_EID"].astype(str).str.upper().str.strip()

# Merge core + clusters
df_core = df_core.merge(clusters[["_EID", "cluster"]], on="_EID", how="inner")

# Merge diagnosis flags
df_all = df_core.merge(diag_flags, on="_EID", how="left")

df_all.shape, df_all.head()
df_all.to_csv(MERGED_PROCESSED_PATH, index=False)  # or MERGED_INTERIM_PATH

# Prevalence & Maintanence

## Helpers

In [6]:
def cramers_v_bias_corrected(table: pd.DataFrame) -> float:
    chi2, _, _, _ = chi2_contingency(table)
    n = table.to_numpy().sum()
    r, k = table.shape
    phi2 = chi2 / n
    # bias correction
    phi2_corr = max(0, phi2 - (k - 1)*(r - 1)/(n - 1))
    r_corr = r - (r - 1)**2 / (n - 1)
    k_corr = k - (k - 1)**2 / (n - 1)
    return np.sqrt(phi2_corr / min((k_corr - 1), (r_corr - 1)))

def chi2_cramers_v(x: pd.Series, y: pd.Series):
    """Convenience wrapper: returns chi2, p, dof, CramÃ©r's V (bias-corrected)."""
    tab = pd.crosstab(x, y)
    chi2, p, dof, _ = chi2_contingency(tab)
    V = cramers_v_bias_corrected(tab)
    return chi2, p, dof, V, tab

def standardized_residuals_from_table(obs: pd.DataFrame) -> pd.DataFrame:
    chi2, p, dof, expected = chi2_contingency(obs.values)
    resid = (obs.values - expected) / np.sqrt(expected)
    return pd.DataFrame(resid, index=obs.index, columns=obs.columns)