In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/interim/NIH_TempDisc_pheno_diagnosis_withTD.csv")

# 1) Drop columns with â‰¥90% missing
col_missing = df.isna().mean()
hard_drop = col_missing[col_missing >= 0.90].index.tolist()
df1 = df.drop(columns=hard_drop)

# 2) Build feature tiers
core_cols     = col_missing[(col_missing <= 0.20)].index.tolist()
extended_cols = col_missing[(col_missing > 0.20) & (col_missing <= 0.40)].index.tolist()

# Pick working feature set
must_have = [
    "_EID","Age","Sex",
    "logk_mean","ed50_mean",          
    "k_mean","k_abs_diff"            
]
nih_candidates = [c for c in core_cols if c.startswith("NIH_final,NIH_") and 
                  any(k in c for k in ["Flanker","Processing","List_Sort"])]
feat_core = sorted(set(must_have + nih_candidates))

# 3) Make complete-case views
def complete_case_view(cols, min_row_frac=0.9):
    sub = df1[cols].copy()
    # Row-wise completeness rule: keep rows with >= min_row_frac non-missing (excluding _EID)
    req = [c for c in cols if c != "_EID"]
    keep = sub[req].notna().mean(axis=1) >= min_row_frac
    return df1.loc[keep, cols]

core_view = complete_case_view(feat_core, min_row_frac=0.9)
print("Core view shape:", core_view.shape)

# 4) Missingness report by feature (for the chosen set)
miss_report = (core_view[feat_core].isna().mean().mul(100)
               .round(1).sort_values(ascending=False))
print(miss_report)

# 5) Optional extended view if power stays OK
feat_ext = sorted(set(feat_core + [c for c in extended_cols 
                                   if c.startswith("NIH_final,NIH_") and 
                                      any(k in c for k in ["Flanker","Processing","List_Sort"])]))
ext_view = complete_case_view(feat_ext, min_row_frac=0.85)
print("Extended view shape:", ext_view.shape)

Core view shape: (1975, 27)
k_abs_diff                                 3.4
Age                                        0.1
Sex                                        0.1
NIH_final,NIH_List_Sort_Inst_Status        0.1
NIH_final,NIH_Processing_Age_Corr_Stnd     0.0
k_mean                                     0.0
ed50_mean                                  0.0
_EID                                       0.0
NIH_final,NIH_Processing_Uncorr_Stnd       0.0
NIH_final,NIH_Processing_Raw               0.0
NIH_final,NIH_Processing_Itm_Cnt           0.0
NIH_final,NIH_Processing_Inst_Status       0.0
NIH_final,NIH_Processing_Inst_Breakoff     0.0
NIH_final,NIH_Processing_Computed_Score    0.0
NIH_final,NIH_List_Sort_Uncorr_Stnd        0.0
NIH_final,NIH_Flanker_Age_Corr_Stnd        0.0
NIH_final,NIH_List_Sort_Raw                0.0
NIH_final,NIH_List_Sort_Itm_Cnt            0.0
NIH_final,NIH_List_Sort_Inst_Breakoff      0.0
NIH_final,NIH_List_Sort_Age_Corr_Stnd      0.0
NIH_final,NIH_Flanker_Uncorr_Stn

In [7]:
from pathlib import Path
import json, hashlib

PROCESSED = Path.home() / "Desktop/hbn_project/data/processed"
RESULTS   = Path.home() / "Desktop/hbn_project/results"
PROCESSED.mkdir(parents=True, exist_ok=True)
RESULTS.mkdir(parents=True, exist_ok=True)

# Versioned filenames
tag = "v1"
core_path = PROCESSED / f"hbn_core_view_{tag}.csv"
ext_path  = PROCESSED / f"hbn_extended_view_{tag}.csv"

core_view.to_csv(core_path, index=False)
ext_view.to_csv(ext_path, index=False)

core_path, ext_path

(PosixPath('/Users/yizj/Desktop/hbn_project/data/processed/hbn_core_view_v1.csv'),
 PosixPath('/Users/yizj/Desktop/hbn_project/data/processed/hbn_extended_view_v1.csv'))