<a href="https://colab.research.google.com/github/cbanuelos/datathon-neuroncdocs/blob/main/ChronicIllnessSubdomainLabels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Chronic Illness Dataset
Human Input Relabeling for Analysis

cell 1: loading packages

cell 2: definitons

cell 3: Conditions Relabeling & CSV generation

cell 4: Symptoms Relabeling & CSV generation

cell 5: Tags Relabeling & CSV generation

cell 6: Food Relabeling & CSV generation

cell 7: Treatment Relabeling & CSV generation

In [None]:
!pip -q install sentence-transformers pandas scikit-learn

import re, unicodedata, math, json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.mixture import GaussianMixture
import warnings


warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message=r".*'force_all_finite' was renamed to 'ensure_all_finite'.*",
    module=r"sklearn\.utils\.deprecation"
)


  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


In [None]:
#Configuration

EMBED_MODEL = 'all-MiniLM-L6-v2'# SentenceTransformer
TOP_P_KEEP = 0.90 # Proportion for GMM
ASSIGN_MARGIN = 0.05 # min (top - second) cosine margin to accept
SOFT_ACCEPT_FLOOR = 0.45 # absolute minimum cosine to accept
RANDOM_STATE = 42 # reproducible GMM initialization


def normalize_text(x: str) -> str:
  """
  Normalize a raw user input string

  1) NFKC normalization
  2) remove leading/trailing whitespace and convert to lowercase
  3) turn internal whitespace to singlespace
  4) remove stray punctuation

  returns a normalize string for embedding
  """
  x = unicodedata.normalize("NFKC", x)
  x = x.strip().lower()
  x = re.sub(r'\s+', ' ', x)
  x = re.sub(r'^[^\w]+|[^\w]+$', '', x)
  return x



def make_prototype(anchor_list):
  """
  Create a unit-norm centroid from anchor strings
  """
  se = model.encode(anchor_list, convert_to_numpy=True, normalize_embeddings=True)
  centroid = se.mean(axis=0)
  centroid /= np.linalg.norm(centroid) + 1e-9
  return centroid, se

def auto_threshold(s: pd.Series, keep_top_p=TOP_P_KEEP, random_state=RANDOM_STATE):
  """
  Separate strong & weak matches using a GMM on the top_P score
  """
  if len(s) < 100:
    return max(0.5, s.quantile(0.8))
  s_sorted = np.sort(s.values)[::-1]
  k = max(50, int(len(s_sorted)*keep_top_p))
  top = s_sorted[:k].reshape(-1,1)
  try:
    gmm = GaussianMixture(n_components=2, random_state=random_state).fit(top)
    means = np.sort(gmm.means_.flatten())
    thr = float(means.mean())  # midpoint
    thr = min(max(thr, 0.35), 0.80)  # clamp
    return thr
  except Exception:
    return max(0.5, s.quantile(0.8))



In [None]:
#CONDITIONS RELABELING

# Load data
conditions = pd.read_csv("condition_value_counts.csv")
cond_col= "Condition_name"

raw_terms = conditions[cond_col].dropna().astype(str)

# Keep map to original for output and remove blanks
df_terms = pd.DataFrame({
    "term_original": raw_terms,
    "term_norm": raw_terms.map(normalize_text)
})

df_terms = (
    df_terms[df_terms["term_norm"] != ""]
    .drop_duplicates("term_norm")
    .reset_index(drop=True)
)
df_terms["row_id"] = np.arange(len(df_terms))
terms = df_terms["term_norm"].tolist()

print(f"Loaded {len(terms)} unique normalized terms.")


domain_anchors = {
    "pots_dysautonomia": [
        "pots","postural orthostatic tachycardia syndrome","postural orthostatic tachycardia",
        "dysautonomia","orthostatic intolerance","orthostatic hypotension","postural hypotension",
        "inappropriate sinus tachycardia","ist","tachycardia when standing","palpitations upright",
        "neurocardiogenic syncope","syncope","presyncope","heart racing on standing"
    ],
    "eds_hypermobility": [
        "ehlers-danlos syndrome","heds","hypermobile ehlers-danlos",
    "joint hypermobility","hypermobility syndrome","joint hyperlaxity"
    ],
    "mast_cell_mcas": [
        "mast cell activation syndrome","mcas","mast cell activation disorder","histamine intolerance",
        "hives and angioedema","chronic idiopathic urticaria","ciu","angioedema","pressure urticaria",
        "allodynia from histamine","flushing from histamine"
    ],
    "reflux": [
        "heartburn","indigestion","bloating",
        "abdominal pain","stomach pain","stomach ache","nausea","vomiting","diarrhea","constipation",
        "functional dyspepsia","digestive issues","stomach cramps","gas pains"
    ],
    "ibs": [
        "irritable bowel syndrome", "ibs", "ibs-c", "ibs-d"
    ],
    "ibd": [
        "inflammatory bowel disease", "ulcerative colitis", "crohn's disease",
        "microscopic colitis", "collagenous colitis", "indeterminate colitis", "ibd","diverticulitis"
    ],
    "gastroparesis":[
        "gastroparesis","idiopathic gastroparesis","delayed gastric emptying","gastric dysmotility"
    ],
    "sibo_leaky_gut": [
        "small intestinal bacterial overgrowth", "sibo", "leaky gut"
    ],
    "food_allergy_intolerance": [
        "food allergy","food allergies","food intolerances","lactose intolerance","fructose intolerance",
        "fructose malabsorption","gluten intolerance"
    ],
    "liver_disease": [
        "hepatitis", "Hepatitis", "fatty liver", "Fatty liver",
        "cirrhosis", "Cirrhosis"
    ],
    "endometriosis_gyne": [
        "endometriosis", "adenomyosis", "pelvic pain", "chronic pelvic pain",
        "period pain", "menstrual cramps", "dysmenorrhea", "menorrhagia",
        "ovarian cysts"
    ],

    "interstitial_cystitis": [
        "interstitial cystitis", "bladder pain", "painful urination",
        "urinary urgency", "urinary frequency"
    ],
    "vulvovaginal_pain_infection": [
        "vulvodynia", "vaginal pain", "vaginal burning",
        "yeast infection", "vaginal yeast infection",
        "bacterial vaginosis", "vaginal irritation"
    ],
    "urinary_tract_infection": [
        "urinary tract infection", "uti", "bladder infection",
        "urination pain", "urinary burning"
    ],
    "derm_inflammatory": [
        "eczema", "atopic dermatitis", "contact dermatitis",
        "seborrheic dermatitis", "psoriasis", "hidradenitis suppurativa",
        "rosacea", "folliculitis"
    ],
    "derm_allergic_pruritic": [
        "hives", "urticaria", "rash", "pruritus", "itchy skin","itch",
        "skin sensitivity", "allergic rash", "contact allergy"
    ],
    "acne": [
        "acne","cystic acne",
    ],
    "autoimmune_connective": [
        "systemic lupus erythematosus","sle","lupus","sjogren's syndrome","sjogrens","mixed connective tissue disease",
        "mctd","undifferentiated connective tissue disease","uctd","scleroderma","systemic sclerosis",
        "behcet's disease","dermatomyositis","polymyositis","autoimmune disease","autoimmune issues"
    ],
    "inflammatory_arthritis": [
        "rheumatoid arthritis","ra","psoriatic arthritis","ankylosing spondylitis",
        "axial spondyloarthritis","axial spa","reactive arthritis","enteropathic arthritis",
        "inflammatory arthritis","palindromic rheumatism","spondyloarthropathy"
    ],
    "osteoarthritis_spine": [
        "osteoarthritis","oa","degenerative disc disease","ddd","cervical spondylosis","lumbar spondylosis",
        "spinal stenosis","spondylolisthesis","bulging disc","herniated disc"
    ],
    "neuropathy_crps": [
        "peripheral neuropathy","diabetic neuropathy","small fiber neuropathy","sfpn",
        "neuropathic pain","nerve pain","paresthesia","tingling","numbness","crps","rsd","complex regional pain syndrome"
    ],
    "fnd_neurologic_other": [
        "functional neurological disorder","fnd","functional movement disorder","myoclonus",
        "opsoclonus myoclonus","tremor","spasticity","dystonia","essential tremor","chiari malformation",
        "visual snow","balance problems","light sensitivity","vertigo","bppv","cervicogenic symptoms"
    ],
    "multiple_sclerosis": [
        "multiple sclerosis","ms","demyelinating disease","optic neuritis","uhthoff phenomenon"
    ],
    "epilepsy_seizure": [
        "epilepsy","seizure disorder","seizures","complex partial seizures","absence seizure",
        "cataplexy","epileptic","convulsion","temporal lobe seizure"
    ],
    "nonepileptic seizure":[
    "psychogenic nonepileptic seizures","pn es","pnnes"
    ],
    "migraine_headache": [
        "migraine","migraines","chronic migraine","chronic daily headache","new daily persistent headache",
        "ndph","vestibular migraine","migraine with aura","hemiplegic migraine","ocular migraine",
        "cluster headache","tension headache","headache","head pain","pressure in head"
    ],
    "parkinson_movement": [
        "parkinson's disease","parkinsonism","spastic paraplegia","movement disorder"
    ],
    "fibromyalgia": [
        "fibromyalgia","fibro","fibro flare","fibro fog","fibromylgia","fibromialgia","fibromyaglia",
        "fibromyalgie","fybromyalgia","fibro pain","widespread pain",
        "tender points","achiness","muscle aches","muscle pain"
    ],
    "me_cfs": [
        "me/cfs","chronic fatigue syndrome","myalgic encephalomyelitis","me cfs","cfs",
        "post exertional malaise","pem","fatigue syndrome","fatigue all the time","exhaustion",
        "energy level low","m.e","severe fatigue","post exertion mala","systemic exertion intolerance disease"
    ],
    "sleep_disorders": [
        "insomnia","sleep apnea","obstructive sleep apnea","central sleep apnea","narcolepsy",
        "hypersomnia","idiopathic hypersomnia","restless legs syndrome","rls","periodic limb movements",
        "night terrors","parasomnias","delayed sleep phase","sleep problems","lack of sleep"
    ],
    "depression": [
        "depression","major depressive disorder","clinical depression","dysthymia","MDD", "seasonal affective disorder",
        "persistent depressive disorder","low mood","depressed mood","mood disorder"
    ],
    "anxiety": [
        "anxiety","generalized anxiety disorder","gad","panic disorder","panic attacks",
        "social anxiety disorder","agoraphobia","health anxiety","worry","phobia","social anxiety"
    ],
    "ptsd": [
        "post-traumatic stress disorder","ptsd","complex ptsd","cptsd","trauma response",
        "flashbacks","nightmares","hypervigilance"
    ],
    "ocd": [
        "obsessive compulsive disorder","ocd","obsessions","compulsions","trichotillomania","dermatillomania",
        "body focused repetitive behaviors","bfrb","skin picking"
    ],
    "bipolar_spectrum": [
        "bipolar disorder","bipolar type 1","bipolar type ii","bipolar 2","hypomania","mania","cyclothymia"
    ],
    "adhd": [
        "adhd","add","adult adhd","attention deficit disorder"
    ],
    "asd": [
        "autism spectrum disorder","asd","aspergers",
        "sensory processing disorder","spd","selective mutism"
    ],
    "schizophrenia_psychotic": [
        "psychosis","delusional disorder","schizoaffective disorder","schizophrenia","auditory hallucinations","hallucinations","paranoia","schizoaffective"
    ],
    "cardiovascular_arrhythmia": [
        "arrhythmia","heart arrhythmias","atrial fibrillation","afib","mitral valve prolapse","mvp",
        "palpitations","sinus tachycardia","sinus bradycardia","prolonged qt","pvcs","rapid heart rate"
    ],
    "bp_hypertension_hypotension": [
        "hypertension","high blood pressure","low blood pressure","hypotension",
    ],
    "cardiac_other": [
        "cardiomyopathy","heart failure","ischemic heart disease","coronary artery disease",
        "pericarditis","chest pain","chest tightness"
    ],
    "thyroid_disorders": [
        "hypothyroidism","hashimoto's disease","hashimoto’s thyroiditis","hashimotos","hypothyroid",
        "hyperthyroidism","graves disease","goiter","low thyroid","thyroid"
    ],
    "diabetes": [
        "type 1 diabetes","type 2 diabetes","diabetes","insulin resistance","hypoglycemia",
        "reactive hypoglycemia","diabetic complications"
    ],
    "adrenal": [
        "adrenal insufficiency","addison's disease","cushing","secondary adrenal insufficiency",
        "cushing’s","low cortisol","endocrine disorder","pituitary disorders","hypopituitarism"
    ],
    "pcos": [
        "pcos","polycystic ovary syndrome","polycystic ovaries"
    ],
    "menopause_hormonal": [
        "menopause", "perimenopause", "hot flashes", "hormone spike", "irregular cycles",
        "hormone imbalance"
    ],
    "pmdd":[
        "pmdd","premenstrual dysphoric disorder"
    ],
    "asthma": [
        "asthma","asthma attack","exercise-induced asthma","allergic rhinitis",
    ],
    "respiratory":[
        "hay fever","sinusitis","chronic sinusitis","sinus headaches","cough","shortness of breath"
    ],
    "allergy":[
        "seasonal allergies",
        "dust mite allergy","mold allergy"
    ],
    "copd":[
        "chronic obstructive pulmonary disease","copd","pulmonary pipe disease"
    ],
    "primary_immunology": [
        "cvid","common variable immunodeficiency","immune deficiency","selective iga deficiency (if present)",
        "recurrent infections"
    ],
    "lyme": [
        "lyme disease","chronic lyme","bartonella","babesia"
    ],
    "post_viral_infection": [
        "post viral fatigue","mycoplasma pneumoniae","chronic infection","mono","mononucleosis", "epstein-barr virus","ebv","cmv","cytomegalovirus"
    ],
    "covid":[
        "covid","post covid", "long covid"
    ],
    "back_spine_pain": [
        "back pain", "lower back pain", "upper back pain",
        "lumbar pain", "spinal pain", "sciatica",
        "degenerative disc disease", "herniated disc", "spinal stenosis"
    ],

    "neck_shoulder_pain": [
        "neck pain", "shoulder pain", "frozen shoulder",
        "rotator cuff injury", "tendinitis", "bursitis",
        "cervical spondylosis", "trapezius strain"
    ],
    "joint_limb_pain": [
        "joint pain", "hip pain", "knee pain", "ankle pain",
        "arthralgia",
        "plantar fasciitis", "piriformis syndrome", "tmj disorder"
    ],
    "chronic_pain": [
        "chronic pain", "pain disorder", "widespread pain", "pain syndrome",
        "chronic widespread pain", "chronic pain syndrome"
    ],
    "autoimmune_rheumatologic": [
        "sle",
        "sjogren's syndrome", "scleroderma", "systemic sclerosis",
        "vasculitis"
    ],
    "gu_kidney_bladder": [
        "recurrent uti","urinary tract infection","urinary incontinence","overactive bladder","interstitial cystitis",
        "kidney stones","chronic kidney disease","ckd"
    ],
    "hematology": [
        "anemia","iron deficiency anemia","pernicious anemia","low white blood count","thrombophilia",
        "antiphospholipid syndrome"
    ],
    "oncology": [
        "breast cancer","melanoma","uterine fibroids ",
        "leukemia","lymphoma","cancer general"
    ],
    "unknown_undiagnosed": [
         "undiagnosed","mysterious condition","unknown","the undiagnosed thing"
         ]
}

# L2 normalize embeddings
model = SentenceTransformer(EMBED_MODEL)
emb_all = model.encode(
    terms, batch_size=1024, show_progress_bar=True,
    convert_to_numpy=True, normalize_embeddings=True
)

domain_proto = {}
for d, anchors in domain_anchors.items():
    centroid, seed_mat = make_prototype(anchors)
    domain_proto[d] = {"centroid": centroid, "seeds": seed_mat, "anchors": anchors}

dom_names = list(domain_proto.keys())
proto_mat = np.stack([domain_proto[d]["centroid"] for d in dom_names], axis=0)

# cosine similarity since embeddings are normalized
scores = emb_all @ proto_mat.T   # (N x D)
scores_df = pd.DataFrame(scores, columns=[f"score_{d}" for d in dom_names])
scores_df["term_norm"] = terms


domain_thresholds = {d: auto_threshold(scores_df[f"score_{d}"]) for d in dom_names}

print("Auto thresholds:")
for d, t in domain_thresholds.items():
  print(f"  {d:24s} -> {t:.3f}")

# Assign the best domain
score_mat = scores_df[[f"score_{d}" for d in dom_names]].values
top_idx = score_mat.argmax(axis=1)
top_score = score_mat.max(axis=1)
second_score = np.partition(score_mat, -2, axis=1)[:, -2]
margin = top_score - second_score

assigned = []
for i in range(len(terms)):
    dom = dom_names[top_idx[i]]
    ok = (
        top_score[i] >= domain_thresholds[dom] and
        margin[i] >= ASSIGN_MARGIN and
        top_score[i] >= SOFT_ACCEPT_FLOOR
    )
    assigned.append(dom if ok else "Other")

assign_df = pd.DataFrame({
    "term_norm": terms,
    "best_domain": assigned,
})
# keep row_id + term_original by merging from df_terms
assign_df = df_terms.merge(assign_df, on="term_norm", how="left")


condition_df = assign_df[assign_df["best_domain"] != "Other"].copy().reset_index(drop=True)

for d in dom_names:
    mask = (condition_df["best_domain"] == d)
    if not mask.any():
        continue
    subset_rows = condition_df.loc[mask, "row_id"].to_numpy()
    subset_idx  = np.where(mask)[0]  # positions to write back

out1 = condition_df[["term_original", "best_domain"]].sort_values("best_domain", ascending=[True])
out1.to_csv("conditions_clusters.csv", index=False)

print("Saved:")
print(" - conditions_clusters.csv")



Loaded 9171 unique normalized terms.


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Auto thresholds:
  pots_dysautonomia        -> 0.350
  eds_hypermobility        -> 0.350
  mast_cell_mcas           -> 0.350
  reflux                   -> 0.350
  ibs                      -> 0.350
  ibd                      -> 0.350
  gastroparesis            -> 0.350
  sibo_leaky_gut           -> 0.350
  food_allergy_intolerance -> 0.350
  liver_disease            -> 0.350
  endometriosis_gyne       -> 0.350
  interstitial_cystitis    -> 0.350
  vulvovaginal_pain_infection -> 0.350
  urinary_tract_infection  -> 0.350
  derm_inflammatory        -> 0.350
  derm_allergic_pruritic   -> 0.350
  acne                     -> 0.350
  autoimmune_connective    -> 0.350
  inflammatory_arthritis   -> 0.350
  osteoarthritis_spine     -> 0.350
  neuropathy_crps          -> 0.350
  fnd_neurologic_other     -> 0.350
  multiple_sclerosis       -> 0.350
  epilepsy_seizure         -> 0.350
  nonepileptic seizure     -> 0.350
  migraine_headache        -> 0.350
  parkinson_movement       -> 0.350
  fibrom

In [None]:
#SYMPTOMS RELABELING

symptoms = pd.read_csv("symptom_value_counts.csv")
symp_col= "Symptom_name"

raw_terms = symptoms[symp_col].dropna().astype(str)

df_terms = pd.DataFrame({
    "term_original": raw_terms,
    "term_norm": raw_terms.map(normalize_text)
})

df_terms = (
    df_terms[df_terms["term_norm"] != ""]
    .drop_duplicates("term_norm")
    .reset_index(drop=True)
)
df_terms["row_id"] = np.arange(len(df_terms))
terms = df_terms["term_norm"].tolist()

print(f"Loaded {len(terms)} unique normalized terms.")

domain_anchors = {
    "negative_affect": [
        "low mood", "depressed mood", "depression", "anhedonia", "hopelessness", "sadness", "tearfulness" ],

    "anxiety_fear_panic": [
        "anxiety", "panic attacks", "worry", "restlessness","racing thoughts", "hypervigilance", "fear","phobia"],

    "stress_tension": [
        "stress", "stressed", "tension", "overwhelmed"],

    "positive_affect": [
        "calm", "relaxed", "happy", "grateful", "energetic"],

    "inflammatory_swelling": [
        "swelling", "redness", "inflammation", "tenderness", "limited movement"],

    "appetite_weight_change": [
        "loss of appetite", "increased appetite", "weight gain", "weight loss"],

    "hair_skin_nails": [
        "hair loss", "thinning hair", "brittle nails", "nail changes"],

    "infectious_feverish": [
        "fever", "chills", "flu-like", "sore throat", "infection","lymph node swelling"],

    "derealization_depersonalization": [
        "derealization", "depersonalization", "out of body", "detached"],

    "autonomic_pots_syncope": [
        "postural dizziness", "lightheadedness", "near syncope", "syncope", "orthostatic intolerance", "palpitations on standing",
        "tachycardia", "heart racing", "orthostatic hypotension", "coat hanger pain"],

    "cardiac_chest_palpitations": [
        "chest pain", "chest tightness", "chest pressure", "palpitations", "rapid heartbeat", "irregular heartbeat"],

    "respiratory_dyspnea_cough": [
        "shortness of breath", "breathlessness", "wheezing", "dry cough", "productive cough", "air hunger", "hyperventilation"],

    "ent_nasal_sinus_throat": [
        "nasal congestion", "runny nose", "post nasal drip", "sinus pressure","sinus pain", "sore throat", "hoarseness"],

    "sleep_disorders": [
        "insomnia", "trouble falling asleep", "trouble staying asleep","unrefreshing sleep", "restless sleep", "sleep paralysis",
        "excessive daytime sleepiness", "night terrors"],

    "ocd": [
        "obsessive thoughts", "compulsions", "skin picking","trichotillomania", "intrusive thoughts"],

    "psychosis_paranoia": [
        "hallucinations", "auditory hallucinations", "paranoia","psychosis", "delusional ideas"],

    "adhd_attention": [
        "inattention", "lack of focus", "poor concentration", "executive dysfunction", "hyperactivity"],

    "fatigue_exertional": [
        "fatigue", "exhaustion", "post exertional malaise", "exercise intolerance", "physical tiredness"],

    "cognitive_dysfunction": [
        "brain fog", "memory loss", "forgetfulness", "difficulty concentrating", "word finding difficulty", "impaired cognition", "confusion"],

    "headache_migraine": [
        "headache", "migraine", "tension headache", "migraine aura", "ocular migraine", "cluster headache","pressure in head"],

    "dizziness_vertigo_balance": [
        "dizziness", "vertigo", "off balance", "loss of balance", "light headedness"],

    "neuropathic_paresthesia": [
        "neuropathic pain", "paresthesia", "tingling", "numbness", "burning feet", "electric shocks"],

    "tremor_spasm_twitch": [
        "tremor", "muscle spasms", "muscle twitching", "muscle cramps", "spasticity"],

    "vision_ophthalmic": [
        "blurred vision", "double vision", "visual disturbances","photophobia", "eye pain", "eye strain", "floaters"],

    "tinnitus_ear": [
        "tinnitus", "ear ringing", "ear pressure","ear fullness"],

    "msk_back_neck": [
        "back pain", "lower back pain", "upper back pain", "neck pain", "neck stiffness", "lumbar pain", "back spasms"],

    "msk_joint_limb": [
        "joint pain", "joint stiffness", "joint swelling", "knee pain", "hip pain", "ankle pain", "shoulder pain", "wrist pain", "elbow pain", "finger joint pain"],

    "msk_myalgia_widespread": [
        "muscle pain", "myalgia", "achiness", "widespread pain", "generalized pain", "fibro pain"],

    "regional_tendon_bursa": [
        "plantar fasciitis", "costochondritis","tendinitis", "bursitis", "tmj pain", "sacroiliac pain"],

    "gi_upper_reflux": [
        "heartburn", "acid reflux", "indigestion","nausea", "vomiting", "upset stomach"],

    "gi_bowel_habit": [
        "diarrhea", "constipation", "alternating diarrhea and constipation", "loose stool", "bowel urgency", "bowel incontinence"],

    "gi_pain_bloat_cramp": [
        "abdominal pain", "stomach pain", "stomach ache", "abdominal cramps", "bloating", "gas pains", "flatulence"],

    "gi_rectal_hemorrhoid": [
        "rectal bleeding", "hemorrhoids", "rectal pain","painful bowel movement", "mucus in stool" ],

    "liver_gallbladder": [
        "liver pain", "liver swelling", "liver congestion","right upper quadrant pain"],

    "urinary_bladder": [
        "urinary urgency", "urinary frequency", "painful urination", "bladder pain", "overactive bladder", "urinary incontinence"],

    "gyne_menstrual_pelvic": [
        "pelvic pain", "period pain", "menstrual cramps", "menorrhagia", "vaginal pain", "vaginal bleeding",
        "vaginal discharge", "vaginal itching"],

    "derm_rash_pruritus": [
        "rash", "itchy skin", "pruritus",
        "skin sensitivity", "skin pain", "hives"
    ],
    "derm_inflammatory": [
        "eczema", "psoriasis", "seborrheic dermatitis",
        "folliculitis", "acne", "cystic acne"
    ],
    "allergy_hypersensitivity": [
        "allergic reaction", "allergies", "hay fever",
        "itchy eyes", "swollen eyes", "facial flushing"
    ],
    "sensory_hypersensitivity": [
        "light sensitivity", "sound sensitivity",
        "smell sensitivity", "sensory overload", "tactile sensitivity"
    ],

    "endocrine_metabolic": [
        "cold intolerance", "heat intolerance",
        "hypoglycemia", "low blood sugar", "excessive thirst"
    ],

    "bleeding_bruising": [
        "easy bruising", "nosebleed", "bleeding gums",
        "heavy periods", "rectal bleeding"
    ],
    "raynaud_circulation": [
        "cold hands", "cold feet", "poor circulation",
        "raynaud's", "cold extremities"
    ],
    "temperature_regulation": [
        "hot flashes", "cold intolerance", "heat intolerance",
        "overheating", "night sweats", "chills", "temperature dysregulation",
        "cold hands", "cold feet"
    ],
    "motor_coordination": [
        "tremor", "spasms", "muscle twitching", "weakness", "loss of balance",
        "unsteady gait", "poor coordination", "muscle stiffness"
    ],
    "executive_dysfunction": [
        "executive dysfunction", "difficulty planning",
        "trouble organizing", "overwhelm", "mental fatigue"
    ],
    "motivation_drive": [
        "low motivation", "lack of motivation", "low interest",
        "difficulty initiating tasks", "apathy"
    ],
    "autonomic_visceral": [
        "bowel urgency", "bowel incontinence",
        "urinary urgency", "urinary frequency", "bladder pain",
        "nausea on standing", "digestive distress"
    ],
    "arthritis_inflammatory_joint": [
        "arthritis", "rheumatoid arthritis", "osteoarthritis",
        "psoriatic arthritis", "reactive arthritis",
        "joint inflammation", "joint deformity", "morning stiffness",
        "arthritic pain", "inflamed joints"
    ]


}

# Embeddings
model = SentenceTransformer(EMBED_MODEL)
emb_all = model.encode(
    terms, batch_size=1024, show_progress_bar=True,
    convert_to_numpy=True, normalize_embeddings=True
)


domain_proto = {}
for d, anchors in domain_anchors.items():
    centroid, seed_mat = make_prototype(anchors)
    domain_proto[d] = {"centroid": centroid, "seeds": seed_mat, "anchors": anchors}

dom_names = list(domain_proto.keys())
proto_mat = np.stack([domain_proto[d]["centroid"] for d in dom_names], axis=0)

# cosine similarity since embeddings are normalized
scores = emb_all @ proto_mat.T   # (N x D)
scores_df = pd.DataFrame(scores, columns=[f"score_{d}" for d in dom_names])
scores_df["term_norm"] = terms

domain_thresholds = {d: auto_threshold(scores_df[f"score_{d}"]) for d in dom_names}

print("Auto thresholds:")
for d, t in domain_thresholds.items():
    print(f"  {d:24s} -> {t:.3f}")

score_mat = scores_df[[f"score_{d}" for d in dom_names]].values
top_idx = score_mat.argmax(axis=1)
top_score = score_mat.max(axis=1)
second_score = np.partition(score_mat, -2, axis=1)[:, -2]
margin = top_score - second_score

assigned = []
for i in range(len(terms)):
    dom = dom_names[top_idx[i]]
    ok = (
        top_score[i] >= domain_thresholds[dom] and
        margin[i] >= ASSIGN_MARGIN and
        top_score[i] >= SOFT_ACCEPT_FLOOR
    )
    assigned.append(dom if ok else "Other")

assign_df = pd.DataFrame({
    "term_norm": terms,
    "best_domain": assigned,
})
# keep row_id + term_original by merging from df_terms
assign_df = df_terms.merge(assign_df, on="term_norm", how="left")


# Cluster inside each accepted domain
symptom_df = assign_df[assign_df["best_domain"] != "Other"].copy().reset_index(drop=True)


# Save outputs
out1 = symptom_df[["term_original", "best_domain"]].sort_values("best_domain", ascending=[True])
out1.to_csv("symptoms_clusters.csv", index=False)

print("Saved:")
print(" - symptoms_clusters.csv")



Loaded 22643 unique normalized terms.


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Auto thresholds:
  negative_affect          -> 0.350
  anxiety_fear_panic       -> 0.350
  stress_tension           -> 0.350
  positive_affect          -> 0.350
  inflammatory_swelling    -> 0.350
  appetite_weight_change   -> 0.350
  hair_skin_nails          -> 0.350
  infectious_feverish      -> 0.350
  derealization_depersonalization -> 0.350
  autonomic_pots_syncope   -> 0.350
  cardiac_chest_palpitations -> 0.350
  respiratory_dyspnea_cough -> 0.350
  ent_nasal_sinus_throat   -> 0.350
  sleep_disorders          -> 0.350
  ocd                      -> 0.350
  psychosis_paranoia       -> 0.350
  adhd_attention           -> 0.350
  fatigue_exertional       -> 0.350
  cognitive_dysfunction    -> 0.350
  headache_migraine        -> 0.350
  dizziness_vertigo_balance -> 0.350
  neuropathic_paresthesia  -> 0.370
  tremor_spasm_twitch      -> 0.350
  vision_ophthalmic        -> 0.350
  tinnitus_ear             -> 0.350
  msk_back_neck            -> 0.350
  msk_joint_limb           -> 0.350


In [None]:

# TAGS CLUSTERS RELABELING
tags = pd.read_csv("tag_value_counts.csv")
tag_col = [c for c in ["Tag_name","tag","Tag","tag_name"] if c in tags.columns][0]

raw_terms =tags[tag_col].dropna().astype(str)

# Keep map to original for output
df_terms = pd.DataFrame({
    "term_original": raw_terms,
    "term_norm": raw_terms.map(normalize_text)
})
df_terms = (
    df_terms[df_terms["term_norm"] != ""]
    .drop_duplicates("term_norm")
    .reset_index(drop=True)
)
df_terms["row_id"] = np.arange(len(df_terms))   # row_id aligns to emb_all
terms = df_terms["term_norm"].tolist()          # emb_all will align to THIS order

print(f"Loaded {len(terms)} unique normalized terms.")


# Domain anchors

domain_anchors = {
    "stress_emotional": [
        "stress", "stressed", "work stress", "family stress", "relationship stress",
        "overwhelmed", "argument", "conflict", "worried", "anxious", "anxiety",
        "depressed", "sad", "hopeless", "emotional", "frustrated", "angry",
        "irritable", "grumpy", "crying", "lonely", "burnout", "tension", "grief", "down"
    ],

    "mental_focus_productivity": [
        "productive", "unproductive", "focused", "distracted",
        "unmotivated", "no motivation", "motivation", "couldn't focus",
        "projecting", "worked on computer"
    ],

    "social_connection": [
        "saw friends", "socialized", "social interaction", "family time",
        "talked with friend", "visited family", "went out", "social gathering",
        "church", "date", "party", "relationship stress"
    ],

    "sleep_quality": [
        "poor sleep", "bad sleep", "broken sleep", "no sleep", "insomnia",
        "slept badly", "short sleep", "restless sleep", "overslept", "little sleep",
        "slept in", "average sleep", "good sleep", "nap", "napped",
        "daytime nap", "nightmares", "woke up early", "slept all day"
    ],

    "fatigue_low_energy": [
        "tired", "very tired", "fatigued", "exhausted", "drained",
        "low energy", "no energy", "lethargic", "wiped out",
        "rest day", "lazy day", "bed day", "crash", "extreme fatigue"
    ],

    "menstrual_cycle": [
        "period", "pms", "menstruation", "cramps", "spotting", "ovulation",
        "heavy period", "first day of period", "premenstrual", "menstrual cycle"
    ],

    "pain_musculoskeletal": [
        "pain", "in pain", "bad pain day", "sore", "ache", "aching", "hurt",
        "back pain", "shoulder pain", "neck pain", "hip pain", "joint pain",
        "muscle pain", "fibro", "fibro flare", "sciatica", "stiff", "burning knees"
    ],

    "symptom_general": [
        "nausea", "nauseated", "vomited", "dizzy", "dizziness", "lightheaded",
        "weak", "weakness", "shaky", "tingling", "itchy", "itching",
        "body aches", "chest pain", "stomach pain", "abdominal pain", "bloating","tachycardia","palpitations"
    ],

    "illness_infection": [
        "sick", "cold", "flu", "fever", "sore throat", "cough", "infection",
        "sinus infection", "stuffy nose", "have a cold", "cold/flu", "ill", "virus"
    ],

    "weather_environment": [
        "hot weather", "cold weather", "humid", "humidity", "rainy day",
        "sunny", "sunshine", "storm", "temperature change",
        "weather change", "heat", "cold", "barometric pressure", "sun fatigue","no sun","sat in sun"
    ],

    "dietary_triggers": [
        "gluten", "gluten free", "dairy", "ate dairy", "sugar", "coffee",
        "caffeine", "alcohol", "drank alcohol", "hungover", "beer",
        "chocolate", "ate out", "restaurant", "fast food", "processed food",
        "whole30", "diet", "ate breakfast", "ate lunch", "ate dinner", "food intolerance", "probiotic"
    ],

    "hydration": [
        "dehydrated", "not enough water", "low water intake",
        "having trouble drinking", "hydrated"
    ],

    "menstrual_cycle": [
        "period", "pms", "menstruation", "cramps", "spotting", "ovulation",
        "heavy period", "first day of period", "premenstrual", "menstrual cycle"
    ],

    "physical_activity": [
        "exercise", "worked out", "workout", "gym", "yoga", "running",
        "walked", "walking", "long walk", "biking", "hiking", "swimming",
        "strength training", "physical therapy", "stretches", "cardio",
        "overdid it", "high activity", "physio", "tai chi", "pilates"
    ],

    "household_chores": [
        "cleaning", "laundry", "washed dishes", "housework", "vacuumed",
        "yard work", "cooked dinner", "unloaded dishwasher", "grocery shopping",
        "chores", "cooking", "baking", "organizing", "caregiving","daughter","son"
    ],

    "work_school": [
        "work", "worked", "busy at work", "no work", "day off work",
        "school", "exam", "studied", "missed school", "homework",
        "lots of work", "office day", "work from home", "deadline"
    ],

    "travel_schedule": [
        "travel", "travel day", "vacation", "trip", "car travel",
        "long drive", "flight", "jet lag", "time zone change",
        "long car ride", "packing", "off-roading"
    ],

    "technology_usage": [
        "screen time", "computer", "phone", "scrolling", "social media",
        "gaming", "video games", "tv", "watched tv", "worked on computer",
        "twitter", "online", "internet"
    ],

    "meals_timing": [
        "skipped meal", "skipped breakfast", "late dinner",
        "late night snack", "midnight snack", "no appetite",
        "poor appetite", "hungry", "ate out", "fasting", "snack"
    ],

    "therapy_treatment": [
        "doctor appointment", "doctor’s appointment", "physio",
        "physical therapy", "therapy", "psychotherapy", "chiropractor",
        "acupuncture", "massage therapy", "counseling", "medical appointment","consultation"
    ],

    "medication_factors": [
        "missed dose", "skipped dose", "new medication",
        "dose change", "medication change", "missed meds",
        "took meds", "took medication", "dose increase", "dose reduction","dose lowered","dose highered"
    ],

    "medication_effects": [
        "side effects", "pain meds", "ibuprofen", "antibiotic",
        "antidepressant", "changed meds", "meds kicked in",
        "meds not working", "new meds working", "painkillers"
    ],

    "relaxation_restoration": [
        "meditation", "meditated", "relaxed", "relaxing", "rested",
        "massage", "acupuncture", "light therapy", "bath", "took a bath",
        "self care", "rest day", "rested all day", "stretching",
        "tai chi", "yoga nidra", "light box"
    ],

    "allergens_irritants": [
        "allergies", "pollen", "dust", "smoke", "pollution",
        "perfume", "fragrance", "pet dander"
    ],
    "infusion_treatment": [
    "ivig", "infusion", "drip", "iv therapy", "port access",
    "infusion reaction", "sore arm from drip"
],

"creative_hobbies": [
    "music", "singing", "played", "art", "painting", "drawing",
    "jewellery making", "craft", "creative day", "writing"
],

"cognitive_function": [
    "brain fog", "brain stutter", "scattered", "couldn't concentrate",
    "hard to make decisions", "memory issues", "forgetful",
    "mental clarity", "focused thinking"
],
}


model = SentenceTransformer(EMBED_MODEL)
emb_all = model.encode(terms, batch_size=1024, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

domain_proto = {}
for d, anchors in domain_anchors.items():
    centroid, seed_mat = make_prototype(anchors)
    domain_proto[d] = {"centroid": centroid, "seeds": seed_mat, "anchors": anchors}

dom_names = list(domain_proto.keys())
proto_mat = np.stack([domain_proto[d]["centroid"] for d in dom_names], axis=0)  # D x dim

scores = emb_all @ proto_mat.T
scores_df = pd.DataFrame(scores, columns=[f"score_{d}" for d in dom_names])
scores_df["term_norm"] = terms

domain_thresholds = {d: auto_threshold(scores_df[f"score_{d}"]) for d in dom_names}

print("Auto thresholds:")
for d, t in domain_thresholds.items():
    print(f"  {d:18s} -> {t:.3f}")

score_mat = scores_df[[f"score_{d}" for d in dom_names]].values
top_idx = score_mat.argmax(axis=1)
top_score = score_mat.max(axis=1)
second_score = np.partition(score_mat, -2, axis=1)[:,-2]
margin = top_score - second_score

assigned = []
for i in range(len(terms)):
    dom = dom_names[top_idx[i]]
    ok = (
        top_score[i] >= domain_thresholds[dom] and
        margin[i] >= ASSIGN_MARGIN and
        top_score[i] >= SOFT_ACCEPT_FLOOR
    )
    assigned.append(dom if ok else "Other")

assign_df = pd.DataFrame({
    "term_norm": terms,
    "best_domain": assigned,
})

# Merge back originals
assign_df = df_terms.merge(assign_df, on="term_norm", how="left")

tag_df = assign_df[assign_df["best_domain"] != "Other"].copy().reset_index(drop=True)

for d in dom_names:
    mask = (tag_df["best_domain"] == d)
    if not mask.any():
        continue
    subset_rows = tag_df.loc[mask, "row_id"].to_numpy()
    subset_idx  = np.where(mask)[0]  # positions to write back

out1 = tag_df[["term_original", "best_domain"]].sort_values("best_domain", ascending=[True])
out1.to_csv("tags_clusters.csv", index=False)

print("Saved:")
print(" - tags_clusters.csv")


Loaded 33464 unique normalized terms.


Batches:   0%|          | 0/33 [00:00<?, ?it/s]

Auto thresholds:
  stress_emotional   -> 0.350
  mental_focus_productivity -> 0.350
  social_connection  -> 0.350
  sleep_quality      -> 0.391
  fatigue_low_energy -> 0.350
  menstrual_cycle    -> 0.350
  pain_musculoskeletal -> 0.350
  symptom_general    -> 0.350
  illness_infection  -> 0.350
  weather_environment -> 0.350
  dietary_triggers   -> 0.350
  hydration          -> 0.350
  physical_activity  -> 0.350
  household_chores   -> 0.350
  work_school        -> 0.350
  travel_schedule    -> 0.350
  technology_usage   -> 0.350
  meals_timing       -> 0.350
  therapy_treatment  -> 0.350
  medication_factors -> 0.350
  medication_effects -> 0.350
  relaxation_restoration -> 0.350
  allergens_irritants -> 0.350
  infusion_treatment -> 0.350
  creative_hobbies   -> 0.350
  cognitive_function -> 0.350
Saved:
 - tags_clusters.csv


In [None]:
# FOODS CLUSTERING/RENAMING

foods = pd.read_csv("food_value_counts.csv")
food_col = [c for c in ["Food_name","food","Food","food_name"] if c in foods.columns][0]

raw_terms = foods[food_col].dropna().astype(str)

df_terms = pd.DataFrame({
    "term_original": raw_terms,
    "term_norm": raw_terms.map(normalize_text)
})
df_terms = (
    df_terms[df_terms["term_norm"] != ""]
    .drop_duplicates("term_norm")
    .reset_index(drop=True)
)
df_terms["row_id"] = np.arange(len(df_terms))
terms = df_terms["term_norm"].tolist()

print(f"Loaded {len(terms)} unique normalized terms.")

domain_anchors = {
    "caffeine": [
        "coffee", "espresso", "latte", "cappuccino", "americano", "decaf coffee",
        "tea", "black tea", "green tea", "chai", "chai latte", "earl grey",
        "iced coffee", "iced tea", "energy drink", "gatorade", "coke", "diet coke",
        "pepsi", "cola", "soda", "mountain dew", "dr pepper", "powerade"
    ],
    "sugar_sweets": [
        "sugar", "chocolate", "dark chocolate", "ice cream", "cookies", "cake",
        "candy", "dessert", "brownie", "donut", "pastry", "poptart",
        "sweet tea", "syrup", "honey", "maple syrup", "jam", "frosted flakes",
        "kanelbulle", "rabarberpaj", "kladdkaka", "croissant", "bearclaw",
        "cheesecake", "paj", "pie", "cookie dough", "ice cream bar", "blizzard",
        "mcflurry", "cupcake"
    ],
    "alcohol": [
        "alcohol", "beer", "wine", "red wine", "white wine", "champagne",
        "vodka", "rum", "whiskey", "cocktail", "cider", "liquor", "pale ale"
    ],
    "gluten_grains": [
        "bread", "pasta", "noodles", "bagel", "toast", "pizza", "crackers",
        "flour", "cereal", "muffin", "wrap", "sandwich", "biscuit",
        "mac and cheese", "lasagna", "spaghetti", "pancake", "waffle",
        "tortilla", "burrito", "quesadilla"
    ],
    "dairy": [
        "milk", "cheese", "butter", "yogurt", "cream", "ice cream",
        "cream cheese", "cottage cheese", "half and half", "sour cream",
        "whipped cream", "dairy", "lactose", "milkshake", "cheddar"
    ],
    "fried_processed": [
        "fries", "french fries", "chips", "crisps", "burger", "nuggets",
        "hotdog", "fast food", "junk food", "doritos", "pringles",
        "hashbrowns", "fried chicken", "onion rings", "tater tots",
        "corn chips", "cheese curls"
    ],
    "fruit": [
        "apple", "banana", "berries", "strawberries", "blueberries",
        "grapes", "orange", "pineapple", "melon", "peach", "pear",
        "mango", "nectarine", "watermelon", "raspberries", "fruit salad"
    ],
    "vegetable": [
        "broccoli", "spinach", "carrots", "peas", "green beans", "cauliflower",
        "kale", "zucchini", "asparagus", "lettuce", "cucumber", "tomato",
        "onion", "garlic", "pepper", "bell pepper", "avocado", "potato",
        "sweet potato", "corn", "beetroot"
    ],
    "meat_protein": [
        "chicken", "beef", "pork", "ham", "turkey", "steak", "fish",
        "salmon", "tuna", "shrimp", "sausage", "bacon", "eggs", "egg",
        "egg sandwich", "omelet", "mackerel", "trout", "haddock",
        "catfish", "crab", "prawn", "lobster", "fish fingers", "meatballs"
    ],
    "plant_protein": [
        "tofu", "tempeh", "lentils", "beans", "black beans", "chickpeas",
        "hummus", "edamame", "nuts", "almonds", "cashews",
        "peanuts", "seeds", "chia seeds", "pumpkin seeds", "sunflower seeds",
        "quorn", "halloumi", "soya", "vegbiff", "vegan nuggets",
        "lentil curry", "chickpea curry", "bean chili"
    ],
    "condiment_spice": [
        "olive oil", "ketchup", "mustard", "mayo", "mayonnaise",
        "salt", "pepper", "cinnamon", "salsa", "gravy", "spices",
        "herbs", "vinegar", "apple cider vinegar", "soy sauce", "cilantro",
        "dressing", "ranch", "caesar dressing", "mustard dressing",
        "balsamic", "vinaigrette", "hollandaise", "sriracha",
        "barbecue sauce", "bbq sauce", "sweet and sour", "ranch dip",
        "teriyaki sauce"
    ],
    "diet_pattern": [
        "gluten free", "dairy free", "no gluten", "no sugar", "low carb",
        "high carbs", "keto", "vegetarian", "vegan", "whole30",
        "processed food", "low fodmap", "fodmap", "paleo", "optavia",
        "jenny craig", "clean eating"
    ],
    "fermented_probiotic": [
        "kombucha", "kefir", "sauerkraut", "kimchi", "yogurt", "probiotic",
        "fermented", "pickles", "pickle"
    ],
    "hydration": [
        "water", "seltzer", "sparkling water", "soda water", "coconut water",
        "electrolyte drink", "hydration", "dehydrated"
    ],
    "carbs_starches": [
        "rice", "brown rice", "white rice", "oats", "oatmeal", "porridge",
        "quinoa", "potatoes", "mashed potatoes", "starch", "risotto",
        "gröt", "oatbran", "weetabix", "oatabix", "cereal",
        "quaker oats", "rice pops", "oat flakes", "wholewheat toast", "bagel"
    ],
    "snacks_small_meals": [
        "granola", "granola bar", "protein bar", "trail mix", "snack",
        "cracker", "pretzel", "popcorn", "peanut butter", "almond butter",
        "protein shake", "smoothie", "fruit smoothie"
    ],
    "breakfast": [
        "breakfast", "cereal", "oatmeal", "toast", "pancake",
        "waffle", "scrambled eggs", "bagel", "coffee with milk",
        "breakfast sandwich"
    ],
    "supplement_nutrients": [
        "vitamin d", "vitamin b12", "iron", "magnesium", "omega 3",
        "fish oil", "supplement"
    ],
    "fast_food_restaurant": [
        "mcdonalds", "burger king", "subway", "pizza hut", "dominos",
        "chick-fil-a", "sonic", "jersey mike's", "perkins",
        "whataburger", "giordano's", "dos coyotes", "chipotle", "panda express",
        "taco bell"
    ],
    "ethnic_dish": [
        "curry", "bolognese", "tamal", "enchilada", "fajita", "taco",
        "quesadilla", "burrito", "souvlaki", "teriyaki", "ramen", "pho",
        "empanada", "dimsum", "falafel", "naan", "pad thai", "sushi", "couscous",
        "shawarma", "pita", "gyro", "dumpling"
    ],
    "ready_meal_packaged": [
        "frozen meal", "microwave meal", "ready meal", "packaged meal",
        "tideford", "pieminister", "slimfast", "optavia", "jenny craig",
        "tv dinner", "prepackaged", "boxed meal"
    ]
}


model = SentenceTransformer(EMBED_MODEL)
emb_all = model.encode(terms, batch_size=1024, show_progress_bar=True,
                       convert_to_numpy=True, normalize_embeddings=True)

domain_proto = {}
for d, anchors in domain_anchors.items():
    centroid, seed_mat = make_prototype(anchors)
    domain_proto[d] = {"centroid": centroid, "seeds": seed_mat, "anchors": anchors}

dom_names = list(domain_proto.keys())
proto_mat = np.stack([domain_proto[d]["centroid"] for d in dom_names], axis=0)  # D x dim

scores = emb_all @ proto_mat.T
scores_df = pd.DataFrame(scores, columns=[f"score_{d}" for d in dom_names])
scores_df["term_norm"] = terms


domain_thresholds = {d: auto_threshold(scores_df[f"score_{d}"]) for d in dom_names}

print("Auto thresholds:")
for d, t in domain_thresholds.items():
    print(f"  {d:24s} -> {t:.3f}")

# Domain assignment with rejection
score_mat = scores_df[[f"score_{d}" for d in dom_names]].values
top_idx = score_mat.argmax(axis=1)
top_score = score_mat.max(axis=1)
second_score = np.partition(score_mat, -2, axis=1)[:,-2]
margin = top_score - second_score

assigned = []
for i in range(len(terms)):
    dom = dom_names[top_idx[i]]
    ok = (
        top_score[i] >= domain_thresholds[dom] and
        margin[i] >= ASSIGN_MARGIN and
        top_score[i] >= SOFT_ACCEPT_FLOOR
    )
    assigned.append(dom if ok else "Other")

assign_df = pd.DataFrame({
    "term_norm": terms,
    "best_domain": assigned,
    "top_score": top_score,
    "second_score": second_score,
    "margin": margin
})
assign_df = df_terms.merge(assign_df, on="term_norm", how="left")

food_df = assign_df[assign_df["best_domain"] != "Other"].copy().reset_index(drop=True)

for d in dom_names:
    mask = (food_df["best_domain"] == d)
    if not mask.any():
        continue
    subset_rows = food_df.loc[mask, "row_id"].to_numpy()
    subset_idx  = np.where(mask)[0]  # positions to write back

out1 = food_df[["term_original", "best_domain"]].sort_values("best_domain", ascending=[True])
out1.to_csv("foods_clusters.csv", index=False)

print("Saved:")
print(" - foods_clusters.csv")


Loaded 45198 unique normalized terms.


Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Auto thresholds:
  caffeine                 -> 0.380
  sugar_sweets             -> 0.418
  alcohol                  -> 0.350
  gluten_grains            -> 0.430
  dairy                    -> 0.403
  fried_processed          -> 0.420
  fruit                    -> 0.392
  vegetable                -> 0.423
  meat_protein             -> 0.390
  plant_protein            -> 0.418
  condiment_spice          -> 0.419
  diet_pattern             -> 0.382
  fermented_probiotic      -> 0.386
  hydration                -> 0.350
  carbs_starches           -> 0.429
  snacks_small_meals       -> 0.422
  breakfast                -> 0.415
  supplement_nutrients     -> 0.350
  fast_food_restaurant     -> 0.376
  ethnic_dish              -> 0.384
  ready_meal_packaged      -> 0.389
Saved:
 - foods_clusters.csv


In [None]:
# TREATMENTS RENAME/CLUSTERING

treatments = pd.read_csv("treatment_value_counts.csv")
treat_col = [c for c in ["Treatment_name","treatment","Treatment","treatment_name"] if c in treatments.columns][0]
raw_terms = treatments[treat_col].dropna().astype(str)

df_terms = pd.DataFrame({
    "term_original": raw_terms,
    "term_norm": raw_terms.map(normalize_text)
})
df_terms = (
    df_terms[df_terms["term_norm"] != ""]
    .drop_duplicates("term_norm")
    .reset_index(drop=True)
)
df_terms["row_id"] = np.arange(len(df_terms))
terms = df_terms["term_norm"].tolist()

print(f"Loaded {len(terms)} unique normalized terms.")

domain_anchors = {
    "treatment_pain_antiinflammatory": [
        "ibuprofen","advil","motrin","nurofen","naproxen","aleve",
        "diclofenac","voltaren","celebrex","meloxicam","mobic",
        "arthrotec","ketorolac","aspirin","asprin","excedrin",
        "tylenol","paracetamol","acetaminophen","panadol",
        "codeine","tramadol","cocodamol","panadeine","opioid",
        "oxycodone","percocet","vicodin","morphine","fentanyl",
        "painkiller","pain medicine","pain management"
    ],

    "treatment_antidepressant_anxiolytic": [
        "amitriptyline","nortriptyline","mirtazapine","trazodone",
        "duloxetine","cymbalta","venlafaxine","effexor","pristiq",
        "bupropion","wellbutrin","ssri","fluoxetine","prozac",
        "citalopram","celexa","escitalopram","lexapro","sertraline","zoloft",
        "paroxetine","paxil","fluvoxamine","trintellix",
        "abilify","seroquel","quetiapine","lamictal","lamotrigine",
        "lithium","depakote","valproate","buspirone","buspar",
        "diazepam","valium","lorazepam","ativan","alprazolam","xanax",
        "clonazepam","klonopin","benzodiazepine","hydroxyzine"
    ],

    "treatment_neuropathic_anticonvulsant": [
        "gabapentin","neurontin","pregabalin","lyrica",
        "carbamazepine","tegretol","topiramate","topamax",
        "oxcarbazepine","keppra","lamotrigine","baclofen",
        "tizanidine","cyclobenzaprine","flexeril","robaxin","muscle relaxant"
    ],

    "treatment_hormone_endocrine": [
        "levothyroxine","synthroid","wp thyroid","armour thyroid","thyronorm","cytomel",
        "progesterone","estrogen","estradiol","birth control","contraceptive",
        "microgynon","junel","visanne","testosterone","pregnenolone","dhea","t shot",
        "hormone replacement"
    ],

    "treatment_cardiovascular": [
        "propranolol","propanolol","metoprolol","bisoprolol","atenolol","nebivolol",
        "verapamil","diltiazem","amlodipine","losartan","lisinopril","hydrochlorothiazide",
        "diuretic","beta blocker","midodrine","fludrocortisone","florinef",
        "ivabradine","xarelto","blood thinner","statin","atorvastatin"
    ],

    "treatment_gastrointestinal": [
        "omeprazole","prilosec","pantoprazole","protonix","esomeprazole","nexium",
        "lansoprazole","zantac","ranitidine","famotidine","pepci","gaviscon","tums",
        "miralax","laxido","senna","colace","stool softener","imodium","loperamide",
        "buscopan","ondansetron","zofran","domperidone","metoclopramide","antacid",
        "digestive enzyme","enzyme","pepcid"
    ],

    "treatment_immune_autoimmune": [
        "prednisone","prednisolone","methylprednisolone","medrol","hydrocortisone",
        "corticosteroid","plaquenil","hydroxychloroquine","methotrexate","azathioprine",
        "imuran","cellcept","tacrolimus","cyclosporine","humira","cosentyx","biologic",
        "sulfasalazine","mesalamine","salofalk","pentasa","entocort","budesonide"
    ],

    "treatment_supplements_vitamins": [
        "vitamin d","vitamin d3","vitamin b12","b-complex","vitamin c","vitamin e",
        "vitamin a","vitamin k2","folate","folic acid","iron","ferrous","floradix",
        "magnesium","calcium","zinc","selenium","manganese","coq10","ubiquinol",
        "omega 3","fish oil","krill oil","probiotic","probiotics","multivitamin",
        "multi vitamin","riboflavin","niacin","thiamine","b6","b1","b5",
        "vitamin b complex","cod liver oil","electrolytes","salt tablets"
    ],

    "treatment_herbal_natural": [
        "turmeric","curcumin","ashwagandha","ginseng","licorice","echinacea",
        "milk thistle","chlorella","spirulina","oregano oil","dandelion",
        "elderberry","ginger","peppermint oil","apple cider vinegar",
        "black seed oil","maca","quercetin","withania","samento","herbal supplement"
    ],

    "treatment_sleep_relaxation": [
        "melatonin","ambien","zolpidem","zopiclone","lunesta","sleep aid","rest",
        "meditation","mindfulness","yoga","breathing exercises",
        "progressive muscle relaxation","massage","heat pack","epsom salt",
        "bath","hot bath","aromatherapy","sauna","stretching"
    ],

    "treatment_respiratory_allergy": [
        "antihistamine","loratadine","claritin","cetirizine","zyrtec","fexofenadine",
        "allegra","desloratadine","hydroxyzine","benadryl","diphenhydramine",
        "flonase","nasonex","ventolin","albuterol","salbutamol","symbicort",
        "flutiform","fluticasone","inhaler","nasal spray","epipen"
    ],

    "treatment_physical_rehab": [
        "physical therapy","physiotherapy","chiropractic","osteopathy","stretching",
        "exercise","walking","pilates","elliptical","rowing machine",
        "tens unit","compression stockings","ice pack","chiropractic adjustment",
        "spinal decompression","heat pad"
    ],

    "treatment_infusions_injections": [
        "ivig","infusion","iv fluids","injection","steroid shot","iron infusion",
        "testosterone injection","vitamin infusion","iv therapy","biologic injection"
    ],

    "treatment_diet_metabolic": [
        "gluten free diet","low fodmap","low fat diet","ketogenic","whole30",
        "clean eating","fodmap friendly","supplement shake","protein shake",
        "meal replacement"
    ],

    "treatment_alternative_complementary": [
        "cbd","cbd oil","medical cannabis","marijuana","cannabis","hemp oil",
        "acupuncture","craniosacral","reiki","chi gong","tai chi","breath work",
        "prayer","aromatherapy"
    ],

    "treatment_medical_appointments": [
        "doctor appointment","specialist visit","gp","dermatologist",
        "psychiatrist","therapy session","follow-up appointment"
    ]
}


model = SentenceTransformer(EMBED_MODEL)
emb_all = model.encode(terms, batch_size=1024, show_progress_bar=True,
                       convert_to_numpy=True, normalize_embeddings=True)

domain_proto = {}
for d, anchors in domain_anchors.items():
    centroid, seed_mat = make_prototype(anchors)
    domain_proto[d] = {"centroid": centroid, "seeds": seed_mat, "anchors": anchors}

dom_names = list(domain_proto.keys())
proto_mat = np.stack([domain_proto[d]["centroid"] for d in dom_names], axis=0)

scores = emb_all @ proto_mat.T   # N x D
scores_df = pd.DataFrame(scores, columns=[f"score_{d}" for d in dom_names])
scores_df["term_norm"] = terms


domain_thresholds = {d: auto_threshold(scores_df[f"score_{d}"]) for d in dom_names}

print("Auto thresholds:")
for d, t in domain_thresholds.items():
    print(f"  {d:32s} -> {t:.3f}")


score_mat = scores_df[[f"score_{d}" for d in dom_names]].values
top_idx = score_mat.argmax(axis=1)
top_score = score_mat.max(axis=1)
second_score = np.partition(score_mat, -2, axis=1)[:,-2]
margin = top_score - second_score

assigned = []
for i in range(len(terms)):
    dom = dom_names[top_idx[i]]
    ok = (
        top_score[i] >= domain_thresholds[dom] and
        margin[i] >= ASSIGN_MARGIN and
        top_score[i] >= SOFT_ACCEPT_FLOOR
    )
    assigned.append(dom if ok else "Other")

assign_df = pd.DataFrame({
    "term_norm": terms,
    "best_domain": assigned,
})
assign_df = df_terms.merge(assign_df, on="term_norm", how="left")

treatment_df = assign_df[assign_df["best_domain"] != "Other"].copy().reset_index(drop=True)


for d in dom_names:
    mask = (treatment_df["best_domain"] == d)
    if not mask.any():
        continue
    subset_rows = treatment_df.loc[mask, "row_id"].to_numpy()
    subset_idx  = np.where(mask)[0]

out1 = treatment_df[["term_original", "best_domain"]].sort_values("best_domain", ascending=[True])
out1.to_csv("treatments_clusters.csv", index=False)

print("Saved:")
print(" - treatments_clusters.csv")

Loaded 8070 unique normalized terms.


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Auto thresholds:
  treatment_pain_antiinflammatory  -> 0.402
  treatment_antidepressant_anxiolytic -> 0.406
  treatment_neuropathic_anticonvulsant -> 0.393
  treatment_hormone_endocrine      -> 0.350
  treatment_cardiovascular         -> 0.397
  treatment_gastrointestinal       -> 0.394
  treatment_immune_autoimmune      -> 0.398
  treatment_supplements_vitamins   -> 0.365
  treatment_herbal_natural         -> 0.387
  treatment_sleep_relaxation       -> 0.350
  treatment_respiratory_allergy    -> 0.407
  treatment_physical_rehab         -> 0.350
  treatment_infusions_injections   -> 0.350
  treatment_diet_metabolic         -> 0.350
  treatment_alternative_complementary -> 0.350
  treatment_medical_appointments   -> 0.350
Saved:
 - treatments_clusters.csv
