In [42]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv('nacc_60plus.csv', low_memory=False)

In [5]:
data.shape

(161700, 1024)

# Form A1

In [6]:
demographics_features = {
    'subject_demographics' : [
        'SEX',
        'PRIMLANG',
        'EDUC',
        'MARISTAT',
        'INDEPEND',
        'RESIDENC',
        'NACCAGE'
    ]
}

# Form A2

In [7]:
co_participant_demograpgics = {
    'relationship': [
        'INRELTO', 
        'INKNOWN'
    ],
    
    'living': ['INLIVWTH']
}

# Form A3

In [8]:
family_history_features = {
    'impairement_within_family': ['NACCFAM'],
    
    'mutation_evidence': [
        'NACCFADM',  # AD
        'NACCFFTD',  # FTLD
        'NACCOM' # Other
    ]
    
}

# Form A4

In [9]:
medication_features = {
    'meds': ['ANYMEDS'],

    "antihypertensive_medications": [
        "NACCAHTN",  # any type
        "NACCHTNC",  # combination therapy
        "NACCACEI",  # ACE inhibitor
        "NACCAAAS",  # antiadrenergic agent
        "NACCBETA",  # beta-blocker
        "NACCCCBS",  # calcium channel blocker
        # "NACCEIUR",  # diuretic
        "NACCVASD",  # vasodilator
        "NACCANGI"   # angiotensin II inhibitor
    ],
    
    "lipid_lowering_medications": [
        "NACCLIPL"
    ],
    
    "anti_inflammatory_medications": [
        "NACCNSD"  # nonsteroidal anti-inflammatory
    ],
    
    "anticoagulant_antiplatelet_agents": [
        "NACCAC"
    ],
    
    "psychiatric_medications": [
        "NACCADEP",  # antidepressant
        "NACCAPSY",  # antipsychotic
        "NACCAANX"   # anxiolytic/sedative/hypnotic
    ],
    
    "neurological_medications": [
        "NACCADMD",  # Alzheimer's medications
        "NACCPDMD"   # antiparkinson agent
    ],
    
    "hormone_therapies": [
        "NACCEMD",   # estrogen therapy
        "NACCEPMD"   # estrogen + progestin
    ],
    
    "diabetes_medications": [
        "NACCDBMD"
    ]
}

# Form A5

In [10]:
health_history_features = {
    'smoking': [
        'TOBAC30', 
        'TOBAC100', 
        'SMOKYRS', 
        'PACKSPER',  
        'QUITSMOK', 
    ],

    'alcohol': [
        'ALCOCCAS', 
        'ALCFREQ'
    ],

    'cardiovascular': [
        'CVHATT',
        'HATTMULT',
        'CVAFIB',
        'CVANGIO',
        'CVBYPASS',
        'CVPACDEF',
        'CVPACE',
        'CVCHF',
        'CVANGINA',
        'CVHVALVE',
        'CVOTHR'
    ],

    'cerebrovascular': [
        'CBSTROKE', # Strokes
        'STROKMUL', # More than once
        'CBTIA', # TIA
        'TIAMULT' # More than once
    ],

    'neurologic_conditions': [
        'PD', # Parkinson's Disease
        'PDOTHR', # Other parkinsonism disorders
        'SEIZURES', 
        'NACCTBI', # history of TBI
        'NCOTHR' # other
    ],

    'other_medical_conditions': [
        'DIABETES', 'DIABTYPE',
        'HYPERTEN',
        'HYPERCHO',
        'B12DEF',
        'THYROID',
        'ARTHRIT', 'ARTHTYPE',
        'INCONTU', # Incontinence — urinary
        'INCONTF', # Incontinence — bowel
        'APNEA',
        'RBD',
        'INSOMN',
        'OTHSLEEP'
    ],

    'substance_abuse': [
        'ALCOHOL',
        'ABUSOTHR'
    ],

    'psychiatric_conditions': [
        'PTSD',
        'BIPOLAR',
        'SCHIZ',
        'DEP2YRS',
        'DEPOTHR',
        'ANXIETY',
        'OCD',
        'NPSYDEV',
        'PSYCDIS' # other
    ]
}

# Form B1

In [11]:
physical_measurment_features = {
    'bmi': ['NACCBMI'],
    'blood_pressure': [
        'BPSYS', # systolic
        'BPDIAS', # diastolic
    ],

    'heart_rate': ['HRATE'],

    'vision': [
        'VISION',
        'VISCORR', # wear corrective lense
        'VISWCORR' # working properly after weaaring corrective lenses
    ],

    'hearing': [
        'HEARING',
        'HEARAID',
        'HEARWAID'
    ]
}

# Form B2 and B3 are not in the packet

# Form B4

In [12]:
cdr_ftld_features = {
    'cdr_scores': [
        'MEMORY',
        'ORIENT',
        'JUDGMENT',
        'COMMUN',
        'HOMEHOBB',
        'PERSCARE',
        'CDRSUM', # sum of boxes
        'CDRGLOB' # global cdr
    ],
    
    'ftld_scores': [
        'COMPORT',
        'CDRLANG'
    ]
}

# Form B5

In [13]:
behavioural_assessment_features = {
    # NPI (Neuropsychiatric Inventory)
    'npi_symptoms': [
        'DEL',  # Delusions
        'DELSEV',
        # 'HALLUC',    # Hallucinations       
        'HALLSEV',
        'AGIT',      # Agitation
        'AGITSEV',
        'DEPD',      # Depression
        'DEPDSEV',
        'ANX',       # Anxiety
        'ANXSEV',
        'ELAT',      # Elation
        'ELATSEV',
        'APA',       # Apathy
        'APASEV',
        'DISN',  # Disinhibition
        'DISNSEV',
        'IRR',       # Irritability
        'IRRSEV',
        'MOT',     # Motor disturbance
        'MOTSEV',
        'NITE',      # Nighttime behaviors
        'NITESEV',
        'APP',        # Appetite changes
        'APPSEV',

    ]
}

# Form B6 (not including in out dataset)

# Form B7

In [14]:
functional_assessment_features = {
    # FAQ (Functional Activities Questionnaire)
    'faq_items': [
        'BILLS',     # Managing bills/finances
        'TAXES',     # Handling taxes
        'SHOPPING',  # Shopping
        'GAMES',     # Games/hobbiesssssss
        'STOVE',     # Kitchen appliances
        'MEALPREP',  # Meal preparation
        'EVENTS',    # Remembering events
        'PAYATTN',   # Paying attention
        'REMDATES',  # Remembering dates
        'TRAVEL',    # Traveling
    ]
}

# Form B8

In [15]:
neurological_examination_features = {
    'neurological_examination': [
        'NACCNREX',   # Abnormal findings - Q1
        'PARKSIGN',   # Parkinsonian signs - Q2
        'BRADY',      # Bradykinesia - Q2d
        'PARKGAIT',   # Parkinsonian gait disorder - Q2e
        'POSTINST',   # Postural instability - Q2f
        'CVDSIGNS',   # Neurological signs relate to cerebrovascular disease - Q3
        'CORTDEF',    # Cortical cognitive deficit  - Q3a
        'SIVDFIND',   # Focal or other neurological findings - Q3b
        'CVDMOTL',    # Motor (may include weakness of combination of face, arm, and leg; reflex changes, etc.) — left side - Q3c1
        'CVDMOTR',    # Motor (may include weakness of combination of face, arm, and leg; reflex changes, etc.) — right side - Q3c2
        'CORTVISL',   # Cortical visual field loss — left side - Q3d1
        'CORTVISR',   # Cortical visual field loss — right side - Q3d2
        'SOMATL',     # Somatosensory loss — left side - Q3e1
        'SOMATR',     # Somatosensory loss — right side - Q3e2
        'POSTCORT',   # Higher cortical visual problem suggesting posterior cortical atrophy or apraxia of gaze - Q4
        'PSPCBS',     # Findings suggestive of progressive supranuclear palsy (PSP), corticobasal syndrome (CBS), or other related disorders - Q5
        'ALSFIND',    # Findings suggesting ALS - Q6
        'GAITNPH',    # gait apraxia - Q7
        'OTHNEUR',    # Other findings - Q8
    ]
}

 # 'RESTTRL',    # left arm tremor - Q2a1
        # 'RESTTRR',    # right arm tremor - Q2a2
        # 'SLOWINGL',   # Left side Slowing of fine motor movements- Q2b1
        # 'SLOWINGR',   # Right side Slowing of fine motor movements - Q2b2
        # 'RIGIDL',     # Left arm Rigidity - Q2c1
        # 'RIGIDR',     # Right arm Rigidity - Q2c2

In [39]:
# List of features to check
neurological_examination_features = [
    'NACCNREX', 'PARKSIGN', 'BRADY', 'PARKGAIT', 'POSTINST', 'CVDSIGNS',
    'CORTDEF', 'SIVDFIND', 'CVDMOTL', 'CVDMOTR', 'CORTVISL', 'CORTVISR',
    'SOMATL', 'SOMATR', 'POSTCORT', 'PSPCBS', 'ALSFIND', 'GAITNPH', 'OTHNEUR'
]

# Filter to include only existing columns in your data
existing_features = [col for col in neurological_examination_features if col in data.columns]

# Count nulls for all those features
null_counts = data[existing_features].isnull().sum()

print(null_counts)

NACCNREX     0
PARKSIGN    13
BRADY       13
PARKGAIT    13
POSTINST    13
CVDSIGNS    13
CORTDEF     13
SIVDFIND    13
CVDMOTL     13
CVDMOTR     13
CORTVISL    13
CORTVISR    13
SOMATL      13
SOMATR      13
POSTCORT    13
PSPCBS      13
ALSFIND     13
GAITNPH     13
OTHNEUR     13
dtype: int64


In [40]:
minus4_counts = (data[existing_features] == -4).sum()
print(minus4_counts)

NACCNREX    20965
PARKSIGN    96255
BRADY       96255
PARKGAIT    96255
POSTINST    96255
CVDSIGNS    96255
CORTDEF     96255
SIVDFIND    96255
CVDMOTL     96255
CVDMOTR     96255
CORTVISL    96255
CORTVISR    96255
SOMATL      96255
SOMATR      96255
POSTCORT    96255
PSPCBS      96255
ALSFIND     96255
GAITNPH     96255
OTHNEUR     96255
dtype: int64


In [43]:
data[existing_features] = data[existing_features].replace(-4, np.nan)

In [44]:
# Filter to include only existing columns in your data
existing_features = [col for col in neurological_examination_features if col in data.columns]

# Count nulls for all those features
null_counts = data[existing_features].isnull().sum()

print(null_counts)

NACCNREX    20965
PARKSIGN    96268
BRADY       96268
PARKGAIT    96268
POSTINST    96268
CVDSIGNS    96268
CORTDEF     96268
SIVDFIND    96268
CVDMOTL     96268
CVDMOTR     96268
CORTVISL    96268
CORTVISR    96268
SOMATL      96268
SOMATR      96268
POSTCORT    96268
PSPCBS      96268
ALSFIND     96268
GAITNPH     96268
OTHNEUR     96268
dtype: int64


# Form B9

In [169]:
clinician_judgment_features = {
    'memory_subject_co_parti': [
        'DECSUB',    # decline in memory - Q1
        'DECIN',     # decline in subject’s memory - Q2
    ],
    'cognitive_symptoms': [
        'DECCLCOG',  #  meaningful impairment in cognition - Q3
        'COGMEM',    # memory - Q4a
        'COGORI',    # orientation - Q4b
        'COGJUDG',   # executive function — judgment, planning, or problem-solving - Q4c
        'COGLANG',   # language - Q4d
        'COGVIS',    # visuospatial function - Q4e
        'COGATTN',   # attention or concentration - Q4f
        'COGFLUC',   # currently has fluctuating cognition - Q4g
        'COGFLAGO',  # age the fluctuating cognition begin? - 4g1
        'COGOTHR',   # other cognitive - Q4h
        'NACCCOGF',  # predominant symptom  - Q5
        'COGMODE',   # Mode of onset of cognitive symptoms - Q6
        'DECAGE',    # age did the cognitive decline begin - Q7
    ],
    'behavioral_symptoms': [
        'DECCLBE',   # behavioral symptoms - Q8
        'BEAPATHY',  # change in behavior — Apathy, withdrawal - Q9a
        'BEDEP',     # Depressed mood - Q9b
        'BEVHALL',   # Psychosis - Visual hallucinations - Q9C1
        'BEVWELL',   # hallucinations well-formed and detailed? - Q9c1a
        'BEVHAGO',   # age these hallucinations begin - Q9c1b
        'BEAHALL',   # Psychosis —  Auditory hallucinations - Q9c2
        'BEDEL',     # Psychosis — Abnormal, false, or delusional beliefs - Q9c3
        'BEDISIN',   # Disinhibition - Q9d
        'BEIRRIT',   # Irritability - Q9e
        'BEAGIT',    # Agitation - Q9f
        'BEPERCH',   # Personality change - Q9g
        'BEREM',     # REM sleep behavior disorder - Q9h
        'BEREMAGO',  # age the REM sleep behavior disorder begin? - Q9h1
        'BEANX',     # Anxiety - Q9i
        'BEOTHR',    # Other - Q9j
        'NACCBEHF',  # predominant symptom = decline in the subject’s behavior - Q10
        'BEMODE',    # Mode of onset of behavioral symptoms - Q11
        # 'BEMODEX',   # Other - Q11a
        'BEAGE',     # age the behavioral symptoms begin - Q12
    ],
    'motor_symptoms': [
        'DECCLMOT',  # experiencing any motor symptoms - Q13
        'MOGAIT',    # Gait disorder - Q14a
        'MOFALLS',   # Falls - Q14b
        'MOTREM',    # Tremor - Q14c
        'MOSLOW',    # Slowness - Q14d
        'NACCMOTF',  # subject’s motor function - Q15
        'MOMODE',    # Mode of onset of motor symptoms - Q16
        'MOMOPARK',  # suggestive of Parkinsonism - Q17
        'PARKAGE',   # age Parkinsonism begins - Q17a
        'MOMOALS',   # amyotrophic lateral sclerosis? - Q18
        'ALSAGE',    # age - ALS begin - Q18a
        'MOAGE',     # age the motor changes begin? - Q19
    ],
    'decline_course_predominant_domain': [
        'COURSE',    # Overall decline of cognitive/behavioral/motor syndrome - Q20
        'FRSTCHG',   # Q21
    ],
    'lewy_body_frontotemporal': [
        'LBDEVAL',   # Lewy body disease - Q22
        'FTLDEVAL',  # frontotemporal lobar degeneration - Q23
    ]
}

# Form C2 - DO NOT USE THIS FORM

In [170]:
neuropsychological_battery_scores = {
    'montreal_cognitive_assessment' : [
        'MOCACOMP',   # Was any part of MoCA administered? - Q1a
        'MOCAVIS',    # Subject was unable to complete one or more sections due to visual impairment - Q1d
        'MOCAHEAR',   # Subject was unable to complete one or more sections due to hearing impairmen - Q1c
        'MOCATOTS',   # MoCA Total Raw Score — uncorrected - Q1f
        'NACCMOCA',   # MoCA Total Score — corrected for education
        'MOCATRAI',   # MoCA: Visuospatial/executive — Trails - Q1g
        'MOCACUBE',   # MoCA: Visuospatial/executive — Cube - Q1h
        'MOCACLOC',   # MoCA: Visuospatial/executive —  Clock contour - Q1i
        'MOCACLON',   # MoCA: Visuospatial/executive —  Clock numbers - Q1j
        'MOCACLOH',   # MoCA: Language — Naming - Q1L
        'MOCAREGI',   # MoCA: Memory — Registration (two trials) - Q1m
        'MOCADIGI',   # MoCA: Attention — Digits - Q1n
        'MOCALETT',   # MoCA: Attention — Letter A - Q1o
        'MOCASER7',   # MoCA: Attention — Serial 7s - Q1p
        'MOCAREPE',   # MoCA: Language — Repetition - Q1q
        'MOCAFLUE',   # MoCA: Language — Fluency - Q1r
        'MOCAABST',   # MoCA: Abstraction - Q1s
        'MOCARECN',   # MoCA: Delayed recall — No cue - Q1t
        'MOCARECC',   # MoCA: Delayed recall — Category cue - Q1u
        'MOCARECR',   # MoCA: Delayed recall — Recognition - Q1v
        'MOCAORDT',   # MoCA: Orientation — Date - Q1w
        'MOCAORMO',   # MoCA: Orientation — Month - Q1x
        'MOCAORYR',   # MoCA: Orientation — Year - Q1y
        'MOCAORDY',   # MoCA: Orientation — Day - Q1z
        'MOCAORPL',   # MoCA: Orientation — Place - Q1aa
        'MOCAORCT',   # MoCA: Orientation — City - Q1bb
    ],
    'craft_story_21_immediate' : [
        'CRAFTVRS',   # Craft Story 21 Recall (Immediate) — Total story units recalled, verbatim scoring - Q3a
        'CRAFTURS',   # Craft Story 21 Recall (Immediate) — Total  story units recalled, paraphrase scoring - Q3b
    ],
    'benson_complex_figure_copy' : [
        'UDSBENTC'   #  Total score for copy of Benson figure - Q4a
    ],
    'number_span_test_forward': [
        'DIGFORCT',   # Number Span Test: Forward — Number of correct trials - Q5a
        'DIGFORSL',   # Number Span Test: Forward — Longest span forward - Q5b
    ],
    'number_span_test_backward': [
        'DIGBACCT',   # Number Span Test: Backward — Number of correct trials - Q6a
        'DIGBACLS',   # Number Span Test: Backward — Longest span forward - Q6b
    ],
    'category_fluency_test': [
        'ANIMALS',    # Animals — Total number of animals named in 60 seconds - Q7a
        'VEG',        # Vegetables — Total number of vegetables named in 60 seconds - Q7b
    ],
    'trail_making_test': [
        'TRAILA',     # Trail Making Test Part A — Total number of seconds to complete - Q8a
        'TRAILARR',   # Part A — Number of commission errors - Q8a1
        'TRAILALI',   # Part A — Number of correct lines - A8a2
        'TRAILB',     # Trail Making Test Part B — Total number of seconds to complete - Q8b
        'TRAILBRR',   # Part B — Number of commission errors - Q8b1
        'TRAILBLI',   # Part A — Number of correct lines - A8b2
    ],
    'craft_story_21_delayed' : [
        'CRAFTDVR',   # Craft Story 21 Recall (Delayed) — Total story units recalled, verbatim scoring - Q9a
        'CRAFTDRE',   # Craft Story 21 Recall (Delayed) — Total story units recalled, paraphrase scoring - Q9b
        'CRAFTDTI',   # Craft Story 21 Recall (Delayed) —  Delay time - Q9c
        'CRAFTCUE',   # Craft Story 21 Recall (Delayed) — Cue (boy) needed - Q9d
    ],
    'benson_complex_figure_recall' : [
        'UDSBENTD',   # Total score for 10- to 15-minute delayed drawing of Benson figure  - Q10a
        'UDSBENRS',   # Recognized original stimulus from among four options - Q10b
    ],
    'multilingual_naming_test' : [
        'MINTTOTS',   # Total score - Q11a
        'MINTTOTW',   # Total correct without semantic cue - Q11b
        'MINTSCNG',   # Semantic cues: Number given - Q11c
        'MINTSCNC',   # Semantic cues: Number correct with cue - Q11d
        'MINTPCNG',   # Phonemic cues: Number given - Q11e
        'MINTPCNC',   # Phonemic cues: Number correct with cue - Q11f
    ],
    'verbal_fluency_test': [
        'UDSVERFC',   # Number of correct F-words generated in 1 minute - Q12a
        'UDSVERFN',   # Number of F-words repeated in 1 minute - Q12b
        'UDSVERNF',   # Number of non-F-words and rule violation errors in 1 minute - Q12c
        'UDSVERLC',   # Number of correct L-words generated in 1 minute - Q12d
        'UDSVERLR',   # Number of L-words repeated in 1 minute - Q12e
        'UDSVERLN',   # Number of non-L-words and rule violation - Q12f
        'UDSVERTN',   # Total number of correct F-words and  - Q12g
        'UDSVERTE',   # Total number of F-word and L-word repetition errors - Q12h
        'UDSVERTI',   # Total number of non-F/L-words and rule violation errors - Q12i
    ],
    'overall_appraisal': [
        'COGSTAT',   #  the subject’s cognitive status is deemed - Q13a
    ]
}

# Form D1

In [171]:
# Section 1
clinician_diagnosis_features = {
    'cognitive_status': [
        'NORMCOG',    # Normal cognition and behavior - Q 2
        'NACCNORM',   # NACC derived -  Normal cognition at all visits to date - Q 2
        'DEMENTED',   # Met criteria for dementia - Q 3
        'NACCIDEM',   # NACC derived Incident dementia during UDS follow-up - Q 3
        'NACCUDSD',   # NACC derived -  Cognitive status at UDS visit - Q. [2, 3, 5a-5e]
        'AMNDEM',     # Amnestic multidomain dementia syndrome - Q 4a
        'PCA',        # Posterior cortical atrophy syndrome (or primary visual presentation) - Q 4b
        'NACCPPA',    # NACC derived - Primary progressive aphasia (PPA) with cognitive impairment - Q 4c
        'NACCPPAG',   # NACC derived — PPA subtype according - Q 4c1
        'NACCBVFT',   # Dementia syndrome — behavioral variant FTD syndrome (bvFTD) - Q 4d
        'NACCLBDS',   # Dementia syndrome — Lewy body dementia syndrome - Q 4e
        'NAMNDEM',    # Dementia syndrome — Non-amnestic multidomain dementia, not PCA, PPA, bvFTD, or DLB syndrome - Q 4e
        'NACCTMCI',   # Mild cognitive impairment (MCI) type - Q. 5a-d MCI type
        'NACCMCII',    # NACC derived - Incident MCI during UDS follow-up - Q [5a, 5b, 5c, 5d]
        'NACCMCIL',   # MCI — language - Q. [5b1, 5c1, 5d1]
        'NACCMCIA',   # MCI attention - Q. [5b2, 5c2, 5d2]
        'NACCMCIE',   # MCI executive function - Q. [5b3, 5c3, 5d3]
        'NACCMCIV',   # MCI visuospatial - Q. [5b4, 5c4, 5d4]
        'IMPNOMCI',   # Cognitively impaired, not MCI Q. 5e
    ],
     'etiologic_diagnoses': [
        'NACCETPR',  # NACC derived  
        'NACCALZD',  # NACC derived  - Alzheimer’s diseas - Q 11
        'NACCALZP',  # NACC derived - Primary, contributing, or non-contributing cause of observed cognitive impairment — Alzheimer’s disease (AD) - Q 11a
        'NACCLBDE',  # NACC derived  Lewy body disease - Q 12
        'NACCLBDP',  # NACC derived -Primary/contributing/non-contributing — Lewy body disease (LBD) - Q 12 a
        'PARK',      # Parkinson’s disease present - Q 12b
        'MSA',       # Multiple system atrophy (MSA) - Q 13
        'PSP',       # primary supranuclear palsy (PSP) - Q 14a
        'CORT',      # Presumptive etiologic diagnosis — Corticobasal degeneration (CBD) - Q 14b
        'FTLDMO',    # FTLD with motor neuron disease (MND) - W 14c
        'CVD',       # Vascular brain injury (VBI)  - Q 15
        'DOWNS',     # Down syndrome - Q 17
        'HUNT',      # Huntington’s disease Numeric longitudinal - Q 18
        'PRION',     # Prion disease - Q 19
        'BRNINJ',    # Traumatic brain injury (TBI) - Q 20
        'EPILEP',    # Epilepsy - Q 22
        'NEOP',      # Presumptive etiologic diagnosis — CNS neoplasm - Q 23
        'HIV',       # Q24
        'DEP',       # Depression - Q 26
        'BIPOLDX',   # Bipolar disorder - Q 27
        'ANXIET',    # Anxiety - Q 29
        'DELIR',     # Delirium - Q 30
        'PTSDDX',    # Post traumatic stress disorder (PTSD) - Q 31
        'ALCDEM',    # Cognitive impairment due to alcohol abuse - Q 33
        'MEDS',      # Cognitive impairment due to medications - Q 36
    ],

    'inheritance': [
        'NACCADMU', # AD Mutation
        'NACCFTDM' # FTLD Mutation
    ]
}

# Form D2

In [172]:
medical_conditions_features = {
    'medical_conditions_procedures': [
        'CANCER',   # Cancer present in the last 12 months - Q1
        'DIABET',   # Diabetes - Q2
        'MYOINF',   # Myocardial infarct - Q3
        'CONGHRT',  # Congestive heart failure - Q4
        'AFIBRILL', # Atrial fibrillation - Q5
        'HYPERT',   # Hypertension - Q6
        'ANGINA',   # Angina - Q7
        'HYPCHOL',  # Hypercholesterolemia - Q8
        'VB12DEF',  # B12 deficiency - Q9
        'THYDIS',   # Thyroid disease - Q10
        'ARTH',     # Arthritis - Q11
        'ARTYPE',   # Arthritis type - Q11a 
        'URINEINC', # Incontinence — urinary - Q12
        'BOWLINC',  # Incontinence — bowel - Q13
        'SLEEPAP',  # Sleep apnea present - Q14
        'REMDIS',   # REM sleep behavior disorder (RBD) - Q15
        'HYPOSOM',  # Hyposomnia/insomnia - Q16
        'SLEEPOTH', # Other sleep disorder - Q17
        'ANGIOCP',  # Carotid procedure - Q18 - Not directly related but chance for heart issue
        'ANGIOPCI', # Percutaneous coronary intervention - Q19 - Not directly related
        'PACEMAKE', # pacemaker and/or defibrillator - Q20
        'HVALVE',   # heart valve replacement or repair - Q21
        'ANTIENC',  # Antibody-mediated encephalopathy  - Q22
    ]
}

# Variable not to use 

In [173]:
 # ---- everything focus on PSP - so do we want to focus on PSP? - Form B8
        # 'PSPCBS',     # Findings suggestive of progressive supranuclear palsy (PSP), corticobasal syndrome (CBS), or other related disorders - Q5
        # 'EYEPSP',     # Eye movement changes consistent with PSP - Q5a - need to think about it - Q5a
        # 'DYSPSP',     # Dysarthria consistent with PSP - Q5b
        # 'AXIALPSP',   # Axial rigidity consistent with PSP - Q5c
        # 'GAITPSP',    # Gait disorder consistent with PSP - Q5d
        # 'APRAXSP',    # Apraxia of speech - Q5e
        # 'APRAXL',     # Apraxia consistent with CBS — left side - Q5f1
        # 'APRAXR',     # Apraxia consistent with CBS — right side - Q5f2
        # 'CORTSENL',   # Cortical sensory deficits consistent with CBS — left side - Q5g1
        # 'CORTSENR',   # Cortical sensory deficits consistent with CBS — right side - Q5g2
        # 'ATAXL',      # Ataxia consistent with CBS — left side - Q5h1
        # 'ATAXR',      # Ataxia consistent with CBS — right side - Q5h2
        # 'ALIENLML',   # Alien limb consistent with CBS — left side - Q5i1
        # 'ALIENLMR',   # Alien limb consistent with CBS — right sid - Q5i2
        # 'DYSTONL',    # Dystonia consistent with CBS, PSP, or related disorder — left side - Q5j1
        # 'DYSTONR',    # Dystonia consistent with CBS, PSP, or related disorder — right side- Q5j2
        # 'MYOCLLT',    # Myoclonus consistent with CBS — left side - Q5k1
        # 'MYOCLRT',    # Myoclonus consistent with CBS — right side - Q5k2
        
        # 'FOCLDEF',    # Deprecated in V3
        # 'GAITDIS',    # Deprecated in V3
        # 'EYEMOVE'     # Deprecated in V3

In [174]:
# Form D1 - Section 3

#         'PROBAD',    # Presumptive etiologic diagnosis of the cognitive disorder — Probable Alzheimer’s disease - Q 11
#         'PROBADIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — Probable Alzheimer’s disease - Q 11a      
#         'FTLDNOS',   # Presumptive etiologic diagnosis of the cognitive disorder — FTLD not otherwise specified (NOS) - Q 14d
#         'FTLDNOIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — FTLD not otherwise specified (NOS) - Q 14d1
#         'FTLDSUBT',  # FTLD subtype - Q 14e
#         'FTLDSUBX',  # Other FTLD subtype, specify - Q 14e1
#         'CVD',       # Presumptive etiologic diagnosis — Vascular brain injury (VBI)  - Q 15
#         'CVDIF',     # Primary, contributing, or non-contributing cause of cognitive impairment — vascular brain injury - Q 15a
#         'PREVSTK',   # Previous symptomatic stroke - Q 15b
#         'STROKDEC',  # Temporal relationship between stroke and cognitive decline - Q 15b1
#         'STKIMAG',   # Confirmation of stroke by neuroimaging - Q 15b2
#         'INFNETW',   # Imaging evidence of cystic infarction in cognitive network(s) - Q 15c
#         'INFWMH',    # Imaging evidence of cystic infarction - Stroke damage, white matter disease, poor thinking skills - Q 15d
#         'ESSTREM',   # Presumptive etiologic diagnosis —  Essential tremor - Q 16
#         'ESSTREIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — Essential tremor - Q 16a

#         'BRNINCTE',  # Symptoms consistent with chronic traumatic encephalopathy (CTE) - Q 20b
#         'HYCEPH',    # Presumptive etiologic diagnosis of the cognitive disorder — Normal-pressure hydrocephalus (NPH) - Q 21
#         'HYCEPHIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — Normalpressure hydrocephalus (NPH) - Q 21a 
#         
#         
#         'NEOPSTAT',  # CNS neoplasm — benign or malignant - Q23b

#         'OTHCOG',    # Presumptive etiologic diagnosis — Other neurological, genetic, or infectious condition - Q 25
#         'OTHCOGIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — Other neurological, genetic, or infectious condition - Q 25a
#         'OTHCOGX',   # Presumptive etiologic diagnosis of the cognitive disorder — Other neurological, genetic, or infectious condition (specify)  Q 25b
#         'DEPIF',     # Primary, contributing, or non-contributing cause of cognitive impairment — Depression - Q26 a
#         'DEPTREAT',  # Depression — Treated or untreated - Q 26b
#         
#         'BIPOLDIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — bipolar disorder - Q 27a
#         'SCHIZOP',   # Presumptive etiologic diagnosis — Schizophrenia or other psychosis - Q 28
#         'SCHIZOIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — Schizophrenia or other psychosis - Q 28a
#         
#         'ANXIETIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — Anxiety - Q 29a
#         
#         'DELIRIF',   # Primary, contributing, or non-contributing cause of cognitive impairment — Delirium - Q 30a
#         'PTSDDXIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — PTSD - Q 31a
#         'OTHPSY',    # Presumptive etiologic diagnosis — Other psychiatric disease - Q 32
#         'OTHPSYIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — Other psychiatric disease - Q 32a
#         'OTHPSYX',   # Presumptive etiologic diagnosis of the cognitive disorder — Other psychiatric disease (specify) - Q 32b
#        
#         'ALCDEMIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — Alcohol abuse - Q 33a
#         'ALCABUSE',  # Current alcohol abuse - Q 33b
#         'IMPSUB',    # Presumptive etiologic diagnosis of the cognitive disorder — Cognitive impairment due to other substance abuse - Q 34
#         'IMPSUBIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — Other substance abuse - Q 34a
#         'DYSILL',    # Presumptive etiologic diagnosis of the cognitive disorder — Cognitive impairment due to systemic disease/medical illness - Q 35
#         'DYSILLIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — systemic disease/medical illness - Q 35a
#         'MEDSIF',    # Primary, contributing, or non-contributing cause of cognitive impairment — medications - Q 36a
#         'DEMUN',     # Deprecated in V3 -  Presumptive etiologic diagnosis of the cognitive disorder — Undetermined etiology
#         'DEMUNIF',   # Deprecated in V3 -  Primary, contributing, or non-contributing cause of cognitive impairment — Undetermined etiology
#         'COGOTH',    # Presumptive etiologic diagnosis of the cognitive disorder — Other 1 (specify) - Q 37
#         'COGOTHIF',  # Primary, contributing, or non-contributing cause of cognitive impairment — Other 1 (specify) - Q 37a
#         'COGOTHX',   # Other presumptive etiologic diagnosis of the cognitive disorder 1, specify - Q 37b
#         'COGOTH2',   # Presumptive etiologic diagnosis of the cognitive disorder — Other 2 (specify) - Q 38
#         'COGOTH2F',  # Primary, contributing, or non-contributing cause of cognitive impairment — Other 2 (specify) - Q 38a
#         'COGOTH2X',  # Other presumptive etiologic diagnosis of the cognitive disorder 2, specify - Q 38b
#         'COGOTH3',   # Presumptive etiologic diagnosis of the cognitive disorder — Other 3 (specify) - Q 39
#         'COGOTH3F',  # Primary, contributing, or non-contributing cause of cognitive impairment — Other 3 (specify) - Q 39a


# Feature Selection

In [175]:
all_feature_dicts = [
    demographics_features,
    co_participant_demograpgics,
    family_history_features,
    medication_features,
    health_history_features,
    physical_measurment_features,
    cdr_ftld_features,
    behavioural_assessment_features,
    functional_assessment_features,
    neurological_examination_features,
    clinician_judgment_features,
    clinician_diagnosis_features,
    medical_conditions_features
    
]

selected_features = []
for feature_group in all_feature_dicts:
    for feature_list in feature_group.values():
        selected_features.extend(feature_list)

selected_features = list(set(selected_features))
df_selected = data[selected_features]
df_selected.head()


Unnamed: 0,DEPD,NACCOM,MOTSEV,NACCLIPL,BEIRRIT,BEMODE,MOTREM,NACCMCIV,BEDISIN,BPSYS,...,NACCNORM,NACCCCBS,VB12DEF,BEDEL,DISN,NACCFFTD,APP,CBTIA,BEREMAGO,BEVHALL
0,0,9,8,0,0,0,0,1,0,160.0,...,0,1,0.0,0,0,0,0,0,888,0
1,0,9,8,1,0,0,0,0,0,175.0,...,0,0,0.0,0,0,0,0,-4,888,0
2,-4,9,-4,0,0,0,0,8,0,149.0,...,1,0,0.0,0,-4,0,-4,0,888,0
3,0,-4,8,0,0,0,0,8,0,888.0,...,0,0,8.0,0,0,0,1,9,888,0
4,0,9,8,0,0,0,0,8,0,148.0,...,1,0,0.0,0,0,0,0,0,888,0


In [176]:
df_selected.shape

(161700, 277)

# Missing value analysis

In [177]:
(df_selected.isnull().mean() * 100).round(2)

DEPD        0.0
NACCOM      0.0
MOTSEV      0.0
NACCLIPL    0.0
BEIRRIT     0.0
           ... 
NACCFFTD    0.0
APP         0.0
CBTIA       0.0
BEREMAGO    0.0
BEVHALL     0.0
Length: 277, dtype: float64

In [178]:
pd.set_option('display.max_rows', None)  # Show all rows
print(df_selected.isnull().sum())
pd.reset_option('display.max_rows')      # Reset after if needed


DEPD          0
NACCOM        0
MOTSEV        0
NACCLIPL      0
BEIRRIT       0
BEMODE        0
MOTREM        0
NACCMCIV      0
BEDISIN       0
BPSYS       389
CORT          0
CDRGLOB       0
BEPERCH       0
COGFLAGO      0
NACCETPR      0
NACCBETA      0
INSOMN        0
SHOPPING      0
BEDEP         0
BIPOLAR       0
OTHNEUR      13
RBD           0
THYDIS        1
NAMNDEM       0
CVPACE        0
COGORI        0
PSYCDIS       0
COMMUN        0
IRR           0
SEIZURES      0
NACCTMCI      0
CVD           0
SLEEPOTH      1
ARTHTYPE      0
CVDSIGNS     13
NACCMCIE      0
HRATE       389
EDUC          0
BEANX         0
SOMATR       13
NACCMCIL      0
CORTVISR     13
NACCAANX      0
ANGIOPCI      1
PD            0
DEPDSEV       0
THYROID       0
PDOTHR        0
SIVDFIND     13
TOBAC100      0
CVDMOTL      13
FTLDEVAL      0
INCONTF       0
CVCHF         0
COMPORT       0
ANTIENC       1
NACCVASD      0
ANXIETY       0
DIABETES      0
NACCTBI       0
PAYATTN       0
NACCCOGF      0
ELATSEV 

In [179]:
df_selected['NACCALZD'].value_counts()

NACCALZD
8    78213
1    56592
0    26895
Name: count, dtype: int64