In [1]:
import sys
sys.path.append("../")
import os
import time
import argparse
import numpy as np
import pandas as pd
from loguru import logger

import common
from code_mapping import *
from config import base_data_dict, DATA_DIR, PHENOTYPE_DIR, random_state
from utils.general_functs import save_pickle, load_pickle, combine_lists

In [2]:
dx_dict = pd.read_csv(base_data_dict['DX_dict_file'], sep="\t")
med_dict = pd.read_csv(base_data_dict['MED_dict_file'], sep="\t")

In [45]:
dx_dict[dx_dict['PheCode'] == 290.11]

Unnamed: 0,concept_code,concept_name,concept_path,concept_type,concept_ontology,ICD10_code,PheCode,ADRD_DX
3565,ICD9:331.0,Alzheimer's disease,Diagnoses \ Diseases of the nervous system (g0...,DX,ICD-9-CM,ICD10:G30.9,290.11,0
22240,ICD10:G30.0,Alzheimer's disease with early onset,Diagnoses \ Diseases of the nervous system (g0...,DX,ICD-10-CM,ICD10:G30.0,290.11,1
22244,ICD10:G30.1,Alzheimer's disease with late onset,Diagnoses \ Diseases of the nervous system (g0...,DX,ICD-10-CM,ICD10:G30.1,290.11,1
22728,ICD10:G30.9,"Alzheimer's disease, unspecified",Diagnoses \ Diseases of the nervous system (g0...,DX,ICD-10-CM,ICD10:G30.9,290.11,1
22878,ICD10:G30.8,Other alzheimer's disease,Diagnoses \ Diseases of the nervous system (g0...,DX,ICD-10-CM,ICD10:G30.8,290.11,1
65001,LPA:1009,Alzheimers disease-LMR 1009,Diagnoses \ Diseases of the nervous system (g0...,DX,LMR Problem List,ICD10:G30,290.11,0
65813,LPA:867,Alzheimer's disease-LMR 867,Diagnoses \ Diseases of the nervous system (g0...,DX,LMR Problem List,ICD10:G30,290.11,0


In [11]:
dx_dict[dx_dict['concept_code'].str.contains("ICD10:E13")]

Unnamed: 0,concept_code,concept_name,concept_path,concept_type,concept_ontology,ICD10_code,PheCode
19830,ICD10:E13.01,Other specified diabetes mellitus with hyperos...,"Diagnoses \ Endocrine, nutritional and metabol...",DX,ICD-10-CM,ICD10:E13.01,250.2
19835,ICD10:E13.311,Other specified diabetes mellitus with unspeci...,"Diagnoses \ Endocrine, nutritional and metabol...",DX,ICD-10-CM,ICD10:E13.311,250.7
19843,ICD10:E13.3299,Other specified diabetes mellitus with mild no...,"Diagnoses \ Endocrine, nutritional and metabol...",DX,ICD-10-CM,ICD10:E13.3299,250.7
19854,ICD10:E13.341,Other specified diabetes mellitus with severe ...,"Diagnoses \ Endocrine, nutritional and metabol...",DX,ICD-10-CM,ICD10:E13.341,250.7
19858,ICD10:E13.621,Other specified diabetes mellitus with foot ulcer,"Diagnoses \ Endocrine, nutritional and metabol...",DX,ICD-10-CM,ICD10:E13.621,250.2
19861,ICD10:E13.9,Other specified diabetes mellitus without comp...,"Diagnoses \ Endocrine, nutritional and metabol...",DX,ICD-10-CM,ICD10:E13.9,250.2
19934,ICD10:E13.22,Other specified diabetes mellitus with diabeti...,"Diagnoses \ Endocrine, nutritional and metabol...",DX,ICD-10-CM,ICD10:E13.22,250.22
19944,ICD10:E13.339,Other specified diabetes mellitus with moderat...,"Diagnoses \ Endocrine, nutritional and metabol...",DX,ICD-10-CM,ICD10:E13.339,250.7
19953,ICD10:E13.359,Other specified diabetes mellitus with prolife...,"Diagnoses \ Endocrine, nutritional and metabol...",DX,ICD-10-CM,ICD10:E13.359,250.7
19957,ICD10:E13.44,Other specified diabetes mellitus with diabeti...,"Diagnoses \ Endocrine, nutritional and metabol...",DX,ICD-10-CM,ICD10:E13.44,250.24


In [203]:
ADRD_REGEX = [
    # ICD-9-CM codes
    re.compile(r'^(ICD9:)?331\.0$'), # Alzheimer's disease
    re.compile(r'^(ICD9:)?290\.[0|2|3](\d+)?$'), # Senile dementia
    re.compile(r'^(ICD9:)?290\.1(\d+)?$'), # Presenile dementia
    re.compile(r'^(ICD9:)?290\.4(\d+)?$'), # Vascular dementia
    re.compile(r'^(ICD9:)?294\.1([0-1])?$'), # Dementia in conditions classified elsewhere
    re.compile(r'^(ICD9:)?294\.2([0-1])?$'), # Dementia, unspecified
    re.compile(r'^(ICD9:)?331\.1([0-9])?$'), # Frontotemporal dementia (e.g., Pick's disease)
    re.compile(r'^(ICD9:)?331\.82$'), # Dementia with Lewy bodies

    # ICD-10-CM codes
    re.compile(r'^(ICD10:)?G30(\.|$)'), # Alzheimer's disease
    re.compile(r'^(ICD10:)?F01(\.|$)'), # Vascular dementia
    re.compile(r'^(ICD10:)?F02(\.|$)'), # Dementia in other diseases classified elsewhere
    re.compile(r'^(ICD10:)?F03(\.|$)'), # Unspecified dementia
    re.compile(r'^(ICD10:)?G31(\.0|$)'), # Frontotemporal dementia (e.g., Pick's disease)
    re.compile(r'^(ICD10:)?G31(\.1|$)'), # Senile degeneration of brain, not elsewhere classified
    re.compile(r'^(ICD10:)?G31\.83$'), # Dementia with Lewy bodies

    # LPA codes
    re.compile(r'^(LPA:)?1009$'), # Alzheimer's disease
    re.compile(r'^(LPA:)?867$'), # Alzheimers disease
]

dx_dict["ADRD_DX"] = match_any_regex(dx_dict["concept_code"], ADRD_REGEX).astype(int)
dx_dict[dx_dict['ADRD_DX']==1]

Unnamed: 0,concept_code,concept_name,concept_path,concept_type,concept_ontology,ICD10_code,PheCode,ADRD_DX
2559,ICD9:290.1,Presenile dementia,"Diagnoses \ Mental, behavioral and neurodevelo...",DX,ICD-9-CM,ICD10:F03,290.10,1
2568,ICD9:290.10,"Presenile dementia, uncomplicated","Diagnoses \ Mental, behavioral and neurodevelo...",DX,ICD-9-CM,ICD10:F03.90,290.10,1
2574,ICD9:290.0,"Senile dementia, uncomplicated","Diagnoses \ Mental, behavioral and neurodevelo...",DX,ICD-9-CM,ICD10:F03.90,290.13,1
2582,ICD9:290.11,Presenile dementia with delirium,"Diagnoses \ Mental, behavioral and neurodevelo...",DX,ICD-9-CM,ICD10:F03.90,290.10,1
2587,ICD9:290.13,Presenile dementia with depressive features,"Diagnoses \ Mental, behavioral and neurodevelo...",DX,ICD-9-CM,ICD10:F03.90,290.10,1
...,...,...,...,...,...,...,...,...
22732,ICD10:G31.01,Pick's disease,Diagnoses \ Diseases of the nervous system (g0...,DX,ICD-10-CM,ICD10:G31.01,290.12,1
22878,ICD10:G30.8,Other alzheimer's disease,Diagnoses \ Diseases of the nervous system (g0...,DX,ICD-10-CM,ICD10:G30.8,290.11,1
22880,ICD10:G31.83,Dementia with lewy bodies,Diagnoses \ Diseases of the nervous system (g0...,DX,ICD-10-CM,ICD10:G31.83,290.12,1
65001,LPA:1009,Alzheimers disease-LMR 1009,Diagnoses \ Diseases of the nervous system (g0...,DX,LMR Problem List,ICD10:G30,290.11,1


In [204]:
dx_dict[dx_dict['ADRD_DX']==1][['concept_code', 'concept_name', 'concept_ontology', 'PheCode']].to_csv("../../phenotyping/adrd_codes.csv", index=False)

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--cohort', type=str, help='Cohort selection', default='MDD')
parser.add_argument('-cd', '--cutoff_date', type=str, help='Data cutoff date', default="1900-01-01")
parser.add_argument('-lw', '--lookback_window', type=int, help='Lookback window', default=2)
args, unkwn = parser.parse_known_args()

args.cohort = "CAD"
args.cutoff_date = "1900-01-01"
args.lookback_window = 1
print(args)

Namespace(cohort='CAD', cutoff_date='1900-01-01', lookback_window=1)


## Create base data

In [3]:
DX_file = base_data_dict["DX_orig_file"]
MED_file = base_data_dict["MED_orig_file"]
LAB_file = base_data_dict["LAB_orig_file"]
PROC_file = base_data_dict["PROC_orig_file"]
DEMO_file = base_data_dict["DEMO_orig_file"]
BASE_data_path = os.path.join(DATA_DIR, args.cohort.upper(), 'base_data')

logger.info("Loading original data files")
df_dx = pd.read_csv(DX_file, sep='\t', 
                    usecols=['subject_num', 'concept_code', 'sstart_date'])
df_med = pd.read_csv(MED_file, sep='\t', 
                        usecols=['subject_num', 'concept_code', 'sstart_date'])
df_lab = pd.read_csv(LAB_file, sep='\t', 
                        usecols=['subject_num', 'concept_code', 'sstart_date', 'valueflag', 'valtype', 'nval'])
df_proc = pd.read_csv(PROC_file, sep='\t', 
                        usecols=['subject_num', 'concept_code', 'sstart_date'])
df_demo = pd.read_csv(DEMO_file, sep='\t')

[32m2025-07-31 02:33:24.795[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mLoading original data files[0m


In [4]:
print(f"Loading {args.cohort} case and control data")
df_cohort = pd.read_csv(os.path.join(PHENOTYPE_DIR, f"{args.cohort}_pids.csv"))
df_cohort, cohort_pids = common.filter_cohort(df_cohort, cutoff_date = '2017-01-01')

logger.info(f"Selecting only {args.cohort} patients")
case_pids = df_cohort[df_cohort['label']==1]['subject_num'].unique().tolist()[:1000]
control_pids = df_cohort[df_cohort['label']==0]['subject_num'].unique().tolist()[:1000]
cohort_pids = case_pids + control_pids
df_dx = df_dx[df_dx['subject_num'].isin(cohort_pids)]
df_med = df_med[df_med['subject_num'].isin(cohort_pids)]
df_lab = df_lab[df_lab['subject_num'].isin(cohort_pids)]
df_proc = df_proc[df_proc['subject_num'].isin(cohort_pids)]
df_demo = df_demo[df_demo['subject_num'].isin(cohort_pids)]

print(len(cohort_pids), "patients in total")
print(len(case_pids), "cases")
print(len(control_pids), "controls")

Loading MDD case and control data


  df_cohort = pd.read_csv(os.path.join(PHENOTYPE_DIR, f"{args.cohort}_pids.csv"))
[32m2025-07-31 02:59:22.926[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mSelecting only MDD patients[0m


Controls (all retained): 2742646
Cases before filtering: 233924
Cases after filtering (>= 2017-01-01 00:00:00): 120492
Cases excluded: 113432
Total cohort size: 2863138
2000 patients in total
1000 cases
1000 controls


In [5]:
logger.info("Mapping codes")
dx_dict = pd.read_csv(base_data_dict['DX_dict_file'], sep="\t")
med_dict = pd.read_csv(base_data_dict['MED_dict_file'], sep="\t")
lab_dict = pd.read_csv(base_data_dict['LAB_dict_file'], sep="\t")
proc_dict = pd.read_csv(base_data_dict['PROC_dict_file'], sep="\t")

df_dx = phecode_mapping(df_dx, dx_dict)
df_med = rxnorm_mapping(df_med, med_dict)
df_lab = loinc_mapping(df_lab, lab_dict)
df_proc = cpt4_mapping(df_proc)

[32m2025-07-31 03:00:26.037[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mMapping codes[0m


In [6]:
logger.info("Grouping datasets by PID")
df_dx = common.groupby_pid(df_dx, df_cohort, "DX", args.cutoff_date)
df_med = common.groupby_pid(df_med, df_cohort, "MED", args.cutoff_date)
df_lab = common.groupby_pid(df_lab, df_cohort, "LAB", args.cutoff_date)
df_proc = common.groupby_pid(df_proc, df_cohort, "PROC", args.cutoff_date)

[32m2025-07-31 03:00:30.666[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mGrouping datasets by PID[0m


In [7]:
logger.info("Grouping concepts by dates")
df_dx = common.group_concepts(df_dx)
df_med = common.group_concepts(df_med)
df_lab = common.group_concepts(df_lab)
df_proc = common.group_concepts(df_proc)

[32m2025-07-31 03:00:40.658[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mGrouping concepts by dates[0m


In [8]:
logger.info("Balancing case and control numbers")
mgbb_pids = np.loadtxt(base_data_dict['MGBB_pids_file'])
df_dx, df_med, df_lab, df_proc = common.balance_case_control_data(
    df_dx, df_med, df_lab, df_proc, mgbb_pids, random_state=random_state
)

cohort_pids = df_dx.subject_num.unique().tolist()
output_file = f"{args.cohort.upper()}_balanced_pids.txt"
output_path = os.path.join(BASE_data_path, output_file)
# np.savetxt(output_path, cohort_pids)
print(f"Saved balanced patient IDs to {output_path}")

[32m2025-07-31 03:00:43.521[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mBalancing case and control numbers[0m


1411 patients after excluding MGBB participants
Found 839 cases and 572 controls
Insufficient controls (572) for 839 cases. 
Balanced cohort contains 1411 patients
1411 patients in DX after sampling
1275 patients in MED after sampling
1100 patients in LAB after sampling
1391 patients in PROC after sampling
Saved balanced patient IDs to /home/bw720/nde_traj/data/MDD/base_data/MDD_balanced_pids.txt


In [None]:
logger.info("Saving dataframes of different data types in pickle format")
save_pickle(df_dx, os.path.join(BASE_data_path, "RPDRml__DX_base_v2024.1.pkl"))
save_pickle(df_med, os.path.join(BASE_data_path, "RPDRml__MED_base_v2024.1.pkl"))
save_pickle(df_lab, os.path.join(BASE_data_path, "RPDRml__LAB_base_v2024.1.pkl"))
save_pickle(df_proc, os.path.join(BASE_data_path, "RPDRml__PROC_base_v2024.1.pkl"))
print("Saved")

In [9]:
join_how = "outer"
df1 = pd.merge(
    df_dx.drop(['concept_codes'], axis=1),
    df_med.drop(['concept_codes'], axis=1), 
    on='subject_num', suffixes=('_dx', '_med'), how=join_how
)
df1['index_date'] = df1['index_date_dx'].fillna(df1['index_date_med'])
df1['label'] = df1['label_dx'].fillna(df1['label_med'])

df2 = pd.merge(
    df_lab.drop(['concept_codes', 'label', 'index_date'], axis=1),
    df_proc.drop(['concept_codes', 'label', 'index_date'], axis=1), 
    on='subject_num', suffixes=('_lab', '_proc'), how=join_how
)

df = pd.merge(df1, df2, on='subject_num', how=join_how)

In [10]:
def process_row(row):
    # Extract and deduplicate each type
    phe_dates, phe_codes = common.deduplicate_dates_with_codes(
        row['start_dates_dx'], row['concepts_by_date_dx'])
    rxnorm_dates, rxnorm_codes = common.deduplicate_dates_with_codes(
        row['start_dates_med'], row['concepts_by_date_med'])
    cpt4_dates, cpt4_codes = common.deduplicate_dates_with_codes(
        row['start_dates_proc'], row['concepts_by_date_proc'])
    loinc_dates, loinc_codes = common.deduplicate_dates_with_codes(
        row['start_dates_lab'], row['concepts_by_date_lab'])
    
    assert(len(phe_codes) == len(phe_dates))
    assert(len(rxnorm_codes) == len(rxnorm_dates))
    assert(len(cpt4_codes) == len(cpt4_dates))
    assert(len(loinc_codes) == len(loinc_dates))
    
    # Combine all dates and codes
    all_dates = phe_dates + loinc_dates + rxnorm_dates + cpt4_dates
    all_codes = phe_codes + loinc_codes + rxnorm_codes + cpt4_codes
    combined = combine_lists(all_dates, all_codes)
    
    return pd.Series({
        'dx_codes': phe_codes,
        'dx_dates': phe_dates,
        'med_codes': rxnorm_codes,
        'med_dates': rxnorm_dates,
        'proc_codes': cpt4_codes,
        'proc_dates': cpt4_dates,
        'lab_codes': loinc_codes,
        'lab_dates': loinc_dates,
        'all_dates': [x[0] for x in combined],
        'all_codes': [[xi for xl in x[1] for xi in xl] for x in combined]
    })

df_temp = df.apply(process_row, axis=1)
df = pd.concat([df[['subject_num', 'index_date', 'label']], 
                df_temp], axis=1)

In [11]:
# Combine with demographic data
demo_cols = [
    'subject_num', 'gender', 'sbirth_date', 'race', 'marital_status', 
    'ethnicity', 'currentzip_medianincome_2010', 'public_payer', 
    'visit_count', 'notes_ct', 'icd_first_sdate', 'cpt_first_sdate',
    'biobank_genotyped'
]
df_final = df.merge(df_demo[demo_cols], on='subject_num', how='left')

df_final['index_date'] = pd.to_datetime(df_final.index_date)
df_final['sbirth_date'] = pd.to_datetime(df_final.sbirth_date)
df_final['icd_first_sdate'] = pd.to_datetime(df_final.icd_first_sdate)
df_final['cpt_first_sdate'] = pd.to_datetime(df_final.cpt_first_sdate)
df_final = df_final.rename(columns={'sbirth_date': 'birth_date'})

## Data sampling

In [3]:
BASE_data_path = os.path.join(DATA_DIR, args.cohort.upper(), 'base_data')
DATA_path = os.path.join(DATA_DIR, args.cohort.upper())

df = load_pickle(os.path.join(BASE_data_path, "RPDRml__ALL_base_v2024.2.pkl"))
print(f"Number of {args.cohort} cases = {len(df[df['label']==1])}")
print(f"Number of {args.cohort} controls = {len(df[df['label']==0])}")

Number of CAD cases = 199151
Number of CAD controls = 2640404


In [4]:
logger.info(f"Sampling patient EHR history with {args.lookback_window} year(s) lookback window")
df_new = common.ehr_sampling(
    df,
    lookback_window=args.lookback_window, 
    random_state=random_state
)

print(f"\n{len(df_new)} after data sampling")
df_cas = df_new[df_new['label']==1]
df_ctl = df_new[df_new['label']==0]
print(f"    Number of {args.cohort} cases = {len(df_cas)}")
print(f"    Number of {args.cohort} controls = {len(df_ctl)}")

logger.info("Filtering patients by age")
def diff_funct(row):
    diff = row['all_dates'][-1]-row['all_dates'][0]
    return diff.days
df_new['ehr_duration'] = df_new.apply(lambda row: diff_funct(row), axis=1)

merge_cols = [
    'subject_num', 'gender', 'race', 'marital_status', 'ethnicity',
    'currentzip_medianincome_2010', 'public_payer', 
    # 'ehr_duration', 'icd_first_sdate', 'cpt_first_sdate'
]
df_new = pd.merge(df_new, df[merge_cols], on='subject_num')
df_new['age'] = df_new.apply(lambda x: common.calc_date_diff(x['pred_date'], x['birth_date']), axis=1)
df_new = df_new[df_new['age']>=18].reset_index(drop=True) # Require a minimum age of 18 for all patients

df_new = df_new[df_new['dx_codes'].str.len() > 0].reset_index(drop=True)

print(f"\n{len(df_new)} after data filtering")
df_cas = df_new[df_new['label']==1]
df_ctl = df_new[df_new['label']==0]
print(f"    Number of {args.cohort} cases = {len(df_cas)}")
print(f"    Number of {args.cohort} controls = {len(df_ctl)}")

[32m2025-10-06 10:15:01.996[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mSampling patient EHR history with 2 year(s) lookback window[0m



2464226 after data sampling


[32m2025-10-06 10:44:56.409[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mFiltering patients by age[0m


    Number of MDD cases = 176171
    Number of MDD controls = 2288055

2100297 after data filtering
    Number of MDD cases = 151630
    Number of MDD controls = 1948667


In [15]:
custom_bins = [0, 18, 25, 40, 60, 75, float('inf')]
matched_df = match_by_demographics(
    df_new,
    label_col='label',
    date_col='pred_date',
    demographic_cols={
        'age': 'age',
        'gender': 'gender',
        'race': 'race',
        'ethnicity': 'ethnicity'
    },
    age_bins=custom_bins,
    matching_ratio=1,  # 1:1 matching
    random_state=42
)


DEMOGRAPHIC MATCHING SUMMARY
Matching on: year + age, gender, race, ethnicity

Original dataset:
  Cases: 151,630
  Controls: 1,948,667

Matching results:
  Matched cases: 129,119
  Matched controls: 129,119
  Unmatched cases: 22,511

Matching performance:
  Achieved ratio: 1:1.00
  Match rate: 85.2% of cases matched

Stratification details:
  Total unique strata: 2,086
  Strata with matches: 1,381
  Strata with no controls: 327
  Strata with insufficient controls: 167

Top 5 strata by unmatched cases:
  2003_40-60_F_White_Non-Hispanic: 880 unmatched (13 controls available)
  2004_40-60_F_White_Non-Hispanic: 859 unmatched (37 controls available)
  2002_40-60_F_White_Non-Hispanic: 819 unmatched (15 controls available)
  2001_40-60_F_White_Non-Hispanic: 712 unmatched (9 controls available)
  2000_40-60_F_White_Non-Hispanic: 685 unmatched (9 controls available)



In [35]:
print(matched_df.groupby(['landmark_year', 'label']).size().unstack(fill_value=0))

label            0.0    1.0
landmark_year              
1999              18     18
2000              39     39
2001              33     33
2002              55     55
2003              76     76
2004             376    376
2005            4638   4638
2006            4762   4762
2007            4943   4943
2008            5602   5602
2009            6276   6276
2010            6350   6350
2011            7384   7384
2012            7259   7259
2013            7325   7325
2014            7713   7713
2015            8415   8415
2016           10074  10074
2017           10466  10466
2018           10913  10913
2019            8453   8453
2020            7394   7394
2021            5696   5696
2022            3831   3831
2023            1028   1028


In [32]:
print(matched_df.groupby(['gender', 'label']).size().unstack(fill_value=0))

label     0.0    1.0
gender              
F       86262  86262
M       42855  42855
U           1      1
X           1      1


In [36]:
print(matched_df.groupby(['race', 'label']).size().unstack(fill_value=0))

label       0.0     1.0
race                   
Asian      3262    3262
Black      8420    8420
Other     14784   14784
Unknown     292     292
White    102361  102361


In [34]:
print(matched_df.groupby(['ethnicity', 'label']).size().unstack(fill_value=0))

label            0.0     1.0
ethnicity                   
Hispanic        5815    5815
Non-Hispanic  123304  123304


In [38]:
print(matched_df.groupby(['age_bin', 'label']).size().unstack(fill_value=0))

label           0.0    1.0
age_bin                   
(0.0, 18.0]    1214   1500
(18.0, 25.0]  13506  12937
(25.0, 40.0]  30535  30579
(40.0, 60.0]  45134  45687
(60.0, 75.0]  25421  25487
(75.0, inf]   13309  12929


  print(matched_df.groupby(['age_bin', 'label']).size().unstack(fill_value=0))


In [39]:
matched_df.iloc[0]

subject_num                                                            11822411.0
pred_date                                                     2011-01-27 00:00:00
index_date                                                    2011-04-28 00:00:00
birth_date                                                    1932-03-27 00:00:00
all_dates                       [2009-05-17 00:00:00, 2009-05-29 00:00:00, 200...
all_codes                       [[PheCode:401.1, PheCode:272.11, PheCode:401.1...
dx_dates                        [2009-05-17 00:00:00, 2009-07-10 00:00:00, 200...
dx_codes                        [[PheCode:401.1, PheCode:272.11, PheCode:401.1...
med_dates                       [2009-05-29 00:00:00, 2009-05-30 00:00:00, 200...
med_codes                       [[RXNORM:83367], [RXNORM:17767], [RXNORM:46041...
lab_dates                       [2009-05-17 00:00:00, 2009-09-06 00:00:00, 201...
lab_codes                       [[LOINC:13457-7|N, LOINC:13457-7|N, LOINC:1920...
proc_dates      

In [40]:
print(matched_df['label'].value_counts())

label
1.0    129119
0.0    129119
Name: count, dtype: int64
