In [1]:
import pandas as pd
import glob
import os

folder_path = "doi_10_5061_dryad_jq2bvq8kp__v20250411"

all_files = glob.glob(os.path.join(folder_path, "*.csv"))
all_df_files = {}
all_features = []
for file in all_files:
    df = pd.read_csv(file, nrows = 5)
    label = os.path.splitext(os.path.basename(file))[0]
    all_df_files[label] = df
    all_features.extend(df.columns.tolist())

unique_features = set(all_features)

print(f'features = {unique_features}')
print(f'No of features = {len(unique_features)}')

for file, df in all_df_files.items():
    print(f'file: {file}')
    print(df.dtypes)
    

features = {'first_temp', 'last_wbc', 'first_hgb', 'last_sysbp', 'Q25_lactate', 'first_resprate', 'last_procalcitonin', 'medication_category', 'culture_description', 'Q75_wbc', 'last_plt', 'last_heartrate', 'Q25_heartrate', 'last_diasbp', 'Antibiotic', 'median_heartrate', 'median_plt', 'median_bun', 'median_diasbp', 'median_na', 'median_cr', 'Q25_na', 'resistant_time_to_culturetime', 'last_neutrophils', 'Q25_sysbp', 'Organism', 'median_hgb', 'median_resprate', 'medication_time_to_culturetime', 'median_sysbp', 'Q75_neutrophils', 'last_bun', 'median_neutrophils', 'antibiotic_subtype_category', 'gender', 'procedure_time_to_culturetime', 'last_resprate', 'last_lactate', 'Q75_hgb', 'Q75_heartrate', 'Q75_cr', 'first_bun', 'median_temp', 'Q75_procalcitonin', 'procedure_description', 'Q25_bun', 'time_to_culturetime', 'Q25_resprate', 'organism', 'adi_state_rank', 'prior_organism', 'Q75_na', 'first_cr', 'first_hco3', 'prior_infecting_organism_days_to_culutre', 'Q75_resprate', 'median_hco3', 'Q75

In [1]:
linked_features = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded', 'order_time_jittered_utc']

files_info = [
    {
        "name": "cultures_cohort",
        "path": "microbiology_cultures_cohort.csv",
        "merge_on": linked_features, 
        "dtype": {
            "ordering_mode": "object",
            "culture_description": "object",
            "was_positive": "Int32",
            "organism": "object",
            "antibiotic": "object",
            "susceptibility": "object"
        }
    },
    {
        "name": "ward_info",
        "path": "microbiology_cultures_ward_info.csv",
        "merge_on": linked_features, 
        "dtype": {
            "hosp_ward_IP": "Int32",
            "hosp_ward_OP": "Int32",
            "hosp_ward_ER": "Int32",
            "hosp_ward_ICU": "Int32"
        }
    },
    {
        "name": "prior_med",
        "path": "microbiology_cultures_prior_med.csv",
        "merge_on": linked_features,
        "dtype": {
            "medication_name": "object",
            "medication_time_to_culturetime": "Int32",
            "medication_category": "object"
        }
    },
    {
        "name": "microbial_resistance",
        "path": "microbiology_cultures_microbial_resistance.csv",
        "merge_on": linked_features, 
        "dtype": {
            "organism": "object",
            "antibiotic": "object",
            "resistant_time_to_culturetime": "Int32"
        }
    },
    {
        "name": "cultures_demographics",
        "path": "microbiology_cultures_demographics.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "age": "object",
            "gender": "object"
        }
    },
     {
        "name": "cultures_labs",
        "path": "microbiology_cultures_labs.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "Period_Day": "Int32",
            "Q75_wbc": "object",
            "Q25_wbc": "object",
            "median_wbc": "object",
            "Q25_neutrophils": "float32",
            "Q75_neutrophils": "float32",
            "median_neutrophils": "float32",
            "Q25_lymphocytes": "float32",
            "Q75_lymphocytes": "float32",
            "median_lymphocytes": "float32",
            "Q25_hgb": "object",
            "Q75_hgb": "object",
            "median_hgb": "object",
            "Q25_plt": "object",
            "Q75_plt": "object",
            "median_plt": "object",
            "Q75_na": "object",
            "Q25_na": "object",
            "median_na": "object",
            "Q75_hco3": "object",
            "Q25_hco3": "object",
            "median_hco3": "object",
            "Q75_bun": "object",
            "Q25_bun": "object",
            "median_bun": "object",
            "Q75_cr": "object",
            "Q25_cr": "object",
            "median_cr": "object",
            "Q75_lactate": "object",
            "Q25_lactate": "object",
            "median_lactate": "object",
            "Q75_procalcitonin": "object",
            "Q25_procalcitonin": "object",
            "median_procalcitonin": "object",
            "first_procalcitonin": "object",
            "last_procalcitonin": "object",
            "first_lactate":"object",
            "last_cr":"object",
            "first_cr":"object",
            "last_bun":"object",
            "first_bun":"object",
            "last_hco3":"object",
            "first_hco3":"object",
            "last_na":"object",
            "first_na":"object",
            "last_plt":"object",
            "first_plt":"object",
            "last_hgb":"object",
            "first_hgb":"object",
            "last_lymphocytes":"object",
            "first_lymphocytes":"object",
            "last_neutrophils":"object",
            "first_neutrophils":"object",
            "last_wbc":"object",
            "first_wbc":"object" 
        }
    },
    {
        "name": "cultures_vitals",
        "path": "microbiology_cultures_vitals.csv",
        "merge_on": linked_features[:3],
        "dtype": {
            "Q25_heartrate": "object",
            "Q75_heartrate": "object",
            "median_heartrate": "object",
            "Q25_resprate": "object",
            "Q75_resprate": "object",
            "median_resprate": "object",
            "Q25_temp": "object",
            "Q75_temp": "object",
            "median_temp": "object",
            "Q25_sysbp": "float32",
            "Q75_sysbp": "float32",
            "median_sysbp": "float32",
            "Q25_diasbp": "float32",
            "Q75_diasbp": "float32",
            "median_diasbp": "float32",
            "first_diasbp": "object",
            "last_diasbp": "object",
            "last_sysbp": "object",
            "first_sysbp": "object",
            "last_temp": "object",
            "first_temp": "object",
            "last_resprate": "object",
            "first_resprate": "object",
            "last_heartrate": "object",
            "first_heartrate": "object"
        }
    },
     {
        "name": "antibiotic_class_exposure",
        "path": "microbiology_cultures_antibiotic_class_exposure.csv",
        "merge_on": linked_features, 
        "dtype": {
            "medication_category": "object",
            "medication_name": "object",
            "antibiotic_class": "object",
            "time_to_culturetime": "Int32"
        }
    },
    {
        "name": "antibiotic_subtype_exposure",
        "path": "microbiology_cultures_antibiotic_subtype_exposure.csv",
        "merge_on": linked_features, 
        "dtype": {
            "medication_category": "object",
            "medication_name": "object",
            "antibiotic_subtype": "object",
            "antibiotic_subtype_category": "object",
            "medication_time_to_cultureTime": "Int32"
        }
    },
    {
        "name": "prior_infecting_organism",
        "path": "microbiology_culture_prior_infecting_organism.csv",
        "merge_on": linked_features, 
        "dtype": {
            "prior_organism": "object",
            "prior_infecting_organism_days_to_culutre": "Int32"
        }
    },
    {
        "name": "cultures_comorbidity",
        "path": "microbiology_cultures_comorbidity.csv",
        "merge_on": linked_features, 
        "dtype": {
            "comorbidity_component": "object",
            "comorbidity_component_start_days_culture": "Int32",
            "comorbidity_component_end_days_culture": "float32"
        }
    },
    {
        "name": "cultures_priorprocedures.csv",
        "path": "microbiology_cultures_priorprocedures.csv",
        "merge_on": linked_features, 
        "dtype": {
            "procedure_description": "object",
            "procedure_time_to_culturetime": "Int32"
        }
    },
    {
        "name": "adi_scores",
        "path": "microbiology_cultures_adi_scores.csv",
        "merge_on": linked_features, 
        "dtype": {
            "adi_score": "object",
            "adi_state_rank": "object"
        }
    },
    {
        "name": "nursing_home_visits",
        "path": "microbiology_cultures_adi_scores.csv",
        "merge_on": linked_features, 
        "dtype": {
            "nursing_home_visit_culture": "Int32"
        }
    },
    {
        "name": "implied_susceptibility",
        "path": "microbiology_cultures_implied_susceptibility.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "organism": "object",
            "antibiotic": "object",
            "susceptibility": "object",
            "implied_susceptibility": "object"
        }
    }
    #, {
    #     "name": "implied_susceptibility_rules",
    #     "path": "microbiology_cultures_implied_susceptibility.csv",
    #     "merge_on": None , 
    #     "dtype": {
    #         "Organism": "object",
    #         "Antibiotic": "object",
    #         "Susceptibility": "object",
    #     }
    # }
]


In [21]:
import glob
import os
input_folder = 'doi_10_5061_dryad_jq2bvq8kp__v20250411/Processed_ARMD_Dataset/'
output_file = 'output/merged_sample_data.csv'

linked_features = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded', 'order_time_jittered_utc']

csv_files = glob.glob(input_folder + "*.csv")

for file in csv_files:
    print(os.path.basename(file))

sample_microbiology_cultures_adi_scores.csv
sample_microbiology_cultures_cohort.csv
sample_microbiology_cultures_comorbidity.csv
sample_microbiology_cultures_demographics.csv
sample_microbiology_cultures_implied_susceptibility.csv
sample_microbiology_cultures_labs.csv
sample_microbiology_cultures_microbial_resistance.csv
sample_microbiology_cultures_nursing_home_visits.csv
sample_microbiology_cultures_priorprocedures.csv
sample_microbiology_cultures_prior_med.csv
sample_microbiology_cultures_vitals.csv


In [54]:
import pandas as pd
output_file = 'output/'
df = pd.read_parquet(output_file)
print(df.shape)
print(df.dtypes)

(2902702, 87)
anon_id                           string[python]
pat_enc_csn_id_coded                       int64
order_proc_id_coded                        int64
order_time_jittered_utc           string[python]
ordering_mode_x                   string[python]
                                       ...      
resistant_time_to_culturetime            float64
nursing_home_visit_culture               float64
medication_name                   string[python]
medication_time_to_culturetime           float64
medication_category               string[python]
Length: 87, dtype: object


# To make sure the size of sample after each join

In [69]:
import dask.dataframe as dd

output_file = 'output/'
df = dd.read_parquet(output_file)
print("Available columns:", df.columns.tolist())

Available columns: ['pat_enc_csn_id_coded', 'order_proc_id_coded', 'order_time_jittered_utc', 'ordering_mode_x', 'culture_description_x', 'was_positive_x', 'organism_x', 'antibiotic_x', 'susceptibility_x', 'adi_score', 'adi_state_rank', 'ordering_mode_y', 'culture_description_y', 'was_positive_y', 'organism_y', 'antibiotic_y', 'susceptibility_y', 'age', 'gender', 'organism_left', 'antibiotic_left', 'susceptibility', 'implied_susceptibility', 'Period_Day', 'Q75_wbc', 'Q25_wbc', 'median_wbc', 'Q25_neutrophils', 'Q75_neutrophils', 'median_neutrophils', 'Q25_lymphocytes', 'Q75_lymphocytes', 'median_lymphocytes', 'Q25_hgb', 'Q75_hgb', 'median_hgb', 'Q25_plt', 'Q75_plt', 'median_plt', 'Q75_na', 'Q25_na', 'median_na', 'Q75_hco3', 'Q25_hco3', 'median_hco3', 'Q75_bun', 'Q25_bun', 'median_bun', 'Q75_cr', 'Q25_cr', 'median_cr', 'Q75_lactate', 'Q25_lactate', 'median_lactate', 'Q75_procalcitonin', 'Q25_procalcitonin', 'median_procalcitonin', 'first_procalcitonin', 'last_procalcitonin', 'last_lactat

In [29]:
import glob
import os
sample_folder = 'new_sample_one/'
sample_files = glob.glob(sample_folder + "*.parquet")
for file in sample_files:
    print(os.path.basename(file))

adi_scores.parquet
antibiotic_class_exposure.parquet
antibiotic_subtype_exposure.parquet
cultures_cohort.parquet
cultures_comorbidity.parquet
cultures_demographics.parquet
cultures_labs.parquet
cultures_priorprocedures.csv.parquet
cultures_vitals.parquet
implied_susceptibility.parquet
microbial_resistance.parquet
nursing_home_visits.parquet
prior_infecting_organism.parquet
prior_med.parquet
ward_info.parquet


In [3]:

import pandas as pd

sample_folder = 'new_sample_one/'
df = pd.read_parquet(sample_folder + "cultures_cohort.parquet")
print(df.shape)

(2662, 10)


In [5]:
df.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,order_time_jittered_utc,ordering_mode,culture_description,was_positive,organism,antibiotic,susceptibility
0,JC1873593,131236477728,531304610,2017-07-09 21:06:00+00:00,Outpatient,URINE,1,KLEBSIELLA PNEUMONIAE,Nitrofurantoin,Intermediate
1,JC604528,131126526250,476505092,2015-10-17 23:27:00+00:00,Outpatient,RESPIRATORY,1,MUCOID PSEUDOMONAS AERUGINOSA,Cefepime,Susceptible
2,JC902494,131278170812,637729583,2019-10-26 23:09:00+00:00,Outpatient,RESPIRATORY,1,MUCOID PSEUDOMONAS AERUGINOSA,Doripenem,Susceptible
3,JC902494,131316125170,745830011,2021-08-20 21:52:00+00:00,Outpatient,RESPIRATORY,1,ACHROMOBACTER XYLOSOXIDANS,Aztreonam,Resistant
4,JC902494,131242685986,546152566,2017-12-09 21:51:00+00:00,Outpatient,RESPIRATORY,1,ACHROMOBACTER XYLOSOXIDANS,Tobramycin,Resistant


In [31]:
import dask.dataframe as dd
import glob
import os 

merged_folder = 'merged_sample_one/'
merged_sample_file = 'merged_ARMD.parquet'

df = dd.read_parquet(merged_folder + merged_sample_file)
unique_patient_ids = df['anon_id'].unique()
unique_patient_ids = unique_patient_ids.compute()
print("# of patients:", len(unique_patient_ids))


# of patients: 100


In [33]:
df.compute()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,order_time_jittered_utc,ordering_mode,culture_description,was_positive,organism,antibiotic,susceptibility,hosp_ward_IP,hosp_ward_OP,hosp_ward_ER,hosp_ward_ICU,medication_name,medication_time_to_culturetime,medication_category,resistant_time_to_culturetime,age,gender,Period_Day,Q75_wbc,Q25_wbc,median_wbc,Q25_neutrophils,Q75_neutrophils,median_neutrophils,Q25_lymphocytes,Q75_lymphocytes,median_lymphocytes,Q25_hgb,Q75_hgb,median_hgb,Q25_plt,Q75_plt,median_plt,Q75_na,Q25_na,median_na,Q75_hco3,Q25_hco3,median_hco3,Q75_bun,Q25_bun,median_bun,Q75_cr,Q25_cr,median_cr,Q75_lactate,Q25_lactate,median_lactate,Q75_procalcitonin,Q25_procalcitonin,median_procalcitonin,first_procalcitonin,last_procalcitonin,last_lactate,first_lactate,last_cr,first_cr,last_bun,first_bun,last_hco3,first_hco3,last_na,first_na,last_plt,first_plt,last_hgb,first_hgb,last_lymphocytes,first_lymphocytes,last_neutrophils,first_neutrophils,last_wbc,first_wbc,Q25_heartrate,Q75_heartrate,median_heartrate,Q25_resprate,Q75_resprate,median_resprate,Q25_temp,Q75_temp,median_temp,Q25_sysbp,Q75_sysbp,median_sysbp,Q25_diasbp,Q75_diasbp,median_diasbp,first_diasbp,last_diasbp,last_sysbp,first_sysbp,last_temp,first_temp,last_resprate,first_resprate,last_heartrate,first_heartrate,antibiotic_class,time_to_culturetime
0,JC1873593,131236477728,531304610,2017-07-09 21:06:00+00:00,Outpatient,URINE,1,KLEBSIELLA PNEUMONIAE,Nitrofurantoin,Intermediate,0,1,0,0,Cefazolin,75,CEF,,65-74 years,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Beta Lactam,75
1,JC1873593,131236477728,531304610,2017-07-09 21:06:00+00:00,Outpatient,URINE,1,KLEBSIELLA PNEUMONIAE,Nitrofurantoin,Intermediate,0,1,0,0,Cefazolin,75,CEF,,65-74 years,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Beta Lactam,75
2,JC1873593,131236477728,531304610,2017-07-09 21:06:00+00:00,Outpatient,URINE,1,KLEBSIELLA PNEUMONIAE,Nitrofurantoin,Intermediate,0,1,0,0,Cefazolin,75,CEF,,65-74 years,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Beta Lactam,97
3,JC1873593,131236477728,531304610,2017-07-09 21:06:00+00:00,Outpatient,URINE,1,KLEBSIELLA PNEUMONIAE,Nitrofurantoin,Intermediate,0,1,0,0,Cefazolin,75,CEF,,65-74 years,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Fluoroquinolone,74
4,JC1873593,131236477728,531304610,2017-07-09 21:06:00+00:00,Outpatient,URINE,1,KLEBSIELLA PNEUMONIAE,Nitrofurantoin,Intermediate,0,1,0,0,Cefazolin,97,CEF,,65-74 years,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Beta Lactam,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17347583,JC2501862,131291199517,671962833,2020-06-25 01:43:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Trimethoprim/Sulfamethoxazole,Resistant,1,0,1,0,,,,26,18-24 years,0,14,18.7,18.7,18.7,,,,,,,11800.0,11800.0,11800.0,236.0,236.0,236.0,135.0,135.0,135.0,23.0,23.0,23.0,4.0,4.0,4.0,0.63,0.63,0.63,,,,,,,,,,,,0.63,,,,,,,,,,,,,,,,,112.0,115.0,114.0,18.0,26.0,21.0,99.5,99.5,99.5,109.0,117.0,111.0,61.0,70.0,67.0,,,,,,,26.0,,,130.0,Beta Lactam,26
17347584,JC2501862,131291199517,671962833,2020-06-25 01:43:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Trimethoprim/Sulfamethoxazole,Resistant,1,0,1,0,,,,26,18-24 years,0,14,18.7,18.7,18.7,,,,,,,11800.0,11800.0,11800.0,236.0,236.0,236.0,135.0,135.0,135.0,23.0,23.0,23.0,4.0,4.0,4.0,0.63,0.63,0.63,,,,,,,,,,,,0.63,,,,,,,,,,,,,,,,,112.0,115.0,114.0,18.0,26.0,21.0,99.5,99.5,99.5,109.0,117.0,111.0,61.0,70.0,67.0,,70.0,109.0,,,,,,,130.0,Nitrofuran,26
17347585,JC2501862,131291199517,671962833,2020-06-25 01:43:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Trimethoprim/Sulfamethoxazole,Resistant,1,0,1,0,,,,26,18-24 years,0,14,18.7,18.7,18.7,,,,,,,11800.0,11800.0,11800.0,236.0,236.0,236.0,135.0,135.0,135.0,23.0,23.0,23.0,4.0,4.0,4.0,0.63,0.63,0.63,,,,,,,,,,,,0.63,,,,,,,,,,,,,,,,,112.0,115.0,114.0,18.0,26.0,21.0,99.5,99.5,99.5,109.0,117.0,111.0,61.0,70.0,67.0,,70.0,109.0,,,,,,,130.0,Beta Lactam,26
17347586,JC2501862,131291199517,671962833,2020-06-25 01:43:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Trimethoprim/Sulfamethoxazole,Resistant,1,0,1,0,,,,26,18-24 years,0,14,18.7,18.7,18.7,,,,,,,11800.0,11800.0,11800.0,236.0,236.0,236.0,135.0,135.0,135.0,23.0,23.0,23.0,4.0,4.0,4.0,0.63,0.63,0.63,,,,,,,,,,,,0.63,,,,,,,,,,,,,,,,,112.0,115.0,114.0,18.0,26.0,21.0,99.5,99.5,99.5,109.0,117.0,111.0,61.0,70.0,67.0,,72.0,124.0,,,,,,,130.0,Nitrofuran,26
