In [1]:
import pandas as pd
import glob
import os

folder_path = "doi_10_5061_dryad_jq2bvq8kp__v20250411"

all_files = glob.glob(os.path.join(folder_path, "*.csv"))
all_df_files = {}
all_features = []
for file in all_files:
    df = pd.read_csv(file, nrows = 5)
    label = os.path.splitext(os.path.basename(file))[0]
    all_df_files[label] = df
    all_features.extend(df.columns.tolist())

unique_features = set(all_features)

print(f'features = {unique_features}')
print(f'No of features = {len(unique_features)}')

for file, df in all_df_files.items():
    print(f'file: {file}')
    print(df.dtypes)
    

features = {'first_temp', 'last_wbc', 'first_hgb', 'last_sysbp', 'Q25_lactate', 'first_resprate', 'last_procalcitonin', 'medication_category', 'culture_description', 'Q75_wbc', 'last_plt', 'last_heartrate', 'Q25_heartrate', 'last_diasbp', 'Antibiotic', 'median_heartrate', 'median_plt', 'median_bun', 'median_diasbp', 'median_na', 'median_cr', 'Q25_na', 'resistant_time_to_culturetime', 'last_neutrophils', 'Q25_sysbp', 'Organism', 'median_hgb', 'median_resprate', 'medication_time_to_culturetime', 'median_sysbp', 'Q75_neutrophils', 'last_bun', 'median_neutrophils', 'antibiotic_subtype_category', 'gender', 'procedure_time_to_culturetime', 'last_resprate', 'last_lactate', 'Q75_hgb', 'Q75_heartrate', 'Q75_cr', 'first_bun', 'median_temp', 'Q75_procalcitonin', 'procedure_description', 'Q25_bun', 'time_to_culturetime', 'Q25_resprate', 'organism', 'adi_state_rank', 'prior_organism', 'Q75_na', 'first_cr', 'first_hco3', 'prior_infecting_organism_days_to_culutre', 'Q75_resprate', 'median_hco3', 'Q75

In [1]:
import pandas as pd
import glob
import os

linked_features = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded', 'order_time_jittered_utc']

files_info = [
    {
        "name": "cultures_cohort",
        "path": "microbiology_cultures_cohort.csv",
        "merge_on": linked_features, 
        "dtype": {
            "ordering_mode": "category",
            "culture_description": "category",
            "was_positive": "int64",
            "organism": "category",
            "antibiotic": "category",
            "susceptibility": "category"
        }
    },
    {
        "name": "adi_scores",
        "path": "microbiology_cultures_adi_scores.csv",
        "merge_on": linked_features, 
        "dtype": {
            "adi_score": "category",
            "adi_state_rank": "category"
        }
    },
    {
        "name": "antibiotic_class_exposure",
        "path": "microbiology_cultures_antibiotic_class_exposure.csv",
        "merge_on": linked_features, 
        "dtype": {
            "medication_category": "category",
            "medication_name": "category",
            "antibiotic_class": "category",
            "time_to_culturetime": "int64"
        }
    },
    {
        "name": "ntibiotic_subtype_exposure",
        "path": "microbiology_cultures_antibiotic_subtype_exposure.csv",
        "merge_on": linked_features, 
        "dtype": {
            "medication_category": "category",
            "medication_name": "category",
            "antibiotic_subtype": "category",
            "antibiotic_subtype_category": "category",
            "medication_time_to_cultureTime": "int64"
        }
    },
    {
        "name": "cultures_comorbidity",
        "path": "microbiology_cultures_comorbidity.csv",
        "merge_on": linked_features, 
        "dtype": {
            "comorbidity_component": "category",
            "comorbidity_component_start_days_culture": "int64",
            "comorbidity_component_end_days_culture": "float64"
        }
    },
    {
        "name": "cultures_demographics",
        "path": "microbiology_cultures_demographics.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "age": "category",
            "gender": "category"
        }
    },
    {
        "name": "implied_susceptibility",
        "path": "microbiology_cultures_implied_susceptibility.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "organism": "category",
            "antibiotic": "category",
            "susceptibility": "category",
            "implied_susceptibility": "category"
        }
    },
    {
        "name": "cultures_labs",
        "path": "microbiology_cultures_labs.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "Period_Day": "int64",
            "Q75_wbc": "category",
            "Q25_wbc": "category",
            "median_wbc": "category",
            "Q25_neutrophils": "float64",
            "Q75_neutrophils": "float64",
            "median_neutrophils": "float64",
            "Q25_lymphocytes": "float64",
            "Q75_lymphocytes": "float64",
            "median_lymphocytes": "float64",
            "Q25_hgb": "category",
            "Q75_hgb": "category",
            "median_hgb": "category",
            "Q25_plt": "category",
            "Q75_plt": "category",
            "median_plt": "category",
            "Q75_na": "category",
            "Q25_na": "category",
            "median_na": "category",
            "Q75_hco3": "category",
            "Q25_hco3": "category",
            "median_hco3": "category",
            "Q75_bun": "category",
            "Q25_bun": "category",
            "median_bun": "category",
            "Q75_cr": "category",
            "Q25_cr": "category",
            "median_cr": "category",
            "Q75_lactate": "category",
            "Q25_lactate": "category",
            "median_lactate": "category",
            "Q75_procalcitonin": "category",
            "Q25_procalcitonin": "category",
            "median_procalcitonin": "category",
            "first_procalcitonin": "category",
            "last_procalcitonin": "category",
            "first_lactate":"category",
            "last_cr":"category",
            "first_cr":"category",
            "last_bun":"category",
            "first_bun":"category",
            "last_hco3":"category",
            "first_hco3":"category",
            "last_na":"category",
            "first_na":"category",
            "last_plt":"category",
            "first_plt":"category",
            "last_hgb":"category",
            "first_hgb":"category",
            "last_lymphocytes":"category",
            "first_lymphocytes":"category",
            "last_neutrophils":"category",
            "first_neutrophils":"category",
            "last_wbc":"category",
            "first_wbc":"category" 
        }
    },
    {
        "name": "microbial_resistance.csv",
        "path": "microbiology_cultures_microbial_resistance.csv",
        "merge_on": linked_features, 
        "dtype": {
            "organism": "category",
            "antibiotic": "category",
            "resistant_time_to_culturetime": "int64"
        }
    },
    {
        "name": "cultures_priorprocedures.csv",
        "path": "microbiology_cultures_priorprocedures.csv",
        "merge_on": linked_features, 
        "dtype": {
            "procedure_description": "category",
            "procedure_time_to_culturetime": "int64"
        }
    },
    {
        "name": "prior_med",
        "path": "microbiology_cultures_prior_med.csv",
        "merge_on": linked_features,
        "dtype": {
            "medication_name": "category",
            "medication_time_to_culturetime": "int64",
            "medication_category": "category"
        }
    },
    {
        "name": "cultures_vitals",
        "path": "microbiology_cultures_vitals.csv",
        "merge_on": linked_features[:3],
        "dtype": {
            "Q25_heartrate": "category",
            "Q75_heartrate": "category",
            "median_heartrate": "category",
            "Q25_resprate": "category",
            "Q75_resprate": "category",
            "median_resprate": "category",
            "Q25_temp": "category",
            "Q75_temp": "category",
            "median_temp": "category",
            "Q25_sysbp": "float64",
            "Q75_sysbp": "float64",
            "median_sysbp": "float64",
            "Q25_diasbp": "float64",
            "Q75_diasbp": "float64",
            "median_diasbp": "float64",
            "first_diasbp": "category",
            "last_diasbp": "category",
            "last_sysbp": "category",
            "first_sysbp": "category",
            "last_temp": "category",
            "first_temp": "category",
            "last_resprate": "category",
            "first_resprate": "category",
            "last_heartrate": "category",
            "first_heartrate": "category"
        }
    },
    {
        "name": "ward_info",
        "path": "microbiology_cultures_ward_info.csv",
        "merge_on": linked_features, 
        "dtype": {
            "hosp_ward_IP": "int64",
            "hosp_ward_OP": "int64",
            "hosp_ward_ER": "int64",
            "hosp_ward_ICU": "int64"
        }
    },
    {
        "name": "prior_infecting_organism",
        "path": "microbiology_culture_prior_infecting_organism.csv",
        "merge_on": linked_features, 
        "dtype": {
            "prior_organism": "category",
            "prior_infecting_organism_days_to_culutre": "int64"
        }
    }
]

