# Take a sample of ARMD Dataset.

In [16]:

linked_features = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded', 'order_time_jittered_utc']

files_info = [
    {
        "name": "cultures_cohort",
        "path": "microbiology_cultures_cohort.csv",
        "merge_on": linked_features, 
        "dtype": {
            "ordering_mode": "category",
            "culture_description": "category",
            "was_positive": "int64",
            "organism": "category",
            "antibiotic": "category",
            "susceptibility": "category"
        }
    },
    {
        "name": "adi_scores",
        "path": "microbiology_cultures_adi_scores.csv",
        "merge_on": linked_features, 
        "dtype": {
            "adi_score": "category",
            "adi_state_rank": "category"
        }
    },
    {
        "name": "antibiotic_class_exposure",
        "path": "microbiology_cultures_antibiotic_class_exposure.csv",
        "merge_on": linked_features, 
        "dtype": {
            "medication_category": "category",
            "medication_name": "category",
            "antibiotic_class": "category",
            "time_to_culturetime": "int64"
        }
    },
    {
        "name": "ntibiotic_subtype_exposure",
        "path": "microbiology_cultures_antibiotic_subtype_exposure.csv",
        "merge_on": linked_features, 
        "dtype": {
            "medication_category": "category",
            "medication_name": "category",
            "antibiotic_subtype": "category",
            "antibiotic_subtype_category": "category",
            "medication_time_to_cultureTime": "int64"
        }
    },
    {
        "name": "cultures_comorbidity",
        "path": "microbiology_cultures_comorbidity.csv",
        "merge_on": linked_features, 
        "dtype": {
            "comorbidity_component": "category",
            "comorbidity_component_start_days_culture": "int64",
            "comorbidity_component_end_days_culture": "float64"
        }
    },
    {
        "name": "cultures_demographics",
        "path": "microbiology_cultures_demographics.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "age": "category",
            "gender": "category"
        }
    },
    {
        "name": "implied_susceptibility",
        "path": "microbiology_cultures_implied_susceptibility.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "organism": "category",
            "antibiotic": "category",
            "susceptibility": "category",
            "implied_susceptibility": "category"
        }
    },
    {
        "name": "cultures_labs",
        "path": "microbiology_cultures_labs.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "Period_Day": "int64",
            "Q75_wbc": "category",
            "Q25_wbc": "category",
            "median_wbc": "category",
            "Q25_neutrophils": "float64",
            "Q75_neutrophils": "float64",
            "median_neutrophils": "float64",
            "Q25_lymphocytes": "float64",
            "Q75_lymphocytes": "float64",
            "median_lymphocytes": "float64",
            "Q25_hgb": "category",
            "Q75_hgb": "category",
            "median_hgb": "category",
            "Q25_plt": "category",
            "Q75_plt": "category",
            "median_plt": "category",
            "Q75_na": "category",
            "Q25_na": "category",
            "median_na": "category",
            "Q75_hco3": "category",
            "Q25_hco3": "category",
            "median_hco3": "category",
            "Q75_bun": "category",
            "Q25_bun": "category",
            "median_bun": "category",
            "Q75_cr": "category",
            "Q25_cr": "category",
            "median_cr": "category",
            "Q75_lactate": "category",
            "Q25_lactate": "category",
            "median_lactate": "category",
            "Q75_procalcitonin": "category",
            "Q25_procalcitonin": "category",
            "median_procalcitonin": "category",
            "first_procalcitonin": "category",
            "last_procalcitonin": "category",
            "first_lactate":"category",
            "last_cr":"category",
            "first_cr":"category",
            "last_bun":"category",
            "first_bun":"category",
            "last_hco3":"category",
            "first_hco3":"category",
            "last_na":"category",
            "first_na":"category",
            "last_plt":"category",
            "first_plt":"category",
            "last_hgb":"category",
            "first_hgb":"category",
            "last_lymphocytes":"category",
            "first_lymphocytes":"category",
            "last_neutrophils":"category",
            "first_neutrophils":"category",
            "last_wbc":"category",
            "first_wbc":"category" 
        }
    },
    {
        "name": "microbial_resistance.csv",
        "path": "microbiology_cultures_microbial_resistance.csv",
        "merge_on": linked_features, 
        "dtype": {
            "organism": "category",
            "antibiotic": "category",
            "resistant_time_to_culturetime": "int64"
        }
    },
    {
        "name": "cultures_priorprocedures.csv",
        "path": "microbiology_cultures_priorprocedures.csv",
        "merge_on": linked_features, 
        "dtype": {
            "procedure_description": "category",
            "procedure_time_to_culturetime": "int64"
        }
    },
    {
        "name": "prior_med",
        "path": "microbiology_cultures_prior_med.csv",
        "merge_on": linked_features,
        "dtype": {
            "medication_name": "category",
            "medication_time_to_culturetime": "int64",
            "medication_category": "category"
        }
    },
    {
        "name": "cultures_vitals",
        "path": "microbiology_cultures_vitals.csv",
        "merge_on": linked_features[:3],
        "dtype": {
            "Q25_heartrate": "category",
            "Q75_heartrate": "category",
            "median_heartrate": "category",
            "Q25_resprate": "category",
            "Q75_resprate": "category",
            "median_resprate": "category",
            "Q25_temp": "category",
            "Q75_temp": "category",
            "median_temp": "category",
            "Q25_sysbp": "float64",
            "Q75_sysbp": "float64",
            "median_sysbp": "float64",
            "Q25_diasbp": "float64",
            "Q75_diasbp": "float64",
            "median_diasbp": "float64",
            "first_diasbp": "category",
            "last_diasbp": "category",
            "last_sysbp": "category",
            "first_sysbp": "category",
            "last_temp": "category",
            "first_temp": "category",
            "last_resprate": "category",
            "first_resprate": "category",
            "last_heartrate": "category",
            "first_heartrate": "category"
        }
    },
    {
        "name": "ward_info",
        "path": "microbiology_cultures_ward_info.csv",
        "merge_on": linked_features, 
        "dtype": {
            "hosp_ward_IP": "int64",
            "hosp_ward_OP": "int64",
            "hosp_ward_ER": "int64",
            "hosp_ward_ICU": "int64"
        }
    },
    {
        "name": "prior_infecting_organism",
        "path": "microbiology_culture_prior_infecting_organism.csv",
        "merge_on": linked_features, 
        "dtype": {
            "prior_organism": "category",
            "prior_infecting_organism_days_to_culutre": "int64"
        }
    }
]



In [54]:
import pandas as pd
import os

output_folder = 'doi_10_5061_dryad_jq2bvq8kp__v20250411/Processed_ARMD_Dataset/'
file = 'sample_microbiology_cultures_cohort.csv'

for filename in os.listdir(output_folder):
    file_path = os.path.join(output_folder, filename)
    if os.path.isfile(file_path) and filename == file:
        os.remove(file_path)

folder_path = 'doi_10_5061_dryad_jq2bvq8kp__v20250411/'
# Load the main culture file
cohort = pd.read_csv(folder_path + "microbiology_cultures_cohort.csv")

# Keep only cultures with a known susceptibility result
cohort = cohort[cohort["susceptibility"].isin(["Resistant", "Susceptible", "Intermediate"])]

# Select the first culture per patient (can be changed to latest or random)
cohort_sorted = cohort.sort_values("order_time_jittered_utc")
one_culture_per_patient = cohort_sorted.groupby("anon_id").first().reset_index()

# Take a sample of 10,000 patients
patient_sample = one_culture_per_patient.sample(n=10000, random_state=42)
patient_sample.to_csv(os.path.join(folder_path+'Processed_ARMD_Dataset/sample_microbiology_cultures_cohort.csv'), index=False)

# Extract linking keys
sample_order_ids = patient_sample["order_proc_id_coded"].unique()
sample_anon_ids = patient_sample["anon_id"].unique()


In [56]:
import pandas as pd
import os
import shutil


output_folder = 'doi_10_5061_dryad_jq2bvq8kp__v20250411/Processed_ARMD_Dataset/'
keep_file = 'sample_microbiology_cultures_cohort.csv'

for filename in os.listdir(output_folder):
    file_path = os.path.join(output_folder, filename)
    if os.path.isfile(file_path) and filename != keep_file:
        os.remove(file_path)

# List of files related to order_proc_id_coded
order_level_files = [
    "microbiology_cultures_labs.csv",
    "microbiology_cultures_vitals.csv",
    "microbiology_cultures_prior_med.csv",
    "microbiology_cultures_microbial_resistance.csv",
    "microbiology_cultures_priorprocedures.csv",
    "microbiology_cultures_implied_susceptibility.csv"
]

# List of files related to anon_id
patient_level_files = [
    "microbiology_cultures_demographics.csv",
    "microbiology_cultures_comorbidity.csv",
    "microbiology_cultures_adi_scores.csv",
    "microbiology_cultures_nursing_home_visits.csv"
]

# def getDatatype(file):
#     for f in files_info:
#         if f['path']== file:
#             return f['dtype']
#     return None 


# Filter order-level files
chunk_size = 10000

for file in order_level_files:
    first_chunk = True 
    for chunk in pd.read_csv(folder_path + file, chunksize=chunk_size, low_memory=False):
        filtered_df = chunk[chunk["order_proc_id_coded"].isin(sample_order_ids)]
        filtered_df.to_csv(output_folder+'sample_'+file, mode='a', index=False, header=first_chunk)
        first_chunk = False


for file in patient_level_files:
    first_chunk = True 
    for chunk in pd.read_csv(folder_path + file, chunksize=chunk_size, low_memory=False):
        filtered_df = chunk[chunk["anon_id"].isin(sample_anon_ids)]
        filtered_df.to_csv(output_folder+'sample_'+file, mode='a', index=False, header=first_chunk)
        first_chunk = False


**sample files**

    "sample_microbiology_cultures_cohort.csv"
    "sample_microbiology_cultures_labs.csv"
    "sample_microbiology_cultures_vitals.csv"
    "sample_microbiology_cultures_prior_med.csv"
    "sample_microbiology_cultures_microbial_resistance.csv"
    "sample_microbiology_cultures_priorprocedures.csv"
    "sample_microbiology_cultures_implied_susceptibility.csv"
    "sample_microbiology_cultures_demographics.csv"
    "sample_microbiology_cultures_comorbidity.csv"
    "sample_microbiology_cultures_adi_scores.csv"
    "sample_microbiology_cultures_nursing_home_visits.csv"

In [None]:
import duckdb
import glob
import os

# Setup
input_folder = 'doi_10_5061_dryad_jq2bvq8kp__v20250411/Processed_ARMD_Dataset/'
output_file = 'output/merged_sample_data.parquet'
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Connect to DuckDB
con = duckdb.connect('my_joined_data.db')

# Load CSVs as views
csv_files = glob.glob(input_folder + "*.csv")
views = {}

for file in csv_files:
    name = os.path.basename(file).replace("sample_microbiology_cultures_", "").replace(".csv", "")
    table_name = name if name != "cohort" else "cohort"
    views[table_name] = table_name

    if "demographics" in file:
        # Override types for demographics where gender is misdetected
        con.execute(f"""
            CREATE OR REPLACE VIEW {table_name} AS
            SELECT * FROM read_csv_auto('{file}', types={{'gender': 'VARCHAR'}}, nullstr='Null')
        """)
    else:
        con.execute(f"""
            CREATE OR REPLACE VIEW {table_name} AS
            SELECT * FROM read_csv_auto('{file}', nullstr='Null')
        """)

# Construct SELECT clause
select_clause = ",\n    ".join([f"{alias}.*" for alias in views])

# Start from 'cohort' and join the rest using ON
join_query = f"SELECT\n    {select_clause}\nFROM cohort"
for table_name in views:
    if table_name == "cohort":
        continue
    join_key = "order_proc_id_coded" if table_name in [
        "labs", "vitals", "prior_med", "microbial_resistance", "priorprocedures", "implied_susceptibility"
    ] else "anon_id"
    join_query += f"\nLEFT JOIN {table_name} ON cohort.{join_key} = {table_name}.{join_key}"

# Export to Parquet
con.execute(f"""
    COPY (
        {join_query}
    )
    TO '{output_file}' (FORMAT PARQUET)
""")

print(f"Join completed. Output saved to {output_file}")


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [31]:

import pandas as pd
output_file = 'output/merged_sample_data.parquet'
df = pd.read_parquet(output_file)

ArrowInvalid: Could not open Parquet input source '<Buffer>': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.

In [35]:
import pyarrow.parquet as pq
try:
    pq.read_table(output_file)
except Exception as e:
    print(f"Error: {e}")

Error: Could not open Parquet input source 'output/merged_sample_data.parquet': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.
