# Take a sample of ARMD Dataset.

In [16]:

linked_features = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded', 'order_time_jittered_utc']

files_info = [
    {
        "name": "cultures_cohort",
        "path": "microbiology_cultures_cohort.csv",
        "merge_on": linked_features, 
        "dtype": {
            "ordering_mode": "category",
            "culture_description": "category",
            "was_positive": "int64",
            "organism": "category",
            "antibiotic": "category",
            "susceptibility": "category"
        }
    },
    {
        "name": "adi_scores",
        "path": "microbiology_cultures_adi_scores.csv",
        "merge_on": linked_features, 
        "dtype": {
            "adi_score": "category",
            "adi_state_rank": "category"
        }
    },
    {
        "name": "antibiotic_class_exposure",
        "path": "microbiology_cultures_antibiotic_class_exposure.csv",
        "merge_on": linked_features, 
        "dtype": {
            "medication_category": "category",
            "medication_name": "category",
            "antibiotic_class": "category",
            "time_to_culturetime": "int64"
        }
    },
    {
        "name": "ntibiotic_subtype_exposure",
        "path": "microbiology_cultures_antibiotic_subtype_exposure.csv",
        "merge_on": linked_features, 
        "dtype": {
            "medication_category": "category",
            "medication_name": "category",
            "antibiotic_subtype": "category",
            "antibiotic_subtype_category": "category",
            "medication_time_to_cultureTime": "int64"
        }
    },
    {
        "name": "cultures_comorbidity",
        "path": "microbiology_cultures_comorbidity.csv",
        "merge_on": linked_features, 
        "dtype": {
            "comorbidity_component": "category",
            "comorbidity_component_start_days_culture": "int64",
            "comorbidity_component_end_days_culture": "float64"
        }
    },
    {
        "name": "cultures_demographics",
        "path": "microbiology_cultures_demographics.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "age": "category",
            "gender": "category"
        }
    },
    {
        "name": "implied_susceptibility",
        "path": "microbiology_cultures_implied_susceptibility.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "organism": "category",
            "antibiotic": "category",
            "susceptibility": "category",
            "implied_susceptibility": "category"
        }
    },
    {
        "name": "cultures_labs",
        "path": "microbiology_cultures_labs.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "Period_Day": "int64",
            "Q75_wbc": "category",
            "Q25_wbc": "category",
            "median_wbc": "category",
            "Q25_neutrophils": "float64",
            "Q75_neutrophils": "float64",
            "median_neutrophils": "float64",
            "Q25_lymphocytes": "float64",
            "Q75_lymphocytes": "float64",
            "median_lymphocytes": "float64",
            "Q25_hgb": "category",
            "Q75_hgb": "category",
            "median_hgb": "category",
            "Q25_plt": "category",
            "Q75_plt": "category",
            "median_plt": "category",
            "Q75_na": "category",
            "Q25_na": "category",
            "median_na": "category",
            "Q75_hco3": "category",
            "Q25_hco3": "category",
            "median_hco3": "category",
            "Q75_bun": "category",
            "Q25_bun": "category",
            "median_bun": "category",
            "Q75_cr": "category",
            "Q25_cr": "category",
            "median_cr": "category",
            "Q75_lactate": "category",
            "Q25_lactate": "category",
            "median_lactate": "category",
            "Q75_procalcitonin": "category",
            "Q25_procalcitonin": "category",
            "median_procalcitonin": "category",
            "first_procalcitonin": "category",
            "last_procalcitonin": "category",
            "first_lactate":"category",
            "last_cr":"category",
            "first_cr":"category",
            "last_bun":"category",
            "first_bun":"category",
            "last_hco3":"category",
            "first_hco3":"category",
            "last_na":"category",
            "first_na":"category",
            "last_plt":"category",
            "first_plt":"category",
            "last_hgb":"category",
            "first_hgb":"category",
            "last_lymphocytes":"category",
            "first_lymphocytes":"category",
            "last_neutrophils":"category",
            "first_neutrophils":"category",
            "last_wbc":"category",
            "first_wbc":"category" 
        }
    },
    {
        "name": "microbial_resistance.csv",
        "path": "microbiology_cultures_microbial_resistance.csv",
        "merge_on": linked_features, 
        "dtype": {
            "organism": "category",
            "antibiotic": "category",
            "resistant_time_to_culturetime": "int64"
        }
    },
    {
        "name": "cultures_priorprocedures.csv",
        "path": "microbiology_cultures_priorprocedures.csv",
        "merge_on": linked_features, 
        "dtype": {
            "procedure_description": "category",
            "procedure_time_to_culturetime": "int64"
        }
    },
    {
        "name": "prior_med",
        "path": "microbiology_cultures_prior_med.csv",
        "merge_on": linked_features,
        "dtype": {
            "medication_name": "category",
            "medication_time_to_culturetime": "int64",
            "medication_category": "category"
        }
    },
    {
        "name": "cultures_vitals",
        "path": "microbiology_cultures_vitals.csv",
        "merge_on": linked_features[:3],
        "dtype": {
            "Q25_heartrate": "category",
            "Q75_heartrate": "category",
            "median_heartrate": "category",
            "Q25_resprate": "category",
            "Q75_resprate": "category",
            "median_resprate": "category",
            "Q25_temp": "category",
            "Q75_temp": "category",
            "median_temp": "category",
            "Q25_sysbp": "float64",
            "Q75_sysbp": "float64",
            "median_sysbp": "float64",
            "Q25_diasbp": "float64",
            "Q75_diasbp": "float64",
            "median_diasbp": "float64",
            "first_diasbp": "category",
            "last_diasbp": "category",
            "last_sysbp": "category",
            "first_sysbp": "category",
            "last_temp": "category",
            "first_temp": "category",
            "last_resprate": "category",
            "first_resprate": "category",
            "last_heartrate": "category",
            "first_heartrate": "category"
        }
    },
    {
        "name": "ward_info",
        "path": "microbiology_cultures_ward_info.csv",
        "merge_on": linked_features, 
        "dtype": {
            "hosp_ward_IP": "int64",
            "hosp_ward_OP": "int64",
            "hosp_ward_ER": "int64",
            "hosp_ward_ICU": "int64"
        }
    },
    {
        "name": "prior_infecting_organism",
        "path": "microbiology_culture_prior_infecting_organism.csv",
        "merge_on": linked_features, 
        "dtype": {
            "prior_organism": "category",
            "prior_infecting_organism_days_to_culutre": "int64"
        }
    }
]



In [54]:
import pandas as pd
import os

output_folder = 'doi_10_5061_dryad_jq2bvq8kp__v20250411/Processed_ARMD_Dataset/'
file = 'sample_microbiology_cultures_cohort.csv'

for filename in os.listdir(output_folder):
    file_path = os.path.join(output_folder, filename)
    if os.path.isfile(file_path) and filename == file:
        os.remove(file_path)

folder_path = 'doi_10_5061_dryad_jq2bvq8kp__v20250411/'
# Load the main culture file
cohort = pd.read_csv(folder_path + "microbiology_cultures_cohort.csv")

# Keep only cultures with a known susceptibility result
cohort = cohort[cohort["susceptibility"].isin(["Resistant", "Susceptible", "Intermediate"])]

# Select the first culture per patient (can be changed to latest or random)
cohort_sorted = cohort.sort_values("order_time_jittered_utc")
one_culture_per_patient = cohort_sorted.groupby("anon_id").first().reset_index()

# Take a sample of 10,000 patients
patient_sample = one_culture_per_patient.sample(n=10000, random_state=42)
patient_sample.to_csv(os.path.join(folder_path+'Processed_ARMD_Dataset/sample_microbiology_cultures_cohort.csv'), index=False)

# Extract linking keys
sample_order_ids = patient_sample["order_proc_id_coded"].unique()
sample_anon_ids = patient_sample["anon_id"].unique()


In [56]:
import pandas as pd
import os
import shutil


output_folder = 'doi_10_5061_dryad_jq2bvq8kp__v20250411/Processed_ARMD_Dataset/'
keep_file = 'sample_microbiology_cultures_cohort.csv'

for filename in os.listdir(output_folder):
    file_path = os.path.join(output_folder, filename)
    if os.path.isfile(file_path) and filename != keep_file:
        os.remove(file_path)

# List of files related to order_proc_id_coded
order_level_files = [
    "microbiology_cultures_labs.csv",
    "microbiology_cultures_vitals.csv",
    "microbiology_cultures_prior_med.csv",
    "microbiology_cultures_microbial_resistance.csv",
    "microbiology_cultures_priorprocedures.csv",
    "microbiology_cultures_implied_susceptibility.csv"
]

# List of files related to anon_id
patient_level_files = [
    "microbiology_cultures_demographics.csv",
    "microbiology_cultures_comorbidity.csv",
    "microbiology_cultures_adi_scores.csv",
    "microbiology_cultures_nursing_home_visits.csv"
]

# def getDatatype(file):
#     for f in files_info:
#         if f['path']== file:
#             return f['dtype']
#     return None 



# Filter order-level files
chunk_size = 10000

for file in order_level_files:
    first_chunk = True 
    for chunk in pd.read_csv(folder_path + file, chunksize=chunk_size, low_memory=False):
        filtered_df = chunk[chunk["order_proc_id_coded"].isin(sample_order_ids)]
        filtered_df.to_csv(output_folder+'sample_'+file, mode='a', index=False, header=first_chunk)
        first_chunk = False


for file in patient_level_files:
    first_chunk = True 
    for chunk in pd.read_csv(folder_path + file, chunksize=chunk_size, low_memory=False):
        filtered_df = chunk[chunk["anon_id"].isin(sample_anon_ids)]
        filtered_df.to_csv(output_folder+'sample_'+file, mode='a', index=False, header=first_chunk)
        first_chunk = False


**sample files**

    "sample_microbiology_cultures_cohort.csv"
    "sample_microbiology_cultures_labs.csv"
    "sample_microbiology_cultures_vitals.csv"
    "sample_microbiology_cultures_prior_med.csv"
    "sample_microbiology_cultures_microbial_resistance.csv"
    "sample_microbiology_cultures_priorprocedures.csv"
    "sample_microbiology_cultures_implied_susceptibility.csv"
    "sample_microbiology_cultures_demographics.csv"
    "sample_microbiology_cultures_comorbidity.csv"
    "sample_microbiology_cultures_adi_scores.csv"
    "sample_microbiology_cultures_nursing_home_visits.csv"

In [1]:
import pandas as pd

# Define paths
input_folder = 'doi_10_5061_dryad_jq2bvq8kp__v20250411/Processed_ARMD_Dataset/'
output_file = 'merged_ARMD_sample.csv'

# First, identify all columns in all files to detect potential conflicts
all_columns = {}
order_files = ["labs", "vitals", "prior_med", "microbial_resistance", "priorprocedures", "implied_susceptibility"]
patient_files = ["demographics", "comorbidity", "adi_scores", "nursing_home_visits"]

# Load main cohort to get base columns
cohort_sample = pd.read_csv(input_folder + "sample_microbiology_cultures_cohort.csv", nrows=1)
base_columns = set(cohort_sample.columns)

# Process in chunks
chunk_size = 1000
first_chunk = True

for chunk in pd.read_csv(input_folder + "sample_microbiology_cultures_cohort.csv", chunksize=chunk_size):
    # Merge order-level files
    for file in order_files:
        df = pd.read_csv(input_folder + f"sample_microbiology_cultures_{file}.csv")
        
        # Identify overlapping columns (excluding the merge key)
        overlapping_cols = set(df.columns).intersection(set(chunk.columns)) - {'order_proc_id_coded'}
        
        # Rename overlapping columns in the right dataframe
        if overlapping_cols:
            rename_dict = {col: f"{file}_{col}" for col in overlapping_cols}
            df = df.rename(columns=rename_dict)
        
        chunk = pd.merge(chunk, df, on="order_proc_id_coded", how="inner")
    
    # Merge patient-level files
    for file in patient_files:
        df = pd.read_csv(input_folder + f"sample_microbiology_cultures_{file}.csv")
        
        # Identify overlapping columns (excluding the merge key)
        overlapping_cols = set(df.columns).intersection(set(chunk.columns)) - {'anon_id'}
        
        # Rename overlapping columns in the right dataframe
        if overlapping_cols:
            rename_dict = {col: f"{file}_{col}" for col in overlapping_cols}
            df = df.rename(columns=rename_dict)
        
        chunk = pd.merge(chunk, df, on="anon_id", how="inner")
    
    # Save each merged chunk
    chunk.to_csv(output_file, mode='a', index=False, header=first_chunk)
    first_chunk = False

MemoryError: Unable to allocate 10.1 GiB for an array with shape (1354396703,) and data type int64