# ARMD Dataset Preprocessing

In [2]:
import pyarrow.parquet as pq
import glob
import os

id_columns = ['pat_enc_csn_id_coded', 'order_proc_id_coded', 'order_time_jittered_utc']
sample_folder='output/'
parquet_files = glob.glob(sample_folder + "*.parquet") 
sample_file = 'output/'+ os.path.basename(parquet_files[0])

print(f'Sample file: {os.path.basename(parquet_files[0])}')

table = pq.ParquetFile(sample_file)
df_batch = None
for batch in table.iter_batches(batch_size=5):
    df_batch = batch.to_pandas()
    break

df_batch.head()

Sample file: part.0.parquet


Unnamed: 0,pat_enc_csn_id_coded,order_proc_id_coded,order_time_jittered_utc,ordering_mode_x,culture_description_x,was_positive_x,organism_x,antibiotic_x,susceptibility_x,adi_score,...,first_diasbp,last_diasbp,last_sysbp,first_sysbp,last_temp,first_temp,last_resprate,first_resprate,last_heartrate,first_heartrate
0,131331407544,790098057,2022-05-14 02:29:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Ceftriaxone,Susceptible,17,...,,,,,,,,,77.0,79.0
1,131331407544,790098057,2022-05-14 02:29:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Ceftriaxone,Susceptible,17,...,,,,,,,,,77.0,79.0
2,131331407544,790098057,2022-05-14 02:29:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Ceftriaxone,Susceptible,17,...,,,,,,,,,77.0,79.0
3,131331407544,790098057,2022-05-14 02:29:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Ceftriaxone,Susceptible,17,...,,,,,,,,,77.0,79.0
4,131331407544,790098057,2022-05-14 02:29:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Ceftriaxone,Susceptible,17,...,,,,,,,,,77.0,79.0


# Prepare the target 
The target is `susceptibility` column 

In [4]:
import pandas as pd

sample_folder='output/'
parquet_files = glob.glob(sample_folder + "*.parquet") 
sample_file = 'output/'+ os.path.basename(parquet_files[0])

df = pd.read_parquet(sample_file, columns=['susceptibility'])
df['susceptibility'].value_counts()

susceptibility
Susceptible                     1249228
Resistant                        934967
Null                             459244
Intermediate                     259190
No Interpretation                    58
Susceptible - Dose Dependent         15
Name: count, dtype: Int64

Map susceptibility values as:
- `'Susceptible' → 'S'`
- `'Resistant' → 'R'`
- `'Intermediate' → 'I'`

Drop:
- `'Null'`
- `'No Interpretation'`
- `'Susceptible - Dose Dependent'`

In [11]:
import pyarrow.parquet as pq
import pyarrow as pa
import os
import glob
import pandas as pd

sample_folder = 'output/'
cleaned_folder = 'cleaned_output/'
os.makedirs(cleaned_folder, exist_ok=True)

# Target mapping
keep_values = {
    'Susceptible': 'S',
    'Resistant': 'R',
    'Intermediate': 'I'
}

batch_size = 100_000
parquet_files = glob.glob(os.path.join(sample_folder, "*.parquet"))

for file_path in parquet_files:
    print(f"Processing: {os.path.basename(file_path)}")

    table = pq.ParquetFile(file_path)
    output_path = os.path.join(cleaned_folder, os.path.basename(file_path))
    writer = None
    base_schema = None

    for batch in table.iter_batches(batch_size=batch_size):
        df = batch.to_pandas()

        # Filter + map susceptibility
        df = df[df['susceptibility'].isin(keep_values)].copy()
        df['susceptibility_label'] = df['susceptibility'].map(keep_values)

        if df.empty:
            continue

        # Convert to pyarrow Table
        batch_table = pa.Table.from_pandas(df, preserve_index=False)

        if writer is None:
            base_schema = batch_table.schema
            writer = pq.ParquetWriter(output_path, base_schema)
        else:
            # Cast to initial schema to prevent mismatch
            batch_table = batch_table.cast(base_schema)

        writer.write_table(batch_table)

    if writer:
        writer.close()
        print(f"Saved cleaned file: {output_path}")
    else:
        print(f"No valid rows written: {file_path}")


Processing: part.0.parquet
Saved cleaned file: cleaned_output/part.0.parquet


In [16]:
import pandas as pd

df = pd.read_parquet('cleaned_output/your_file_name.parquet')
print(df.shape)
print(df['susceptibility_label'].value_counts())


FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_output/your_file_name.parquet'