# ARMD Dataset Preprocessing

In [3]:
import pyarrow.parquet as pq
import glob
import os

id_columns = ['pat_enc_csn_id_coded', 'order_proc_id_coded', 'order_time_jittered_utc']
sample_folder='output/merged_result.parquet/'
parquet_files = glob.glob(sample_folder + "*.parquet") 
sample_file = sample_folder + os.path.basename(parquet_files[0])

print(f'Sample file: {os.path.basename(parquet_files[0])}')

table = pq.ParquetFile(sample_file)
df_batch = None
for batch in table.iter_batches(batch_size=5):
    df_batch = batch.to_pandas()
    break

df_batch.head()

Sample file: part.0.parquet


Unnamed: 0,pat_enc_csn_id_coded,order_proc_id_coded,order_time_jittered_utc,ordering_mode_x,culture_description_x,was_positive_x,organism_x,antibiotic_x,susceptibility_x,adi_score,...,first_diasbp,last_diasbp,last_sysbp,first_sysbp,last_temp,first_temp,last_resprate,first_resprate,last_heartrate,first_heartrate
0,131331407544,790098057,2022-05-14 02:29:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Ceftriaxone,Susceptible,17,...,,,,,,,,,77.0,79.0
1,131331407544,790098057,2022-05-14 02:29:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Ceftriaxone,Susceptible,17,...,,,,,,,,,77.0,79.0
2,131331407544,790098057,2022-05-14 02:29:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Ceftriaxone,Susceptible,17,...,,,,,,,,,77.0,79.0
3,131331407544,790098057,2022-05-14 02:29:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Ceftriaxone,Susceptible,17,...,,,,,,,,,77.0,79.0
4,131331407544,790098057,2022-05-14 02:29:00+00:00,Inpatient,URINE,1,ESCHERICHIA COLI,Ceftriaxone,Susceptible,17,...,,,,,,,,,77.0,79.0


# Prepare the target 
The target is `susceptibility` column 

In [None]:
import pandas as pd

sample_folder='output/merged_result.parquet/'
parquet_files = glob.glob(sample_folder + "*.parquet") 
sample_file = 'output/'+ os.path.basename(parquet_files[0])

df = pd.read_parquet(sample_file, columns=['susceptibility'])
df['susceptibility'].value_counts()

Map susceptibility values as:
- `'Susceptible' → 'S'`
- `'Resistant' → 'R'`

Drop:
- `'Intermediate' → 'I'`
- `'Null'`
- `'No Interpretation'`
- `'Susceptible - Dose Dependent'`

In [None]:
import pyarrow.parquet as pq
import pyarrow as pa
import os
import glob
import pandas as pd

sample_folder = 'output/merged_result.parquet/'
cleaned_folder = 'cleaned_output/'
os.makedirs(cleaned_folder, exist_ok=True)

# Target mapping
keep_values = {
    'Susceptible': 'S',
    'Resistant': 'R',
}

batch_size = 100_000
parquet_files = glob.glob(os.path.join(sample_folder, "*.parquet"))

for file_path in parquet_files:
    print(f"Processing: {os.path.basename(file_path)}")

    table = pq.ParquetFile(file_path)
    output_path = os.path.join(cleaned_folder, os.path.basename(file_path))
    writer = None
    base_schema = None

    for batch in table.iter_batches(batch_size=batch_size):
        df = batch.to_pandas()

        # Filter + map susceptibility
        df = df[df['susceptibility'].isin(keep_values)].copy()
        df['susceptibility_label'] = df['susceptibility'].map(keep_values)

        if df.empty:
            continue

        # Convert to pyarrow Table
        batch_table = pa.Table.from_pandas(df, preserve_index=False)

        if writer is None:
            base_schema = batch_table.schema
            writer = pq.ParquetWriter(output_path, base_schema)
        else:
            # Cast to initial schema to prevent mismatch
            batch_table = batch_table.cast(base_schema)

        writer.write_table(batch_table)

    if writer:
        writer.close()
        print(f"Saved cleaned file: {output_path}")
    else:
        print(f"No valid rows written: {file_path}")


In [5]:
import pandas as pd
import os
import glob

sample_clean_folder='cleaned_output/'
parquet_clean_files = glob.glob(sample_clean_folder + "*.parquet") 
sample_clean_file = sample_clean_folder + os.path.basename(parquet_clean_files[0])

df = pd.read_parquet(sample_clean_file, columns=['susceptibility_label'])
print(df.shape)
print(df['susceptibility_label'].value_counts())


(2184195, 1)
susceptibility_label
S    1249228
R     934967
Name: count, dtype: int64


# Visualize class distribution:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(x='susceptibility_label', data=df, order=['S', 'R'])
plt.title("Distribution of Susceptibility Categories")
plt.ylabel("Count")
plt.show()


# All Features:

In [None]:
sample_clean_folder='cleaned_output/'
parquet_clean_files = glob.glob(sample_clean_folder + "*.parquet") 
sample_clean_file = sample_clean_folder + os.path.basename(parquet_clean_files[0])

df = pd.read_parquet(sample_clean_file)
print(df.columns.tolist())

# Feature selection

**Target Column**

- `susceptibility` → already binary (`S` vs `R`)

**Core Predictive Features**
These are critical microbiological context:

- `organism_x` or `organism_left` or `organism_right` (choose one consistent source)
- `antibiotic_x` or `antibiotic_left` or `antibiotic_right` (same as above)
- `resistant_time_to_culturetime` (temporal resistance clue)

**Demographic & Socioeconomic Features**
These can capture patterns in resistance across populations:

- `age`
- `gender`
- `adi_score`, `adi_state_rank` (Area Deprivation Index)

**Lab Results (Quantitative Features)**
Time-windowed lab stats are excellent:

- WBC, neutrophils, lymphocytes, HGB, PLT, Na, HCO3, BUN, CR, lactate, procalcitonin
- Use median or all three (`Q25_`, `median_`, `Q75_`)
- Also useful: `first_` and `last_` versions (can show dynamics)

I may drop redundant summaries or keep only the most informative (e.g., median + first).

**Vital Signs**
These can indicate infection severity:
- `median_temp`, `median_resprate`, `median_heartrate`, `median_sysbp`, - `median_diasbp`
(I can optionally include quartiles or trends using first/last)

**Treatment Context**
Pre-treatment can influence resistance:

- `medication_name` (categorical, may need embedding or frequency encoding)
- `medication_category` (higher-level encoding)
- `medication_time_to_culturetime` (temporal feature)

**Administrative/Metadata to Drop**
Should be excluded:
- `pat_enc_csn_id_coded` 
- `order_proc_id_coded` 
- `order_time_jittered_utc` 
Any redundant _x, _y, _left, _right if duplicates exist

**Add delta features**

- df[`'delta_wbc'`] = df[`'last_wbc'`] - df[`'first_wbc'`]
- df[`'delta_cr'`] = df[`'last_cr'`] - df[`'first_cr'`]
- df[`'delta_lactate'`] = df[`'last_lactate'`] - df[`'first_lactate'`]
- df[`'delta_procalcitonin'`] = df[`'last_procalcitonin'`] - df[`'first_procalcitonin'`]


In [None]:
import pandas as pd
import os
import glob

sample_clean_folder = 'cleaned_output/'
parquet_clean_files = glob.glob(sample_clean_folder + "*.parquet") 
sample_clean_file = sample_clean_folder + os.path.basename(parquet_clean_files[0])

# Load all needed columns, including first/last for deltas
base_features = [
    # Microbiology
    'organism_x', 'antibiotic_x', 'resistant_time_to_culturetime',

    # Demographics
    'age', 'gender', 'adi_score', 'adi_state_rank',

    # Labs (medians)
    'median_wbc', 'median_neutrophils', 'median_lymphocytes',
    'median_hgb', 'median_plt', 'median_na',
    'median_hco3', 'median_bun', 'median_cr',
    'median_lactate', 'median_procalcitonin',

    # Vitals (medians)
    'median_heartrate', 'median_resprate', 'median_temp',
    'median_sysbp', 'median_diasbp',

    # Treatment
    'medication_category', 'medication_time_to_culturetime',

    # Needed for delta calculation
    'first_wbc', 'last_wbc',
    'first_cr', 'last_cr',
    'first_lactate', 'last_lactate',
    'first_procalcitonin', 'last_procalcitonin'
]

# Add the target column
target_column = 'susceptibility_label'
all_columns = base_features + [target_column]

df = pd.read_parquet(sample_clean_file, columns=all_columns)
print("Before delta:", df.shape)

# Convert columns to numeric (safely, coercing errors to NaN)
numeric_cols = [
    'first_wbc', 'last_wbc',
    'first_cr', 'last_cr',
    'first_lactate', 'last_lactate',
    'first_procalcitonin', 'last_procalcitonin'
]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Now compute delta features
df['delta_wbc'] = df['last_wbc'] - df['first_wbc']
df['delta_cr'] = df['last_cr'] - df['first_cr']
df['delta_lactate'] = df['last_lactate'] - df['first_lactate']
df['delta_procalcitonin'] = df['last_procalcitonin'] - df['first_procalcitonin']

# Final selected features list
selected_features = base_features + [
    'delta_wbc', 'delta_cr', 'delta_lactate', 'delta_procalcitonin'
]

# Save the dataframe with selected features and target
df = df[selected_features + [target_column]]
print("After delta:", df.shape)

# Save to new Parquet file
output_path = "selected_features_with_deltas.parquet"
df.to_parquet(output_path, index=False)

print(f"✅ Saved selected features with deltas to: {output_path}")


In [None]:
df.head()

## Features Engenering

In [21]:
import pandas as pd
import os
import glob
from sklearn.preprocessing import PolynomialFeatures
import xgboost as xgb
import shap

# --- Load Data ---
sample_clean_folder = 'cleaned_output/'
parquet_clean_files = glob.glob(sample_clean_folder + "*.parquet") 
sample_clean_file = sample_clean_folder + os.path.basename(parquet_clean_files[0])

# Load all needed columns (expanded for feature engineering)
base_features = [
    # Microbiology
    'organism_x', 'antibiotic_x', 'resistant_time_to_culturetime', 'was_positive_x',
    
    # Demographics
    'age', 'gender', 'adi_score', 'adi_state_rank',
    
    # Labs (medians + first/last for trends)
    'median_wbc', 'median_neutrophils', 'median_lymphocytes',
    'median_hgb', 'median_plt', 'median_na', 'median_hco3',
    'median_bun', 'median_cr', 'median_lactate', 'median_procalcitonin',
    'first_wbc', 'last_wbc', 'first_neutrophils', 'last_neutrophils',
    'first_lymphocytes', 'last_lymphocytes', 'first_lactate', 'last_lactate',
    'first_cr', 'last_cr', 'first_procalcitonin', 'last_procalcitonin',
    
    # Vitals
    'median_heartrate', 'median_resprate', 'median_temp',
    'median_sysbp', 'median_diasbp',
    
    # Treatment
    'medication_category', 'medication_time_to_culturetime', 'nursing_home_visit_culture'
]

target_column = 'susceptibility_label'
df = pd.read_parquet(sample_clean_file, columns=base_features + [target_column])
print("Initial shape:", df.shape)

# --- Convert ALL Numeric Columns ---
numeric_cols = [
    'median_wbc', 'median_neutrophils', 'median_lymphocytes',
    'median_hgb', 'median_plt', 'median_na', 'median_hco3',
    'median_bun', 'median_cr', 'median_lactate', 'median_procalcitonin',
    'first_wbc', 'last_wbc', 'first_neutrophils', 'last_neutrophils',
    'first_lymphocytes', 'last_lymphocytes', 'first_lactate', 'last_lactate',
    'first_cr', 'last_cr', 'first_procalcitonin', 'last_procalcitonin',
    'median_heartrate', 'median_resprate', 'median_temp',
    'median_sysbp', 'median_diasbp',
    'age', 'adi_score'
]

# Convert to numeric (coerce errors to NaN)
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Validate conversion
print("\nData types after conversion:")
print(df[numeric_cols].dtypes)

# --- Feature Engineering ---
# 1. Calculate Trends (with safe division)
def safe_divide(a, b):
    return a / b.replace(0, 1e-6)

try:
    df['wbc_trend'] = safe_divide((df['last_wbc'] - df['first_wbc']), df['first_wbc'])
    df['lactate_trend'] = safe_divide((df['last_lactate'] - df['first_lactate']), df['first_lactate'])
    df['cr_trend'] = safe_divide((df['last_cr'] - df['first_cr']), df['first_cr'])
    df['procalcitonin_trend'] = safe_divide((df['last_procalcitonin'] - df['first_procalcitonin']), df['first_procalcitonin'])
except Exception as e:
    print(f"Error calculating trends: {e}")

# 2. Clinical Ratios (with error handling)
try:
    df['neutrophil_to_lymphocyte_ratio'] = safe_divide(df['median_neutrophils'], df['median_lymphocytes'])
    df['bun_to_cr_ratio'] = safe_divide(df['median_bun'], df['median_cr'])
except Exception as e:
    print(f"Error calculating ratios: {e}")

# 3. Interaction Terms (with error handling)
try:
    interaction_terms = ['organism_x', 'antibiotic_x', 'median_lactate', 'median_procalcitonin']
    df_interactions = pd.get_dummies(df[interaction_terms], columns=['organism_x', 'antibiotic_x'])
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    interaction_features = poly.fit_transform(df_interactions)
    interaction_cols = poly.get_feature_names_out(df_interactions.columns)
    df_interactions = pd.DataFrame(interaction_features, columns=interaction_cols)
    df = pd.concat([df, df_interactions], axis=1)
except Exception as e:
    print(f"Error creating interaction terms: {e}")

# --- Feature Selection ---
try:
    # Prepare Data for SHAP Analysis
    X = df.drop(columns=[target_column]).select_dtypes(include=['number'])
    y = df[target_column]
    
    # Train XGBoost with Class Balancing
    model = xgb.XGBClassifier(
        scale_pos_weight=(len(y) - sum(y)) / sum(y), 
        random_state=42,
        enable_categorical=True
    )
    model.fit(X, y)
    
    # SHAP Importance
    explainer = shap.Explainer(model)
    shap_values = explainer(X)
    top_features = pd.Series(shap_values.abs.mean(0).values, index=X.columns).nlargest(30).index.tolist()
    
except Exception as e:
    print(f"Error during feature selection: {e}")
    top_features = []  # Fallback to manual selection

# --- Final Feature Set ---
final_features = [
    # Core Features
    'organism_x', 'antibiotic_x', 'resistant_time_to_culturetime',
    'age', 'adi_score',
    
    # Engineered Features (if available)
    *[f for f in ['lactate_trend', 'cr_trend', 'neutrophil_to_lymphocyte_ratio'] if f in df.columns],
    
    # Top Features from SHAP (if available)
    *[f for f in top_features if f in df.columns]
]

# Ensure features exist
final_features = [f for f in final_features if f in df.columns]
df_final = df[final_features + [target_column]]

# --- Save ---
output_path = "selected_features_optimized.parquet"
df_final.to_parquet(output_path, index=False)
print(f"\n✅ Saved optimized features to: {output_path}")
print("Final features used:", final_features)
print("Final shape:", df_final.shape)

Initial shape: (2184195, 40)

Data types after conversion:
median_wbc              Float64
median_neutrophils      Float64
median_lymphocytes      Float64
median_hgb              Float64
median_plt              Float64
median_na               Float64
median_hco3             Float64
median_bun              Float64
median_cr               Float64
median_lactate          Float64
median_procalcitonin    Float64
first_wbc               Float64
last_wbc                Float64
first_neutrophils       Float64
last_neutrophils        Float64
first_lymphocytes       Float64
last_lymphocytes        Float64
first_lactate           Float64
last_lactate            Float64
first_cr                Float64
last_cr                 Float64
first_procalcitonin     Float64
last_procalcitonin      Float64
median_heartrate        float32
median_resprate         float32
median_temp             float32
median_sysbp            float32
median_diasbp           float32
age                     Float64
adi_score    