In [1]:
# 2_data_harmonization.ipynb
# Notebook version of 2_data_harmonization.py for JupyterLab

# ----------------------------------
# 1. Import libraries
# ----------------------------------
import pandas as pd
import os

# ----------------------------------
# 2. Load curated pediatric PK master and metadata
# ----------------------------------
CURATED_FILE = '/data/curated/pediatric_pk_master.csv'
METADATA_FILE = '/data/raw/metadata.csv'
HARMONIZED_FILE = '/data/curated/pediatric_pk_master_harmonized.csv'

os.makedirs(os.path.dirname(HARMONIZED_FILE), exist_ok=True)

# Load datasets
df = pd.read_csv(CURATED_FILE)
metadata = pd.read_csv(METADATA_FILE)

print('Curated dataset preview:')
display(df.head())
print('Metadata preview:')
display(metadata.head())

# ----------------------------------
# 3. Harmonize clearance units
# ----------------------------------
if 'cl_ml_per_min_per_kg' in df.columns and 'cl_l_per_h_per_kg' in df.columns:
    mask = df['cl_l_per_h_per_kg'].isna() & df['cl_ml_per_min_per_kg'].notna()
    df.loc[mask, 'cl_l_per_h_per_kg'] = df.loc[mask, 'cl_ml_per_min_per_kg'] * 0.06  # 1 mL/min/kg = 0.06 L/h/kg

# ----------------------------------
# 4. Harmonize route names
# ----------------------------------
ROUTE_MAP = {
    'iv': 'intravenous',
    'oral': 'oral',
    'po': 'oral',
    'sc': 'subcutaneous'
}
df['route_harmonized'] = df['route'].map(lambda x: ROUTE_MAP.get(str(x).lower(), str(x).lower()) if pd.notna(x) else x)

# ----------------------------------
# 5. Merge with metadata and harmonize pathways
# ----------------------------------
PATHWAY_MAP = {
    'CYP3A4 hepatic metabolism': 'hepatic_CYP3A4',
    'UGT2B7 hepatic metabolism': 'hepatic_UGT2B7',
    'UGT/renal': 'hepatic_UGT_renal',
    'renal': 'renal',
    'hepatic/biliary': 'hepatic_biliary',
    'CYP2D6 hepatic metabolism': 'hepatic_CYP2D6'
}

# Merge on study_id
df = df.merge(metadata[['study_id','pathway']], on='study_id', how='left')
df['pathway_harmonized'] = df['pathway'].map(lambda x: PATHWAY_MAP.get(str(x), str(x)) if pd.notna(x) else x)

# ----------------------------------
# 6. Fill missing age and weight with median per drug
# ----------------------------------
if 'age_years' in df.columns:
    df['age_years'] = df.groupby('drug')['age_years'].transform(lambda x: x.fillna(x.median()))
if 'weight_kg' in df.columns:
    df['weight_kg'] = df.groupby('drug')['weight_kg'].transform(lambda x: x.fillna(x.median()))

# ----------------------------------
# 7. Save harmonized dataset
# ----------------------------------
df.to_csv(HARMONIZED_FILE, index=False)
print(f'Harmonized dataset saved to {HARMONIZED_FILE}')

# Preview first rows
display(df.head())


Curated dataset preview:


Unnamed: 0,study_id,drug,subject_id,age_years,age_days,weight_kg,cl_l_per_h_per_kg,t_half_h,route,aggregated_row,...,source,source_file,age_group,cl_ml_per_min_per_kg,cl_l_per_h,cmin_ng_per_ml,visit,dose_mg,auc_mg_h_per_l,cmax_mg_per_l
0,vet_et_al_2014_agg,midazolam,,0.006,2.2,2.5,0.14,,iv,True,...,https://pmc.ncbi.nlm.nih.gov/articles/PMC3948203/,midazolam_raw.csv,,,,,,,,
1,vet_et_al_2014_agg,midazolam,,2.0,730.0,12.0,0.28,,iv,True,...,https://pmc.ncbi.nlm.nih.gov/articles/PMC3948203/,midazolam_raw.csv,,,,,,,,
2,morph_iv_surgery_1998_agg,morphine,morph_1998_1_1,0.005,,,,,iv,True,...,https://journals.lww.com/anesthesia-analgesia/...,morphine_raw.csv,1-7 days,9.2,,,,,,
3,morph_iv_surgery_1998_agg,morphine,morph_1998_1_2,0.005,,,,,iv,True,...,https://journals.lww.com/anesthesia-analgesia/...,morphine_raw.csv,1-7 days,9.2,,,,,,
4,morph_iv_surgery_1998_agg,morphine,morph_1998_1_3,0.005,,,,,iv,True,...,https://journals.lww.com/anesthesia-analgesia/...,morphine_raw.csv,1-7 days,9.2,,,,,,


Metadata preview:


Unnamed: 0,study_id,drug,source_file,pathway,elimination,formulation,notes
0,genta_neonate_2013_agg,gentamicin,gentamicin_raw.csv,renal,renal,iv,Neonatal clearance
1,metoprolol_fda_bpca_agg,metoprolol,metoprolol_raw.csv,CYP2D6 hepatic metabolism,hepatic/renal,oral,Pediatric summary Cmin/Cmax
2,vet_et_al_2014_agg,midazolam,midazolam_raw.csv,CYP3A4 hepatic metabolism,hepatic,iv,Critically ill neonates/children
3,morph_iv_surgery_1998_agg,morphine,morphine_raw.csv,UGT2B7 hepatic metabolism,hepatic,iv,Postoperative infants
4,simva_pbpk_children_2019_agg,simvastatin,simvastatin_raw.csv,hepatic/biliary,hepatic,oral,Children/adolescents PBPK model


Harmonized dataset saved to /Users/cmontefusco/Pediatric PK Data Repository/data/curated/pediatric_pk_master_harmonized.csv


Unnamed: 0,study_id,drug,subject_id,age_years,age_days,weight_kg,cl_l_per_h_per_kg,t_half_h,route,aggregated_row,...,cl_ml_per_min_per_kg,cl_l_per_h,cmin_ng_per_ml,visit,dose_mg,auc_mg_h_per_l,cmax_mg_per_l,route_harmonized,pathway,pathway_harmonized
0,vet_et_al_2014_agg,midazolam,,0.006,2.2,2.5,0.14,,iv,True,...,,,,,,,,intravenous,CYP3A4 hepatic metabolism,hepatic_CYP3A4
1,vet_et_al_2014_agg,midazolam,,2.0,730.0,12.0,0.28,,iv,True,...,,,,,,,,intravenous,CYP3A4 hepatic metabolism,hepatic_CYP3A4
2,morph_iv_surgery_1998_agg,morphine,morph_1998_1_1,0.005,,,0.552,,iv,True,...,9.2,,,,,,,intravenous,UGT2B7 hepatic metabolism,hepatic_UGT2B7
3,morph_iv_surgery_1998_agg,morphine,morph_1998_1_2,0.005,,,0.552,,iv,True,...,9.2,,,,,,,intravenous,UGT2B7 hepatic metabolism,hepatic_UGT2B7
4,morph_iv_surgery_1998_agg,morphine,morph_1998_1_3,0.005,,,0.552,,iv,True,...,9.2,,,,,,,intravenous,UGT2B7 hepatic metabolism,hepatic_UGT2B7
