# Static Preprocessing 

In [None]:
import pandas as pd
import os
import sys
import json
from pathlib import Path

# Notebook is in notebooks/, so repo root is parent
REPO_ROOT = Path.cwd().parent
SRC_PATH = REPO_ROOT / "src"

# Insert src at the front of sys.path so imports work
sys.path.insert(0, str(SRC_PATH))


In [None]:
from preprocessing.static_preprocessing import load_static_data

# Get repo root relative to the current notebook
repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Load static preprocessing config
config_path = os.path.join(repo_root, "configs", "static_preprocessing_params.json")
with open(config_path, "r") as f:
    config = json.load(f)

# Set input and output directories
in_dir = os.path.join(repo_root, config["paths"]["in_dir"])
out_dir = os.path.join(repo_root, config["paths"]["out_dir"])

patients, admissions, hosp_diagnosis, drgcodes, icustays, edstays, ed_diagnosis, record_list = load_static_data(config)



Found CSV files:
ed_diagnosis.csv
hosp_admissions.csv
hosp_patient.csv
edstays.csv
drgcodes.csv
record_list.csv
hosp_diagnosis_icd.csv
icustays.csv


## Clean ECG Reports 
- flatten reports into 1 column, clean dtypes, and remove missing leads

In [156]:
# Build full path to the ECG record list CSV
ecg_record_filename = config["static_sources"]["record_list"]
ecg_record_path = os.path.join(in_dir, ecg_record_filename)

df_ecg_record = pd.read_csv(ecg_record_path)
print(df_ecg_record.shape)
df_ecg_record.head()

(800035, 5)


Unnamed: 0,subject_id,study_id,file_name,ecg_time,path
0,10000032,40689238,40689238,2180-07-23 08:44:00,files/p1000/p10000032/s40689238/40689238
1,10000032,44458630,44458630,2180-07-23 09:54:00,files/p1000/p10000032/s44458630/44458630
2,10000032,49036311,49036311,2180-08-06 09:07:00,files/p1000/p10000032/s49036311/49036311
3,10000117,45090959,45090959,2181-03-04 17:14:00,files/p1000/p10000117/s45090959/45090959
4,10000117,48446569,48446569,2183-09-18 13:52:00,files/p1000/p10000117/s48446569/48446569


In [161]:
from preprocessing.static_preprocessing import clean_cols_types

df_ecg_record_cleaned = clean_cols_types(df_ecg_record)
df_ecg_record_cleaned.head()

ImportError: cannot import name 'clean_cols_types' from 'preprocessing.static_preprocessing' (/Users/brandonng/Documents/GitHub/ClinicalDigitalTwin/notebooks/../src/preprocessing/static_preprocessing.py)

## Cleaning Hospital Module

### Hospital Diagnosis

In [158]:
from preprocessing.static_preprocessing import clean_diagnosis_data

hosp_diagnosis_filename = config["static_sources"]["hosp_diagnosis"]
hosp_diagnosis_path = os.path.join(in_dir, hosp_diagnosis_filename)

df_hosp_diagnosis = pd.read_csv(hosp_diagnosis_path)

df_diagnosis_cleaned = clean_diagnosis_data(df_hosp_diagnosis)

df_diagnosis_cleaned.head()


Unnamed: 0,subject_id,hadm_id,hosp_icd_codes_diagnosis,hosp_diagnosis
0,10000032,22595853,"[5723, 78959, 5715, 07070, 496, 29680, 30981, ...","[Portal hypertension, Other ascites, Cirrhosis..."
1,10000032,22841357,"[07071, 78959, 2875, 2761, 496, 5715, V08, 3051]",[Unspecified viral hepatitis C with hepatic co...
2,10000032,25742920,"[07054, 78959, V462, 5715, 2767, 2761, 496, V0...",[Chronic hepatitis C without mention of hepati...
3,10000032,29079034,"[45829, 07044, 7994, 2761, 78959, 2767, 3051, ...","[Other iatrogenic hypotension, Chronic hepatit..."
4,10000117,22927623,"[R1310, R0989, K31819, K219, K449, F419, I341,...","[Dysphagia, unspecified, Other specified sympt..."


### Hospital Admissions

In [None]:
from preprocessing.static_preprocessing import preprocess_admissions

hosp_admissions_filename = config["static_sources"]["admissions"]
hosp_admissions_path = os.path.join(in_dir, hosp_admissions_filename)

df_hosp_admissions = pd.read_csv(hosp_admissions_path)

df_hosp_admissions_cleaned = preprocess_admissions(df_hosp_admissions)
df_hosp_admissions_cleaned.head()

ImportError: cannot import name 'preprocess_admissions' from 'preprocessing.static_preprocessing' (/Users/brandonng/Documents/GitHub/ClinicalDigitalTwin/notebooks/../src/preprocessing/static_preprocessing.py)

In [None]:
from preprocessing.static_preprocessing import preprocess_patient

hos_patient_filename = config["static_sources"]["patients"]
hos_patient_path = os.path.join(in_dir, hos_patient_filename)

df_hosp_patient = pd.read_csv(hos_patient_path)

df_hosp_patient_cleaned = preprocess_patient(df_hosp_patient)
df_hosp_patient_cleaned.head()

ImportError: cannot import name 'preprocess_patient' from 'preprocessing.static_preprocessing' (/Users/brandonng/Documents/GitHub/ClinicalDigitalTwin/notebooks/../src/preprocessing/static_preprocessing.py)

In [None]:
ed_diagnosis_filename = config["static_sources"]["ed_diagnosis"]
ed_diagnosis_path = os.path.join(in_dir, ed_diagnosis_filename)

ed_diagnosis_patient = pd.read_csv(ed_diagnosis_path)
ed_diagnosis_patient.head()

Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title
0,16253470,37248432,1,35,9,ERYSIPELAS
1,10396349,36517744,1,42,9,HIV DISEASE
2,10429665,38305760,1,42,9,HIV DISEASE
3,10446182,31712472,1,42,9,HIV DISEASE
4,11053554,34764404,1,42,9,HIV DISEASE


In [None]:
ed_diagnosis_patient.dtypes

subject_id      int64
stay_id         int64
seq_num         int64
icd_code       object
icd_version     int64
icd_title      object
dtype: object

## Merge dfs

In [None]:
df_hosp_patient = df_hosp_admissions_cleaned.merge(df_hosp_patient_cleaned, on="subject_id")
df_hosp_patient['death_time'] = df_hosp_patient['deathtime'].combine_first(df_hosp_patient['dod'])
df_hosp_patient.drop(columns=['dod', 'deathtime'], inplace=True)
df_hosp_patient.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_type,admit_provider_id,discharge_location,race,edregtime,edouttime,gender,anchor_age,death_time
0,10106244,26713233,2147-05-09 10:34:00,2147-05-12 13:43:00,DIRECT EMER.,,HOME,WHITE,NaT,NaT,F,60,NaT
1,13700703,20448599,2172-09-25 01:01:00,2172-10-03 13:25:00,OBSERVATION ADMIT,,HOME,WHITE,2172-09-24 17:38:00,2172-09-25 03:07:00,F,56,NaT
2,15443666,27961368,2168-12-30 23:30:00,2169-01-05 16:02:00,OBSERVATION ADMIT,,HOME HEALTH CARE,BLACK/AFRICAN AMERICAN,2168-12-30 11:19:00,2168-12-31 01:22:00,F,75,2175-03-01
3,16299919,26977065,2193-05-15 08:37:00,2193-05-17 16:03:00,OBSERVATION ADMIT,,HOSPICE,BLACK/AFRICAN AMERICAN,2193-05-15 04:36:00,2193-05-15 14:27:00,F,91,2194-02-03
4,14149715,24191358,2181-10-25 19:37:00,2181-10-29 14:38:00,OBSERVATION ADMIT,P00230,SKILLED NURSING FACILITY,WHITE,2181-10-25 08:48:00,2181-10-26 15:18:00,F,64,2186-05-01


In [None]:
df_hosp_patient.shape

(395931, 13)

In [None]:
# missing values only because no diagnosis maybe in icu or ed?

df_hosp_patient_diag = df_hosp_patient.merge(df_diagnosis_cleaned, on=["subject_id", "hadm_id"], how="left")
df_hosp_patient_diag

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_type,admit_provider_id,discharge_location,race,edregtime,edouttime,gender,anchor_age,death_time,hosp_icd_codes_diagnosis,hosp_diagnosis
0,10106244,26713233,2147-05-09 10:34:00,2147-05-12 13:43:00,DIRECT EMER.,,HOME,WHITE,NaT,NaT,F,60,NaT,"[I25110, I2542, I501, T82593A, J9811, I9589, Y...",[Atherosclerotic heart disease of native coron...
1,13700703,20448599,2172-09-25 01:01:00,2172-10-03 13:25:00,OBSERVATION ADMIT,,HOME,WHITE,2172-09-24 17:38:00,2172-09-25 03:07:00,F,56,NaT,"[K224, R1013, E039, G4733, G894, D72829, Y848,...","[Dyskinesia of esophagus, Epigastric pain, Hyp..."
2,15443666,27961368,2168-12-30 23:30:00,2169-01-05 16:02:00,OBSERVATION ADMIT,,HOME HEALTH CARE,BLACK/AFRICAN AMERICAN,2168-12-30 11:19:00,2168-12-31 01:22:00,F,75,2175-03-01,"[T82538A, E1140, I509, I482, I10, I714, K5900,...",[Leakage of other cardiac and vascular devices...
3,16299919,26977065,2193-05-15 08:37:00,2193-05-17 16:03:00,OBSERVATION ADMIT,,HOSPICE,BLACK/AFRICAN AMERICAN,2193-05-15 04:36:00,2193-05-15 14:27:00,F,91,2194-02-03,"[I120, N179, N185, E1121, F0390, Z794, Z87891,...",[Hypertensive chronic kidney disease with stag...
4,14149715,24191358,2181-10-25 19:37:00,2181-10-29 14:38:00,OBSERVATION ADMIT,P00230,SKILLED NURSING FACILITY,WHITE,2181-10-25 08:48:00,2181-10-26 15:18:00,F,64,2186-05-01,"[S12120A, I4891, W109XXA, Y92009, I10, E785, I...","[Other displaced dens fracture, initial encoun..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395926,16711441,22500229,2179-11-16 17:15:00,2179-11-18 12:45:00,OBSERVATION ADMIT,P99Z33,HOME HEALTH CARE,HISPANIC/LATINO - DOMINICAN,2179-11-16 12:06:00,2179-11-16 19:06:00,M,58,NaT,"[L02612, L03116, I10, E119, S90872A, E785, W57...","[Cutaneous abscess of left foot, Cellulitis of..."
395927,18766758,25415341,2131-12-08 18:55:00,2131-12-12 16:15:00,OBSERVATION ADMIT,P99Z33,HOME HEALTH CARE,WHITE,2131-12-08 14:28:00,2131-12-08 19:46:00,M,34,NaT,"[E11621, L97511, M216X1, L97421, L03115, Z9481...","[Type 2 diabetes mellitus with foot ulcer, Non..."
395928,11286186,23566382,2157-04-08 09:15:00,2157-04-09 17:00:00,DIRECT OBSERVATION,P99Z33,,WHITE,NaT,NaT,F,49,NaT,"[S92341A, S92351A, W19XXXA, Y929, G40802, K219...","[Displaced fracture of fourth metatarsal bone,..."
395929,16578860,26155863,2150-12-07 03:38:00,2150-12-08 16:11:00,DIRECT OBSERVATION,P99Z33,,HISPANIC/LATINO - DOMINICAN,NaT,NaT,M,64,NaT,"[M86171, I70261, I10, F17210, Z7902, Z95828, E...","[Other acute osteomyelitis, right ankle and fo..."
