# Static Preprocessing 

In [12]:
import pandas as pd
import os
import sys
import json
from pathlib import Path

# Notebook is in notebooks/, so repo root is parent
REPO_ROOT = Path.cwd().parent
SRC_PATH = REPO_ROOT / "src"

# Insert src at the front of sys.path so imports work
sys.path.insert(0, str(SRC_PATH))


In [13]:
from preprocessing.static_preprocessing import load_static_data

# Get repo root relative to the current notebook
repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Load static preprocessing config
config_path = os.path.join(repo_root, "configs", "static_preprocessing_params.json")
with open(config_path, "r") as f:
    config = json.load(f)

# Set input and output directories
in_dir = os.path.join(repo_root, config["paths"]["in_dir"])
out_dir = os.path.join(repo_root, config["paths"]["out_dir"])

# Pass in_dir as the first argument
hosp_patients_df, hosp_admissions_df, hosp_diagnosis_df, hosp_drgcodes_df, icustays_df, edstays_df, ed_diagnosis_df, record_list_df = load_static_data(in_dir, config)

## Clean ECG Reports 

In [14]:
from preprocessing.static_preprocessing import clean_cols_types

record_list_df_cleaned = clean_cols_types(record_list_df)
record_list_df_cleaned.head()

Unnamed: 0,subject_id,study_id,file_name,ecg_time,path
0,10000032,40689238,40689238,2180-07-23 08:44:00,files/p1000/p10000032/s40689238/40689238
1,10000032,44458630,44458630,2180-07-23 09:54:00,files/p1000/p10000032/s44458630/44458630
2,10000032,49036311,49036311,2180-08-06 09:07:00,files/p1000/p10000032/s49036311/49036311
3,10000117,45090959,45090959,2181-03-04 17:14:00,files/p1000/p10000117/s45090959/45090959
4,10000117,48446569,48446569,2183-09-18 13:52:00,files/p1000/p10000117/s48446569/48446569


In [40]:
record_list_df_cleaned.dtypes

subject_id             int64
study_id               int64
file_name              int64
ecg_time      datetime64[ns]
path          string[python]
dtype: object

## Cleaning Hospital Module

### Hospital Admissions

In [15]:
from preprocessing.static_preprocessing import preprocess_admissions

hosp_admissions_df_cleaned = preprocess_admissions(hosp_admissions_df)
hosp_admissions_df_cleaned.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,discharge_location,race,edregtime,edouttime
0,10106244,26713233,2147-05-09 10:34:00,2147-05-12 13:43:00,NaT,DIRECT EMER.,,HOME,WHITE,NaT,NaT
1,13700703,20448599,2172-09-25 01:01:00,2172-10-03 13:25:00,NaT,OBSERVATION ADMIT,,HOME,WHITE,2172-09-24 17:38:00,2172-09-25 03:07:00
2,15443666,27961368,2168-12-30 23:30:00,2169-01-05 16:02:00,NaT,OBSERVATION ADMIT,,HOME HEALTH CARE,BLACK/AFRICAN AMERICAN,2168-12-30 11:19:00,2168-12-31 01:22:00
3,16299919,26977065,2193-05-15 08:37:00,2193-05-17 16:03:00,NaT,OBSERVATION ADMIT,,HOSPICE,BLACK/AFRICAN AMERICAN,2193-05-15 04:36:00,2193-05-15 14:27:00
4,14149715,24191358,2181-10-25 19:37:00,2181-10-29 14:38:00,NaT,OBSERVATION ADMIT,P00230,SKILLED NURSING FACILITY,WHITE,2181-10-25 08:48:00,2181-10-26 15:18:00


In [41]:
hosp_admissions_df_cleaned.dtypes

subject_id                     int64
hadm_id                        int64
admittime             datetime64[ns]
dischtime             datetime64[ns]
deathtime             datetime64[ns]
admission_type        string[python]
admit_provider_id     string[python]
discharge_location    string[python]
race                  string[python]
edregtime             datetime64[ns]
edouttime             datetime64[ns]
dtype: object

### Hopsital Patient

In [16]:
from preprocessing.static_preprocessing import preprocess_patient

hosp_patients_df_cleaned = preprocess_patient(hosp_patients_df)
hosp_patients_df_cleaned.head()

Unnamed: 0,subject_id,gender,anchor_age,dod
0,11289691,F,18,NaT
1,11806971,F,18,NaT
2,12107404,F,18,NaT
3,12143996,F,18,NaT
4,13117076,F,18,NaT


In [42]:
hosp_patients_df_cleaned.dtypes

subject_id             int64
gender        string[python]
anchor_age             int64
dod           datetime64[ns]
dtype: object

### Hospital Drgcodes

In [None]:
from preprocessing.static_preprocessing import preprocess_drgcodes

hosp_drgcodes_df_cleaned = preprocess_drgcodes(hosp_drgcodes_df)

Unnamed: 0,subject_id,hadm_id,drg_type,drg_code,description,drg_severity,drg_mortality
0,10004235,22187210,HCFA,864,FEVER,,
1,10013643,27433745,HCFA,864,FEVER,,
2,10014610,27408652,HCFA,864,FEVER,,
3,10014610,28254713,HCFA,864,FEVER,,
4,10033552,20193539,HCFA,864,FEVER,,


In [43]:
hosp_drgcodes_df_cleaned.dtypes

subject_id            int64
hadm_id               int64
drg_code_apr         object
description_apr      object
drg_severity_apr     object
drg_mortality_apr    object
drg_code_hcfa        object
description_hcfa     object
dtype: object

### Hospital Diagnosis 

In [64]:
hosp_diagnosis_df.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,long_title
0,10000935,26381316,27,78052,9,"Insomnia, unspecified"
1,10000980,25242409,27,44021,9,Atherosclerosis of native arteries of the extr...
2,10000980,25242409,28,27800,9,"Obesity, unspecified"
3,10000980,25242409,29,V8522,9,"Body Mass Index 26.0-26.9, adult"
4,10000980,25242409,30,72992,9,Nontraumatic hematoma of soft tissue


In [None]:
from preprocessing.static_preprocessing import clean_diagnosis_data

hosp_diagnosis_df_cleaned = clean_diagnosis_data(hosp_diagnosis_df, 'hosp')
hosp_diagnosis_df_cleaned.head()


Unnamed: 0,subject_id,hadm_id,hosp_icd_codes_diagnosis,hosp_diagnosis
0,10000032,22595853,"[5723, 78959, 5715, 07070, 496, 29680, 30981, ...","[Portal hypertension, Other ascites, Cirrhosis..."
1,10000032,22841357,"[07071, 78959, 2875, 2761, 496, 5715, V08, 3051]",[Unspecified viral hepatitis C with hepatic co...
2,10000032,25742920,"[07054, 78959, V462, 5715, 2767, 2761, 496, V0...",[Chronic hepatitis C without mention of hepati...
3,10000032,29079034,"[45829, 07044, 7994, 2761, 78959, 2767, 3051, ...","[Other iatrogenic hypotension, Chronic hepatit..."
4,10000117,22927623,"[R1310, R0989, K31819, K219, K449, F419, I341,...","[Dysphagia, unspecified, Other specified sympt..."


In [44]:
hosp_diagnosis_df_cleaned.dtypes

subject_id                   int64
hadm_id                      int64
hosp_icd_codes_diagnosis    object
hosp_diagnosis              object
dtype: object

## ICU 

In [58]:
# from preprocessing.static_preprocessing import preprocess_icustays

def add_prefix_to_columns(df, prefix):
    """
    Add prefix to all columns except subject_id and hadm_id.
    """
    exclude_cols = ['subject_id', 'hadm_id']
    
    rename_dict = {
        col: f"{prefix}_{col}" 
        for col in df.columns 
        if col not in exclude_cols
    }
    
    return clean_cols_types(df.rename(columns=rename_dict))

icustays_df_cleaned = add_prefix_to_columns(icustays_df, 'icu')
icustays_df_cleaned.head()

Unnamed: 0,subject_id,hadm_id,icu_stay_id,icu_first_careunit,icu_last_careunit,icu_intime,icu_outtime,icu_los
0,10270644,20019675,35548343,PACU,PACU,2159-12-03 16:20:31,2159-12-08 17:28:42,5.04735
1,10368426,21588639,39194905,PACU,PACU,2164-12-30 13:29:21,2164-12-30 14:00:38,0.021725
2,10640410,25898987,34344828,PACU,PACU,2112-02-03 12:55:23,2112-02-08 15:14:54,5.096887
3,10691194,24438843,37799251,PACU,PACU,2147-06-01 17:38:48,2147-06-01 17:58:44,0.013843
4,11162329,26304963,39444424,PACU,PACU,2137-06-11 19:49:23,2137-06-12 14:54:47,0.795417


In [59]:
icustays_df_cleaned.dtypes

subject_id                     int64
hadm_id                        int64
icu_stay_id                    int64
icu_first_careunit    string[python]
icu_last_careunit     string[python]
icu_intime            datetime64[ns]
icu_outtime           datetime64[ns]
icu_los                      float64
dtype: object

## Emergency Department
### ED Stays

In [60]:
edstays_df_cleaned = add_prefix_to_columns(edstays_df, 'ed')
edstays_df_cleaned.head()

Unnamed: 0,subject_id,hadm_id,ed_stay_id,ed_intime,ed_outtime,ed_gender,ed_race,ed_arrival_transport,ed_disposition
0,10049341,20677333.0,34255415,2171-04-07 17:48:00,2171-04-08 09:31:00,F,ASIAN,WALK IN,HOME
1,10049341,,35767475,2170-08-29 18:20:00,2170-08-29 22:46:00,F,ASIAN,WALK IN,HOME
2,10049341,,36382949,2171-11-19 20:09:00,2171-11-20 00:03:00,F,ASIAN,WALK IN,HOME
3,10049341,,36490047,2174-11-29 19:39:00,2174-11-30 00:49:00,F,ASIAN,WALK IN,HOME
4,10049341,,37283116,2174-01-26 20:10:00,2174-01-27 00:34:00,F,ASIAN,WALK IN,HOME


In [61]:
edstays_df_cleaned.dtypes

subject_id                       int64
hadm_id                        float64
ed_stay_id                       int64
ed_intime               datetime64[ns]
ed_outtime              datetime64[ns]
ed_gender               string[python]
ed_race                 string[python]
ed_arrival_transport    string[python]
ed_disposition          string[python]
dtype: object

### ED Diagnosis

In [None]:
ed_diagnosis_df_cleaned = clean_diagnosis_data(ed_diagnosis_df, 'ed')
ed_diagnosis_df_cleaned.head()

KeyError: 'hadm_id'

In [62]:
ed_diagnosis_df.head()

Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title
0,16253470,37248432,1,35,9,ERYSIPELAS
1,10396349,36517744,1,42,9,HIV DISEASE
2,10429665,38305760,1,42,9,HIV DISEASE
3,10446182,31712472,1,42,9,HIV DISEASE
4,11053554,34764404,1,42,9,HIV DISEASE


## Merge Hospital, ICU, and ED Data

In [20]:
df_hosp_patient = df_hosp_admissions_cleaned.merge(df_hosp_patient_cleaned, on="subject_id")
df_hosp_patient['death_time'] = df_hosp_patient['deathtime'].combine_first(df_hosp_patient['dod'])
df_hosp_patient.drop(columns=['dod', 'deathtime'], inplace=True)
df_hosp_patient.head()

NameError: name 'df_hosp_admissions_cleaned' is not defined

In [None]:
df_hosp_patient.shape

(395931, 13)

In [None]:
# missing values only because no diagnosis maybe in icu or ed?

df_hosp_patient_diag = df_hosp_patient.merge(df_diagnosis_cleaned, on=["subject_id", "hadm_id"], how="left")
df_hosp_patient_diag

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_type,admit_provider_id,discharge_location,race,edregtime,edouttime,gender,anchor_age,death_time,hosp_icd_codes_diagnosis,hosp_diagnosis
0,10106244,26713233,2147-05-09 10:34:00,2147-05-12 13:43:00,DIRECT EMER.,,HOME,WHITE,NaT,NaT,F,60,NaT,"[I25110, I2542, I501, T82593A, J9811, I9589, Y...",[Atherosclerotic heart disease of native coron...
1,13700703,20448599,2172-09-25 01:01:00,2172-10-03 13:25:00,OBSERVATION ADMIT,,HOME,WHITE,2172-09-24 17:38:00,2172-09-25 03:07:00,F,56,NaT,"[K224, R1013, E039, G4733, G894, D72829, Y848,...","[Dyskinesia of esophagus, Epigastric pain, Hyp..."
2,15443666,27961368,2168-12-30 23:30:00,2169-01-05 16:02:00,OBSERVATION ADMIT,,HOME HEALTH CARE,BLACK/AFRICAN AMERICAN,2168-12-30 11:19:00,2168-12-31 01:22:00,F,75,2175-03-01,"[T82538A, E1140, I509, I482, I10, I714, K5900,...",[Leakage of other cardiac and vascular devices...
3,16299919,26977065,2193-05-15 08:37:00,2193-05-17 16:03:00,OBSERVATION ADMIT,,HOSPICE,BLACK/AFRICAN AMERICAN,2193-05-15 04:36:00,2193-05-15 14:27:00,F,91,2194-02-03,"[I120, N179, N185, E1121, F0390, Z794, Z87891,...",[Hypertensive chronic kidney disease with stag...
4,14149715,24191358,2181-10-25 19:37:00,2181-10-29 14:38:00,OBSERVATION ADMIT,P00230,SKILLED NURSING FACILITY,WHITE,2181-10-25 08:48:00,2181-10-26 15:18:00,F,64,2186-05-01,"[S12120A, I4891, W109XXA, Y92009, I10, E785, I...","[Other displaced dens fracture, initial encoun..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395926,16711441,22500229,2179-11-16 17:15:00,2179-11-18 12:45:00,OBSERVATION ADMIT,P99Z33,HOME HEALTH CARE,HISPANIC/LATINO - DOMINICAN,2179-11-16 12:06:00,2179-11-16 19:06:00,M,58,NaT,"[L02612, L03116, I10, E119, S90872A, E785, W57...","[Cutaneous abscess of left foot, Cellulitis of..."
395927,18766758,25415341,2131-12-08 18:55:00,2131-12-12 16:15:00,OBSERVATION ADMIT,P99Z33,HOME HEALTH CARE,WHITE,2131-12-08 14:28:00,2131-12-08 19:46:00,M,34,NaT,"[E11621, L97511, M216X1, L97421, L03115, Z9481...","[Type 2 diabetes mellitus with foot ulcer, Non..."
395928,11286186,23566382,2157-04-08 09:15:00,2157-04-09 17:00:00,DIRECT OBSERVATION,P99Z33,,WHITE,NaT,NaT,F,49,NaT,"[S92341A, S92351A, W19XXXA, Y929, G40802, K219...","[Displaced fracture of fourth metatarsal bone,..."
395929,16578860,26155863,2150-12-07 03:38:00,2150-12-08 16:11:00,DIRECT OBSERVATION,P99Z33,,HISPANIC/LATINO - DOMINICAN,NaT,NaT,M,64,NaT,"[M86171, I70261, I10, F17210, Z7902, Z95828, E...","[Other acute osteomyelitis, right ankle and fo..."
