# Static Preprocessing 

In [1]:
import pandas as pd
import os
import sys
import json
from pathlib import Path


# Notebook is in notebooks/, so repo root is parent
REPO_ROOT = Path.cwd().parent
SRC_PATH = REPO_ROOT / "src"

# Insert src at the front of sys.path so imports work
sys.path.insert(0, str(SRC_PATH))


In [2]:
import importlib
import preprocessing.static_preprocessing as sp

importlib.reload(sp)

<module 'preprocessing.static_preprocessing' from '/home/syamala/private/ClinicalDigitalTwin/src/preprocessing/static_preprocessing.py'>

In [3]:
from preprocessing.static_preprocessing import load_static_data

# Get repo root relative to the current notebook
repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Load static preprocessing config
config_path = os.path.join(repo_root, "configs", "static_preprocessing_params.json")
with open(config_path, "r") as f:
    config = json.load(f)

# Set input and output directories
in_dir = os.path.join(repo_root, config["paths"]["in_dir"])
out_dir = os.path.join(repo_root, config["paths"]["out_dir"])

# Pass in_dir as the first argument
hosp_patients_df, hosp_admissions_df, hosp_diagnosis_df, hosp_drgcodes_df, icustays_df, edstays_df, ed_diagnosis_df, record_list_df = load_static_data(in_dir, config)

## Clean ECG Reports 

In [4]:
from preprocessing.static_preprocessing import clean_cols_types

record_list_df_cleaned = clean_cols_types(record_list_df)
record_list_df_cleaned.head()

Unnamed: 0,subject_id,study_id,file_name,ecg_time,path
0,10000032,40689238,40689238,2180-07-23 08:44:00,files/p1000/p10000032/s40689238/40689238
1,10000032,44458630,44458630,2180-07-23 09:54:00,files/p1000/p10000032/s44458630/44458630
2,10000032,49036311,49036311,2180-08-06 09:07:00,files/p1000/p10000032/s49036311/49036311
3,10000117,45090959,45090959,2181-03-04 17:14:00,files/p1000/p10000117/s45090959/45090959
4,10000117,48446569,48446569,2183-09-18 13:52:00,files/p1000/p10000117/s48446569/48446569


In [5]:
record_list_df_cleaned.dtypes

subject_id             int64
study_id               int64
file_name              int64
ecg_time      datetime64[ns]
path          string[python]
dtype: object

In [6]:
record_list_df_cleaned.shape

(381300, 5)

## Cleaning Hospital Module

### Hospital Admissions

In [7]:
from preprocessing.static_preprocessing import preprocess_admissions

hosp_admissions_df_cleaned = preprocess_admissions(hosp_admissions_df)
hosp_admissions_df_cleaned.head()

Unnamed: 0,subject_id,hadm_id,hosp_admittime,hosp_dischtime,deathtime,admission_type,discharge_location,race,edregtime,edouttime
0,10106244,26713233,2147-05-09 10:34:00,2147-05-12 13:43:00,NaT,DIRECT EMER.,HOME,WHITE,NaT,NaT
1,13700703,20448599,2172-09-25 01:01:00,2172-10-03 13:25:00,NaT,OBSERVATION ADMIT,HOME,WHITE,2172-09-24 17:38:00,2172-09-25 03:07:00
2,15443666,27961368,2168-12-30 23:30:00,2169-01-05 16:02:00,NaT,OBSERVATION ADMIT,HOME HEALTH CARE,BLACK/AFRICAN AMERICAN,2168-12-30 11:19:00,2168-12-31 01:22:00
3,16299919,26977065,2193-05-15 08:37:00,2193-05-17 16:03:00,NaT,OBSERVATION ADMIT,HOSPICE,BLACK/AFRICAN AMERICAN,2193-05-15 04:36:00,2193-05-15 14:27:00
4,14149715,24191358,2181-10-25 19:37:00,2181-10-29 14:38:00,NaT,OBSERVATION ADMIT,SKILLED NURSING FACILITY,WHITE,2181-10-25 08:48:00,2181-10-26 15:18:00


In [8]:
hosp_admissions_df_cleaned.dtypes

subject_id                     int64
hadm_id                        int64
hosp_admittime        datetime64[ns]
hosp_dischtime        datetime64[ns]
deathtime             datetime64[ns]
admission_type        string[python]
discharge_location    string[python]
race                  string[python]
edregtime             datetime64[ns]
edouttime             datetime64[ns]
dtype: object

### Hopsital Patient

In [9]:
from preprocessing.static_preprocessing import preprocess_patient

hosp_patients_df_cleaned = preprocess_patient(hosp_patients_df)
hosp_patients_df_cleaned.head()

Unnamed: 0,subject_id,gender,anchor_age,dod
0,11289691,F,18,NaT
1,11806971,F,18,NaT
2,12107404,F,18,NaT
3,12143996,F,18,NaT
4,13117076,F,18,NaT


In [10]:
hosp_patients_df_cleaned.dtypes

subject_id             int64
gender        string[python]
anchor_age             int64
dod           datetime64[ns]
dtype: object

### Hospital Drgcodes

In [11]:
hosp_drgcodes_df[hosp_drgcodes_df['drg_type'] == 'APR'].shape

(156105, 7)

In [12]:
len(hosp_drgcodes_df)

446929

In [13]:
#NO APR?
from preprocessing.static_preprocessing import preprocess_drgcodes
hosp_drgcodes_df_cleaned = preprocess_drgcodes(hosp_drgcodes_df)

In [14]:
hosp_drgcodes_df_cleaned.dtypes

subject_id                    int64
hadm_id                       int64
drg_code_apr         string[python]
description_apr      string[python]
drg_severity_apr     string[python]
drg_mortality_apr    string[python]
drg_code_hcfa        string[python]
description_hcfa     string[python]
dtype: object

### Hospital Diagnosis 

In [15]:
hosp_diagnosis_df

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,long_title
0,10000935,26381316,27,78052,9,"Insomnia, unspecified"
1,10000980,25242409,27,44021,9,Atherosclerosis of native arteries of the extr...
2,10000980,25242409,28,27800,9,"Obesity, unspecified"
3,10000980,25242409,29,V8522,9,"Body Mass Index 26.0-26.9, adult"
4,10000980,25242409,30,72992,9,Nontraumatic hematoma of soft tissue
...,...,...,...,...,...,...
4843782,19994379,27052619,26,T502X5A,10,Adverse effect of carbonic-anhydrase inhibitor...
4843783,19994379,27334101,26,F329,10,"Major depressive disorder, single episode, uns..."
4843784,19995012,26194582,26,K580,10,Irritable bowel syndrome with diarrhea
4843785,19997473,27787494,26,Y92230,10,Patient room in hospital as the place of occur...


In [16]:
from preprocessing.static_preprocessing import clean_diagnosis_data

hosp_diagnosis_df_cleaned = clean_diagnosis_data(hosp_diagnosis_df, 'hosp')
hosp_diagnosis_df_cleaned.head()


Unnamed: 0,subject_id,hadm_id,hosp_icd_codes_diagnosis,hosp_diagnosis
0,10000032,22595853,"[5723, 78959, 5715, 07070, 496, 29680, 30981, ...","[Portal hypertension, Other ascites, Cirrhosis..."
1,10000032,22841357,"[07071, 78959, 2875, 2761, 496, 5715, V08, 3051]",[Unspecified viral hepatitis C with hepatic co...
2,10000032,25742920,"[07054, 78959, V462, 5715, 2767, 2761, 496, V0...",[Chronic hepatitis C without mention of hepati...
3,10000032,29079034,"[45829, 07044, 7994, 2761, 78959, 2767, 3051, ...","[Other iatrogenic hypotension, Chronic hepatit..."
4,10000117,22927623,"[R1310, R0989, K31819, K219, K449, F419, I341,...","[Dysphagia, unspecified, Other specified sympt..."


In [17]:
hosp_diagnosis_df_cleaned.dtypes

subject_id                   int64
hadm_id                      int64
hosp_icd_codes_diagnosis    object
hosp_diagnosis              object
dtype: object

## ICU 

### ICU Stays

In [18]:
icustays_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
0,10270644,20019675,35548343,PACU,PACU,2159-12-03 16:20:31,2159-12-08 17:28:42,5.04735
1,10368426,21588639,39194905,PACU,PACU,2164-12-30 13:29:21,2164-12-30 14:00:38,0.021725
2,10640410,25898987,34344828,PACU,PACU,2112-02-03 12:55:23,2112-02-08 15:14:54,5.096887
3,10691194,24438843,37799251,PACU,PACU,2147-06-01 17:38:48,2147-06-01 17:58:44,0.013843
4,11162329,26304963,39444424,PACU,PACU,2137-06-11 19:49:23,2137-06-12 14:54:47,0.795417


In [19]:
from preprocessing.static_preprocessing import preprocess_icustays

icustays_df_agg = preprocess_icustays(icustays_df)
icustays_df_agg.head()

Unnamed: 0,subject_id,hadm_id,icu_stay_id,icu_first_careunit,icu_last_careunit,icu_los,icu_count
0,10270644,20019675,[35548343],[PACU],[PACU],[5.047349537037037],1
1,10368426,21588639,"[39194905, 34185188, 38171960]","[PACU, Coronary Care Unit (CCU), Medical Inten...","[PACU, Coronary Care Unit (CCU), Medical Inten...","[0.021724537037037, 1.1422569444444444, 0.1307...",3
2,10640410,25898987,"[34344828, 35065956]","[PACU, Trauma SICU (TSICU)]","[PACU, Trauma SICU (TSICU)]","[5.096886574074073, 0.9944097222222222]",2
3,10691194,24438843,"[37799251, 34379928, 33563673]","[PACU, Coronary Care Unit (CCU), Surgical Inte...","[PACU, Coronary Care Unit (CCU), Surgical Inte...","[0.0138425925925925, 3.1843402777777774, 2.077...",3
4,11162329,26304963,"[39444424, 37586359]","[PACU, Neuro Intermediate]","[PACU, Neuro Intermediate]","[0.7954166666666665, 3.072453703703704]",2


In [20]:
icustays_df_agg.dtypes

subject_id             int64
hadm_id                int64
icu_stay_id           object
icu_first_careunit    object
icu_last_careunit     object
icu_los               object
icu_count              int64
dtype: object

## Emergency Department
### ED Stays

In [21]:
edstays_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition
0,10049341,20677333.0,34255415,2171-04-07 17:48:00,2171-04-08 09:31:00,F,ASIAN,WALK IN,HOME
1,10049341,,35767475,2170-08-29 18:20:00,2170-08-29 22:46:00,F,ASIAN,WALK IN,HOME
2,10049341,,36382949,2171-11-19 20:09:00,2171-11-20 00:03:00,F,ASIAN,WALK IN,HOME
3,10049341,,36490047,2174-11-29 19:39:00,2174-11-30 00:49:00,F,ASIAN,WALK IN,HOME
4,10049341,,37283116,2174-01-26 20:10:00,2174-01-27 00:34:00,F,ASIAN,WALK IN,HOME


In [22]:
from preprocessing.static_preprocessing import add_prefix_to_columns

edstays_df_cleaned = add_prefix_to_columns(edstays_df, 'ed')
edstays_df_cleaned.head()

Unnamed: 0,subject_id,hadm_id,ed_stay_id,ed_intime,ed_outtime,ed_gender,ed_race,ed_arrival_transport,ed_disposition
0,10049341,20677333.0,34255415,2171-04-07 17:48:00,2171-04-08 09:31:00,F,ASIAN,WALK IN,HOME
1,10049341,,35767475,2170-08-29 18:20:00,2170-08-29 22:46:00,F,ASIAN,WALK IN,HOME
2,10049341,,36382949,2171-11-19 20:09:00,2171-11-20 00:03:00,F,ASIAN,WALK IN,HOME
3,10049341,,36490047,2174-11-29 19:39:00,2174-11-30 00:49:00,F,ASIAN,WALK IN,HOME
4,10049341,,37283116,2174-01-26 20:10:00,2174-01-27 00:34:00,F,ASIAN,WALK IN,HOME


In [23]:
edstays_df_cleaned.dtypes

subject_id                int64
hadm_id                 float64
ed_stay_id                int64
ed_intime                object
ed_outtime               object
ed_gender                object
ed_race                  object
ed_arrival_transport     object
ed_disposition           object
dtype: object

### ED Diagnosis

In [24]:
ed_diagnosis_df

Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title
0,16253470,37248432,1,035,9,ERYSIPELAS
1,10396349,36517744,1,042,9,HIV DISEASE
2,10429665,38305760,1,042,9,HIV DISEASE
3,10446182,31712472,1,042,9,HIV DISEASE
4,11053554,34764404,1,042,9,HIV DISEASE
...,...,...,...,...,...,...
378039,10254490,37321273,2,R200,10,Anesthesia of skin
378040,10257888,31820982,2,R200,10,Anesthesia of skin
378041,10269787,34285825,2,R200,10,Anesthesia of skin
378042,10301090,37711989,2,R200,10,Anesthesia of skin


In [25]:
ed_diagnosis_df_cleaned = clean_diagnosis_data(ed_diagnosis_df, 'ed')
ed_diagnosis_df_cleaned.head()

Unnamed: 0,subject_id,ed_stay_id,ed_icd_codes_diagnosis,ed_diagnosis
0,10000032,32952584,[4589],[HYPOTENSION NOS]
1,10000032,33258284,[5728],"[OTH SEQUELA, CHR LIV DIS]"
2,10000032,35968195,[5715],[CIRRHOSIS OF LIVER NOS]
3,10000032,38112554,[78959],[OTHER ASCITES]
4,10000032,39399961,[78097],[ALTERED MENTAL STATUS ]


## Merge Hospital, ICU, and ED Data

In [26]:
from preprocessing.static_preprocessing import merge_hosp

# def merge_hosp(admissions_df, patients_df, diagnosis_df, drgcodes_df, icustays_df, edstays_df, ed_diagnosis_df):

hosp_master_df = merge_hosp(hosp_admissions_df_cleaned, hosp_patients_df_cleaned, 
                            hosp_diagnosis_df_cleaned, hosp_drgcodes_df_cleaned, icustays_df_agg, 
                            edstays_df_cleaned, ed_diagnosis_df_cleaned)

print(hosp_master_df.shape)
hosp_master_df.head(5)

(530387, 34)


Unnamed: 0,subject_id,hadm_id,hosp_admittime,hosp_dischtime,admission_type,discharge_location,race,edregtime,edouttime,gender,...,icu_count,ed_stay_id,ed_intime,ed_outtime,ed_gender,ed_race,ed_arrival_transport,ed_disposition,ed_icd_codes_diagnosis,ed_diagnosis
0,10106244,26713233.0,2147-05-09 10:34:00,2147-05-12 13:43:00,DIRECT EMER.,HOME,WHITE,NaT,NaT,F,...,1.0,,,,,,,,,
1,13700703,20448599.0,2172-09-25 01:01:00,2172-10-03 13:25:00,OBSERVATION ADMIT,HOME,WHITE,2172-09-24 17:38:00,2172-09-25 03:07:00,F,...,,34625166.0,2172-09-24 17:38:00,2172-09-25 03:07:00,F,WHITE,WALK IN,ADMITTED,[R109],[Unspecified abdominal pain]
2,15443666,27961368.0,2168-12-30 23:30:00,2169-01-05 16:02:00,OBSERVATION ADMIT,HOME HEALTH CARE,BLACK/AFRICAN AMERICAN,2168-12-30 11:19:00,2168-12-31 01:22:00,F,...,1.0,37230309.0,2168-12-30 11:19:00,2168-12-30 23:32:17,F,BLACK/AFRICAN AMERICAN,WALK IN,ADMITTED,[R109],[Unspecified abdominal pain]
3,16299919,26977065.0,2193-05-15 08:37:00,2193-05-17 16:03:00,OBSERVATION ADMIT,HOSPICE,BLACK/AFRICAN AMERICAN,2193-05-15 04:36:00,2193-05-15 14:27:00,F,...,,32751575.0,2193-05-15 04:36:00,2193-05-15 08:38:20,F,BLACK/AFRICAN AMERICAN,AMBULANCE,ADMITTED,[I10],[Essential (primary) hypertension]
4,14149715,24191358.0,2181-10-25 19:37:00,2181-10-29 14:38:00,OBSERVATION ADMIT,SKILLED NURSING FACILITY,WHITE,2181-10-25 08:48:00,2181-10-26 15:18:00,F,...,,33911360.0,2181-10-25 08:48:00,2181-10-25 19:41:27,F,WHITE,WALK IN,ADMITTED,[S12110A],"[Anterior displaced Type II dens fracture, ini..."


In [27]:
# Check for duplicates
duplicates = hosp_master_df.duplicated(subset=['subject_id', 'hadm_id', 'ed_stay_id'], keep=False)

num_duplicates = duplicates.sum()
print(f"Number of non-unique [subject_id, hadm_id, ed_stay_id] pairs: {num_duplicates}")

Number of non-unique [subject_id, hadm_id, ed_stay_id] pairs: 0


In [28]:
missing_ed_times = hosp_master_df[
    hosp_master_df['edregtime'].notna() &
    hosp_master_df['edouttime'].notna() &
    (hosp_master_df['ed_intime'].isna() | hosp_master_df['ed_outtime'].isna())
]

# Number of affected rows
print(f"Rows with missing ed_intime or ed_outtime: {len(missing_ed_times)}")

# Optional: show the first few problematic rows
missing_ed_times[['subject_id','hadm_id','edregtime','edouttime','ed_intime','ed_outtime']].head()

Rows with missing ed_intime or ed_outtime: 111370


Unnamed: 0,subject_id,hadm_id,edregtime,edouttime,ed_intime,ed_outtime
7,15845632,28189199.0,2124-10-04 19:30:00,2124-10-05 04:10:00,,
13,15973356,20838764.0,2183-09-29 08:18:00,2183-09-29 23:36:00,,
204,10060829,20996159.0,2169-03-21 00:28:00,2169-03-21 12:56:00,,
205,10223685,24644368.0,2110-05-23 17:34:00,2110-05-24 04:35:00,,
206,10230028,25107071.0,2118-05-16 19:12:00,2118-05-17 03:13:00,,


In [29]:
missing_ed_times = hosp_master_df[
    hosp_master_df['ed_intime'].notna() &
    hosp_master_df['ed_outtime'].notna() &
    (hosp_master_df['edregtime'].isna() | hosp_master_df['edouttime'].isna())
]

# Number of affected rows
print(f"Rows with missing edregtime or edouttime: {len(missing_ed_times)}")

# Optional: show the first few problematic rows
missing_ed_times[['subject_id','hadm_id','edregtime','edouttime','ed_intime','ed_outtime']].head(5)

Rows with missing edregtime or edouttime: 133982


Unnamed: 0,subject_id,hadm_id,edregtime,edouttime,ed_intime,ed_outtime
396405,10049341,,NaT,NaT,2170-08-29 18:20:00,2170-08-29 22:46:00
396406,10049341,,NaT,NaT,2171-11-19 20:09:00,2171-11-20 00:03:00
396407,10049341,,NaT,NaT,2174-11-29 19:39:00,2174-11-30 00:49:00
396408,10049341,,NaT,NaT,2174-01-26 20:10:00,2174-01-27 00:34:00
396409,10054496,,NaT,NaT,2124-05-22 19:54:00,2124-05-23 00:08:00


In [30]:
# Check rows where subject_id and hadm_id exist, but ed_intime != edregtime
mismatched_ed_times = hosp_master_df[
    hosp_master_df['subject_id'].notna() &
    hosp_master_df['hadm_id'].notna() &
    hosp_master_df['ed_intime'].notna() &
    hosp_master_df['edregtime'].notna() &
    (hosp_master_df['ed_intime'] != hosp_master_df['edregtime'])
]

print(f"Rows with mismatched ed_intime and edregtime: {len(mismatched_ed_times)}")

# Optional: inspect
mismatched_ed_times[['subject_id','hadm_id', 'edregtime','ed_intime','edouttime', 'ed_outtime']].head(5)


Rows with mismatched ed_intime and edregtime: 480


Unnamed: 0,subject_id,hadm_id,edregtime,ed_intime,edouttime,ed_outtime
85,15869025,28089637.0,2182-05-08 01:26:00,2182-05-07 19:53:00,2182-05-09 21:05:00,2182-05-08 01:26:00
7879,10460981,25922807.0,2137-07-25 00:45:00,2137-07-24 08:21:00,2137-07-25 11:54:00,2137-07-25 00:45:00
8040,14599343,28533227.0,2155-03-03 22:03:00,2155-03-03 15:18:00,2155-03-06 14:17:00,2155-03-03 22:03:00
8106,16233333,29449621.0,2115-02-08 01:11:00,2115-02-07 09:35:00,2115-02-08 06:46:00,2115-02-08 01:11:00
8284,13115186,25235424.0,2112-09-24 12:47:00,2112-09-24 11:14:00,2112-09-25 11:28:00,2112-09-24 12:47:00


In [31]:
hosp_master_df[['subject_id','hadm_id','hosp_admittime', 'hosp_dischtime', 'edregtime','edouttime', 'ed_intime','ed_outtime']].head(10)

Unnamed: 0,subject_id,hadm_id,hosp_admittime,hosp_dischtime,edregtime,edouttime,ed_intime,ed_outtime
0,10106244,26713233.0,2147-05-09 10:34:00,2147-05-12 13:43:00,NaT,NaT,,
1,13700703,20448599.0,2172-09-25 01:01:00,2172-10-03 13:25:00,2172-09-24 17:38:00,2172-09-25 03:07:00,2172-09-24 17:38:00,2172-09-25 03:07:00
2,15443666,27961368.0,2168-12-30 23:30:00,2169-01-05 16:02:00,2168-12-30 11:19:00,2168-12-31 01:22:00,2168-12-30 11:19:00,2168-12-30 23:32:17
3,16299919,26977065.0,2193-05-15 08:37:00,2193-05-17 16:03:00,2193-05-15 04:36:00,2193-05-15 14:27:00,2193-05-15 04:36:00,2193-05-15 08:38:20
4,14149715,24191358.0,2181-10-25 19:37:00,2181-10-29 14:38:00,2181-10-25 08:48:00,2181-10-26 15:18:00,2181-10-25 08:48:00,2181-10-25 19:41:27
5,14446098,20543394.0,2182-04-04 20:11:00,2182-05-07 19:00:00,NaT,NaT,,
6,12224488,25909420.0,2158-10-29 15:59:00,2158-11-01 15:45:00,2158-10-28 20:22:00,2158-10-29 18:01:00,2158-10-28 20:22:00,2158-10-29 18:01:00
7,15845632,28189199.0,2124-10-05 02:44:00,2124-10-12 15:00:00,2124-10-04 19:30:00,2124-10-05 04:10:00,,
8,18131667,28337235.0,2195-11-18 02:58:00,2195-11-27 13:34:00,2195-11-17 21:04:00,2195-11-18 04:51:00,2195-11-17 21:04:00,2195-11-18 02:59:02
9,11371788,21071834.0,2185-12-31 20:42:00,2186-01-02 16:00:00,2185-12-31 15:59:00,2185-12-31 23:36:00,2185-12-31 15:59:00,2185-12-31 20:43:32


In [32]:
from preprocessing.static_preprocessing import merge_ecg

static_master = merge_ecg(hosp_master_df, record_list_df_cleaned)
static_master.head()

Unnamed: 0,subject_id,hadm_id,hosp_admittime,hosp_dischtime,admission_type,discharge_location,race,edregtime,edouttime,gender,...,ed_stay_id,ed_intime,ed_outtime,ed_gender,ed_race,ed_arrival_transport,ed_disposition,ed_icd_codes_diagnosis,ed_diagnosis,ecg_study_ids
0,10106244,26713233.0,2147-05-09 10:34:00,2147-05-12 13:43:00,DIRECT EMER.,HOME,WHITE,NaT,NaT,F,...,,NaT,NaT,,,,,,,"[40600970.0, 44859244.0, 48644999.0, 49164244.0]"
1,13700703,20448599.0,2172-09-25 01:01:00,2172-10-03 13:25:00,OBSERVATION ADMIT,HOME,WHITE,2172-09-24 17:38:00,2172-09-25 03:07:00,F,...,34625166.0,2172-09-24 17:38:00,2172-09-25 03:07:00,F,WHITE,WALK IN,ADMITTED,[R109],[Unspecified abdominal pain],[45997419.0]
2,15443666,27961368.0,2168-12-30 23:30:00,2169-01-05 16:02:00,OBSERVATION ADMIT,HOME HEALTH CARE,BLACK/AFRICAN AMERICAN,2168-12-30 11:19:00,2168-12-31 01:22:00,F,...,37230309.0,2168-12-30 11:19:00,2168-12-30 23:32:17,F,BLACK/AFRICAN AMERICAN,WALK IN,ADMITTED,[R109],[Unspecified abdominal pain],[]
3,16299919,26977065.0,2193-05-15 08:37:00,2193-05-17 16:03:00,OBSERVATION ADMIT,HOSPICE,BLACK/AFRICAN AMERICAN,2193-05-15 04:36:00,2193-05-15 14:27:00,F,...,32751575.0,2193-05-15 04:36:00,2193-05-15 08:38:20,F,BLACK/AFRICAN AMERICAN,AMBULANCE,ADMITTED,[I10],[Essential (primary) hypertension],[]
4,14149715,24191358.0,2181-10-25 19:37:00,2181-10-29 14:38:00,OBSERVATION ADMIT,SKILLED NURSING FACILITY,WHITE,2181-10-25 08:48:00,2181-10-26 15:18:00,F,...,33911360.0,2181-10-25 08:48:00,2181-10-25 19:41:27,F,WHITE,WALK IN,ADMITTED,[S12110A],"[Anterior displaced Type II dens fracture, ini...",[46669291.0]


In [33]:
# Flatten all ECG lists into a single list
all_ecgs = [ecg for sublist in static_master['ecg_study_ids'] for ecg in sublist]
print(all_ecgs[:10])
# Count unique study_ids
num_unique_ecgs = len(set(all_ecgs))
num_ecgs = len(all_ecgs)
print(f"Number of study_ids: {num_ecgs}")
print(f"Number of unique study_ids: {num_unique_ecgs}")

# 468176

[40600970.0, 44859244.0, 48644999.0, 49164244.0, 45997419.0, 46669291.0, 41547518.0, 43566083.0, 44908258.0, 48547934.0]
Number of study_ids: 223725
Number of unique study_ids: 223725


In [34]:
print(static_master.shape)
static_master.head()

(530387, 35)


Unnamed: 0,subject_id,hadm_id,hosp_admittime,hosp_dischtime,admission_type,discharge_location,race,edregtime,edouttime,gender,...,ed_stay_id,ed_intime,ed_outtime,ed_gender,ed_race,ed_arrival_transport,ed_disposition,ed_icd_codes_diagnosis,ed_diagnosis,ecg_study_ids
0,10106244,26713233.0,2147-05-09 10:34:00,2147-05-12 13:43:00,DIRECT EMER.,HOME,WHITE,NaT,NaT,F,...,,NaT,NaT,,,,,,,"[40600970.0, 44859244.0, 48644999.0, 49164244.0]"
1,13700703,20448599.0,2172-09-25 01:01:00,2172-10-03 13:25:00,OBSERVATION ADMIT,HOME,WHITE,2172-09-24 17:38:00,2172-09-25 03:07:00,F,...,34625166.0,2172-09-24 17:38:00,2172-09-25 03:07:00,F,WHITE,WALK IN,ADMITTED,[R109],[Unspecified abdominal pain],[45997419.0]
2,15443666,27961368.0,2168-12-30 23:30:00,2169-01-05 16:02:00,OBSERVATION ADMIT,HOME HEALTH CARE,BLACK/AFRICAN AMERICAN,2168-12-30 11:19:00,2168-12-31 01:22:00,F,...,37230309.0,2168-12-30 11:19:00,2168-12-30 23:32:17,F,BLACK/AFRICAN AMERICAN,WALK IN,ADMITTED,[R109],[Unspecified abdominal pain],[]
3,16299919,26977065.0,2193-05-15 08:37:00,2193-05-17 16:03:00,OBSERVATION ADMIT,HOSPICE,BLACK/AFRICAN AMERICAN,2193-05-15 04:36:00,2193-05-15 14:27:00,F,...,32751575.0,2193-05-15 04:36:00,2193-05-15 08:38:20,F,BLACK/AFRICAN AMERICAN,AMBULANCE,ADMITTED,[I10],[Essential (primary) hypertension],[]
4,14149715,24191358.0,2181-10-25 19:37:00,2181-10-29 14:38:00,OBSERVATION ADMIT,SKILLED NURSING FACILITY,WHITE,2181-10-25 08:48:00,2181-10-26 15:18:00,F,...,33911360.0,2181-10-25 08:48:00,2181-10-25 19:41:27,F,WHITE,WALK IN,ADMITTED,[S12110A],"[Anterior displaced Type II dens fracture, ini...",[46669291.0]


In [35]:
static_master['ed_diagnosis']

0                                                       NaN
1                              [Unspecified abdominal pain]
2                              [Unspecified abdominal pain]
3                        [Essential (primary) hypertension]
4         [Anterior displaced Type II dens fracture, ini...
                                ...                        
530382                                           [HEADACHE]
530383    [Hemoptysis, Type 2 diabetes mellitus without ...
530384                                  [OTHER CONVULSIONS]
530385      [SYNCOPE AND COLLAPSE, ORTHOSTATIC HYPOTENSION]
530386                               [SYNCOPE AND COLLAPSE]
Name: ed_diagnosis, Length: 530387, dtype: object

In [36]:
#Trying entity extraction!
from preprocessing.clinical_entity_extraction import apply_entity_extraction

In [37]:
static_master = apply_entity_extraction(static_master)

In [39]:
static_master.columns

Index(['subject_id', 'hadm_id', 'hosp_admittime', 'hosp_dischtime',
       'admission_type', 'discharge_location', 'race', 'edregtime',
       'edouttime', 'gender', 'anchor_age', 'death_time',
       'hosp_icd_codes_diagnosis', 'hosp_diagnosis', 'drg_code_apr',
       'description_apr', 'drg_severity_apr', 'drg_mortality_apr',
       'drg_code_hcfa', 'description_hcfa', 'icu_stay_id',
       'icu_first_careunit', 'icu_last_careunit', 'icu_los', 'icu_count',
       'ed_stay_id', 'ed_intime', 'ed_outtime', 'ed_gender', 'ed_race',
       'ed_arrival_transport', 'ed_disposition', 'ed_icd_codes_diagnosis',
       'ed_diagnosis', 'ecg_study_ids', 'hosp_diagnosis_entities',
       'num_hosp_diagnosis_entities', 'ed_entities',
       'num_ed_diagnosis_entities'],
      dtype='object')

In [59]:
static_master['hosp_diagnosis_entities']

0         [ventricular, hypertension, heart, coronary, a...
1                                                        []
2         [atrial, vascular, valve, fibrillation, heart,...
3                      [hypertensive, hypertension, oxygen]
4         [atrial, fibrillation, artery, heart, coronary...
                                ...                        
530382                                                   []
530383                                                   []
530384                                                   []
530385                                                   []
530386                                                   []
Name: hosp_diagnosis_entities, Length: 530387, dtype: object

In [66]:
non_empty = static_master[static_master['ed_entities'].apply(len) != 0]['ed_entities']
for i, val in enumerate(non_empty.head(10)):  # print first 10 only
    print(f"{i}:", val)

0: {'Arrhythmia': [], 'Ischemic': [], 'Heart Failure': [], 'Chest Pain / Symptoms': ['chest pain'], 'Vascular / Embolic': [], 'Cardiac Arrest': [], 'Structural / Cardiomyopathy': []}
1: {'Arrhythmia': [], 'Ischemic': [], 'Heart Failure': [], 'Chest Pain / Symptoms': ['chest pain'], 'Vascular / Embolic': [], 'Cardiac Arrest': [], 'Structural / Cardiomyopathy': []}
2: {'Arrhythmia': [], 'Ischemic': [], 'Heart Failure': [], 'Chest Pain / Symptoms': ['syncope', 'collapse'], 'Vascular / Embolic': [], 'Cardiac Arrest': [], 'Structural / Cardiomyopathy': []}
3: {'Arrhythmia': [], 'Ischemic': [], 'Heart Failure': [], 'Chest Pain / Symptoms': [], 'Vascular / Embolic': ['embolism', 'infarct'], 'Cardiac Arrest': [], 'Structural / Cardiomyopathy': []}
4: {'Arrhythmia': [], 'Ischemic': [], 'Heart Failure': [], 'Chest Pain / Symptoms': ['chest pain', 'chest pain nos'], 'Vascular / Embolic': [], 'Cardiac Arrest': [], 'Structural / Cardiomyopathy': []}
5: {'Arrhythmia': [], 'Ischemic': [], 'Heart Fail