# Static Preprocessing 

In [None]:
import pandas as pd
import os
import sys
import json
from pathlib import Path


# Notebook is in notebooks/, so repo root is parent
REPO_ROOT = Path.cwd().parent
SRC_PATH = REPO_ROOT / "src"

# Insert src at the front of sys.path so imports work
sys.path.insert(0, str(SRC_PATH))


In [38]:
from preprocessing.static_preprocessing import load_static_data

# Get repo root relative to the current notebook
repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Load static preprocessing config
config_path = os.path.join(repo_root, "configs", "static_preprocessing_params.json")
with open(config_path, "r") as f:
    config = json.load(f)

# Set input and output directories
in_dir = os.path.join(repo_root, config["paths"]["in_dir"])
out_dir = os.path.join(repo_root, config["paths"]["out_dir"])

# Pass in_dir as the first argument
hosp_patients_df, hosp_admissions_df, hosp_diagnosis_df, hosp_drgcodes_df, icustays_df, edstays_df, ed_diagnosis_df, record_list_df = load_static_data(in_dir, config)

## Clean ECG Reports 

In [39]:
from preprocessing.static_preprocessing import clean_cols_types

record_list_df_cleaned = clean_cols_types(record_list_df)
record_list_df_cleaned.head()

Unnamed: 0,subject_id,study_id,file_name,ecg_time,path
0,10000032,40689238,40689238,2180-07-23 08:44:00,files/p1000/p10000032/s40689238/40689238
1,10000032,44458630,44458630,2180-07-23 09:54:00,files/p1000/p10000032/s44458630/44458630
2,10000032,49036311,49036311,2180-08-06 09:07:00,files/p1000/p10000032/s49036311/49036311
3,10000117,45090959,45090959,2181-03-04 17:14:00,files/p1000/p10000117/s45090959/45090959
4,10000117,48446569,48446569,2183-09-18 13:52:00,files/p1000/p10000117/s48446569/48446569


In [40]:
record_list_df_cleaned.dtypes

subject_id             int64
study_id               int64
file_name              int64
ecg_time      datetime64[ns]
path          string[python]
dtype: object

In [41]:
record_list_df_cleaned.shape

(800035, 5)

## Cleaning Hospital Module

### Hospital Admissions

In [42]:
from preprocessing.static_preprocessing import preprocess_admissions

hosp_admissions_df_cleaned = preprocess_admissions(hosp_admissions_df)
hosp_admissions_df_cleaned.head()

Unnamed: 0,subject_id,hadm_id,hosp_admittime,hosp_dischtime,deathtime,admission_type,discharge_location,race,edregtime,edouttime
0,10106244,26713233,2147-05-09 10:34:00,2147-05-12 13:43:00,NaT,DIRECT EMER.,HOME,WHITE,NaT,NaT
1,13700703,20448599,2172-09-25 01:01:00,2172-10-03 13:25:00,NaT,OBSERVATION ADMIT,HOME,WHITE,2172-09-24 17:38:00,2172-09-25 03:07:00
2,15443666,27961368,2168-12-30 23:30:00,2169-01-05 16:02:00,NaT,OBSERVATION ADMIT,HOME HEALTH CARE,BLACK/AFRICAN AMERICAN,2168-12-30 11:19:00,2168-12-31 01:22:00
3,16299919,26977065,2193-05-15 08:37:00,2193-05-17 16:03:00,NaT,OBSERVATION ADMIT,HOSPICE,BLACK/AFRICAN AMERICAN,2193-05-15 04:36:00,2193-05-15 14:27:00
4,14149715,24191358,2181-10-25 19:37:00,2181-10-29 14:38:00,NaT,OBSERVATION ADMIT,SKILLED NURSING FACILITY,WHITE,2181-10-25 08:48:00,2181-10-26 15:18:00


In [43]:
hosp_admissions_df_cleaned.dtypes

subject_id                     int64
hadm_id                        int64
hosp_admittime        datetime64[ns]
hosp_dischtime        datetime64[ns]
deathtime             datetime64[ns]
admission_type        string[python]
discharge_location    string[python]
race                  string[python]
edregtime             datetime64[ns]
edouttime             datetime64[ns]
dtype: object

### Hopsital Patient

In [44]:
from preprocessing.static_preprocessing import preprocess_patient

hosp_patients_df_cleaned = preprocess_patient(hosp_patients_df)
hosp_patients_df_cleaned.head()

Unnamed: 0,subject_id,gender,anchor_age,dod
0,11289691,F,18,NaT
1,11806971,F,18,NaT
2,12107404,F,18,NaT
3,12143996,F,18,NaT
4,13117076,F,18,NaT


In [45]:
hosp_patients_df_cleaned.dtypes

subject_id             int64
gender        string[python]
anchor_age             int64
dod           datetime64[ns]
dtype: object

### Hospital Drgcodes

In [46]:
from preprocessing.static_preprocessing import preprocess_drgcodes

hosp_drgcodes_df_cleaned = preprocess_drgcodes(hosp_drgcodes_df)

In [47]:
hosp_drgcodes_df_cleaned.dtypes

subject_id                    int64
hadm_id                       int64
drg_code_apr         string[python]
description_apr      string[python]
drg_severity_apr     string[python]
drg_mortality_apr    string[python]
drg_code_hcfa        string[python]
description_hcfa     string[python]
dtype: object

### Hospital Diagnosis 

In [48]:
hosp_diagnosis_df

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,long_title
0,10000935,26381316,27,78052,9,"Insomnia, unspecified"
1,10000980,25242409,27,44021,9,Atherosclerosis of native arteries of the extr...
2,10000980,25242409,28,27800,9,"Obesity, unspecified"
3,10000980,25242409,29,V8522,9,"Body Mass Index 26.0-26.9, adult"
4,10000980,25242409,30,72992,9,Nontraumatic hematoma of soft tissue
...,...,...,...,...,...,...
4843782,19994379,27052619,26,T502X5A,10,Adverse effect of carbonic-anhydrase inhibitor...
4843783,19994379,27334101,26,F329,10,"Major depressive disorder, single episode, uns..."
4843784,19995012,26194582,26,K580,10,Irritable bowel syndrome with diarrhea
4843785,19997473,27787494,26,Y92230,10,Patient room in hospital as the place of occur...


In [49]:
from preprocessing.static_preprocessing import clean_diagnosis_data

hosp_diagnosis_df_cleaned = clean_diagnosis_data(hosp_diagnosis_df, 'hosp')
hosp_diagnosis_df_cleaned.head()


Unnamed: 0,subject_id,hadm_id,hosp_icd_codes_diagnosis,hosp_diagnosis
0,10000032,22595853,"[5723, 78959, 5715, 07070, 496, 29680, 30981, ...","[Portal hypertension, Other ascites, Cirrhosis..."
1,10000032,22841357,"[07071, 78959, 2875, 2761, 496, 5715, V08, 3051]",[Unspecified viral hepatitis C with hepatic co...
2,10000032,25742920,"[07054, 78959, V462, 5715, 2767, 2761, 496, V0...",[Chronic hepatitis C without mention of hepati...
3,10000032,29079034,"[45829, 07044, 7994, 2761, 78959, 2767, 3051, ...","[Other iatrogenic hypotension, Chronic hepatit..."
4,10000117,22927623,"[R1310, R0989, K31819, K219, K449, F419, I341,...","[Dysphagia, unspecified, Other specified sympt..."


In [16]:
hosp_diagnosis_df_cleaned.dtypes

subject_id                   int64
hadm_id                      int64
hosp_icd_codes_diagnosis    object
hosp_diagnosis              object
dtype: object

## ICU 

### ICU Stays

In [17]:
icustays_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
0,10270644,20019675,35548343,PACU,PACU,2159-12-03 16:20:31,2159-12-08 17:28:42,5.04735
1,10368426,21588639,39194905,PACU,PACU,2164-12-30 13:29:21,2164-12-30 14:00:38,0.021725
2,10640410,25898987,34344828,PACU,PACU,2112-02-03 12:55:23,2112-02-08 15:14:54,5.096887
3,10691194,24438843,37799251,PACU,PACU,2147-06-01 17:38:48,2147-06-01 17:58:44,0.013843
4,11162329,26304963,39444424,PACU,PACU,2137-06-11 19:49:23,2137-06-12 14:54:47,0.795417


In [18]:
from preprocessing.static_preprocessing import preprocess_icustays

icustays_df_agg = preprocess_icustays(icustays_df)
icustays_df_agg.head()

Unnamed: 0,subject_id,hadm_id,icu_stay_id,icu_first_careunit,icu_last_careunit,icu_los,icu_count
0,10270644,20019675,[35548343],[PACU],[PACU],[5.047349537037037],1
1,10368426,21588639,"[39194905, 34185188, 38171960]","[PACU, Coronary Care Unit (CCU), Medical Inten...","[PACU, Coronary Care Unit (CCU), Medical Inten...","[0.021724537037037, 1.1422569444444444, 0.1307...",3
2,10640410,25898987,"[34344828, 35065956]","[PACU, Trauma SICU (TSICU)]","[PACU, Trauma SICU (TSICU)]","[5.096886574074074, 0.9944097222222222]",2
3,10691194,24438843,"[37799251, 34379928, 33563673]","[PACU, Coronary Care Unit (CCU), Surgical Inte...","[PACU, Coronary Care Unit (CCU), Surgical Inte...","[0.0138425925925925, 3.1843402777777774, 2.077...",3
4,11162329,26304963,"[39444424, 37586359]","[PACU, Neuro Intermediate]","[PACU, Neuro Intermediate]","[0.7954166666666665, 3.0724537037037036]",2


In [19]:
icustays_df_agg.dtypes

subject_id             int64
hadm_id                int64
icu_stay_id           object
icu_first_careunit    object
icu_last_careunit     object
icu_los               object
icu_count              int64
dtype: object

## Emergency Department
### ED Stays

In [20]:
edstays_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition
0,10049341,20677333.0,34255415,2171-04-07 17:48:00,2171-04-08 09:31:00,F,ASIAN,WALK IN,HOME
1,10049341,,35767475,2170-08-29 18:20:00,2170-08-29 22:46:00,F,ASIAN,WALK IN,HOME
2,10049341,,36382949,2171-11-19 20:09:00,2171-11-20 00:03:00,F,ASIAN,WALK IN,HOME
3,10049341,,36490047,2174-11-29 19:39:00,2174-11-30 00:49:00,F,ASIAN,WALK IN,HOME
4,10049341,,37283116,2174-01-26 20:10:00,2174-01-27 00:34:00,F,ASIAN,WALK IN,HOME


In [21]:
from preprocessing.static_preprocessing import add_prefix_to_columns

edstays_df_cleaned = add_prefix_to_columns(edstays_df, 'ed')
edstays_df_cleaned.head()

Unnamed: 0,subject_id,hadm_id,ed_stay_id,ed_intime,ed_outtime,ed_gender,ed_race,ed_arrival_transport,ed_disposition
0,10049341,20677333.0,34255415,2171-04-07 17:48:00,2171-04-08 09:31:00,F,ASIAN,WALK IN,HOME
1,10049341,,35767475,2170-08-29 18:20:00,2170-08-29 22:46:00,F,ASIAN,WALK IN,HOME
2,10049341,,36382949,2171-11-19 20:09:00,2171-11-20 00:03:00,F,ASIAN,WALK IN,HOME
3,10049341,,36490047,2174-11-29 19:39:00,2174-11-30 00:49:00,F,ASIAN,WALK IN,HOME
4,10049341,,37283116,2174-01-26 20:10:00,2174-01-27 00:34:00,F,ASIAN,WALK IN,HOME


In [22]:
edstays_df_cleaned.dtypes

subject_id                int64
hadm_id                 float64
ed_stay_id                int64
ed_intime                object
ed_outtime               object
ed_gender                object
ed_race                  object
ed_arrival_transport     object
ed_disposition           object
dtype: object

### ED Diagnosis

In [23]:
ed_diagnosis_df

Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title
0,16253470,37248432,1,035,9,ERYSIPELAS
1,10396349,36517744,1,042,9,HIV DISEASE
2,10429665,38305760,1,042,9,HIV DISEASE
3,10446182,31712472,1,042,9,HIV DISEASE
4,11053554,34764404,1,042,9,HIV DISEASE
...,...,...,...,...,...,...
659063,17816374,34934902,9,Z9641,10,Presence of insulin pump (external) (internal)
659064,17763551,37535744,9,Z96653,10,"Presence of artificial knee joint, bilateral"
659065,14213786,31284900,9,V1988XA,10,Pedl cyclst (driver) injured in oth transport ...
659066,15903451,35111209,9,V499XXA,10,Car occupant (driver) (passenger) injured in u...


In [24]:
ed_diagnosis_df_cleaned = clean_diagnosis_data(ed_diagnosis_df, 'ed')
ed_diagnosis_df_cleaned.head()

Unnamed: 0,subject_id,ed_stay_id,ed_icd_codes_diagnosis,ed_diagnosis
0,10000032,32952584,"[4589, 07070, V08]","[HYPOTENSION NOS, UNSPECIFIED VIRAL HEPATITIS ..."
1,10000032,33258284,"[5728, 78959, 07070, V08]","[OTH SEQUELA, CHR LIV DIS, OTHER ASCITES, UNSP..."
2,10000032,35968195,"[5715, 78900, V08]","[CIRRHOSIS OF LIVER NOS, ABDOMINAL PAIN UNSPEC..."
3,10000032,38112554,"[78959, 07070, 5715, V08]","[OTHER ASCITES, UNSPECIFIED VIRAL HEPATITIS C ..."
4,10000032,39399961,"[78097, 34830]","[ALTERED MENTAL STATUS , ENCEPHALOPATHY, UNSPE..."


## Merge Hospital, ICU, and ED Data

In [25]:
from preprocessing.static_preprocessing import merge_hosp

# def merge_hosp(admissions_df, patients_df, diagnosis_df, drgcodes_df, icustays_df, edstays_df, ed_diagnosis_df):

hosp_master_df = merge_hosp(hosp_admissions_df_cleaned, hosp_patients_df_cleaned, 
                            hosp_diagnosis_df_cleaned, hosp_drgcodes_df_cleaned, icustays_df_agg, 
                            edstays_df_cleaned, ed_diagnosis_df_cleaned)

print(hosp_master_df.shape)
hosp_master_df.head(5)

(530387, 34)


Unnamed: 0,subject_id,hadm_id,hosp_admittime,hosp_dischtime,admission_type,discharge_location,race,edregtime,edouttime,gender,...,icu_count,ed_stay_id,ed_intime,ed_outtime,ed_gender,ed_race,ed_arrival_transport,ed_disposition,ed_icd_codes_diagnosis,ed_diagnosis
0,10000032,22595853.0,2180-05-06 22:23:00,2180-05-07 17:15:00,URGENT,HOME,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,F,...,,33258284.0,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,AMBULANCE,ADMITTED,"[5728, 78959, 07070, V08]","[OTH SEQUELA, CHR LIV DIS, OTHER ASCITES, UNSP..."
1,10000032,22841357.0,2180-06-26 18:27:00,2180-06-27 18:49:00,EW EMER.,HOME,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,F,...,,38112554.0,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,AMBULANCE,ADMITTED,"[78959, 07070, 5715, V08]","[OTHER ASCITES, UNSPECIFIED VIRAL HEPATITIS C ..."
2,10000032,25742920.0,2180-08-05 23:44:00,2180-08-07 17:50:00,EW EMER.,HOSPICE,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,F,...,,35968195.0,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,AMBULANCE,ADMITTED,"[5715, 78900, V08]","[CIRRHOSIS OF LIVER NOS, ABDOMINAL PAIN UNSPEC..."
3,10000032,29079034.0,2180-07-23 12:35:00,2180-07-25 17:55:00,EW EMER.,HOME,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,F,...,1.0,32952584.0,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,AMBULANCE,HOME,"[4589, 07070, V08]","[HYPOTENSION NOS, UNSPECIFIED VIRAL HEPATITIS ..."
4,10000032,29079034.0,2180-07-23 12:35:00,2180-07-25 17:55:00,EW EMER.,HOME,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,F,...,1.0,39399961.0,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,AMBULANCE,ADMITTED,"[78097, 34830]","[ALTERED MENTAL STATUS , ENCEPHALOPATHY, UNSPE..."


In [26]:
# Check for duplicates
duplicates = hosp_master_df.duplicated(subset=['subject_id', 'hadm_id', 'ed_stay_id'], keep=False)

num_duplicates = duplicates.sum()
print(f"Number of non-unique [subject_id, hadm_id, ed_stay_id] pairs: {num_duplicates}")

Number of non-unique [subject_id, hadm_id, ed_stay_id] pairs: 0


In [27]:
missing_ed_times = hosp_master_df[
    hosp_master_df['edregtime'].notna() &
    hosp_master_df['edouttime'].notna() &
    (hosp_master_df['ed_intime'].isna() | hosp_master_df['ed_outtime'].isna())
]

# Number of affected rows
print(f"Rows with missing ed_intime or ed_outtime: {len(missing_ed_times)}")

# Optional: show the first few problematic rows
missing_ed_times[['subject_id','hadm_id','edregtime','edouttime','ed_intime','ed_outtime']].head()

Rows with missing ed_intime or ed_outtime: 111370


Unnamed: 0,subject_id,hadm_id,edregtime,edouttime,ed_intime,ed_outtime
11,10000635,20642640.0,2143-12-23 07:43:00,2143-12-24 12:52:00,,
16,10000826,20032235.0,2146-12-05 11:09:00,2146-12-05 20:54:00,,
18,10000826,28289260.0,2146-12-30 17:06:00,2146-12-31 02:10:00,,
21,10000935,24955974.0,2183-11-07 01:05:00,2183-11-07 11:19:00,,
24,10000935,29541074.0,2183-10-28 03:30:00,2183-10-28 10:39:00,,


In [28]:
missing_ed_times = hosp_master_df[
    hosp_master_df['ed_intime'].notna() &
    hosp_master_df['ed_outtime'].notna() &
    (hosp_master_df['edregtime'].isna() | hosp_master_df['edouttime'].isna())
]

# Number of affected rows
print(f"Rows with missing edregtime or edouttime: {len(missing_ed_times)}")

# Optional: show the first few problematic rows
missing_ed_times[['subject_id','hadm_id','edregtime','edouttime','ed_intime','ed_outtime']].head(5)

Rows with missing edregtime or edouttime: 133982


Unnamed: 0,subject_id,hadm_id,edregtime,edouttime,ed_intime,ed_outtime
7,10000117,,NaT,NaT,2183-07-17 10:30:00,2183-07-17 11:31:00
8,10000285,,NaT,NaT,2161-11-08 14:19:00,2161-11-08 21:06:00
9,10000285,,NaT,NaT,2159-11-26 14:22:00,2159-11-26 19:17:00
13,10000635,,NaT,NaT,2138-09-29 10:54:00,2138-09-29 16:53:00
14,10000635,,NaT,NaT,2141-08-15 11:32:00,2141-08-15 17:06:00


In [29]:
# Check rows where subject_id and hadm_id exist, but ed_intime != edregtime
mismatched_ed_times = hosp_master_df[
    hosp_master_df['subject_id'].notna() &
    hosp_master_df['hadm_id'].notna() &
    hosp_master_df['ed_intime'].notna() &
    hosp_master_df['edregtime'].notna() &
    (hosp_master_df['ed_intime'] != hosp_master_df['edregtime'])
]

print(f"Rows with mismatched ed_intime and edregtime: {len(mismatched_ed_times)}")

# Optional: inspect
mismatched_ed_times[['subject_id','hadm_id', 'edregtime','ed_intime','edouttime', 'ed_outtime']].head(5)


Rows with mismatched ed_intime and edregtime: 480


Unnamed: 0,subject_id,hadm_id,edregtime,ed_intime,edouttime,ed_outtime
3,10000032,29079034.0,2180-07-23 05:54:00,2180-07-22 16:24:00,2180-07-23 14:00:00,2180-07-23 05:54:00
586,10011189,27462671.0,2189-07-02 13:08:00,2189-07-02 12:51:00,2189-07-02 19:38:00,2189-07-02 13:08:00
3724,10076617,21474221.0,2165-09-25 17:45:00,2165-09-25 10:38:00,2165-09-26 00:14:00,2165-09-25 17:45:00
3917,10080392,24961269.0,2126-05-09 05:24:00,2126-05-08 10:27:00,2126-05-09 14:30:00,2126-05-09 05:24:00
6541,10134742,22748673.0,2128-03-24 12:56:00,2128-03-23 20:23:00,2128-03-24 20:36:00,2128-03-24 12:56:00


In [30]:
hosp_master_df[['subject_id','hadm_id','hosp_admittime', 'hosp_dischtime', 'edregtime','edouttime', 'ed_intime','ed_outtime']].head(10)

Unnamed: 0,subject_id,hadm_id,hosp_admittime,hosp_dischtime,edregtime,edouttime,ed_intime,ed_outtime
0,10000032,22595853.0,2180-05-06 22:23:00,2180-05-07 17:15:00,2180-05-06 19:17:00,2180-05-06 23:30:00,2180-05-06 19:17:00,2180-05-06 23:30:00
1,10000032,22841357.0,2180-06-26 18:27:00,2180-06-27 18:49:00,2180-06-26 15:54:00,2180-06-26 21:31:00,2180-06-26 15:54:00,2180-06-26 21:31:00
2,10000032,25742920.0,2180-08-05 23:44:00,2180-08-07 17:50:00,2180-08-05 20:58:00,2180-08-06 01:44:00,2180-08-05 20:58:00,2180-08-06 01:44:00
3,10000032,29079034.0,2180-07-23 12:35:00,2180-07-25 17:55:00,2180-07-23 05:54:00,2180-07-23 14:00:00,2180-07-22 16:24:00,2180-07-23 05:54:00
4,10000032,29079034.0,2180-07-23 12:35:00,2180-07-25 17:55:00,2180-07-23 05:54:00,2180-07-23 14:00:00,2180-07-23 05:54:00,2180-07-23 14:00:00
5,10000117,22927623.0,2181-11-15 02:05:00,2181-11-15 14:52:00,2181-11-14 21:51:00,2181-11-15 09:57:00,2181-11-14 21:51:00,2181-11-15 02:06:42
6,10000117,27988844.0,2183-09-18 18:10:00,2183-09-21 16:30:00,2183-09-18 08:41:00,2183-09-18 20:20:00,2183-09-18 08:41:00,2183-09-18 20:20:00
7,10000117,,NaT,NaT,NaT,NaT,2183-07-17 10:30:00,2183-07-17 11:31:00
8,10000285,,NaT,NaT,NaT,NaT,2161-11-08 14:19:00,2161-11-08 21:06:00
9,10000285,,NaT,NaT,NaT,NaT,2159-11-26 14:22:00,2159-11-26 19:17:00


In [31]:
from preprocessing.static_preprocessing import merge_ecg

static_master = merge_ecg(hosp_master_df, record_list_df_cleaned)
static_master.head()

Unnamed: 0,subject_id,hadm_id,hosp_admittime,hosp_dischtime,admission_type,discharge_location,race,edregtime,edouttime,gender,...,ed_stay_id,ed_intime,ed_outtime,ed_gender,ed_race,ed_arrival_transport,ed_disposition,ed_icd_codes_diagnosis,ed_diagnosis,ecg_study_ids
0,10000032,22595853.0,2180-05-06 22:23:00,2180-05-07 17:15:00,URGENT,HOME,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,F,...,33258284.0,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,AMBULANCE,ADMITTED,"[5728, 78959, 07070, V08]","[OTH SEQUELA, CHR LIV DIS, OTHER ASCITES, UNSP...",[]
1,10000032,22841357.0,2180-06-26 18:27:00,2180-06-27 18:49:00,EW EMER.,HOME,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,F,...,38112554.0,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,AMBULANCE,ADMITTED,"[78959, 07070, 5715, V08]","[OTHER ASCITES, UNSPECIFIED VIRAL HEPATITIS C ...",[]
2,10000032,25742920.0,2180-08-05 23:44:00,2180-08-07 17:50:00,EW EMER.,HOSPICE,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,F,...,35968195.0,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,AMBULANCE,ADMITTED,"[5715, 78900, V08]","[CIRRHOSIS OF LIVER NOS, ABDOMINAL PAIN UNSPEC...",[49036311]
3,10000032,29079034.0,2180-07-23 12:35:00,2180-07-25 17:55:00,EW EMER.,HOME,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,F,...,32952584.0,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,AMBULANCE,HOME,"[4589, 07070, V08]","[HYPOTENSION NOS, UNSPECIFIED VIRAL HEPATITIS ...",[]
4,10000032,29079034.0,2180-07-23 12:35:00,2180-07-25 17:55:00,EW EMER.,HOME,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,F,...,39399961.0,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,AMBULANCE,ADMITTED,"[78097, 34830]","[ALTERED MENTAL STATUS , ENCEPHALOPATHY, UNSPE...","[40689238, 44458630]"


In [32]:
# Flatten all ECG lists into a single list
all_ecgs = [ecg for sublist in static_master['ecg_study_ids'] for ecg in sublist]
print(all_ecgs[:10])
# Count unique study_ids
num_unique_ecgs = len(set(all_ecgs))
num_ecgs = len(all_ecgs)
print(f"Number of study_ids: {num_ecgs}")
print(f"Number of unique study_ids: {num_unique_ecgs}")

# 468096

[49036311, 40689238, 44458630, 48446569, 42709053, 48339811, 44095784, 40539087, 43681375, 44069449]
Number of study_ids: 468176
Number of unique study_ids: 468176


In [33]:
static_master.head()
print(static_master.shape)

(530387, 35)
