# Preprocessing 

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [30]:
# MIMIC-IV ECG DATA
ecg_path = "../data/ecg_record_data.csv"

# MIMIC_IV HOSP
hosp_diagnosis_path = '../data/diagnoses_icd_data.csv'
hosp_admissions_path = '../data/admissions_data.csv'
hos_patient_path = '../data/patient_data.csv'


## Clean ECG Reports 
- flatten reports into 1 column, clean dtypes, and remove missing leads

In [31]:
df_ecg = pd.read_csv(ecg_path)
print(df_ecg.shape)
df_ecg.head()

  df_ecg = pd.read_csv(ecg_path)


(800035, 35)


Unnamed: 0,subject_id,study_id,cart_id,ecg_time,report_0,report_1,report_2,report_3,report_4,report_5,...,p_onset,p_end,qrs_onset,qrs_end,t_end,p_axis,qrs_axis,t_axis,file_name,path
0,10014354,43236346,6632385,2148-06-30 00:39:00,All 12 leads are missing,,,,,,...,29999,29999,0,29999,29999,0,0,32767,43236346,files/p1001/p10014354/s43236346/43236346
1,10093718,40158485,6632385,2194-07-17 18:01:00,All 12 leads are missing,,,,,,...,29999,29999,0,29999,29999,0,0,32767,40158485,files/p1009/p10093718/s40158485/40158485
2,10149624,46056367,6632385,2141-08-14 19:49:00,All 12 leads are missing,,,,,,...,29999,29999,0,29999,29999,0,0,32767,46056367,files/p1014/p10149624/s46056367/46056367
3,10170435,44602017,6632385,2178-11-14 18:40:00,All 12 leads are missing,,,,,,...,29999,29999,0,29999,29999,0,0,32767,44602017,files/p1017/p10170435/s44602017/44602017
4,10178472,45373918,6632385,2171-04-12 19:38:00,All 12 leads are missing,,,,,,...,29999,29999,0,29999,29999,0,0,32767,45373918,files/p1017/p10178472/s45373918/45373918


In [32]:
def clean_cols_types(df):
    """
    Normalize column types:
      - Convert columns containing 'date' or 'time' in their names to datetime.
      - Convert all other object columns to Pandas string dtype.
    """
    time_keywords = ("date", "time", "dod")

    for col in df.columns:
        col_lower = col.lower()

        if any(k in col_lower for k in time_keywords):
            df[col] = pd.to_datetime(df[col], errors="coerce")
            continue

        # Convert object columns to Pandas string
        if df[col].dtype == "object":
            df[col] = df[col].astype("string")

    return df

def flatten_columns(df, cols, output_col="flattened"):
    """
    Combine multiple report columns into one list column.
    """
    df[output_col] = df[cols].apply(
        lambda row: [s.strip() for s in row if pd.notna(s) and s.strip()],
        axis=1
    )
    return df.drop(columns=cols)

def preprocess_ecg_data(df):
    """
    Full pipeline to clean and flatten ECG report fields.
    Makes a copy of the input to avoid modifying the original.
    """
    report_cols = [col for col in df.columns if col.startswith("report_")]
    invalid_phrases = ["Uncertain rhythm: review", "All 12 leads are missing"]

    # Normalize column types
    df = clean_cols_types(df)

    # Flatten report columns into a single list column
    df = flatten_columns(df, report_cols, "full_report")

    # Remove rows containing invalid machine messages
    df = df[df["full_report"].apply(
        lambda lst: all(p not in lst for p in invalid_phrases)
    )]

    # Reset index after filtering
    df = df.reset_index(drop=True)

    return df

df_ecg_cleaned = preprocess_ecg_data(df_ecg)
df_ecg_cleaned.head()



Unnamed: 0,subject_id,study_id,cart_id,ecg_time,bandwidth,filtering,rr_interval,p_onset,p_end,qrs_onset,qrs_end,t_end,p_axis,qrs_axis,t_axis,file_name,path,full_report
0,18637389,42512653,6537058,2185-10-15 08:20:00,0.005-150 Hz,60 Hz notch Baseline filter,476,29999,29999,0,0,476,29999,29999,29999,42512653,files/p1863/p18637389/s42512653/42512653,[Atrial fibrillation with rapid ventricular re...
1,10002662,48509863,6044454,2183-02-22 20:11:00,0.005-150 Hz,60 Hz notch Baseline filter,1016,40,158,256,344,652,97,57,46,48509863,files/p1000/p10002662/s48509863/48509863,[Sinus bradycardia with borderline 1st degree ...
2,10014354,46738745,6245150,2148-06-25 00:17:00,0.005-150 Hz,60 Hz notch Baseline filter,937,40,174,256,348,656,55,-6,57,46738745,files/p1001/p10014354/s46738745/46738745,[Sinus rhythm with borderline 1st degree A-V b...
3,10020306,47789314,6398092,2116-10-08 12:43:00,0.005-150 Hz,60 Hz notch Baseline filter,631,40,168,256,352,624,44,23,100,47789314,files/p1002/p10020306/s47789314/47789314,[Sinus arrhythmia with PVCs with borderline 1s...
4,10020306,44589661,6049786,2127-01-13 09:04:00,0.005-150 Hz,60 Hz notch Baseline filter,810,40,172,256,344,638,58,20,12,44589661,files/p1002/p10020306/s44589661/44589661,[Sinus rhythm with borderline 1st degree A-V b...


## Cleaning Hospital Module
- Module contains admissions, patient info, and hospital diagnoses
- Cleaning rows, 

In [33]:
df_hosp_diagnosis = pd.read_csv(hosp_diagnosis_path)

def clean_diagnosis_data(df):
    df = clean_cols_types(df)

    # Sort to preserve order
    df = df.sort_values(["subject_id", "hadm_id", "seq_num"])

    # Group by subject_id and hadm_id and aggregate lists directly
    df_diag_agg = (
        df
        .groupby(["subject_id", "hadm_id"], as_index=False)
        .agg({
            "icd_code": list,      # list of ICD codes
            "long_title": list     # list of diagnosis names
        })
        .rename(columns={"icd_code": "icd_codes_diagnosis", "long_title": "diagnosis"})
    )
    return df_diag_agg

df_diagnosis_cleaned = clean_diagnosis_data(df_hosp_diagnosis)
df_diagnosis_cleaned.head()


Unnamed: 0,subject_id,hadm_id,icd_codes_diagnosis,diagnosis
0,10000032,22595853,"[5723, 78959, 5715, 07070, 496, 29680, 30981, ...","[Portal hypertension, Other ascites, Cirrhosis..."
1,10000032,22841357,"[07071, 78959, 2875, 2761, 496, 5715, V08, 3051]",[Unspecified viral hepatitis C with hepatic co...
2,10000032,25742920,"[07054, 78959, V462, 5715, 2767, 2761, 496, V0...",[Chronic hepatitis C without mention of hepati...
3,10000032,29079034,"[45829, 07044, 7994, 2761, 78959, 2767, 3051, ...","[Other iatrogenic hypotension, Chronic hepatit..."
4,10000117,22927623,"[R1310, R0989, K31819, K219, K449, F419, I341,...","[Dysphagia, unspecified, Other specified sympt..."


In [34]:
df_hosp_admissions = pd.read_csv(hosp_admissions_path)

def preprocess_admissions(df):
    df = df.drop(columns=['insurance', 'admission_location', 'marital_status', 'hospital_expire_flag', 'language', 'marital_status'])
    df = clean_cols_types(df)
    return df

df_hosp_admissions_cleaned = preprocess_admissions(df_hosp_admissions)
df_hosp_admissions_cleaned.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,discharge_location,race,edregtime,edouttime
0,10106244,26713233,2147-05-09 10:34:00,2147-05-12 13:43:00,NaT,DIRECT EMER.,,HOME,WHITE,NaT,NaT
1,13700703,20448599,2172-09-25 01:01:00,2172-10-03 13:25:00,NaT,OBSERVATION ADMIT,,HOME,WHITE,2172-09-24 17:38:00,2172-09-25 03:07:00
2,15443666,27961368,2168-12-30 23:30:00,2169-01-05 16:02:00,NaT,OBSERVATION ADMIT,,HOME HEALTH CARE,BLACK/AFRICAN AMERICAN,2168-12-30 11:19:00,2168-12-31 01:22:00
3,16299919,26977065,2193-05-15 08:37:00,2193-05-17 16:03:00,NaT,OBSERVATION ADMIT,,HOSPICE,BLACK/AFRICAN AMERICAN,2193-05-15 04:36:00,2193-05-15 14:27:00
4,14149715,24191358,2181-10-25 19:37:00,2181-10-29 14:38:00,NaT,OBSERVATION ADMIT,P00230,SKILLED NURSING FACILITY,WHITE,2181-10-25 08:48:00,2181-10-26 15:18:00


In [35]:
df_hosp_patient = pd.read_csv(hos_patient_path)

def preprocess_patient(df):
    df = df.drop(columns=['anchor_year', 'anchor_year_group'])
    df = clean_cols_types(df)
    return df

df_hosp_patient_cleaned = preprocess_patient(df_hosp_patient)
df_hosp_patient_cleaned.head()

Unnamed: 0,subject_id,gender,anchor_age,dod
0,11289691,F,18,NaT
1,11806971,F,18,NaT
2,12107404,F,18,NaT
3,12143996,F,18,NaT
4,13117076,F,18,NaT


## Merge dfs

In [36]:
df_hosp_patient = df_hosp_admissions_cleaned.merge(df_hosp_patient_cleaned, on="subject_id")
df_hosp_patient['death_time'] = df_hosp_patient['deathtime'].combine_first(df_hosp_patient['dod'])
df_hosp_patient.drop(columns=['dod', 'deathtime'], inplace=True)
df_hosp_patient.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_type,admit_provider_id,discharge_location,race,edregtime,edouttime,gender,anchor_age,death_time
0,10106244,26713233,2147-05-09 10:34:00,2147-05-12 13:43:00,DIRECT EMER.,,HOME,WHITE,NaT,NaT,F,60,NaT
1,13700703,20448599,2172-09-25 01:01:00,2172-10-03 13:25:00,OBSERVATION ADMIT,,HOME,WHITE,2172-09-24 17:38:00,2172-09-25 03:07:00,F,56,NaT
2,15443666,27961368,2168-12-30 23:30:00,2169-01-05 16:02:00,OBSERVATION ADMIT,,HOME HEALTH CARE,BLACK/AFRICAN AMERICAN,2168-12-30 11:19:00,2168-12-31 01:22:00,F,75,2175-03-01
3,16299919,26977065,2193-05-15 08:37:00,2193-05-17 16:03:00,OBSERVATION ADMIT,,HOSPICE,BLACK/AFRICAN AMERICAN,2193-05-15 04:36:00,2193-05-15 14:27:00,F,91,2194-02-03
4,14149715,24191358,2181-10-25 19:37:00,2181-10-29 14:38:00,OBSERVATION ADMIT,P00230,SKILLED NURSING FACILITY,WHITE,2181-10-25 08:48:00,2181-10-26 15:18:00,F,64,2186-05-01


In [37]:
df_hosp_patient.shape

(395931, 13)

In [38]:
# missing values only because no diagnosis maybe in icu or ed?

df_hosp_patient_diag = df_hosp_patient.merge(df_diagnosis_cleaned, on=["subject_id", "hadm_id"], how="left")
df_hosp_patient_diag

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_type,admit_provider_id,discharge_location,race,edregtime,edouttime,gender,anchor_age,death_time,icd_codes_diagnosis,diagnosis
0,10106244,26713233,2147-05-09 10:34:00,2147-05-12 13:43:00,DIRECT EMER.,,HOME,WHITE,NaT,NaT,F,60,NaT,"[I25110, I2542, I501, T82593A, J9811, I9589, Y...",[Atherosclerotic heart disease of native coron...
1,13700703,20448599,2172-09-25 01:01:00,2172-10-03 13:25:00,OBSERVATION ADMIT,,HOME,WHITE,2172-09-24 17:38:00,2172-09-25 03:07:00,F,56,NaT,"[K224, R1013, E039, G4733, G894, D72829, Y848,...","[Dyskinesia of esophagus, Epigastric pain, Hyp..."
2,15443666,27961368,2168-12-30 23:30:00,2169-01-05 16:02:00,OBSERVATION ADMIT,,HOME HEALTH CARE,BLACK/AFRICAN AMERICAN,2168-12-30 11:19:00,2168-12-31 01:22:00,F,75,2175-03-01,"[T82538A, E1140, I509, I482, I10, I714, K5900,...",[Leakage of other cardiac and vascular devices...
3,16299919,26977065,2193-05-15 08:37:00,2193-05-17 16:03:00,OBSERVATION ADMIT,,HOSPICE,BLACK/AFRICAN AMERICAN,2193-05-15 04:36:00,2193-05-15 14:27:00,F,91,2194-02-03,"[I120, N179, N185, E1121, F0390, Z794, Z87891,...",[Hypertensive chronic kidney disease with stag...
4,14149715,24191358,2181-10-25 19:37:00,2181-10-29 14:38:00,OBSERVATION ADMIT,P00230,SKILLED NURSING FACILITY,WHITE,2181-10-25 08:48:00,2181-10-26 15:18:00,F,64,2186-05-01,"[S12120A, I4891, W109XXA, Y92009, I10, E785, I...","[Other displaced dens fracture, initial encoun..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395926,16711441,22500229,2179-11-16 17:15:00,2179-11-18 12:45:00,OBSERVATION ADMIT,P99Z33,HOME HEALTH CARE,HISPANIC/LATINO - DOMINICAN,2179-11-16 12:06:00,2179-11-16 19:06:00,M,58,NaT,"[L02612, L03116, I10, E119, S90872A, E785, W57...","[Cutaneous abscess of left foot, Cellulitis of..."
395927,18766758,25415341,2131-12-08 18:55:00,2131-12-12 16:15:00,OBSERVATION ADMIT,P99Z33,HOME HEALTH CARE,WHITE,2131-12-08 14:28:00,2131-12-08 19:46:00,M,34,NaT,"[E11621, L97511, M216X1, L97421, L03115, Z9481...","[Type 2 diabetes mellitus with foot ulcer, Non..."
395928,11286186,23566382,2157-04-08 09:15:00,2157-04-09 17:00:00,DIRECT OBSERVATION,P99Z33,,WHITE,NaT,NaT,F,49,NaT,"[S92341A, S92351A, W19XXXA, Y929, G40802, K219...","[Displaced fracture of fourth metatarsal bone,..."
395929,16578860,26155863,2150-12-07 03:38:00,2150-12-08 16:11:00,DIRECT OBSERVATION,P99Z33,,HISPANIC/LATINO - DOMINICAN,NaT,NaT,M,64,NaT,"[M86171, I70261, I10, F17210, Z7902, Z95828, E...","[Other acute osteomyelitis, right ankle and fo..."


In [39]:
df_ecg_cleaned.head()

Unnamed: 0,subject_id,study_id,cart_id,ecg_time,bandwidth,filtering,rr_interval,p_onset,p_end,qrs_onset,qrs_end,t_end,p_axis,qrs_axis,t_axis,file_name,path,full_report
0,18637389,42512653,6537058,2185-10-15 08:20:00,0.005-150 Hz,60 Hz notch Baseline filter,476,29999,29999,0,0,476,29999,29999,29999,42512653,files/p1863/p18637389/s42512653/42512653,[Atrial fibrillation with rapid ventricular re...
1,10002662,48509863,6044454,2183-02-22 20:11:00,0.005-150 Hz,60 Hz notch Baseline filter,1016,40,158,256,344,652,97,57,46,48509863,files/p1000/p10002662/s48509863/48509863,[Sinus bradycardia with borderline 1st degree ...
2,10014354,46738745,6245150,2148-06-25 00:17:00,0.005-150 Hz,60 Hz notch Baseline filter,937,40,174,256,348,656,55,-6,57,46738745,files/p1001/p10014354/s46738745/46738745,[Sinus rhythm with borderline 1st degree A-V b...
3,10020306,47789314,6398092,2116-10-08 12:43:00,0.005-150 Hz,60 Hz notch Baseline filter,631,40,168,256,352,624,44,23,100,47789314,files/p1002/p10020306/s47789314/47789314,[Sinus arrhythmia with PVCs with borderline 1s...
4,10020306,44589661,6049786,2127-01-13 09:04:00,0.005-150 Hz,60 Hz notch Baseline filter,810,40,172,256,344,638,58,20,12,44589661,files/p1002/p10020306/s44589661/44589661,[Sinus rhythm with borderline 1st degree A-V b...


In [40]:
df_ecg_cleaned.shape

(799831, 18)

Unnamed: 0,subject_id,ecg_id,study_id,cart_id,ecg_time,bandwidth,filtering,rr_interval,p_onset,p_end,...,t_axis,file_name,path,full_report,hadm_id,admittime,dischtime,in_admission,time_dist,pre_or_post
0,10000032,0,44458630,6848296,2180-07-23 09:54:00,0.005-150 Hz,60 Hz notch Baseline filter,722,40,124,...,70,44458630,files/p1000/p10000032/s44458630/44458630,"[Sinus rhythm, Possible right atrial abnormali...",29079034.0,2180-07-23 12:35:00,2180-07-25 17:55:00,False,0 days 02:41:00,pre
1,10000032,1,44458630,6848296,2180-07-23 09:54:00,0.005-150 Hz,60 Hz notch Baseline filter,722,40,124,...,70,44458630,files/p1000/p10000032/s44458630/44458630,"[Sinus rhythm, Possible right atrial abnormali...",25742920.0,2180-08-05 23:44:00,2180-08-07 17:50:00,False,13 days 13:50:00,pre
2,10000032,2,44458630,6848296,2180-07-23 09:54:00,0.005-150 Hz,60 Hz notch Baseline filter,722,40,124,...,70,44458630,files/p1000/p10000032/s44458630/44458630,"[Sinus rhythm, Possible right atrial abnormali...",22595853.0,2180-05-06 22:23:00,2180-05-07 17:15:00,False,76 days 16:39:00,post
3,10000032,3,44458630,6848296,2180-07-23 09:54:00,0.005-150 Hz,60 Hz notch Baseline filter,722,40,124,...,70,44458630,files/p1000/p10000032/s44458630/44458630,"[Sinus rhythm, Possible right atrial abnormali...",22841357.0,2180-06-26 18:27:00,2180-06-27 18:49:00,False,25 days 15:05:00,post
4,10000032,4,40689238,6848296,2180-07-23 08:44:00,0.005-150 Hz,60 Hz notch Baseline filter,659,40,128,...,79,40689238,files/p1000/p10000032/s40689238/40689238,"[Sinus rhythm, Possible right atrial abnormali...",29079034.0,2180-07-23 12:35:00,2180-07-25 17:55:00,False,0 days 03:51:00,pre


In [60]:
df_ecg_linked[df_ecg_linked['subject_id'] == 10000032]

Unnamed: 0,subject_id,ecg_id,study_id,cart_id,ecg_time,bandwidth,filtering,rr_interval,p_onset,p_end,...,t_axis,file_name,path,full_report,hadm_id,admittime,dischtime,in_admission,time_dist,pre_or_post
0,10000032,0,44458630,6848296,2180-07-23 09:54:00,0.005-150 Hz,60 Hz notch Baseline filter,722,40,124,...,70,44458630,files/p1000/p10000032/s44458630/44458630,"[Sinus rhythm, Possible right atrial abnormali...",29079034.0,2180-07-23 12:35:00,2180-07-25 17:55:00,False,0 days 02:41:00,pre
1,10000032,1,44458630,6848296,2180-07-23 09:54:00,0.005-150 Hz,60 Hz notch Baseline filter,722,40,124,...,70,44458630,files/p1000/p10000032/s44458630/44458630,"[Sinus rhythm, Possible right atrial abnormali...",25742920.0,2180-08-05 23:44:00,2180-08-07 17:50:00,False,13 days 13:50:00,pre
2,10000032,2,44458630,6848296,2180-07-23 09:54:00,0.005-150 Hz,60 Hz notch Baseline filter,722,40,124,...,70,44458630,files/p1000/p10000032/s44458630/44458630,"[Sinus rhythm, Possible right atrial abnormali...",22595853.0,2180-05-06 22:23:00,2180-05-07 17:15:00,False,76 days 16:39:00,post
3,10000032,3,44458630,6848296,2180-07-23 09:54:00,0.005-150 Hz,60 Hz notch Baseline filter,722,40,124,...,70,44458630,files/p1000/p10000032/s44458630/44458630,"[Sinus rhythm, Possible right atrial abnormali...",22841357.0,2180-06-26 18:27:00,2180-06-27 18:49:00,False,25 days 15:05:00,post
4,10000032,4,40689238,6848296,2180-07-23 08:44:00,0.005-150 Hz,60 Hz notch Baseline filter,659,40,128,...,79,40689238,files/p1000/p10000032/s40689238/40689238,"[Sinus rhythm, Possible right atrial abnormali...",29079034.0,2180-07-23 12:35:00,2180-07-25 17:55:00,False,0 days 03:51:00,pre
5,10000032,5,40689238,6848296,2180-07-23 08:44:00,0.005-150 Hz,60 Hz notch Baseline filter,659,40,128,...,79,40689238,files/p1000/p10000032/s40689238/40689238,"[Sinus rhythm, Possible right atrial abnormali...",25742920.0,2180-08-05 23:44:00,2180-08-07 17:50:00,False,13 days 15:00:00,pre
6,10000032,6,40689238,6848296,2180-07-23 08:44:00,0.005-150 Hz,60 Hz notch Baseline filter,659,40,128,...,79,40689238,files/p1000/p10000032/s40689238/40689238,"[Sinus rhythm, Possible right atrial abnormali...",22595853.0,2180-05-06 22:23:00,2180-05-07 17:15:00,False,76 days 15:29:00,post
7,10000032,7,40689238,6848296,2180-07-23 08:44:00,0.005-150 Hz,60 Hz notch Baseline filter,659,40,128,...,79,40689238,files/p1000/p10000032/s40689238/40689238,"[Sinus rhythm, Possible right atrial abnormali...",22841357.0,2180-06-26 18:27:00,2180-06-27 18:49:00,False,25 days 13:55:00,post
8,10000032,8,49036311,6376932,2180-08-06 09:07:00,0.005-150 Hz,60 Hz notch Baseline filter,600,40,130,...,77,49036311,files/p1000/p10000032/s49036311/49036311,"[Sinus tachycardia, Normal ECG except for rate]",29079034.0,2180-07-23 12:35:00,2180-07-25 17:55:00,False,11 days 15:12:00,post
9,10000032,9,49036311,6376932,2180-08-06 09:07:00,0.005-150 Hz,60 Hz notch Baseline filter,600,40,130,...,77,49036311,files/p1000/p10000032/s49036311/49036311,"[Sinus tachycardia, Normal ECG except for rate]",25742920.0,2180-08-05 23:44:00,2180-08-07 17:50:00,True,0 days 00:00:00,during


In [54]:
df_ecg_cleaned[df_ecg_cleaned['subject_id'] == 10000032]

Unnamed: 0,subject_id,study_id,cart_id,ecg_time,bandwidth,filtering,rr_interval,p_onset,p_end,qrs_onset,qrs_end,t_end,p_axis,qrs_axis,t_axis,file_name,path,full_report
24095,10000032,44458630,6848296,2180-07-23 09:54:00,0.005-150 Hz,60 Hz notch Baseline filter,722,40,124,162,246,504,77,75,70,44458630,files/p1000/p10000032/s44458630/44458630,"[Sinus rhythm, Possible right atrial abnormali..."
34434,10000032,40689238,6848296,2180-07-23 08:44:00,0.005-150 Hz,60 Hz notch Baseline filter,659,40,128,170,258,518,81,77,79,40689238,files/p1000/p10000032/s40689238/40689238,"[Sinus rhythm, Possible right atrial abnormali..."
495178,10000032,49036311,6376932,2180-08-06 09:07:00,0.005-150 Hz,60 Hz notch Baseline filter,600,40,130,162,244,474,79,72,77,49036311,files/p1000/p10000032/s49036311/49036311,"[Sinus tachycardia, Normal ECG except for rate]"


In [55]:
df_hosp_patient_diag[df_hosp_patient_diag['subject_id'] == 10000032]

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_type,admit_provider_id,discharge_location,race,edregtime,edouttime,gender,anchor_age,death_time,icd_codes_diagnosis,diagnosis
28814,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,EW EMER.,P06OTX,HOME,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,F,52,2180-09-09,"[45829, 07044, 7994, 2761, 78959, 2767, 3051, ...","[Other iatrogenic hypotension, Chronic hepatit..."
83885,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,EW EMER.,P19UTS,HOSPICE,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,F,52,2180-09-09,"[07054, 78959, V462, 5715, 2767, 2761, 496, V0...",[Chronic hepatitis C without mention of hepati...
209828,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,URGENT,P49AFC,HOME,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,F,52,2180-09-09,"[5723, 78959, 5715, 07070, 496, 29680, 30981, ...","[Portal hypertension, Other ascites, Cirrhosis..."
312930,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,EW EMER.,P784FA,HOME,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,F,52,2180-09-09,"[07071, 78959, 2875, 2761, 496, 5715, V08, 3051]",[Unspecified viral hepatitis C with hepatic co...


In [45]:
df_final.columns

Index(['subject_id', 'ecg_id', 'study_id', 'cart_id', 'ecg_time', 'bandwidth',
       'filtering', 'rr_interval', 'p_onset', 'p_end', 'qrs_onset', 'qrs_end',
       't_end', 'p_axis', 'qrs_axis', 't_axis', 'file_name', 'path',
       'full_report', 'hadm_id', 'admittime', 'dischtime', 'admission_type',
       'admit_provider_id', 'discharge_location', 'race', 'edregtime',
       'edouttime', 'gender', 'anchor_age', 'death_time',
       'icd_codes_diagnosis', 'diagnosis', 'in_admission', 'time_dist'],
      dtype='object')