In [1]:
import polars as pl
import os
import pandas as pd
import json

In [2]:
def encode_column_and_replace(df, col_name,encoding_dict_=None):

    if encoding_dict_ is None:
        out = df.select(pl.col(col_name).unique()).with_row_count()
        out = out.select([col_name,'row_nr'])
        encoding_dict = dict(out.iter_rows())
    else:
        encoding_dict = encoding_dict_
    df = df.with_columns(pl.col(col_name).map_dict(encoding_dict,return_dtype=pl.UInt64).alias(col_name+'_encoded'))
    return df,encoding_dict

def encode_two_columns(df, col_1,col_2):
    encoding_dict = dict(df.select([col_1,col_2]).unique().iter_rows())
    return encoding_dict



# Cleaning patient history sample

In [4]:
patient_history_encoding = {}
cols_to_encode = ['dx_name']
data = pl.read_csv('EPIC_EMR/patient_history.csv',dtypes={'diagnosis_code':pl.Utf8})
data = data.unique()

In [5]:
data.shape

(437721, 3)

In [6]:
data.head()

mrn,diagnosis_code,dx_name
str,str,str
"""0fef63bb63c4e4…","""218.0""","""Submucous leio…"
"""5a6b742b38dbc6…","""218.0""","""Submucous leio…"
"""13e76a6d564f80…","""238.2""","""Neoplasm of un…"
"""e10847d0e11f75…","""238.2""","""Neoplasm of un…"
"""e9e71c10b45052…","""238.2""","""Neoplasm of un…"


In [9]:
patient_history_encoding['dx_name___diagnosis_code'] = encode_two_columns(data,'dx_name','diagnosis_code')
for col in cols_to_encode:
    data, patient_history_encoding[col] = encode_column_and_replace(data,col)


In [12]:
data.head()

mrn,diagnosis_code,dx_name,dx_name_encoded
str,str,str,u64
"""0fef63bb63c4e4…","""218.0""","""Submucous leio…",47669
"""5a6b742b38dbc6…","""218.0""","""Submucous leio…",47669
"""13e76a6d564f80…","""238.2""","""Neoplasm of un…",11621
"""e10847d0e11f75…","""238.2""","""Neoplasm of un…",11621
"""e9e71c10b45052…","""238.2""","""Neoplasm of un…",11621


In [13]:
data.null_count()

mrn,diagnosis_code,dx_name,dx_name_encoded
u32,u32,u32,u32
0,107424,0,0


In [14]:
patient_history_encoding.keys()

dict_keys(['dx_name___diagnosis_code', 'dx_name'])

In [15]:
data = data.select([
    'mrn','dx_name_encoded'])
data.shape

(437721, 2)

In [12]:
data

mrn,dx_name_encoded
str,u64
"""a75b440dac3786…",9815
"""562502ca0db7f0…",9815
"""2bfcf800962488…",57567
"""86baf59d6a64fb…",57567
"""db2594a3d37bf9…",5364
"""4f7e1947de3828…",4663
"""04fb9499c613f6…",789
"""a552c79d1629d9…",20106
"""bb8a52ebd77529…",20106
"""85983642e609b3…",20106


In [16]:
data.write_csv('EPIC_EMR_cleaned/patient_history_cleaned.csv')

In [17]:
patient_history_encoding

{'dx_name___diagnosis_code': {'Acute maxillary sinusitis': '461.0',
  'Mitral valve prolapse': '424.0',
  'Nasal bone fx-closed': '802.0',
  'Celiac disease': '579.0',
  'Hydroureter': '593.5',
  'Supraventricular tachycardia (CMS-HCC)': '427.89',
  'Thrombophlebitis': '451.9',
  'Septic arthritis (CMS-HCC)': '711.00',
  'Meibomian gland disease': '374.89',
  'Inguinal hernia': '550.90',
  'Microprolactinoma (CMS-HCC)': '227.3',
  'S/P left inguinal hernia repair': 'V45.89',
  'Anemia due to antineoplastic chemotherapy': '285.3',
  'CHF exacerbation (CMS-HCC)': '428.0',
  'Bleeding duodenal ulcer': '532.40',
  'Dialysis patient (CMS-HCC)': 'V45.11',
  'Stem cells transplant status (CMS-HCC)': 'V42.82',
  'Contracture of left knee': '718.46',
  'Borborygmi': '787.5',
  'Eye lesion': '379.90',
  'Retained magnetic metal fragment': 'V90.11',
  'Fibroadenoma of breast': '217',
  'Acute hip pain': '719.45',
  'Routine history and physical examination of adult': 'V70.0',
  'Posterior disloca

In [18]:
len(patient_history_encoding['dx_name'])

59056

# Cleaning patient information
Here The columns ICU_ADMIN_FLAG, SEX, PRIMARY_ANES_TYPE_NM, PATIENT_CLASS_GROUP, DISCH_DISP_C, ASA_RATING_C are encoded

In [19]:
patient_information_encoding = {}
cols_to_encode = ['ICU_ADMIN_FLAG','SEX','PRIMARY_ANES_TYPE_NM','PATIENT_CLASS_GROUP'] # DISCH_DISP
data = pl.read_csv('EPIC_EMR/patient_information.csv',dtypes={})
data = data.unique()
data = data.with_columns(pl.col('DISCH_DISP_C').cast(pl.UInt64),pl.col('ASA_RATING_C').cast(pl.UInt64))

two_columns_encoding = [('DISCH_DISP','DISCH_DISP_C'),('ASA_RATING','ASA_RATING_C')]


In [18]:
data

LOG_ID,MRN,DISCH_DISP_C,DISCH_DISP,HOSP_ADMSN_TIME,HOSP_DISCH_TIME,LOS,ICU_ADMIN_FLAG,SURGERY_DATE,BIRTH_DATE,HEIGHT,WEIGHT,SEX,PRIMARY_ANES_TYPE_NM,ASA_RATING_C,ASA_RATING,PATIENT_CLASS_GROUP,PATIENT_CLASS_NM,PRIMARY_PROCEDURE_NM,IN_OR_DTTM,OUT_OR_DTTM,AN_START_DATETIME,AN_STOP_DATETIME
str,str,u64,str,str,str,f64,str,str,i64,str,f64,str,str,u64,str,str,str,str,str,str,str,str
"""8778901eeb9b61…","""2e34dcb0791b8c…",15,"""Home Routine""","""3/17/20 7:01""","""3/18/20 12:50""",1.0,"""No""","""3/17/20 0:00""",59,"""5' 6""",2640.0,"""Female""","""General""",2,"""Mild Systemic …","""Outpatient""","""Hospital Outpa…","""EGD (ESOPHAGOG…","""3/17/20 8:18""","""3/17/20 10:19""","""3/17/20 8:18""","""3/17/20 10:22"""
"""b1df0a3b9037bd…","""1a479b8bad2165…",6,"""Skilled Nursin…","""8/7/20 11:58""","""8/24/20 20:40""",17.0,"""Yes""","""8/7/20 0:00""",62,"""5' 7""",2880.0,"""Male""","""General""",3,"""Severe Systemi…","""Inpatient""","""Inpatient Admi…","""ANGIOGRAM, SPI…",,,,
"""987faad1df9faa…","""c3ce84c4f165f9…",100,"""Rehab Facility…","""1/11/20 6:07""","""1/20/20 13:56""",9.0,"""No""","""1/13/20 0:00""",80,"""5' 3""",3280.44,"""Female""","""General""",3,"""Severe Systemi…","""Inpatient""","""Inpatient Admi…","""LAMINECTOMY, T…","""1/13/20 7:26""","""1/13/20 13:25""","""1/13/20 7:26""","""1/13/20 13:31"""
"""9be3203bbac4d2…","""b167a720bf50bf…",15,"""Home Routine""","""3/1/19 5:20""","""3/4/19 19:00""",3.0,"""No""","""3/1/19 0:00""",42,"""5' 7""",2320.0,"""Female""","""General""",1,"""Healthy""","""Outpatient""","""Hospital Outpa…","""ABDOMINOPLASTY…","""3/1/19 7:20""","""3/1/19 13:18""","""3/1/19 7:18""","""3/1/19 13:29"""
"""f0d16bb8e59321…","""95d54ee9f5abfa…",15,"""Home Routine""","""12/14/18 7:16""","""12/16/18 16:05…",2.0,"""No""","""12/14/18 0:00""",43,"""5' 11""",3710.78,"""Male""","""General""",2,"""Mild Systemic …","""Outpatient""","""Hospital Outpa…","""URETHROPLASTY,…","""12/14/18 8:44""","""12/14/18 12:14…","""12/14/18 8:46""","""12/14/18 12:30…"
"""e7d71e174a13e2…","""3f39f7519080cc…",15,"""Home Routine""","""4/1/19 5:43""","""4/3/19 11:20""",2.0,"""Yes""","""4/1/19 0:00""",50,,2753.1,"""Male""","""General""",3,"""Severe Systemi…","""Inpatient""","""Hospital Inpat…","""CRANIECTOMY""","""4/1/19 7:12""","""4/1/19 12:43""","""4/1/19 7:13""","""4/1/19 12:50"""
"""32f1980e394417…","""0dcace1cd2a4d8…",15,"""Home Routine""","""6/22/19 8:10""","""6/23/19 11:15""",1.0,"""No""","""6/22/19 0:00""",79,,1968.27,"""Male""","""General""",3,"""Severe Systemi…","""Outpatient""","""Hospital Outpa…","""DECOMPRESSION,…","""6/22/19 10:36""","""6/22/19 13:05""","""6/22/19 10:36""","""6/22/19 13:12"""
"""e294deeebb209d…","""fd2eaa1d01abe9…",15,"""Home Routine""","""12/26/18 12:52…","""12/30/18 11:15…",4.0,"""No""","""12/26/18 0:00""",65,,2892.44,"""Male""","""General""",3,"""Severe Systemi…","""Inpatient""","""Hospital Inpat…","""LAMINOPLASTY, …","""12/26/18 16:04…","""12/26/18 20:43…","""12/26/18 16:04…","""12/26/18 20:49…"
"""a4bcc1f2c94953…","""4de7abdce9b27d…",15,"""Home Routine""","""4/6/19 8:51""","""4/6/19 13:23""",0.0,"""No""","""4/6/19 0:00""",37,,2537.94,"""Male""","""Monitored Anes…",2,"""Mild Systemic …","""Outpatient""","""Hospital Outpa…","""RELEASE, CONTR…","""4/6/19 10:38""","""4/6/19 12:09""","""4/6/19 10:38""","""4/6/19 12:13"""
"""0b90ab367bb7f2…","""173e0cc931b2e3…",15,"""Home Routine""","""9/28/19 5:02""","""9/30/19 11:15""",2.0,"""No""","""9/28/19 0:00""",67,"""5' 9""",3684.33,"""Female""","""Monitored Anes…",2,"""Mild Systemic …","""Inpatient""","""Hospital Inpat…","""ARTHROPLASTY, …","""9/28/19 7:12""","""9/28/19 10:14""","""9/28/19 7:12""","""9/28/19 10:17"""


In [20]:
for col_1,col_2 in two_columns_encoding:
    patient_information_encoding[col_1+'___'+col_2] = encode_two_columns(data,col_1,col_2)

for col in cols_to_encode:
    data, patient_information_encoding[col] = encode_column_and_replace(data,col)


In [21]:
data.null_count()

LOG_ID,MRN,DISCH_DISP_C,DISCH_DISP,HOSP_ADMSN_TIME,HOSP_DISCH_TIME,LOS,ICU_ADMIN_FLAG,SURGERY_DATE,BIRTH_DATE,HEIGHT,WEIGHT,SEX,PRIMARY_ANES_TYPE_NM,ASA_RATING_C,ASA_RATING,PATIENT_CLASS_GROUP,PATIENT_CLASS_NM,PRIMARY_PROCEDURE_NM,IN_OR_DTTM,OUT_OR_DTTM,AN_START_DATETIME,AN_STOP_DATETIME,ICU_ADMIN_FLAG_encoded,SEX_encoded,PRIMARY_ANES_TYPE_NM_encoded,PATIENT_CLASS_GROUP_encoded
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,7,7,0,14,14,0,0,0,12726,2363,0,36,6811,6811,0,0,6,6403,6469,7313,7327,0,0,0,0


In [22]:

data, patient_information_encoding['DISCH_DISP'] = encode_column_and_replace(data,'DISCH_DISP',patient_information_encoding['DISCH_DISP___DISCH_DISP_C'])
data, patient_information_encoding['ASA_RATING'] = encode_column_and_replace(data,'ASA_RATING',patient_information_encoding['ASA_RATING___ASA_RATING_C'])


In [22]:
data.null_count()

LOG_ID,MRN,DISCH_DISP_C,DISCH_DISP,HOSP_ADMSN_TIME,HOSP_DISCH_TIME,LOS,ICU_ADMIN_FLAG,SURGERY_DATE,BIRTH_DATE,HEIGHT,WEIGHT,SEX,PRIMARY_ANES_TYPE_NM,ASA_RATING_C,ASA_RATING,PATIENT_CLASS_GROUP,PATIENT_CLASS_NM,PRIMARY_PROCEDURE_NM,IN_OR_DTTM,OUT_OR_DTTM,AN_START_DATETIME,AN_STOP_DATETIME,ICU_ADMIN_FLAG_encoded,SEX_encoded,PRIMARY_ANES_TYPE_NM_encoded,PATIENT_CLASS_GROUP_encoded,DISCH_DISP_encoded,ASA_RATING_encoded
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,7,7,0,14,14,0,0,0,12726,2363,0,36,6811,6811,0,0,6,6403,6469,7313,7327,0,0,0,0,7,6811


In [23]:
data.head()

LOG_ID,MRN,DISCH_DISP_C,DISCH_DISP,HOSP_ADMSN_TIME,HOSP_DISCH_TIME,LOS,ICU_ADMIN_FLAG,SURGERY_DATE,BIRTH_DATE,HEIGHT,WEIGHT,SEX,PRIMARY_ANES_TYPE_NM,ASA_RATING_C,ASA_RATING,PATIENT_CLASS_GROUP,PATIENT_CLASS_NM,PRIMARY_PROCEDURE_NM,IN_OR_DTTM,OUT_OR_DTTM,AN_START_DATETIME,AN_STOP_DATETIME,ICU_ADMIN_FLAG_encoded,SEX_encoded,PRIMARY_ANES_TYPE_NM_encoded,PATIENT_CLASS_GROUP_encoded,DISCH_DISP_encoded,ASA_RATING_encoded
str,str,u64,str,str,str,f64,str,str,i64,str,f64,str,str,u64,str,str,str,str,str,str,str,str,u64,u64,u64,u64,u64,u64
"""8778901eeb9b61…","""2e34dcb0791b8c…",15,"""Home Routine""","""3/17/20 7:01""","""3/18/20 12:50""",1.0,"""No""","""3/17/20 0:00""",59,"""5' 6""",2640.0,"""Female""","""General""",2,"""Mild Systemic …","""Outpatient""","""Hospital Outpa…","""EGD (ESOPHAGOG…","""3/17/20 8:18""","""3/17/20 10:19""","""3/17/20 8:18""","""3/17/20 10:22""",0,0,11,1,15,2
"""b1df0a3b9037bd…","""1a479b8bad2165…",6,"""Skilled Nursin…","""8/7/20 11:58""","""8/24/20 20:40""",17.0,"""Yes""","""8/7/20 0:00""",62,"""5' 7""",2880.0,"""Male""","""General""",3,"""Severe Systemi…","""Inpatient""","""Inpatient Admi…","""ANGIOGRAM, SPI…",,,,,1,2,11,0,6,3
"""987faad1df9faa…","""c3ce84c4f165f9…",100,"""Rehab Facility…","""1/11/20 6:07""","""1/20/20 13:56""",9.0,"""No""","""1/13/20 0:00""",80,"""5' 3""",3280.44,"""Female""","""General""",3,"""Severe Systemi…","""Inpatient""","""Inpatient Admi…","""LAMINECTOMY, T…","""1/13/20 7:26""","""1/13/20 13:25""","""1/13/20 7:26""","""1/13/20 13:31""",0,0,11,0,100,3
"""9be3203bbac4d2…","""b167a720bf50bf…",15,"""Home Routine""","""3/1/19 5:20""","""3/4/19 19:00""",3.0,"""No""","""3/1/19 0:00""",42,"""5' 7""",2320.0,"""Female""","""General""",1,"""Healthy""","""Outpatient""","""Hospital Outpa…","""ABDOMINOPLASTY…","""3/1/19 7:20""","""3/1/19 13:18""","""3/1/19 7:18""","""3/1/19 13:29""",0,0,11,1,15,1
"""f0d16bb8e59321…","""95d54ee9f5abfa…",15,"""Home Routine""","""12/14/18 7:16""","""12/16/18 16:05…",2.0,"""No""","""12/14/18 0:00""",43,"""5' 11""",3710.78,"""Male""","""General""",2,"""Mild Systemic …","""Outpatient""","""Hospital Outpa…","""URETHROPLASTY,…","""12/14/18 8:44""","""12/14/18 12:14…","""12/14/18 8:46""","""12/14/18 12:30…",0,2,11,1,15,2


In [24]:
patient_information_encoding

{'DISCH_DISP___DISCH_DISP_C': {'Hospice Home': 22,
  'Board and Care': 103,
  None: None,
  "Cancer Ctr/Children's Hospital": 18,
  'Temporary Living': 108,
  'Jail/Prison': 10,
  'Federal Hospital': 11,
  'Other Healthcare Not Defined in this List': 70,
  'Independent Living': 106,
  'Acute Care Facility (this hospital)': 8,
  'Against Medical Advice': 13,
  'Recuperative Care': 105,
  'Home Healthcare Outside 3 Days': 21,
  'Psychiatric Facility (not this hospital)': 9,
  'Intermediate/Residential Care Facility': 5,
  'Designated Disaster Alternate Care Site': 69,
  'Hospice Facility': 16,
  'Shelter': 102,
  'Coroner': 23,
  'Skilled Nursing w Planned Readmit': 83,
  'Home Routine': 15,
  'Home Healthcare Outpatient Related': 109,
  'Home Health w Planned Readmit': 86,
  'Rehab Facility (this hospital)': 100,
  'Acute Care Facility (not this hospital)': 26,
  'Home Healthcare IP Admit Related': 20,
  'Expired': 3,
  'Psychiatric Facility (this hospital)': 19,
  'Sub-Acute Care Facil

The Height is fiven in feet'inch metric. We convert this into centimeters.

In [23]:
data = data.with_columns(pl.col('HEIGHT').str.split(by="'").map_elements(lambda x: float(x[0])*30.48  + float(x[1])*2.54).alias('HEIGHT_IN_CM'))

In [24]:
data.head()

LOG_ID,MRN,DISCH_DISP_C,DISCH_DISP,HOSP_ADMSN_TIME,HOSP_DISCH_TIME,LOS,ICU_ADMIN_FLAG,SURGERY_DATE,BIRTH_DATE,HEIGHT,WEIGHT,SEX,PRIMARY_ANES_TYPE_NM,ASA_RATING_C,ASA_RATING,PATIENT_CLASS_GROUP,PATIENT_CLASS_NM,PRIMARY_PROCEDURE_NM,IN_OR_DTTM,OUT_OR_DTTM,AN_START_DATETIME,AN_STOP_DATETIME,ICU_ADMIN_FLAG_encoded,SEX_encoded,PRIMARY_ANES_TYPE_NM_encoded,PATIENT_CLASS_GROUP_encoded,DISCH_DISP_encoded,ASA_RATING_encoded,HEIGHT_IN_CM
str,str,u64,str,str,str,f64,str,str,i64,str,f64,str,str,u64,str,str,str,str,str,str,str,str,u64,u64,u64,u64,u64,u64,f64
"""b1df0a3b9037bd…","""1a479b8bad2165…",6,"""Skilled Nursin…","""8/7/20 11:58""","""8/24/20 20:40""",17.0,"""Yes""","""8/7/20 0:00""",62,"""5' 7""",2880.0,"""Male""","""General""",3,"""Severe Systemi…","""Inpatient""","""Inpatient Admi…","""ANGIOGRAM, SPI…",,,,,1,2,5,0,6,3,170.18
"""3a6111f16e86a6…","""0631bc4823cb2d…",15,"""Home Routine""","""11/13/18 5:24""","""11/13/18 19:05…",0.0,"""No""","""11/13/18 0:00""",61,"""5' 7""",2828.94,"""Female""","""General""",3,"""Severe Systemi…","""Outpatient""","""Hospital Outpa…","""LOBECTOMY, THY…","""11/13/18 7:20""","""11/13/18 9:34""","""11/13/18 7:20""","""11/13/18 9:41""",0,1,5,1,15,3,170.18
"""6934cc3d1f5fb7…","""a6587bf8204e74…",15,"""Home Routine""","""2/1/19 4:52""","""2/2/19 10:12""",1.0,"""No""","""2/1/19 0:00""",73,"""5' 2""",2590.85,"""Female""","""General""",2,"""Mild Systemic …","""Outpatient""","""Hospital Outpa…","""COLPOCLEISIS""","""2/1/19 7:10""","""2/1/19 9:54""","""2/1/19 7:10""","""2/1/19 10:01""",0,1,5,1,15,2,157.48
"""cd9d02c27c53fa…","""124309aae78f4a…",15,"""Home Routine""","""7/17/19 5:14""","""7/17/19 10:20""",0.0,"""No""","""7/17/19 0:00""",22,"""6' 1""",3280.44,"""Male""","""General""",1,"""Healthy""","""Outpatient""","""Hospital Outpa…","""EXPLORATION, S…","""7/17/19 7:09""","""7/17/19 9:29""","""7/17/19 7:09""","""7/17/19 9:34""",0,2,5,1,15,1,185.42
"""4acd62ee4370cd…","""2be182dda74a3d…",15,"""Home Routine""","""7/27/19 5:20""","""7/27/19 9:14""",0.0,"""No""","""7/27/19 0:00""",29,,3559.11,"""Female""","""General""",2,"""Mild Systemic …","""Outpatient""","""Hospital Outpa…","""EXAM UNDER ANE…","""7/27/19 7:12""","""7/27/19 8:07""","""7/27/19 7:11""","""7/27/19 8:14""",0,1,5,1,15,2,


In [27]:
data.columns

['LOG_ID',
 'MRN',
 'DISCH_DISP_C',
 'DISCH_DISP',
 'HOSP_ADMSN_TIME',
 'HOSP_DISCH_TIME',
 'LOS',
 'ICU_ADMIN_FLAG',
 'SURGERY_DATE',
 'BIRTH_DATE',
 'HEIGHT',
 'WEIGHT',
 'SEX',
 'PRIMARY_ANES_TYPE_NM',
 'ASA_RATING_C',
 'ASA_RATING',
 'PATIENT_CLASS_GROUP',
 'PATIENT_CLASS_NM',
 'PRIMARY_PROCEDURE_NM',
 'IN_OR_DTTM',
 'OUT_OR_DTTM',
 'AN_START_DATETIME',
 'AN_STOP_DATETIME',
 'ICU_ADMIN_FLAG_encoded',
 'SEX_encoded',
 'PRIMARY_ANES_TYPE_NM_encoded',
 'PATIENT_CLASS_GROUP_encoded',
 'DISCH_DISP_encoded',
 'ASA_RATING_encoded',
 'HEIGHT_IN_CM']

We filter the columns that provide useful information for our analysis and models.

In [25]:
columns_needed=[
    'LOG_ID',
    'MRN',
 'HOSP_ADMSN_TIME',
 'HOSP_DISCH_TIME',
 'LOS',
 'SURGERY_DATE',
 'BIRTH_DATE',
 'WEIGHT',
 'IN_OR_DTTM',
 'OUT_OR_DTTM',
 'AN_START_DATETIME',
 'AN_STOP_DATETIME',
 'ICU_ADMIN_FLAG_encoded',
 'SEX_encoded',
 'PRIMARY_ANES_TYPE_NM_encoded',
 'PATIENT_CLASS_GROUP_encoded',
 'DISCH_DISP_encoded',
 'ASA_RATING_encoded',
 'HEIGHT_IN_CM']

In [29]:
len(columns_needed)

19

In [26]:
data = data.select(columns_needed)
print(data.shape)
data.write_csv('EPIC_EMR_cleaned/patient_information_cleaned.csv')

(64364, 19)


# Cleaning patient lab data

This is a very large csv file with the size of 3GB. This file contains lab samples of every patient. The column 'Abnormal Flag' is created by lab faculty based on all the other attributes of the patient, which means it can summarise all the other attributes. Thus, we can omit rest of the columns.

In [28]:
patient_lab_encoding = {}
cols_to_encode = ['Abnormal Flag'] # DISCH_DISP
data = pl.read_csv('EPIC_EMR/patient_labs.csv',dtypes={})
data = data.unique()


In [29]:
data.head()

LOG_ID,MRN,ENC_TYPE_NM,Lab Code,Lab Name,Observation Value,Measurement Units,Reference Range,Abnormal Flag,Collection Datetime
str,str,str,str,str,f64,str,str,str,str
"""0da0bc3be94234…","""1bb09d5761661c…","""Hospital Encou…","""11475-1""","""Microorganism …",9999999.0,"""Unknown""","""Unknown""","""N""","""2021-01-14 20:…"
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""1994-3""","""Calcium.ionize…",1.22,"""mmol/L""","""1.13-1.32""","""N""","""2020-03-10 10:…"
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""2028-9""","""Carbon dioxide…",25.0,"""mmol/L""","""21-31""","""N""","""2020-03-13 16:…"
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""2028-9""","""Carbon dioxide…",25.0,"""mmol/L""","""21-31""","""N""","""2020-03-16 05:…"
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""2951-2""","""Sodium""",140.0,"""mmol/L""","""136-145""","""N""","""2020-03-09 05:…"


In [30]:
for col in cols_to_encode:
    data, patient_lab_encoding[col] = encode_column_and_replace(data,col)

In [31]:
data.null_count()

LOG_ID,MRN,ENC_TYPE_NM,Lab Code,Lab Name,Observation Value,Measurement Units,Reference Range,Abnormal Flag,Collection Datetime,Abnormal Flag_encoded
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,89706,0,0,201566,0,0


In [32]:
data.head()

LOG_ID,MRN,ENC_TYPE_NM,Lab Code,Lab Name,Observation Value,Measurement Units,Reference Range,Abnormal Flag,Collection Datetime,Abnormal Flag_encoded
str,str,str,str,str,f64,str,str,str,str,u64
"""0da0bc3be94234…","""1bb09d5761661c…","""Hospital Encou…","""11475-1""","""Microorganism …",9999999.0,"""Unknown""","""Unknown""","""N""","""2021-01-14 20:…",4
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""1994-3""","""Calcium.ionize…",1.22,"""mmol/L""","""1.13-1.32""","""N""","""2020-03-10 10:…",4
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""2028-9""","""Carbon dioxide…",25.0,"""mmol/L""","""21-31""","""N""","""2020-03-13 16:…",4
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""2028-9""","""Carbon dioxide…",25.0,"""mmol/L""","""21-31""","""N""","""2020-03-16 05:…",4
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""2951-2""","""Sodium""",140.0,"""mmol/L""","""136-145""","""N""","""2020-03-09 05:…",4


In [33]:
columns_needed = ['Abnormal Flag_encoded','Collection Datetime','LOG_ID','MRN']
data = data.select(columns_needed)
print(data.shape)
data.write_csv('EPIC_EMR_cleaned/patient_labs_cleaned.csv')

(29071808, 4)


# Cleaning patient post op complications

This Dataset contains the post operative complications observed by the patients. All of the columns provide useful information for our analysis, which must be encoded.

In [46]:
patient_post_op_encoding = {}
cols_to_encode = ['CONTEXT_NAME','Element_abbr','SMRTDTA_ELEM_VALUE'] 
data = pl.read_csv('EPIC_EMR/patient_post_op_complications.csv',dtypes={})
data = data.unique()


In [47]:
data.head(35)

LOG_ID,MRN,Element_Name,CONTEXT_NAME,Element_abbr,SMRTDTA_ELEM_VALUE
str,str,str,str,str,str
"""f730e521222298…","""2c8c2d76b0ae34…","""AN AQI POST-OP…","""ENCOUNTER""","""AN Post-op Com…","""None"""
"""fc53c06ee2cfe4…","""b7a91e623d957d…","""AN AQI POST-OP…","""ENCOUNTER""","""AN Post-op Com…","""None"""
"""8a3336d8a4802d…","""aefa5d48bed866…","""AN AQI POST-OP…","""ENCOUNTER""","""AN Post-op Com…","""None"""
"""a99a2b466b5836…","""16c69fd7b03e8a…","""AN AQI POST-OP…","""ENCOUNTER""","""AN Post-op Com…","""None"""
"""e1a2b6d52dbd84…","""776fc234d01355…","""AN AQI POST-OP…","""ENCOUNTER""","""AN Post-op Com…","""None"""
"""377c737734f7d9…","""c7640a544a4a33…","""AN AQI POST-OP…","""ENCOUNTER""","""AN Post-op Com…","""None"""
"""8ad97f690f0f67…","""85ef3b9e8cf167…","""AN AQI POST-OP…","""ENCOUNTER""","""AN Post-op Com…","""None"""
"""114ae30b18afe4…","""6872bd9dbcdb59…","""AN AQI POST-OP…","""ENCOUNTER""","""AN Post-op Com…","""None"""
"""be60ff390f593d…","""9f0a03a6955657…","""AN AQI POST-OP…","""ENCOUNTER""","""AN Post-op Com…","""None"""
"""144a327a6ee213…","""4c57d4b70206de…","""AN AQI POST-OP…","""ENCOUNTER""","""AN Post-op Com…","""None"""


In [48]:
data = data.with_columns(pl.col('Element_abbr').str.replace(r"AN Post-op Complications",''))
data.head(35)

LOG_ID,MRN,Element_Name,CONTEXT_NAME,Element_abbr,SMRTDTA_ELEM_VALUE
str,str,str,str,str,str
"""f730e521222298…","""2c8c2d76b0ae34…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None"""
"""fc53c06ee2cfe4…","""b7a91e623d957d…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None"""
"""8a3336d8a4802d…","""aefa5d48bed866…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None"""
"""a99a2b466b5836…","""16c69fd7b03e8a…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None"""
"""e1a2b6d52dbd84…","""776fc234d01355…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None"""
"""377c737734f7d9…","""c7640a544a4a33…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None"""
"""8ad97f690f0f67…","""85ef3b9e8cf167…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None"""
"""114ae30b18afe4…","""6872bd9dbcdb59…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None"""
"""be60ff390f593d…","""9f0a03a6955657…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None"""
"""144a327a6ee213…","""4c57d4b70206de…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None"""


In [51]:
for col in cols_to_encode:
    data, patient_post_op_encoding[col] = encode_column_and_replace(data,col)


In [52]:
data.head()

LOG_ID,MRN,Element_Name,CONTEXT_NAME,Element_abbr,SMRTDTA_ELEM_VALUE,CONTEXT_NAME_encoded,Element_abbr_encoded,SMRTDTA_ELEM_VALUE_encoded
str,str,str,str,str,str,u64,u64,u64
"""f730e521222298…","""2c8c2d76b0ae34…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None""",0,0,409
"""fc53c06ee2cfe4…","""b7a91e623d957d…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None""",0,0,409
"""8a3336d8a4802d…","""aefa5d48bed866…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None""",0,0,409
"""a99a2b466b5836…","""16c69fd7b03e8a…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None""",0,0,409
"""e1a2b6d52dbd84…","""776fc234d01355…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None""",0,0,409


In [53]:
data.null_count()

LOG_ID,MRN,Element_Name,CONTEXT_NAME,Element_abbr,SMRTDTA_ELEM_VALUE,CONTEXT_NAME_encoded,Element_abbr_encoded,SMRTDTA_ELEM_VALUE_encoded
u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,483,0,0,0


In [54]:
columns_needed = ['SMRTDTA_ELEM_VALUE_encoded','Element_abbr_encoded','CONTEXT_NAME_encoded','LOG_ID','MRN']
data = data.select(columns_needed)
print(data.shape)
data.write_csv('EPIC_EMR_cleaned/patient_post_op_complication_cleaned.csv')

(84950, 5)


# Cleaning Patient Vist
This dataset contains the diagnosis information of all visits of patients

In [45]:
patient_visit_encoding = {}
cols_to_encode = ['dx_name'] 
data = pl.read_csv('EPIC_EMR/patient_visit.csv',dtypes={'diagnosis_code':pl.Utf8})
data = data.unique()

In [46]:
data.head()

LOG_ID,mrn,diagnosis_code,dx_name
str,str,str,str
"""fa149b1b36855b…","""ad8291a53a44f0…","""153.3""","""Malignant neop…"
"""c795ff491aa100…","""08cbba27fbc9ab…","""153.3""","""Malignant neop…"
"""3b76e243650147…","""2f74e339dfef02…","""153.3""","""Malignant neop…"
"""883661fd7d8695…","""b04f771d4c3590…","""160.0""","""Malignant neop…"
"""1d164b42259924…","""a07eea403d5462…","""188.0""","""Malignant neop…"


In [47]:

patient_visit_encoding['dx_name___diagnosis_code'] = encode_two_columns(data,'dx_name','diagnosis_code')
for col in cols_to_encode:
    data, patient_visit_encoding[col] = encode_column_and_replace(data,col)

In [48]:
data.head()

LOG_ID,mrn,diagnosis_code,dx_name,dx_name_encoded
str,str,str,str,u64
"""fa149b1b36855b…","""ad8291a53a44f0…","""153.3""","""Malignant neop…",2139
"""c795ff491aa100…","""08cbba27fbc9ab…","""153.3""","""Malignant neop…",2139
"""3b76e243650147…","""2f74e339dfef02…","""153.3""","""Malignant neop…",2139
"""883661fd7d8695…","""b04f771d4c3590…","""160.0""","""Malignant neop…",6812
"""1d164b42259924…","""a07eea403d5462…","""188.0""","""Malignant neop…",25220


In [49]:
data.null_count()

LOG_ID,mrn,diagnosis_code,dx_name,dx_name_encoded
u32,u32,u32,u32,u32
0,0,38138,0,0


In [50]:
data = data.select(pl.col('LOG_ID'),pl.col('mrn').alias('MRN'),pl.col('dx_name_encoded'))

In [51]:
data.shape

(131455, 3)

In [52]:
data.write_csv('EPIC_EMR_cleaned/patient_visit_cleaned.csv')

# Cleaning patient procedure events

In [54]:
patient_procedure_event_encoding = {}
cols_to_encode = ['EVENT_DISPLAY_NAME'] 
data = pl.read_csv('EPIC_EMR/patient_procedure events.csv')
data = data.unique()

In [55]:
data.head()

LOG_ID,MRN,EVENT_DISPLAY_NAME,EVENT_TIME,NOTE_TEXT
str,str,str,str,str
"""fb943d65e18f90…","""5b63201d6d1908…","""TEE Echo Place…","""9/27/19 7:50""",
"""1448853f211a1d…","""77bb0379da107d…","""Sign In""","""1/10/19 7:43""",
"""a51b5cb848f34f…","""482f098c5a1eb6…","""Quick Note""","""10/19/19 8:46""",
"""068b548cf0d919…","""68d58e4c2dcd06…","""Two Anti-Emeti…","""5/6/19 12:01""",
"""fb943d65e18f90…","""5b63201d6d1908…","""Quick Note""","""9/27/19 13:43""",


In [56]:
for col in cols_to_encode:
    data, patient_procedure_event_encoding[col] = encode_column_and_replace(data,col)

In [57]:
data.head()

LOG_ID,MRN,EVENT_DISPLAY_NAME,EVENT_TIME,NOTE_TEXT,EVENT_DISPLAY_NAME_encoded
str,str,str,str,str,u64
"""fb943d65e18f90…","""5b63201d6d1908…","""TEE Echo Place…","""9/27/19 7:50""",,50
"""1448853f211a1d…","""77bb0379da107d…","""Sign In""","""1/10/19 7:43""",,9
"""a51b5cb848f34f…","""482f098c5a1eb6…","""Quick Note""","""10/19/19 8:46""",,8
"""068b548cf0d919…","""68d58e4c2dcd06…","""Two Anti-Emeti…","""5/6/19 12:01""",,0
"""fb943d65e18f90…","""5b63201d6d1908…","""Quick Note""","""9/27/19 13:43""",,8


In [58]:
data = data.select(['LOG_ID','MRN','EVENT_TIME','EVENT_DISPLAY_NAME_encoded'])

In [59]:
data.write_csv('EPIC_EMR_cleaned/patient_procedure_events_cleaned.csv')

In [60]:
patient_procedure_event_encoding

{'EVENT_DISPLAY_NAME': {'Two Anti-Emetics Administered': 0,
  'Department check-in complete': 1,
  'Induction': 2,
  'Admin Review or Deletion': 3,
  'Start Visit': 4,
  'IABP Off': 5,
  'Re Warming': 6,
  'Mark Now': 7,
  'Quick Note': 8,
  'Sign In': 9,
  'Epidural to C-Section': 10,
  'Blood Patch Administered': 11,
  'CV Bypass Initiated': 12,
  'DHCA Started': 13,
  'Help Call': 14,
  'Renal Clamp On': 15,
  'Patient Moving': 16,
  'Aortic Clamp Off': 17,
  'Major Blood Loss': 18,
  'Start Bedside Data Collection': 19,
  'Renal Clamp Off': 20,
  'Visit Complete': 21,
  'Antibiotics Given': 22,
  'DHCA Ended': 23,
  'CPR': 24,
  'OB Face Time Stop': 25,
  'Case Delayed': 26,
  'IABP On': 27,
  'Carotid Clamp On': 28,
  'Out Of OR Recovery': 29,
  'Start Data Collection': 30,
  'Narcotic Balance': 31,
  'Aortic Clamp On': 32,
  'Start Rooming': 33,
  'IV Antibiotics': 34,
  'Mask Emergence': 35,
  'Aortic Cannula': 36,
  'OB Face Time Start': 37,
  'LMA Removed': 38,
  'Venous Cannu

Combine all the encoded data from each dataset and store it in json file, inorder to access the actual value of each encoded value

In [61]:
encoding_dict = {}
encoding_dict['patient_history'] = patient_history_encoding
encoding_dict['patient_information'] = patient_information_encoding
encoding_dict['patient_lab'] = patient_lab_encoding
encoding_dict['patient_post_op_complications'] = patient_post_op_encoding
encoding_dict['patient_visit'] = patient_visit_encoding
encoding_dict['patient_procedure_event'] = patient_procedure_event_encoding


In [62]:
with open ('EPIC_EMR_cleaned/encoding.json','w') as f:
    json.dump(encoding_dict,f)