## MIMIC-IV EventLog Curation
1. import CSV into pandas dataframe
2. convert processed dataframe to event log structure by `pm4py`
3. export event log to XES file by `pm4py`

In [1]:
# import required library
import os
import pm4py
import numpy as np
import pandas as pd
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.exporter.xes import exporter as xes_exporter

### Import CSV log into pandas dataframe
Please change the path for your CSV file below.

In [2]:
csv_file_path = "mimicel.csv.gz" # or "your_file_path_here.csv"

# log_csv = pd.read_csv(csv_file_path, sep=',')
log_csv = pd.read_csv(csv_file_path, sep=',', compression='gzip', header=0)

In [3]:
log_csv.head(20)

Unnamed: 0,stay_id,subject_id,hadm_id,timestamps,activity,gender,race,arrival_transport,disposition,seq_num,...,chiefcomplaint,rhythm,name,gsn,ndc,etc_rn,etccode,etcdescription,med_rn,gsn_rn
0,30000012,11714491,21562392.0,2126-02-14 20:22:00,Vital sign check,,,,,,...,,,,,,,,,,
1,30000012,11714491,21562392.0,2126-02-14 20:22:00,Enter the ED,F,WHITE,AMBULANCE,,,...,,,,,,,,,,
2,30000012,11714491,21562392.0,2126-02-14 20:22:01,Triage in the ED,,,,,,...,CHANGE IN MENTAL STATUS,,,,,,,,,
3,30000012,11714491,21562392.0,2126-02-14 22:21:00,Medicine reconciliation,,,,,,...,,,rifaximin,66295.0,54868620000.0,1.0,5844.0,Rifamycins and Related Derivative Antibiotics,,
4,30000012,11714491,21562392.0,2126-02-14 22:21:00,Medicine reconciliation,,,,,,...,,,spironolactone,6818.0,16729020000.0,1.0,5658.0,"Diuretic - Aldosterone Receptor Antagonist, No...",,
5,30000012,11714491,21562392.0,2126-02-14 22:21:00,Medicine reconciliation,,,,,,...,,,gabapentin,21413.0,10135060000.0,1.0,6030.0,Anticonvulsant - GABA Analogs,,
6,30000012,11714491,21562392.0,2126-02-14 22:21:00,Medicine reconciliation,,,,,,...,,,spironolactone,6818.0,16729020000.0,2.0,6043.0,Aldosterone Receptor Antagonists,,
7,30000012,11714491,21562392.0,2126-02-14 22:21:00,Medicine reconciliation,,,,,,...,,,furosemide,8209.0,10544060000.0,1.0,250.0,Diuretic - Loop,,
8,30000012,11714491,21562392.0,2126-02-14 22:21:00,Medicine reconciliation,,,,,,...,,,"multivitamin,tx-minerals",2510.0,10267070000.0,1.0,704.0,Multivitamin and Mineral Combinations,,
9,30000012,11714491,21562392.0,2126-02-14 22:21:00,Medicine reconciliation,,,,,,...,,,pantoprazole,27462.0,13668010000.0,1.0,445.0,Gastric Acid Secretion Reducing Agents - Proto...,,


### Process the dataframe and convert it to `pm4py` event log structure
The default attributes in `pm4py` package:

- Case ID --> case:concept:name
- Activity --> concept:name
- Timestamps --> time:timestamp
- Case attributes -->  start with `case:`

In [4]:
# rename some attributes name
log_csv.rename(columns=
    {
        # Standardization for CaseID, activity and timestamp
        'stay_id':'case:concept:name',
        'activity':'concept:name',
        'timestamps':'time:timestamp', 

        # Standardization for Case attributes
        'subject_id': 'case:subject_id', 
        'hadm_id':'case:hadm_id', 
        'acuity': 'case:acuity', 
        'chiefcomplaint': 'case:chiefcomplaint',
        
        # new case attributes
        'gender': 'case:gender',
        'race': 'case:race',
    }, inplace=True)

`pm4py` will select values in the first row of each case for case attributes. Thus, we need fill in rows with empty case attribute

For example: `case:acuity`, `case:chiefcomplaint`

In [5]:
log_csv['case:acuity'] = log_csv.groupby('case:concept:name')['case:acuity'].transform(lambda v: v.ffill().bfill())
log_csv['case:chiefcomplaint'] = log_csv.groupby('case:concept:name')['case:chiefcomplaint'].transform(lambda v: v.ffill().bfill())
log_csv['case:gender'] = log_csv.groupby('case:concept:name')['case:gender'].transform(lambda v: v.ffill().bfill())
log_csv['case:race'] = log_csv.groupby('case:concept:name')['case:race'].transform(lambda v: v.ffill().bfill())

`pm4py` has built-in fuctions for transforming the data type of timestamp in the dataframe.

Function `pm4py.objects.log.util.dataframe_utils.convert_timestamp_columns_in_df`

When using this function, make sure that the column name of timestamp is `timestamp`.

In [6]:
log_csv = dataframe_utils.convert_timestamp_columns_in_df(log_csv)
log_csv = log_csv.sort_values('time:timestamp')

In [7]:
# check the first 20 rows
log_csv.head(20)

Unnamed: 0,case:concept:name,case:subject_id,case:hadm_id,time:timestamp,concept:name,case:gender,case:race,arrival_transport,disposition,seq_num,...,case:chiefcomplaint,rhythm,name,gsn,ndc,etc_rn,etccode,etcdescription,med_rn,gsn_rn
4214372,35341790,13238787,,2110-01-11 01:45:00+00:00,Enter the ED,M,WHITE,UNKNOWN,,,...,"Back pain, RESTLESSNESS",,,,,,,,,
4214373,35341790,13238787,,2110-01-11 01:45:01+00:00,Triage in the ED,M,WHITE,,,,...,"Back pain, RESTLESSNESS",,,,,,,,,
4214374,35341790,13238787,,2110-01-11 01:49:00+00:00,Vital sign check,M,WHITE,,,,...,"Back pain, RESTLESSNESS",,,,,,,,,
7130539,39042378,15350437,20383396.0,2110-01-11 03:43:00+00:00,Enter the ED,M,WHITE,AMBULANCE,,,...,"Diplopia, Transfer",,,,,,,,,
7130540,39042378,15350437,20383396.0,2110-01-11 03:43:01+00:00,Triage in the ED,M,WHITE,,,,...,"Diplopia, Transfer",,,,,,,,,
7130541,39042378,15350437,20383396.0,2110-01-11 03:45:00+00:00,Vital sign check,M,WHITE,,,,...,"Diplopia, Transfer",,,,,,,,,
4214375,35341790,13238787,,2110-01-11 04:02:00+00:00,Vital sign check,M,WHITE,,,,...,"Back pain, RESTLESSNESS",,,,,,,,,
4214376,35341790,13238787,,2110-01-11 05:21:00+00:00,Medicine dispensations,M,WHITE,,,,...,"Back pain, RESTLESSNESS",,Diazepam,3768.0,,,,,1.0,1.0
4214377,35341790,13238787,,2110-01-11 05:21:00+00:00,Medicine dispensations,M,WHITE,,,,...,"Back pain, RESTLESSNESS",,TraMADOL (Ultram),23139.0,,,,,2.0,1.0
7130542,39042378,15350437,20383396.0,2110-01-11 05:42:00+00:00,Vital sign check,M,WHITE,,,,...,"Diplopia, Transfer",,,,,,,,,


### Export event log data to XES file

The default export setting for exporting XES file requires using the default column name of case id `case:concept:name`. But you can use parameters to specify a different name.

In [8]:
dataframe = log_csv.head(200000)

event_log_selected = log_converter.apply(dataframe, variant=log_converter.Variants.TO_EVENT_LOG)

In [9]:
xes_file_path = "mimicel-test.xes"
xes_exporter.apply(event_log_selected, xes_file_path, parameters={xes_exporter.Variants.ETREE.value.Parameters.COMPRESS: True})

exporting log, completed traces :: 100%|██████████| 13101/13101 [00:27<00:00, 480.04it/s]


In [10]:
# # You can set parameters for using different column name of case id
# parameters = {log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'case:stay_id'}
# event_log = log_converter.apply(log_csv, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG)

# default usage
event_log = log_converter.apply(log_csv, variant=log_converter.Variants.TO_EVENT_LOG)

In [11]:
xes_file_path = "mimicel.xes" # or "your_file_path_here.xes"
xes_exporter.apply(event_log, xes_file_path, parameters={xes_exporter.Variants.ETREE.value.Parameters.COMPRESS: True})

exporting log, completed traces :: 100%|██████████| 425087/425087 [18:06<00:00, 391.27it/s]
