## MIMIC-IV EventLog Curation
1. import CSV into pandas dataframe
2. convert processed dataframe to event log structure by `pm4py`
3. export event log to XES file by `pm4py`

In [1]:
# import required library
import os
import pm4py
import numpy as np
import pandas as pd
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.exporter.xes import exporter as xes_exporter

### Import CSV log into pandas dataframe
Please change the path for your CSV file below.

In [2]:
csv_file_path = "mimicel.csv.gz" # or "your_file_path_here.csv"

# log_csv = pd.read_csv(csv_file_path, sep=',')
log_csv = pd.read_csv(csv_file_path, sep=',', compression='gzip', header=0)

In [3]:
log_csv.head(20)

Unnamed: 0,stay_id,subject_id,hadm_id,timestamps,activity,seq_num,icd_code,icd_version,icd_title,temperature,...,chiefcomplaint,rhythm,name,gsn,ndc,etc_rn,etccode,etcdescription,med_rn,gsn_rn
0,31204893,14840073,27640291.0,2130-09-15 01:24:00,Vital sign check,,,,,98.0,...,,sr,,,,,,,,
1,31204893,14840073,27640291.0,2130-09-15 02:15:00,Vital sign check,,,,,98.5,...,,sb,,,,,,,,
2,31204893,14840073,27640291.0,2130-09-15 03:54:00,Vital sign check,,,,,98.5,...,,,,,,,,,,
3,31204893,14840073,27640291.0,2130-09-15 04:55:00,Vital sign check,,,,,98.4,...,,sb,,,,,,,,
4,31204893,14840073,27640291.0,2130-09-15 06:56:00,Vital sign check,,,,,97.8,...,,,,,,,,,,
5,31204893,14840073,27640291.0,2130-09-15 07:23:00,Vital sign check,,,,,,...,,,,,,,,,,
6,31204893,14840073,27640291.0,2130-09-15 07:56:34,Discharge from the ED + diagnosis 1,1.0,2859,9.0,ANEMIA NOS,,...,,,,,,,,,,
7,31204893,14840073,27640291.0,2130-09-15 08:57:00,Vital sign check,,,,,,...,,,,,,,,,,
8,31204893,14840073,27640291.0,2130-09-15 09:20:00,Vital sign check,,,,,97.5,...,,,,,,,,,,
9,31204893,14840073,27640291.0,2130-09-15 22:45:00,Vital sign check,,,,,97.5,...,,,,,,,,,,


### Process the dataframe and convert it to `pm4py` event log structure
The default attributes in `pm4py` package:

- Case ID --> case:concept:name
- Activity --> concept:name
- Timestamps --> time:timestamp
- Case attributes -->  start with `case:`

In [4]:
# rename some attributes name
log_csv.rename(columns=
    {
        # Standardization for CaseID, activity and timestamp
        'stay_id':'case:concept:name',
        'activity':'concept:name',
        'timestamps':'time:timestamp', 

        # Standardization for Case attributes
        'subject_id': 'case:subject_id', 
        'hadm_id':'case:hadm_id', 
        'acuity': 'case:acuity', 
        'chiefcomplaint': 'case:chiefcomplaint'
        
        # new case attributes
        'gender': 'case:gender',
        'race': 'case:race',
        'arrival_transport': 'case:arrival_transport',
        'disposition': 'case:disposition'
    }, inplace=True)

`pm4py` will select values in the first row of each case for case attributes. Thus, we need fill in rows with empty case attribute

For example: `case:acuity`, `case:chiefcomplaint`

In [5]:
log_csv['case:acuity'] = log_csv.groupby('case:concept:name')['case:acuity'].transform(lambda v: v.ffill().bfill())
log_csv['case:chiefcomplaint'] = log_csv.groupby('case:concept:name')['case:chiefcomplaint'].transform(lambda v: v.ffill().bfill())
log_csv['case:gender'] = log_csv.groupby('case:concept:name')['case:gender'].transform(lambda v: v.ffill().bfill())
log_csv['case:race'] = log_csv.groupby('case:concept:name')['case:race'].transform(lambda v: v.ffill().bfill())
log_csv['case:arrival_transport'] = log_csv.groupby('case:concept:name')['case:arrival_transport'].transform(lambda v: v.ffill().bfill())
log_csv['case:disposition'] = log_csv.groupby('case:concept:name')['case:disposition'].transform(lambda v: v.ffill().bfill())

`pm4py` has built-in fuctions for transforming the data type of timestamp in the dataframe.

Function `pm4py.objects.log.util.dataframe_utils.convert_timestamp_columns_in_df`

When using this function, make sure that the column name of timestamp is `timestamp`.

In [6]:
log_csv = dataframe_utils.convert_timestamp_columns_in_df(log_csv)
log_csv = log_csv.sort_values('time:timestamp')

In [7]:
# check the first 20 rows
log_csv.head(20)

Unnamed: 0,case:concept:name,case:subject_id,case:hadm_id,time:timestamp,concept:name,seq_num,icd_code,icd_version,icd_title,temperature,...,case:chiefcomplaint,rhythm,name,gsn,ndc,etc_rn,etccode,etcdescription,med_rn,gsn_rn
3437461,35341790,13238787,,2110-01-11 01:45:00+00:00,Enter the ED,,,,,,...,"Back pain, RESTLESSNESS",,,,,,,,,
3437462,35341790,13238787,,2110-01-11 01:45:01+00:00,Triage in the ED,,,,,98.4,...,"Back pain, RESTLESSNESS",,,,,,,,,
3437463,35341790,13238787,,2110-01-11 01:49:00+00:00,Vital sign check,,,,,98.4,...,"Back pain, RESTLESSNESS",,,,,,,,,
3437464,35341790,13238787,,2110-01-11 02:05:00+00:00,Discharge from the ED + diagnosis 1,1.0,7242,9.0,LUMBAGO,,...,"Back pain, RESTLESSNESS",,,,,,,,,
3437465,35341790,13238787,,2110-01-11 02:05:00+00:00,Discharge from the ED + diagnosis 2,2.0,30000,9.0,ANXIETY STATE NOS,,...,"Back pain, RESTLESSNESS",,,,,,,,,
6518309,39042378,15350437,20383396.0,2110-01-11 03:43:00+00:00,Enter the ED,,,,,,...,"Diplopia, Transfer",,,,,,,,,
6518310,39042378,15350437,20383396.0,2110-01-11 03:43:01+00:00,Triage in the ED,,,,,97.1,...,"Diplopia, Transfer",,,,,,,,,
6518311,39042378,15350437,20383396.0,2110-01-11 03:45:00+00:00,Vital sign check,,,,,97.1,...,"Diplopia, Transfer",,,,,,,,,
3437466,35341790,13238787,,2110-01-11 04:02:00+00:00,Vital sign check,,,,,98.0,...,"Back pain, RESTLESSNESS",,,,,,,,,
3437467,35341790,13238787,,2110-01-11 05:21:00+00:00,Medicine dispensations,,,,,,...,"Back pain, RESTLESSNESS",,Diazepam,3768.0,,,,,1.0,1.0


### Export event log data to XES file

The default export setting for exporting XES file requires using the default column name of case id `case:concept:name`. But you can use parameters to specify a different name.

In [8]:
dataframe = log_csv.head(200000)

event_log_selected = log_converter.apply(dataframe, variant=log_converter.Variants.TO_EVENT_LOG)

In [9]:
xes_file_path = "mimicel-test.xes"
xes_exporter.apply(event_log_selected, xes_file_path, parameters={xes_exporter.Variants.ETREE.value.Parameters.COMPRESS: True})

exporting log, completed traces :: 100%|██████████| 13080/13080 [00:25<00:00, 508.36it/s]


In [10]:
# # You can set parameters for using different column name of case id
# parameters = {log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'case:stay_id'}
# event_log = log_converter.apply(log_csv, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG)

# default usage
event_log = log_converter.apply(log_csv, variant=log_converter.Variants.TO_EVENT_LOG)

In [11]:
xes_file_path = "mimicel.xes" # or "your_file_path_here.xes"
xes_exporter.apply(event_log, xes_file_path, parameters={xes_exporter.Variants.ETREE.value.Parameters.COMPRESS: True})

exporting log, completed traces :: 100%|██████████| 448972/448972 [17:40<00:00, 423.18it/s]
