In [None]:
import numpy as np
from psycopg2 import connect
import pandas as pd
import pm4py
import numpy as np
import pandasql as ps
from pm4py.objects.conversion.log import converter as log_converter
from scipy.stats import variation
from scipy import stats
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
from pm4py.visualization.dfg import visualizer as dfg_visualization
from pm4py.statistics.eventually_follows.log import get as efg_get
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import sklearn.preprocessing as sk
from scipy.stats import chi2_contingency
import math
import statistics
import pingouin as pg
import graphviz
from statsmodels.stats import multitest
from statsmodels.stats.contingency_tables import SquareTable as ST
import sys

In [None]:
#MIMIC-IV
final_pm = pd.read_csv("Outputs/MIMIC_Transformed.csv")

In [None]:
#Sepsis
final_pm = pd.read_csv("Outputs/Sepsis_Transformed.csv")

In [None]:
hadms = list(final_pm["case:hadm_id"].unique())

In [None]:
parameters = {log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'case:hadm_id'}
event_log = pm4py.format_dataframe(final_pm, case_id='case:hadm_id', activity_key='concept:name', timestamp_key='time:timestamp')
log = pm4py.convert_to_event_log(event_log)

In [None]:
#retrieve all possible process variants and remove variants occuring < 20 times due to their small sample size
from pm4py.algo.filtering.log.variants import variants_filter
variants = variants_filter.get_variants(log)
variants = list(variants.keys())
var = final_pm.groupby('case:hadm_id')['concept:name'].apply(list).reset_index()
var["concept:name"] = var['concept:name'].apply(lambda x: ','.join(map(str, x)))
var = var.rename({"concept:name":"variant"}, axis=1)
final_pm_var = final_pm.merge(var, how="left", on="case:hadm_id")
var_count= final_pm_var.drop_duplicates("case:hadm_id").groupby("variant").count()
to_drop = list(var_count.loc[var_count["case:hadm_id"] < 20].reset_index()["variant"])
for ele in to_drop:
    variants.remove(ele)

In [None]:
def classify_attributes(proc_c):
    for index, row in proc_c.iterrows():
        if((row["numberOfActivities"] == 1) & (row["numberOfTraceOccurence (Mean)"] == 1)):
            proc_c.at[index, "class"] = "static"
        elif((row["numberOfActivities"] > 1) & (row["numberOfTraceOccurence (Mean)"] == 1)):
            proc_c.at[index, "class"] = "semi-dynamic"
        else:
            proc_c.at[index, "class"] = "dynamic"
    return proc_c

In [None]:
#specify activity column
activity = "concept:name"
#specify case id
case_id = "case:hadm_id"


In [None]:
#specify attributes which should not be classified
#MIMIC
columns_to_drop = ['Unnamed: 0','ordercategoryname','category','time:timestamp', "Unnamed: 0.1", "event_time", "time_diff"]

In [None]:
#specify attributes which should not be classified
#Sepsis
columns_to_drop = ['Unnamed: 0','time:timestamp', "event_time", "time_diff", "Variant", "Variant index", "lifecycle:transition"]

In [None]:
#Classify event attributes, so that dynamic event attributes can be identified
final_pm_class = final_pm.drop(columns_to_drop, axis=1)

activities = final_pm_class[activity].unique()

matrix = pd.DataFrame(data=None, columns=activities)

#identify attributes for activities
att_card = pd.DataFrame(data=None,columns=final_pm_class.columns)
for dep in activities:
    dep_data = final_pm_class.loc[final_pm_class[activity] == dep]
    y = dep_data.groupby(activity).agg({lambda x: x.notnull().sum()})
    y.columns = y.columns.droplevel(1)
    y = y.reset_index().drop(activity, axis=1)
    row_num = len(dep_data)
    row = y.loc[0]
    for col in y.columns:
        t = 0.05
        if(row[col] > (row_num*t)):
            row[col] = 1
        else:
            row[col] = 0
    row[activity] = dep
    att_card = att_card.append(row)
    


att_card.drop(case_id, axis=1, inplace=True)

# for each attribute: number of activities + number of occurence in a trace

number_trace_occurence = final_pm_class.groupby(case_id).agg({lambda x: x.notnull().sum()})

#drop concept:name
number_trace_occurence.drop(activity, axis=1, inplace=True)

number_trace_occurence.columns = number_trace_occurence.columns.droplevel(1)

number_trace_occurence = number_trace_occurence.replace(0, np.NaN)

number_trace_occurence = number_trace_occurence.mean()

number_trace_occurence = number_trace_occurence.rename("numberOfTraceOccurence (Mean)")

number_of_activities = pd.Series([], name="numberOfActivities")

for col in final_pm_class.columns:
    if((col != case_id) & (col != activity)):
        number_of_activities[col] = len(final_pm_class[[activity, col]].dropna()[activity].unique())

process_characteristics = pd.concat([number_of_activities, number_trace_occurence], axis=1)

for col in final_pm_class.columns:
    if (final_pm_class[col].nunique()/final_pm_class[col].count() < 0.005):
        process_characteristics.loc[col, "type"] = "categorical"
    else:
        process_characteristics.loc[col, "type"] = "continuous"

process_characteristics = process_characteristics.drop(labels=[case_id, activity])

x = process_characteristics

x = classify_attributes(process_characteristics)

x = x.reset_index()

x = x.rename({"index":"Activity"}, axis=1)

attribute_classes = x[["Activity", "class", "type"]]

attribute_classes["CV"] = 0

attribute_list_con = list(attribute_classes.loc[(attribute_classes["class"] == "dynamic") & (attribute_classes["type"] == "continuous")]["Activity"])

attribute_list_cat = list(attribute_classes.loc[(attribute_classes["class"] == "dynamic") & (attribute_classes["type"] == "categorical")]["Activity"])

In [None]:
dfg = dfg_discovery.apply(log)

In [None]:
#remove small sample size relations (optional)
l = list()
for x in dfg:
    if (dfg[x] <= 30):
        l.append(x)
for e in l:
    del(dfg[e])       

efg_graph = efg_get.apply(log)

#remove small sample size relations (optional)
l = list()
for x in efg_graph:
    if (efg_graph[x] <= 30):
        l.append(x)
for e in l:
    del(efg_graph[e])            

In [None]:
def consecutive_hadms(df, act_1, act_2):
    df = df.loc[df[activity].isin([act_1, act_2])]
    l = [] 
    hadms = df[case_id].unique()
    rows_list = []
    for hadm_id in hadms:
        curr_act = ""
        index_1 = 0
        first_row = ""
        df_hadm = df.loc[df[case_id] == hadm_id]
        for index, row in df_hadm.iterrows():
        #first act
            if((row[activity] == act_1) & (curr_act == "")):
                curr_act = row[activity]
                index_1 = index
                first_row = row
                continue
            elif((curr_act != "") & (row[activity] == act_2)):
                if(index - index_1 == 1):
                    rows_list.append(first_row)
                    rows_list.append(row)
                    curr_act = ""
                else:
                    curr_act = ""
                    
    return pd.DataFrame(rows_list)

In [None]:
def eventually_follow_hadms(df, act_1, act_2):
    df = df.loc[df[activity].isin([act_1, act_2])]
    l = [] 
    hadms = df[case_id].unique()
    rows_list = []
    for hadm_id in hadms:
        curr_act = ""
        first_row = ""
        df_hadm = df.loc[df[case_id] == hadm_id]
        for index, row in df_hadm.iterrows():
        #first act
            if((row[activity] == act_1) & (curr_act == "")):
                curr_act = row[activity]
                first_row = row
            elif((curr_act != "") & (row[activity] == act_2)):
                rows_list.append(first_row)
                rows_list.append(row)
                curr_act = ""
                
    return pd.DataFrame(rows_list)

In [None]:
def stat_value_con(dep_1, dep_2, ea, df):
    f1 = df.loc[df[activity] == dep_1][ea].to_frame().reset_index().drop("index", axis=1)
    f2 = df.loc[df[activity] == dep_2][ea].to_frame().reset_index().drop("index", axis=1)
    df_wo_na = pd.concat([f1,f2], axis= 1)
    df_wo_na.columns = pd.RangeIndex(df_wo_na.columns.size)
    df_wo_na = df_wo_na.dropna()
    
    l1 = list(df_wo_na[0])
    l2 = list(df_wo_na[1])
    df1 = df_wo_na[0]
    df2 = df_wo_na[1]
    
    if((len(l1) < 8) | (len(l2) < 8)):
        return(np.nan,np.nan, np.nan, np.nan,np.nan,np.nan, np.nan, np.nan)
    try:
        p = pg.wilcoxon(l1, l2)["p-val"][0]
        cles = pg.wilcoxon(l1, l2)["CLES"][0]
        rbc = pg.wilcoxon(l1, l2)["RBC"][0]
        z = stats.norm.isf(p / 2)
        r = z / np.sqrt(len(l1)*2)        
        cohen = 2*r / np.sqrt(1-np.square(r))
        return (p, cles, rbc, len(l1), df1.mean(), df2.mean(), df1.std(), df2.std())
    except:
        return(1,0,0,0, 0, 0, 0, 0)

In [None]:
con_All = pd.DataFrame()
df_con = pd.DataFrame()
for rel in dfg:
    #varianten aus consecutive df extrahieren
    consecutive_df = consecutive_hadms(final_pm_var, rel[0], rel[1])
    variants = consecutive_df["variant"].unique()
    att_list = att_card.loc[att_card[activity].isin([rel[0], rel[1]])].sum().to_frame().reset_index()
    att_list = att_list.rename({"index":"e_At", 0:"cardinality"}, axis=1)
    att_list = att_list.loc[(att_list["cardinality"] == 2) & (att_list["e_At"].isin(attribute_list_con))].reset_index()
    for e_at in att_list["e_At"]:
        p, cles, rbc, num_p, m1, m2, st1, st2 = stat_value_con(rel[0], rel[1], e_at, consecutive_df)
        con_All = con_All.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : 'ALL', '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':True}, ignore_index=True)    
        if(p <= (0.05 / len(att_list))):
            df_con = df_con.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : 'ALL', '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':True}, ignore_index=True)
        for var in variants:
            df_var = consecutive_df.loc[consecutive_df["variant"] == var]
            p, cles, rbc, num_p, m1, m2, st1, st2 = stat_value_con(rel[0], rel[1], e_at, df_var)
            con_All = con_All.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : var, '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':True}, ignore_index=True)
            if(p <= (0.05 / len(att_list))):
                df_con = df_con.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : var, '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':True}, ignore_index=True)
       

In [None]:
for rel in efg_graph:
    #varianten aus consecutive df extrahieren
    consecutive_df = eventually_follow_hadms(final_pm_var, rel[0], rel[1])
    variants = consecutive_df["variant"].unique()
    att_list = att_card.loc[att_card[activity].isin([rel[0], rel[1]])].sum().to_frame().reset_index()
    att_list = att_list.rename({"index":"e_At", 0:"cardinality"}, axis=1)
    att_list = att_list.loc[(att_list["cardinality"] == 2) & (att_list["e_At"].isin(attribute_list_con))].reset_index()
    for e_at in att_list["e_At"]:
        p, cles, rbc, num_p, m1, m2, st1, st2 = stat_value_con(rel[0], rel[1], e_at, consecutive_df)
        con_All = con_All.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : 'ALL', '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':False}, ignore_index=True)    
        if(p <= (0.05 / len(att_list))):
            df_con = df_con.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : 'ALL', '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':False}, ignore_index=True)
            for var in variants:
                df_var = consecutive_df.loc[consecutive_df["variant"] == var]
                p, cles, rbc, num_p, m1, m2, st1, st2 = stat_value_con(rel[0], rel[1], e_at, df_var)
                con_All = con_All.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : var, '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':False}, ignore_index=True)
                if(p <= (0.05 / len(att_list))):
                    df_con = df_con.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : var, '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':False}, ignore_index=True)

con_All = con_All.loc[~con_All["P"].isna()]       

In [None]:
#Sepsis
con_All.to_csv("Outputs/Change_Detection_Sepsis_con_All.csv")
df_con.to_csv("Outputs/Change_Detection_Sepsis_df_con.csv")
final_pm_var.to_csv("Outputs/Sepsis_Transformed_Var.csv")

In [None]:
#MIMIC-IV
df_con = df_con.drop_duplicates(["Act_1", "Act_2", "E_At", "var"])
con_All = con_All.drop_duplicates(["Act_1", "Act_2", "E_At", "var"])
con_All.to_csv("Outputs/Change_Detection_MIMIC_con_All.csv")
df_con.to_csv("Outputs/Change_Detection_MIMIC_df_con.csv")
final_pm_var.to_csv("Outputs/MIMIC_Transformed_Var.csv")