In [17]:
import numpy as np
import pandas as pd
import math
import os
import sys
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from lime import lime_tabular
from lime import submodular_pick
from matplotlib import pyplot as plt

plt.style.use("seaborn")


# Check loops
def checkLoops(expName):
    data = ds.loc[:,['caseId', 'activity']]

    actCount = data.groupby(data.columns.tolist()).size().reset_index().\
                        rename(columns={0:'records'})
    print(actCount.describe())
    #print(actCount[:20])
    
def checkCorrelation():
    ds['Accept Claim'] = ds['Accept Claim'].astype(bool)
    print('Accept == label (1)', ds['Accept Claim'].equals(ds['label']))

    ds['Reject Claim'] = ds['Reject Claim'].astype(bool)
    print('Reject == label (0)', ds['Reject Claim'].equals(~ds['label']))
    
# Delete out loops, Input: trace (lista), Ouput: traceSemLoop (lista)
def dropLoopsInTraces(traceList):
    prev = object()
    traceList = [prev := v for v in traceList if prev != v]
    return traceList

# Transform the log into dataset of traces
def f_Traces(x):
    return pd.Series(dict(trace='%s' % ','.join(x['activity']),
                      nrEvents=x['activity'].count(),
                      target=int(x[target].mean())))

# Transform the log into dataset of traces considering resources
def f_TracesComplex(x):
    return pd.Series(dict(tracea='%s' % ','.join(x['activity']),
                          tracer='%s' % ','.join(x['Resource']),
                          nrEvents=x['activity'].count(),
                          target=int(x[target].mean())))

# Load event log data
def loadData(data, column_caseId, column_activity):   
    global ds, target, class_names, expName
    ds = pd.read_csv('data/' + data.split('_')[0] + '.csv', sep=',')
    target = 'label'
    class_names = [0, 1]
    ds = ds.rename(columns={column_caseId: "caseId", column_activity: "activity"})
    expName = data
    print("Data loaded...")

# Pre-process data: simple indexing, considering trace positions,  Input: dataset of traces (dataframe), Ouput: dataset of activity per position in trace (dataframe)
def simpleIndexingNoLoops():
    global ds
    ds = ds.groupby('caseId').apply(f_Traces)
    ds_new = pd.DataFrame(columns=['caseId', 'trace', 'nrEvents', target])
    for item in ds.iterrows():
        noduplicates = dropLoopsInTraces(item[1][0].split(','))
        ds_new.loc[len(ds_new)] = [int(item[0]), ','.join(noduplicates), len(noduplicates), int(item[1][2])]

    ds_new[['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'a11', 'a12', 'a13']] = ds_new['trace'].str.split(',', expand=True)
    ds_new = ds_new.drop(['caseId', 'trace', 'nrEvents'], axis=1)
    ds_new.to_csv('results/' + expName + '_traces.csv')
    ds = ds_new.copy()
    print("Data preprocessed... SIMPLE-INDEXING ENCODING")

    
# Pre-process data: frequency indexing, considering frequency of occurrence of an activity in the trace
def frequencyIndexingWithLoops():
    global ds
    ds.loc[ds.activity =='Accept Claim','activity']='Decide claim'
    ds.loc[ds.activity =='Reject Claim','activity']='Decide claim'
    ds_new = pd.DataFrame()
    ds_new[['caseId', 'activity','label']] = ds[['caseId', 'activity','label']].copy()
    ds_new['val'] = 1
    ds_new = ds_new.groupby(['caseId', 'label', 'activity'])['val'].sum().unstack(fill_value=0)
    ds_new.reset_index(level = 'label', inplace=True)
    ds_new.reset_index(level = 'caseId', inplace=True)
    ds_new.drop(['caseId'], axis = 1, inplace=True)
    ds_new.to_csv('results/' + expName + '_frequencytraces.csv')
    ds = ds_new.copy()
    print("Data preprocessed... FREQUENCY-INDEXING ENCODING")

# Apply one-hot encoding and split data
def encodingAndSplitData():
    global X, y, X_train, X_test, y_train, y_test

    X = pd.get_dummies(ds.drop(columns=[target]))
    y = ds[target].astype(bool)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Set the random_state parameter to 42 for getting the same split
    print("Data splitted...")
    print('X_train shape: ', X_train.shape)
    print('y_train shape: ', y_train.shape)
    print('X_test shape: ', X_test.shape)
    print('y_test shape: ', y_test.shape)
    
# Train um RandomForestClassifier from ScikitLearn
def trainRFModel():
    global model
    model = RandomForestClassifier(random_state = np.random.seed(42))
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print('Model trained... RFC score: ', score)
          

# Apply Lime for an instance, Input: CaseId (int), Output: LimeExp (lime object)
def applyLimeIn(points, nrFeatures):
    explainer = lime_tabular.LimeTabularExplainer(
                                                    training_data=np.array(X_train),
                                                    feature_names=X_train.columns,
                                                    class_names=class_names,
                                                    mode='classification')

    exp_points = []
    exp_list = []

    for idx in points:
        exp = explainer.explain_instance(data_row=X_train.loc[idx], predict_fn=model.predict_proba, num_features=nrFeatures)
        exp_list = exp.as_list()
        exp_list.append(round(model.predict_proba([X_train.loc[idx]])[0, 0], 2))
        exp_list.append(round(model.predict_proba([X_train.loc[idx]])[0, 1], 2))
        exp_list.append(ds.loc[idx, target]) #exp_list.append(ds_new.loc[idx, target])
        exp_list.append(idx)
        exp_points.append(exp_list)
    fileName = "results/"+ expName + "_explanations.csv"
    with open(fileName, "w") as file:
        for row in exp_points:
            file.write("%s\n" % ';'.join(str(col) for col in row))
    print("Explanations file created... " + fileName)

    
# Apply SP-Lime
def applySPLimeIn(sampleSize, nrFeatures, nrExplanations):
    explainer = lime_tabular.LimeTabularExplainer(
                                                    training_data=np.array(X_train),
                                                    feature_names=X_train.columns,
                                                    class_names=class_names,
                                                    mode='classification')

    training_data=np.array(X_train)
    sp_obj = submodular_pick.SubmodularPick(explainer, data=training_data, predict_fn=model.predict_proba, sample_size=sampleSize, num_features=nrFeatures, num_exps_desired=nrExplanations)
    exp_points = sp_obj.sp_explanations[9].as_list()
    fileName = "results/"+ expName + "_explanationsSPLIME.csv"
    with open(fileName, "w") as file:
        for row in exp_points:
            file.write("%s\n" % ';'.join(str(col) for col in row))
    print("Explanations with SP-LIME file created... "+fileName)

# Plot results from file generated by the LIME module, Input: Number of figures per row (int), Experiment Name (str)
def plotLimeResults(plotsPerRow, expName):        
    i, j = 0, 0
    fileName = "results/"+ expName + "_explanations.csv"
    with open(fileName, "r") as file:
        exp_points = list(file)
    exp_points = [x.rstrip() for x in exp_points]
    exp_points = [list(x.split(';')) for x in exp_points]

    # Graph multiplot
    fig, axs = plt.subplots(nrows=math.ceil(len(exp_points) / plotsPerRow), ncols=plotsPerRow, constrained_layout=True, figsize=(20, 20))  # #squeeze=False, you can force the result to be a 2D-array, independant of the number or arrangement of the subplots
    fig.suptitle('Experiment: %s' % expName, fontsize=20)  # title for entire figure

    exp_list = []
    for exp_list in exp_points:
        exp_list = [x.strip('(') for x in exp_list]
        exp_list = [x.strip(')') for x in exp_list]
        exp_list = [x.split(', ') for x in exp_list]
        names = [x[0].strip("\'") for x in exp_list[:-4]]  # Y
        names = [n.rpartition('.0')[0] for n in names]
        vals = [round(float(x[1]), 3) for x in exp_list[:-4]]  # X
        pos = np.arange(len(exp_list) - 4) + .5
        prob0 = exp_list[len(exp_list) - 4][0]
        prob1 = exp_list[len(exp_list) - 3][0]
        y_target = exp_list[len(exp_list) - 2][0]
        idx = exp_list[len(exp_list) - 1][0]
        vals.reverse()
        names.reverse()
        colors = ['green' if x > 0 else 'red' for x in vals]
        axs[i][j].set_title('Case Id: %s\nProbability for class 0 = %s \nProbability for class 1 = %s\n Target/Y: %s' % (idx,
                                                                                                                      str(prob0),
                                                                                                                      str(prob1),
                                                                                                                      y_target))
        axs[i][j].barh(pos, vals, align='center', color=colors)
        axs[i][j].set_yticks(pos, names)
        j += 1
        if j % plotsPerRow == 0:
            i += 1
            j = 0

    plt.show()
    fig.savefig(str("results/" + expName + "_plotLime.jpg"), bbox_inches='tight')
    print('Figure saved...' + "results/" + expName + "_plotLime.jpg")

# plot LIME-SP results in a a global way ***not working yet***
def plotLimeSPResults(sp_obj):
    #Plot explanations
    [explainer.as_pyplot_figure(label=explainer.available_labels()[0]) for explainer in sp_obj.sp_explanations];
    # Make it into a dataframe SP-LIME
    W_pick=pd.DataFrame([dict(this.as_list(this.available_labels()[0])) for this in sp_obj.sp_explanations]).fillna(0)
    # Getting SP predictions
    W_pick['prediction'] = [this.available_labels()[0] for this in sp_obj.sp_explanations]
    W_pick.to_csv("results/"+ expName + "_expSPLime.csv")
    print('Results saved...' + "results/" + expName + "_resultsSPLime.jpg")
    
    #Making a dataframe of all the explanations of sampled points SIMPLE - LIME
    W=pd.DataFrame([dict(this.as_list(this.available_labels()[0])) for this in sp_obj.explanations]).fillna(0)
    W['prediction'] = [this.available_labels()[0] for this in sp_obj.explanations]
    W.to_csv("results/"+ expName + "_expLime.csv")
    print('Results saved...' + "results/" + expName + "_resultsLime.jpg")

In [None]:
expName = 'trainMaggiOriginal_artigoWithDuplicates3'
loadData(expName, 'Case ID', 'Activity')
#checkLoops(expName)

# Select type of modeling for traces in log
#simpleIndexingNoLoops()
frequencyIndexingWithLoops()
#checkCorrelation()

global ds
#ds = ds.drop(['Accept Claim', 'Reject Claim'], axis=1)

# dropping ALL duplicate values
ds.drop_duplicates(keep = "first", inplace = True)

# Train a ML model
encodingAndSplitData()
trainRFModel()

# Basic LIME 
# points=[704, 780, 1047, 1710, 2379, 2556, 2584, 2721]
# applyLimeIn(points, 17)
# plotLimeResults(2, expName)

# Not working complete yet
# SP-LIME
explainer = lime_tabular.LimeTabularExplainer(
                                                    training_data=np.array(X_train),
                                                    feature_names=X_train.columns,
                                                    class_names=class_names,
                                                    mode='classification')

training_data=np.array(X_train)
sp_obj = submodular_pick.SubmodularPick(explainer, data=training_data, predict_fn=model.predict_proba, num_features=20, num_exps_desired=10)

plotLimeSPResults(sp_obj)
#-------- ok ----------

In [19]:
X_train.columns

Index(['Archive', 'Contact Hospital', 'Create Questionnaire', 'Decide claim',
       'High Insurance Check', 'High Medical History', 'Low Insurance Check',
       'Low Medical History', 'Prepare Notification Content',
       'Receive Questionnaire Response', 'Register',
       'Send Notification by Phone', 'Send Notification by Post',
       'Send Notification by e-mail', 'Send Questionnaire',
       'Skip Questionnaire'],
      dtype='object', name='activity')

In [16]:
sp_obj.explanations[2].available_labels()

[0]

In [5]:
df = pd.DataFrame({})
for this_label in range(3):
    dfl=[]
    for i,exp in enumerate(sp_obj.sp_explanations):
        l=exp.as_list(label=this_label)
        l.append(("exp number",i))
        dfl.append(dict(l))
    dftest=pd.DataFrame(dfl)
    df=df.append(pd.DataFrame(dfl,index=[iris.target_names[this_label] for i in range(len(sp_obj.sp_explanations))]))
df

KeyError: 0