In [26]:
import logging
import json
from pathlib import Path
import os

logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s',
                    level=logging.WARNING)

PROJECT_ROOT = Path(os.getcwd()).parents[1].resolve()
DATA_PATH = os.path.join(PROJECT_ROOT, 'data')

MODEL_FILE = 'Model_DomesticDeclarations'

# load json from file
with open(os.path.join(DATA_PATH, MODEL_FILE + '.json'), 'r') as json_file:
    model_json = json.loads(json_file.read())


In [27]:
import pandas as pd
from semconstmining.parsing.parser import get_elements_flat
from semconstmining.parsing.model_to_log import check_soundness, create_log
from semconstmining.parsing.conversion.bpmnjsonanalyzer import fromJSON
from semconstmining.parsing.conversion.jsontopetrinetconverter import JsonToPetriNetConverter

f, l, _ = fromJSON(model_json)
elements = pd.DataFrame.from_records(get_elements_flat(model_json, f, l)).set_index('eid')

converter = JsonToPetriNetConverter()

net, im, fm = converter.convert_from_parsed(f, l)

model_log = create_log(net, im, fm, elements)

In [28]:
from semconstmining.declare.enums import Template
from semconstmining.declare.declare import Declare


d4py = Declare()
d4py.config.CONSTRAINT_TYPES_TO_IGNORE = [Template.CHAIN_RESPONSE.templ_str,
                                           Template.CHAIN_PRECEDENCE.templ_str, Template.CHAIN_SUCCESSION.templ_str,
                                           Template.CHOICE.templ_str, Template.NOT_CHAIN_RESPONSE.templ_str,
                                             Template.NOT_CHAIN_PRECEDENCE.templ_str]
res = set()
for i, trace in enumerate(model_log):
    trace.attributes["concept:name"] = str(i)
d4py.log = model_log
d4py.compute_frequent_itemsets(min_support=0.0, len_itemset=2, algorithm="apriori")
d4py.discovery(consider_vacuity=True, max_declare_cardinality=2, plain=True)
individual_res, associations = d4py.filter_discovery(min_support=0.99, plain=True)
res.update(const for const, checker_results in individual_res.items() if "[]" not in const
           and "[none]" not in const)
res

{'Absence2[Declaration APPROVED by ADMINISTRATION] | |',
 'Absence2[Declaration APPROVED by BUDGET OWNER] | |',
 'Absence2[Declaration FINAL_APPROVED by SUPERVISOR] | |',
 'Absence2[Declaration REJECTED by ADMINISTRATION] | |',
 'Absence2[Declaration REJECTED by BUDGET OWNER] | |',
 'Absence2[Declaration REJECTED by EMPLOYEE] | |',
 'Absence2[Declaration REJECTED by SUPERVISOR] | |',
 'Absence2[Declaration SUBMITTED by EMPLOYEE] | |',
 'Absence2[Payment Handled] | |',
 'Absence2[Request Payment] | |',
 'Alternate Precedence[Declaration APPROVED by ADMINISTRATION, Declaration APPROVED by BUDGET OWNER] | | |',
 'Alternate Precedence[Declaration APPROVED by ADMINISTRATION, Declaration FINAL_APPROVED by SUPERVISOR] | | |',
 'Alternate Precedence[Declaration APPROVED by ADMINISTRATION, Declaration REJECTED by BUDGET OWNER] | | |',
 'Alternate Precedence[Declaration APPROVED by ADMINISTRATION, Declaration REJECTED by SUPERVISOR] | | |',
 'Alternate Precedence[Declaration APPROVED by ADMINIST

In [29]:
import pm4py
LOG_PATH = os.path.join(PROJECT_ROOT, "data/logs")
LOG_FILE = "DomesticDeclarations.xes"
log_df = pm4py.read_xes(str(os.path.join(LOG_PATH, LOG_FILE)))

parsing log, completed traces ::   0%|          | 0/10500 [00:00<?, ?it/s]

In [30]:
log_df['duration'] = log_df.groupby('case:concept:name')['time:timestamp'].transform('max') - log_df.groupby('case:concept:name')['time:timestamp'].transform('min')
log_df['duration'] = log_df['duration'].dt.total_seconds() / 60
log_df

Unnamed: 0,id,org:resource,concept:name,time:timestamp,org:role,case:id,case:concept:name,case:BudgetNumber,case:DeclarationNumber,case:Amount,duration
0,st_step 86794_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2017-01-09 08:49:50+00:00,EMPLOYEE,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205,4781.533333
1,st_step 86793_0,STAFF MEMBER,Declaration FINAL_APPROVED by SUPERVISOR,2017-01-09 10:27:48+00:00,SUPERVISOR,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205,4781.533333
2,dd_declaration 86791_19,SYSTEM,Request Payment,2017-01-10 08:34:44+00:00,UNDEFINED,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205,4781.533333
3,dd_declaration 86791_20,SYSTEM,Payment Handled,2017-01-12 16:31:22+00:00,UNDEFINED,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205,4781.533333
4,st_step 86798_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2017-01-09 09:26:14+00:00,EMPLOYEE,declaration 86795,declaration 86795,budget 86566,declaration number 86796,182.464172,91144.750000
...,...,...,...,...,...,...,...,...,...,...,...
56432,st_step 138363_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2018-12-29 16:50:14+00:00,EMPLOYEE,declaration 138359,declaration 138359,budget 86566,declaration number 138360,190.404576,17260.900000
56433,st_step 138361_0,STAFF MEMBER,Declaration APPROVED by ADMINISTRATION,2018-12-29 16:56:13+00:00,ADMINISTRATION,declaration 138359,declaration 138359,budget 86566,declaration number 138360,190.404576,17260.900000
56434,st_step 138362_0,STAFF MEMBER,Declaration FINAL_APPROVED by SUPERVISOR,2019-01-03 07:55:52+00:00,SUPERVISOR,declaration 138359,declaration 138359,budget 86566,declaration number 138360,190.404576,17260.900000
56435,dd_declaration 138359_19,SYSTEM,Request Payment,2019-01-08 07:20:28+00:00,UNDEFINED,declaration 138359,declaration 138359,budget 86566,declaration number 138360,190.404576,17260.900000


In [31]:
# transform log into PM4Py EventLog
log = pm4py.convert_to_event_log(log_df)

In [32]:
from semconstmining.declare.parsers import parse_decl

d4py = Declare()
d4py.log = log
d4py.model = parse_decl(res)
violations = d4py.conformance_checking(consider_vacuity=True)
violations

{'declaration 86791': {'Alternate Precedence[Declaration APPROVED by ADMINISTRATION, Declaration FINAL_APPROVED by SUPERVISOR] | | |',
  'Alternate Precedence[Declaration APPROVED by ADMINISTRATION, Payment Handled] | | |',
  'Alternate Precedence[Declaration APPROVED by ADMINISTRATION, Request Payment] | | |',
  'Exclusive Choice[Declaration APPROVED by ADMINISTRATION, Declaration REJECTED by ADMINISTRATION] | | |',
  'Exclusive Choice[Declaration REJECTED by ADMINISTRATION, Declaration APPROVED by ADMINISTRATION] | | |',
  'Precedence[Declaration APPROVED by ADMINISTRATION, Declaration FINAL_APPROVED by SUPERVISOR] | | |',
  'Precedence[Declaration APPROVED by ADMINISTRATION, Payment Handled] | | |',
  'Precedence[Declaration APPROVED by ADMINISTRATION, Request Payment] | | |',
  'Responded Existence[Declaration FINAL_APPROVED by SUPERVISOR, Declaration APPROVED by ADMINISTRATION] | | |',
  'Responded Existence[Payment Handled, Declaration APPROVED by ADMINISTRATION] | | |',
  'Respo

In [33]:
violation_to_cases = {}
for case, case_violations in violations.items():
    if len(case_violations) > 0:
        for violation in case_violations:
            if violation not in violation_to_cases:
                violation_to_cases[violation] = []
            violation_to_cases[violation].append(case)
violation_to_cases

{'Alternate Precedence[Declaration APPROVED by ADMINISTRATION, Payment Handled] | | |': ['declaration 86791',
  'declaration 86795',
  'declaration 86800',
  'declaration 86731',
  'declaration 86735',
  'declaration 86805',
  'declaration 86809',
  'declaration 86816',
  'declaration 86716',
  'declaration 86720',
  'declaration 86820',
  'declaration 86824',
  'declaration 86828',
  'declaration 86739',
  'declaration 86746',
  'declaration 86752',
  'declaration 86756',
  'declaration 86832',
  'declaration 86836',
  'declaration 86760',
  'declaration 86764',
  'declaration 86771',
  'declaration 86840',
  'declaration 86776',
  'declaration 86783',
  'declaration 86847',
  'declaration 86851',
  'declaration 86572',
  'declaration 86576',
  'declaration 86855',
  'declaration 86859',
  'declaration 86863',
  'declaration 86867',
  'declaration 86871',
  'declaration 86580',
  'declaration 86584',
  'declaration 86875',
  'declaration 86588',
  'declaration 86592',
  'declaration 8

In [66]:
# per violation compute the performance of cases that violate if and compare to the performance of cases that do not violate the constraint
def compute_performance_of_cases_with_violation(violation, violations_to_cases, log):
    cases = violations_to_cases[violation]
    # only keep the first row per group when grouped by case:concept:name
    performance = log[log['case:concept:name'].isin(cases)].groupby('case:concept:name').first()['duration']
    # transform into dict that maps case:concept:name to duration
    case_to_performance = performance.to_dict()
    return case_to_performance

def compute_performance_of_cases_without_violation(activated_but_not_violated, log):
    # only keep the first row per group when grouped by case:concept:name
    performance = log[log['case:concept:name'].isin(activated_but_not_violated)].groupby('case:concept:name').first()['duration']
    # transform into dict that maps case:concept:name to duration
    case_to_performance = performance.to_dict()
    return case_to_performance


def compute_cases_that_activate_constraint(constraint_string, original_params, log):
    print(f"Original params: {original_params}")
    # group by cases and get unique activity names (concept:name)
    cases = log.groupby('case:concept:name')['concept:name'].unique()
    # get case ids (case:concept:name) where the constraint is activated, i.e., where one of the original_params is in the unique activity names
    cases = cases[cases.apply(lambda x: original_params[1] in x if len(original_params)> 1 and  "Precedence" in constraint_string else (original_params[1] in x or orig_params[0] in x) if len(original_params) > 1 and  "Choice" in constraint_string else original_params[0] in x if len(original_params)>0 else False)].index
    return cases


In [67]:
# compute the average performance of cases with and without violation and save it in a dataframe
performance_records = []
performance_records_full = []
known_durations = {}
for violation in violation_to_cases.keys():
    print(("-"*50))
    print(f"Violation: {violation}")
    orig_params = [x.strip() for x in violation.split("[")[1].split("]")[0].split(",")]
    activated_cases = compute_cases_that_activate_constraint(violation, orig_params, log_df)
    activated_but_not_violated = list(set(activated_cases) - set(violation_to_cases[violation]))
    print(f"Activated cases: {len(activated_cases)}")
    print(f"Cases with violation: {len(violation_to_cases[violation])}")
    performance_with_violation = compute_performance_of_cases_with_violation(violation, violation_to_cases, log_df)
    known_durations |= performance_with_violation
    if len(violation_to_cases[violation]) < len(activated_cases):
        print(f"Violation {violation} is not violated whenever it is activated")
        performance_without_violation = compute_performance_of_cases_without_violation(activated_but_not_violated, log_df)
        known_durations |= performance_without_violation
    else:
        performance_without_violation = performance_with_violation
    if len(performance_with_violation) == 0 or len(performance_without_violation) == 0:
        print(f"Violation {violation} has no cases with or without violation")
        continue
    for case in violation_to_cases[violation]:
        performance_records_full.append({"violation": violation, "violated": True,"case": case, "performance": known_durations[case]})
    for case in activated_but_not_violated:
        performance_records_full.append({"violation": violation,  "violated": False, "case": case, "performance": known_durations[case]})
    performance_records.append({"violation": violation, "avg_performance_with_violation": sum(performance_with_violation.values())/len(performance_with_violation.values()), 
                                "avg_performance_without_violation": sum(performance_without_violation.values())/len(performance_without_violation.values()),
                                "diff": sum(performance_with_violation.values())/len(performance_with_violation.values()) - sum(performance_without_violation.values())/len(performance_without_violation.values()),
                                "cases_with_violation": len(performance_with_violation.values()), "cases_without_violation": len(performance_without_violation.values())})
    

--------------------------------------------------
Violation: Alternate Precedence[Declaration APPROVED by ADMINISTRATION, Payment Handled] | | |
Original params: ['Declaration APPROVED by ADMINISTRATION', 'Payment Handled']
Activated cases: 10044
Cases with violation: 2123
Violation Alternate Precedence[Declaration APPROVED by ADMINISTRATION, Payment Handled] | | | is not violated whenever it is activated
--------------------------------------------------
Violation: Precedence[Declaration APPROVED by ADMINISTRATION, Payment Handled] | | |
Original params: ['Declaration APPROVED by ADMINISTRATION', 'Payment Handled']
Activated cases: 10044
Cases with violation: 2123
Violation Precedence[Declaration APPROVED by ADMINISTRATION, Payment Handled] | | | is not violated whenever it is activated
--------------------------------------------------
Violation: Precedence[Declaration APPROVED by ADMINISTRATION, Request Payment] | | |
Original params: ['Declaration APPROVED by ADMINISTRATION', 'Req

In [68]:
performance_df = pd.DataFrame.from_records(performance_records)
performance_df

Unnamed: 0,violation,avg_performance_with_violation,avg_performance_without_violation,diff,cases_with_violation,cases_without_violation
0,Alternate Precedence[Declaration APPROVED by A...,15383.054074,17091.862612,-1708.808538,2123,7921
1,Precedence[Declaration APPROVED by ADMINISTRAT...,15383.054074,17091.862612,-1708.808538,2123,7921
2,Precedence[Declaration APPROVED by ADMINISTRAT...,15458.086422,16998.059172,-1539.972749,2126,7914
3,Alternate Precedence[Declaration APPROVED by A...,15458.086422,16998.059172,-1539.972749,2126,7914
4,"Responded Existence[Payment Handled, Declarati...",15383.054074,17091.862612,-1708.808538,2123,7921
...,...,...,...,...,...,...
194,"Response[Declaration REJECTED by SUPERVISOR, D...",26591.753333,27883.879891,-1292.126558,5,276
195,Not Response[Declaration APPROVED by BUDGET OW...,99258.158333,20227.045585,79031.112748,6,2801
196,Not Precedence[Declaration APPROVED by BUDGET ...,99258.158333,29698.569464,69559.588869,6,840
197,Not Precedence[Declaration REJECTED by BUDGET ...,63904.650000,27732.160595,36172.489405,1,280


In [69]:
performance_df = performance_df[performance_df['diff'] > 0]
performance_df

Unnamed: 0,violation,avg_performance_with_violation,avg_performance_without_violation,diff,cases_with_violation,cases_without_violation
6,Exclusive Choice[Declaration APPROVED by ADMIN...,18615.015803,15799.983307,2815.032497,2973,7527
9,Exclusive Choice[Declaration REJECTED by ADMIN...,18615.015803,15799.983307,2815.032497,2973,7527
11,Alternate Succession[Declaration FINAL_APPROVE...,45976.595971,16477.125592,29499.470379,91,9981
12,Alternate Response[Declaration FINAL_APPROVED ...,46492.678277,16477.125592,30015.552685,89,9981
13,Not Response[Declaration FINAL_APPROVED by SUP...,49749.284483,16551.196604,33198.087879,58,10012
...,...,...,...,...,...,...
193,Responded Existence[Declaration REJECTED by SU...,32483.062500,27794.142118,4688.920382,4,277
195,Not Response[Declaration APPROVED by BUDGET OW...,99258.158333,20227.045585,79031.112748,6,2801
196,Not Precedence[Declaration APPROVED by BUDGET ...,99258.158333,29698.569464,69559.588869,6,840
197,Not Precedence[Declaration REJECTED by BUDGET ...,63904.650000,27732.160595,36172.489405,1,280


In [74]:
performance_df_full = pd.DataFrame.from_records(performance_records_full)

significance_records = []

#calculate whether the difference in performance is statistically significant
from scipy.stats import ttest_ind
for violation, violation_df in performance_df_full.groupby('violation'):
    # calculate Welch's t-test
    t_test_result = ttest_ind(violation_df[violation_df['violated']]['performance'], violation_df[~violation_df['violated']]['performance'], equal_var=False)
    # store the result in the significance records
    significance_records.append({"violation": violation, "t_statistic": t_test_result.statistic, "p_value": t_test_result.pvalue})

significance_df = pd.DataFrame.from_records(significance_records)
significance_df

Unnamed: 0,violation,t_statistic,p_value
0,Absence2[Declaration APPROVED by ADMINISTRATIO...,8.068548,4.817056e-14
1,Absence2[Declaration APPROVED by BUDGET OWNER]...,3.671020,3.110256e-03
2,Absence2[Declaration FINAL_APPROVED by SUPERVI...,3.178423,2.392335e-03
3,Absence2[Declaration REJECTED by ADMINISTRATIO...,2.477297,1.482597e-02
4,Absence2[Declaration REJECTED by BUDGET OWNER]...,,
...,...,...,...
194,"Response[Declaration REJECTED by SUPERVISOR, D...",-0.072442,9.456785e-01
195,"Response[Request Payment, Payment Handled] | | |",1.182465,3.585454e-01
196,Succession[Declaration FINAL_APPROVED by SUPER...,0.669966,5.081687e-01
197,Succession[Declaration FINAL_APPROVED by SUPER...,1.673638,1.036508e-01


In [75]:
# filter out the violations where the difference in performance is not statistically significant
significance_df = significance_df[significance_df['p_value'] < 0.05]
significance_df

Unnamed: 0,violation,t_statistic,p_value
0,Absence2[Declaration APPROVED by ADMINISTRATIO...,8.068548,4.817056e-14
1,Absence2[Declaration APPROVED by BUDGET OWNER]...,3.671020,3.110256e-03
2,Absence2[Declaration FINAL_APPROVED by SUPERVI...,3.178423,2.392335e-03
3,Absence2[Declaration REJECTED by ADMINISTRATIO...,2.477297,1.482597e-02
5,Absence2[Declaration REJECTED by EMPLOYEE] | |,4.037500,8.369246e-05
...,...,...,...
176,Responded Existence[Declaration FINAL_APPROVED...,-2.974422,2.955894e-03
180,Responded Existence[Declaration REJECTED by SU...,-3.964199,1.070663e-04
182,"Responded Existence[Payment Handled, Declarati...",-3.125242,1.791526e-03
186,"Responded Existence[Request Payment, Declarati...",-2.811462,4.960478e-03
