In [40]:
import re
import json
import glob
import numpy as np
import pandas as pd

In [3]:
def CustomParser(data):
#     import json
    j1 = json.loads(data)
    return j1

In [5]:
def subprocess_exist(shapes):
    subprocess = False
    for shape in shapes:
        shape_stencil = shape['stencil']['id']
        if shape_stencil == 'Subprocess':
            subprocess = True
    return subprocess    

In [6]:
%%time

for f in glob.glob('../../thesis_data/data_with_meta/*_meta.csv'):
    with open(f) as csv:
        data = pd.read_csv(csv, converters={'Model JSON':CustomParser})
        data = data[data['Type'] == 'BPMN 2.0'] 
        for index, row in data.iterrows():
            model_json = row['Model JSON']
            shapes = model_json['childShapes']
            if subprocess_exist(shapes) and 'properties' in model_json.keys() and 'language' in model_json['properties'].keys() and model_json['properties']['language'] == 'English':
                file = './BPMAI2/'+row['Model ID']+'.json'
                model_json.update({'csv_id': f})
                with open(file, 'w') as new_file:
                    json.dump(row['Model JSON'], new_file)

CPU times: user 8min 23s, sys: 16.9 s, total: 8min 40s
Wall time: 8min 42s


In [2]:
def process_flow(shapes):
    shapes_id = {}
    follows = {}
    flow = {}
    tasks_subprocesses = ['Task', 'CollapsedSubprocess', 'Subprocess']
    shapes_unwanted = ['DataObject', 'ITSystem', 'TextAnnotation', 
                      ' Association_Undirected', 'Association_Unidirectional', 'MessageFlow']
    outputs = [shapes_id, follows, flow]

    for shape in shapes:
        shape_stencil = shape['stencil']['id']
        shape_ID = shape['resourceId']
        if shape_stencil in shapes_unwanted:
            continue
        shapes_id.update({shape_ID: shape_stencil})
        
        outgoingShapes = [s['resourceId'] for s in shape['outgoing']]
        if shape_ID not in follows.keys():
            follows[shape_ID] = outgoingShapes
    
        if shape_stencil in tasks_subprocesses:
            if not shape['properties']['name'] == "":
                flow[shape_ID] = shape['properties']['name'].replace('\n', ' ').replace('\r', '').replace('  ', ' ')
            else:
                flow[shape_ID] = 'Task or Subprocess'
        else:
            if 'name' in shape['properties'] and not shape['properties']['name'] == "":
                flow[shape_ID] = shape_stencil + " (" + shape['properties']['name'].replace('\n', ' ').replace('\r', '').replace('  ', ' ') + ")"
            else:
                flow[shape_ID] = shape_stencil

    return outputs
    

In [3]:
def process_subprocess_with_label(shapes):
    shapes_id = {}
    follows = {}
    flow = {}
    subprocess_name = []
    outputs = [shapes_id, follows, flow]
    for shape in shapes:
        shape_stencil = shape['stencil']['id']
        shape_ID = shape['resourceId']
        if shape_stencil == 'SequenceFlow':
            outgoingShapes = [s['resourceId'] for s in shape['outgoing']]
            if shape_ID not in follows.keys():
                follows[shape_ID] = outgoingShapes
            
        if shape_stencil in ['Pool', 'Lane']:
            results, names = process_subprocess_with_label(shape['childShapes'])
            for o, r in zip(outputs, results):
                o.update(r)
            subprocess_name += names
            
        if shape_stencil == 'Subprocess':
            results = process_flow(shape['childShapes'])
            if 'name' in shape['properties'] and not shape['properties']['name'] == "":
                for o, r in zip(outputs, results):
                    o.update(r)
                name = shape['properties']['name'].replace('\n', ' ').replace('\r', '').replace('  ', ' ')
                subprocess_name.append(name)
                
    return outputs, subprocess_name

In [4]:
def sort_process_flows(flow, directly_follows, shapes_wanted, gateways_count, temp_closing_count):
    process_flow = []

    for flow_object in flow:
        if flow_object in shapes_wanted and flow_object not in temp_closing_count.keys():
            process_flow.append(flow_object)
        
        if flow_object in gateways_count.keys() and flow_object not in temp_closing_count.keys():
            process_flow.append(flow_object)
            
        if flow_object in temp_closing_count.keys():
            count = temp_closing_count[flow_object]-1
            temp_closing_count.update({flow_object: count})
            if temp_closing_count[flow_object] == 0:
                process_flow.append(flow_object)
            else:
                continue
                    
        if flow_object in directly_follows.keys() and directly_follows[flow_object]: 
            flow = directly_follows[flow_object]
            result = sort_process_flows(flow, directly_follows, shapes_wanted, gateways_count, temp_closing_count)
            process_flow = process_flow + result
            
    return process_flow
    

In [5]:
subprocess = {}
file_id = {}
temp_training_data = {}
subprocess_names = {}

for i, file in enumerate(glob.glob('./BPMAI2/*.json')):
    file_id.update({i: file})
    with open(file, 'r') as f:
        model_json = json.load(f)
        (shapes_id, directly_follows, flows), subprocess_name = process_subprocess_with_label(model_json['childShapes'])
        
        stencils = set()
        tasks_subprocesses_id = set()
        start_events_id = set()
        end_events_id = set()
        int_events_id = set()
        gateways_count = {}
        closing_gateways_id = set()
        shapes_unwanted_id = set()
        tasks_subprocesses = ['Task', 'CollapsedSubprocess', 'Subprocess']
        gateways = ['Exclusive_Databased_Gateway', 'InclusiveGateway', 'ParallelGateway']
        shapes_unwanted = ['DataObject', 'ITSystem', 'TextAnnotation', 
                        'Association_Undirected', 'Association_Unidirectional', 'MessageFlow']

        for s in shapes_id.keys():
            stencils.add(shapes_id[s])
            if re.match('(Start.)', shapes_id[s]):
                start_events_id.add(s)
            if re.match('(End.)', shapes_id[s]):
                end_events_id.add(s)
            if re.match('(Intermediate.)', shapes_id[s]):
                int_events_id.add(s)
            if shapes_id[s] in gateways:
                gateways_count.update({s: len(directly_follows[s])})
                if len(directly_follows[s]) == 1:
                    closing_gateways_id.add(s)
            if shapes_id[s] in tasks_subprocesses:
                tasks_subprocesses_id.add(s)
            if shapes_id[s] in shapes_unwanted:
                shapes_unwanted_id.add(s)

        for f in directly_follows.copy():
            if f in shapes_unwanted_id:
                directly_follows.pop(f)
            if f in directly_follows.keys() and directly_follows[f]:
                for r in directly_follows[f]:
                    if r in shapes_unwanted_id:
                        directly_follows[f].remove(r)

        closing_gateways_tasks_count = {}
        for g in closing_gateways_id:
            count = 0
            for f in directly_follows.values():
                count += sum(1 if re.match(g, x) else 0 for x in f)
            closing_gateways_tasks_count[g] = count

        for t in tasks_subprocesses_id:
            count = 0
            for f in directly_follows.values():
                count += sum(1 if re.match(t, x) else 0 for x in f)
            if count > 1:
                closing_gateways_tasks_count[t] = count

        for e in end_events_id:
            count = 0
            for f in directly_follows.values():
                count += sum(1 if re.match(e, x) else 0 for x in f)
            if count > 1:
                closing_gateways_tasks_count[e] = count

        shapes_wanted = set.union(tasks_subprocesses_id, start_events_id, end_events_id, int_events_id)
        temp_closing_count = closing_gateways_tasks_count.copy()
        file_data = []
        for s in start_events_id:
            data = []
            label = []
            flow = directly_follows[s]
            result = sort_process_flows(flow, directly_follows, shapes_wanted, gateways_count, temp_closing_count)
            result.insert(0,s)
            result_copy = result.copy()    
            names = flows  

            for n, obj in enumerate(result_copy):
                if (obj in start_events_id) or (obj in int_events_id) or (obj in end_events_id):
                    if obj in names:
                        res = re.findall(r'\(.*?\)', names[obj])
                        if res:
                            names[obj] = res[0][1:-1]
                        else:
                            result.remove(obj)
                if obj in gateways_count:
                    result.remove(obj)

            for r in result:
                if r in names:
                    data.append(names[r])
            file_data.append([x.lower() for x in data])

        if len(file_data):
            temp_training_data.update({i: file_data})
            subprocess_names.update({i: [x.lower() for x in subprocess_name]})

    subprocess_train_dataset = {'document': temp_training_data, 'subprocess_names': subprocess_names,
                                'file_id': file_id}


In [21]:
from IPython.display import clear_output

doc = subprocess_train_dataset['document']
label = subprocess_train_dataset['subprocess_names']
subprocess_with_label = {}

for i, (d_v, l_v) in enumerate(zip(doc.values(), label.values())):
    for n, (d, l) in enumerate(zip(d_v, l_v)):
        clear_output(wait=True)
        print(i,'\n', l, '\n', d)
        need = input('need? ')
        if need == 'y':
            key = str(i)+'_'+str(n)
            subprocess_with_label.update({key: {l: d}})

132 
 assign inquiry 
 ['the inquiry is new', 'read e-mail information', 'read mail content', 'write content as body of notification', "read sender's e-mail address ", 'identify staff member from database', 'write name staff member as "sender" in notification', 'read mail subject', 'write subject as head of notification', 'notification created']


In [22]:
subprocess_with_label.keys()

dict_keys(['0_0', '0_1', '0_2', '1_0', '2_0', '2_1', '3_0', '5_0', '6_0', '7_0', '7_1', '7_2', '7_3', '7_4', '7_5', '11_0', '12_0', '13_0', '14_0', '15_0', '15_1', '17_0', '17_1', '18_0', '18_1', '19_0', '20_0', '25_0', '26_0', '31_0', '31_1', '32_0', '32_1', '33_0', '40_0', '41_0', '42_0', '43_0', '45_0', '45_1', '45_2', '49_0', '52_0', '54_0', '54_1', '54_2', '56_0', '56_1', '58_0', '62_0', '62_1', '62_2', '62_3', '62_4', '62_5', '62_6', '62_7', '63_0', '65_0', '67_0', '68_0', '69_0', '71_0', '71_1', '72_0', '73_0', '75_0', '75_1', '76_0', '77_0', '77_2', '78_0', '78_1', '80_0', '84_0', '84_1', '84_2', '88_0', '89_0', '91_0', '91_1', '96_0', '97_0', '98_0', '99_0', '99_1', '100_0', '101_0', '102_0', '104_0', '105_0', '107_0', '111_0', '111_1', '113_0', '113_1', '113_2', '115_0', '118_0', '118_1', '118_2', '119_0', '122_0', '122_1', '122_2', '122_3', '122_4', '123_0', '129_0', '130_0', '131_0', '132_0', '132_1'])

In [34]:
with open('new_labeled_dataset_subprocess.json', 'w') as f:
    json.dump(subprocess_with_label, f)

In [35]:
with open('new_labeled_dataset_subprocess.json', 'r') as f:
    json_data = json.load(f)

In [68]:
subprocesses = json_data.values()
labels = []
docs = []
for s in subprocesses:
    labels.append(list(s.keys())[0])
    docs.append(list(s.values())[0])

In [69]:
len(docs)

112

In [74]:
unique_docs, idx_start = np.unique(docs, return_index=True)
len(unique_docs)

  ar = np.asanyarray(ar)


97

In [72]:
unique_subprocesses = {}
for i, (l, d) in enumerate(zip(labels, docs)):
    if i in idx_start:
        unique_subprocesses.update({i: {l: d}})

In [82]:
subprocess_labels = []
subprocess_docs = []
for i, s in unique_subprocesses.items():
    clear_output(wait=True)
    s_k = list(s.keys())[0]
    s_v = list(s.values())[0]
    s_v = ', '.join(s_v)
    print(i,'\n', s_k, '\n', s_v)
    change_name = input('change name? ')
    if change_name != "":
        subprocess_labels.append(change_name)
    else:
        subprocess_labels.append(s_k)
    subprocess_docs.append(s_v)

111 
 assign inquiry 
 the inquiry is new, read e-mail information, read mail content, write content as body of notification, read sender's e-mail address , identify staff member from database, write name staff member as "sender" in notification, read mail subject, write subject as head of notification, notification created


In [94]:
x =  "enroll"
num_d = len(x.split(', '))
num_d

1

In [95]:
document = []
summary = []
for l, d in zip(subprocess_labels, subprocess_docs):
    num_d = len(d.split(', '))
    if num_d > 2:
        document.append(d)
        summary.append(l)
        
filtered_subprocesses = {'document': document, 'summary': summary}

In [97]:
len(document)

87

In [101]:
for i, (s, d) in enumerate(zip(summary, document)):
    clear_output(wait=True)
    print(i, "\n", s, "\n", d)
    input('check: ')

86 
 assign inquiry 
 the inquiry is new, read e-mail information, read mail content, write content as body of notification, read sender's e-mail address , identify staff member from database, write name staff member as "sender" in notification, read mail subject, write subject as head of notification, notification created


In [96]:
with open('final_new_labeled_dataset.json', 'w') as f:
    json.dump(filtered_subprocesses, f)

In [None]:
# year 1 year 2 - search and check - handle phd program

# delete 81 - essensvorbereitung
# handle num of tasks that are lower than 3 - delete?

In [108]:
num = 10000
file = '../../thesis_data/data_with_meta/{}_meta.csv'.format(num)

In [109]:
data = pd.read_csv(file)
data.head()

Unnamed: 0,Model ID,Organization ID,Model JSON,Description,Name,Type,Namespace
0,b3521d410a3c4912ad2bd9355af717a1,04f6934d4088427a84031c24b27f87d9,"{""resourceId"":""canvas"",""formats"":{},""ssextensi...",,Level 2 - Process Area: Human Resources,,http://www.signavio.com/stencilsets/processmap#
1,c830688c44104ba3b7c27c65b0e12dd7,04f6934d4088427a84031c24b27f87d9,"{""resourceId"":""canvas"",""formats"":{},""ssextensi...",,Employee Onboarding,,http://b3mn.org/stencilset/bpmn2.0#
2,f47c5ac96a0b4ce3ba20f170b83cd274,04f6934d4088427a84031c24b27f87d9,"{""resourceId"":""canvas"",""formats"":{""documentati...",,Receipt of Application,,http://b3mn.org/stencilset/bpmn2.0#
3,ff08bc48516547dd9362bd5fa057b2bb,04f6934d4088427a84031c24b27f87d9,"{""resourceId"":""canvas"",""formats"":{},""ssextensi...",,Verify applicant,,http://signavio.com/stencilsets/dmn-1.0#
4,1ab5bbbe4027423b95f225e6c5a41625,04fa7f859249401a94e7cea3e6a1f4aa,"{""resourceId"":""canvas"",""formats"":{""processgoal...",,Level 1 - Value Chain ACME AG,,http://www.signavio.com/stencilsets/processmap#


In [90]:
list(data.Name.values)

['As-is_V1',
 'Level 1 - Value Chain ACME AG',
 'Procurement of Work Equipment',
 'BPMNmodel_NEWv1',
 'Check quantity and quality',
 'As-is process ',
 'Employee Onboarding',
 'Receipt of Goods',
 'Level 2 - Process Area: Order Processing',
 'Group Assignment1_v1',
 'Simulation and Modelling',
 'Assignment 1 ',
 'Receipt of Application',
 'Level 2 - Process Area: Product Development',
 'Level  2 - Process Area: Human Resources',
 'BPMN_assignment2',
 'Procure parts',
 'Verify applicant',
 'Procurement of Work Equipment',
 'Level 2 - Process Area: Product Development',
 'Receipt of Application',
 'Level 1 - Value Chain ACME AG',
 'Verify applicant',
 'Level  2 - Process Area: Human Resources',
 'Procure parts',
 'Receipt of Goods',
 'Check quantity and quality',
 'Level 2 - Process Area: Order Processing',
 'Employee Onboarding',
 'ΔΙΕΚΠΕΡΑΙΩΣΗ ΑΙΤΗΣΗΣ ΧΟΡΗΓΗΣΗΣ ΑΔΕΙΑΣ',
 'Level 2 - Process Area: Product Development',
 'Level 2 - Process Area: Order Processing',
 'Level  2 - Process Are