In [1]:
import glob
import json
import re
import random 
import numpy as np

def load_JSON(path_to_json, extract_subprocess = False):
    # function that gets all labels, tasks, pools and lanes
    with open(path_to_json, 'r') as f:
        data = f.read()
        json_data = json.loads(data)
        if 'childShapes' not in json_data.keys():
            print('no elements in '+path_to_json)
            return {}
        elif extract_subprocess:
            subprocess = process_subprocess(json_data['childShapes']) 
            return subprocess
        elif pool_exist(path_to_json):
            (shapes_id, follows, lanes), pools = process_pools_and_lanes(json_data['childShapes'])
            return shapes_id, follows, lanes, pools
        else:
            shapes_id, follows, flow = process_flow(json_data['childShapes'])
            return shapes_id, follows, flow

        
def pool_exist(path_to_json):
    meta_file = path_to_json.replace('.json', '.meta.json')
    with open(meta_file, 'r') as f:
        meta_data = f.read()
        json_meta_data = json.loads(meta_data)
        if 'Pool' in json_meta_data['revision']['elementCounts'].keys():
            return True
        else:
            return False  

def process_flow(shapes):
    shapes_id = {}
    follows = {}
    flow = {}
    tasks_subprocesses = ['Task', 'CollapsedSubprocess', 'Subprocess']
    shapes_unwanted = ['DataObject', 'ITSystem', 'TextAnnotation', 
                      ' Association_Undirected', 'Association_Unidirectional', 'MessageFlow']
    outputs = [shapes_id, follows, flow]

    for shape in shapes:
        shape_stencil = shape['stencil']['id']
        shape_ID = shape['resourceId']
        if shape_stencil in shapes_unwanted:
            continue
        shapes_id.update({shape_ID: shape_stencil})
        
        outgoingShapes = [s['resourceId'] for s in shape['outgoing']]
        if shape_ID not in follows.keys():
            follows[shape_ID] = outgoingShapes
    
        if shape_stencil in tasks_subprocesses:
            if not shape['properties']['name'] == "":
                flow[shape_ID] = shape['properties']['name'].replace('\n', ' ').replace('\r', '').replace('  ', ' ')
            else:
                flow[shape_ID] = 'Task or Subprocess'
        else:
            if 'name' in shape['properties'] and not shape['properties']['name'] == "":
                flow[shape_ID] = shape_stencil + " (" + shape['properties']['name'].replace('\n', ' ').replace('\r', '').replace('  ', ' ') + ")"
            else:
                flow[shape_ID] = shape_stencil

    return outputs
    

def process_pools_and_lanes(shapes):
    shapes_id = {}
    follows = {}
    lanes = {}
    pools = {}
    tasks_subprocesses = ['Task', 'CollapsedSubprocess', 'Subprocess']
    shapes_unwanted = ['DataObject', 'ITSystem', 'TextAnnotation', 
                      ' Association_Undirected', 'Association_Unidirectional', 'MessageFlow']
    outputs = [shapes_id, follows, lanes]

    for shape in shapes:
        shape_stencil = shape['stencil']['id']
        shape_ID = shape['resourceId']
        if shape_stencil in shapes_unwanted:
            continue
        shapes_id.update({shape_ID: shape_stencil})
        outgoingShapes = [s['resourceId'] for s in shape['outgoing']]
        if shape_ID not in follows.keys():
            follows[shape_ID] = outgoingShapes
        
        if shape_stencil == 'Pool':
            if 'name' in shape['properties'] and not shape['properties']['name'] == "":
                pool = shape['properties']['name'].replace('\n', ' ').replace('\r', '').replace('  ', ' ')
            else:
                pool = shape_ID
            results = process_pools_and_lanes(shape['childShapes'])
            for r, o in zip(results[0], outputs):
                o.update(r)
            if len(results[0][2]): #lanes
                pools.update({pool: results[0][2]})
        
        if shape_stencil == 'Lane':
            if shape['childShapes'] != []:
                lane_labels = {}
                for childShape in shape['childShapes']:
                    c_stencil = childShape['stencil']['id']
                    c_shape_ID = childShape['resourceId']
                    if c_stencil == 'Lane':
                        if 'name' in shape['properties'] and not shape['properties']['name'] == "":
                            lane = shape['properties']['name'].replace('\n', ' ').replace('\r', '').replace('  ', ' ')
                        else:
                            lane = shape_ID
                        results = process_pools_and_lanes(shape['childShapes'])
                        for r, o in zip(results[0], outputs):
                            o.update(r)

                    else:
                        if c_stencil in shapes_unwanted:
                            continue
                        shapes_id.update({c_shape_ID: c_stencil})

                        outgoingShapes = [s['resourceId'] for s in childShape['outgoing']]
                        if c_shape_ID not in follows.keys():
                            follows[c_shape_ID] = outgoingShapes

                        if c_stencil in tasks_subprocesses:
                            if not childShape['properties']['name'] == "":
                                lane_labels[c_shape_ID] = childShape['properties']['name'].replace('\n', ' ').replace('\r', '').replace('  ', ' ')
                            else:
                                lane_labels[c_shape_ID] = 'Task or Subprocess'
                        else:
                            if 'name' in childShape['properties'] and not childShape['properties']['name'] == "":
                                lane_labels[c_shape_ID] = c_stencil + " (" + childShape['properties']['name'].replace('\n', ' ').replace('\r', '').replace('  ', ' ') + ")"
                            else:
                                lane_labels[c_shape_ID] = c_stencil

                        if 'name' in shape['properties'] and not shape['properties']['name'] == "":
                            lane = shape['properties']['name'].replace('\n', ' ').replace('\r', '').replace('  ', ' ')
                        else:
                            lane = shape_ID
                        lanes.update({lane: lane_labels})
                
    return outputs, pools


In [2]:
def hard_neg_triplet(directly_follows, opening_gateways, closing_tasks_gateways, tasks_subprocesses_id):
    
    all_triplets = []
    direct_parents = {}
    for k, v in directly_follows.items():
        for x in v:
            if x in direct_parents:
                direct_parents[x].append(k)
            else:
                direct_parents[x] = [k]
                
    # opening gateway triplets            
    for g in opening_gateways:
        triplet = []
        follows = directly_follows[g].copy()
        follows = [directly_follows[f][0] for f in follows if directly_follows[f][0] in tasks_subprocesses_id]
        follow_n = len(follows)
        parent_n = len(direct_parents[g])
        if follow_n > 1 and parent_n == 1:
            temp_parent = direct_parents[g][0]
            parent = direct_parents[temp_parent][0]
            if parent in tasks_subprocesses_id:
                res = random.sample(range(0, follow_n), 2)
                for idx in res:
                    triplet.append(follows[idx])
                triplet.insert(1, parent)
        if len(triplet) == 3:
            all_triplets.append(triplet)
            all_triplets.append(list(reversed(triplet)))
            
    # closing gateway triplets
    for c in closing_tasks_gateways:
        triplet = []
        parents = direct_parents[c].copy()
        parents = [direct_parents[p][0] for p in parents if direct_parents[p][0] in tasks_subprocesses_id]
        parent_n = len(parents)
        follow_n = len(directly_follows[c])
        if parent_n > 1 and follow_n == 1:
            res = random.sample(range(0, parent_n), 2)
            for idx in res:
                triplet.append(parents[idx])
            if c in tasks_subprocesses_id:
                triplet.insert(1, c)
            else:
                temp_follow = directly_follows[c][0]
                follow = directly_follows[temp_follow][0]
                if follow in tasks_subprocesses_id:
                    triplet.insert(1, follow)
        if len(triplet) == 3:
            all_triplets.append(triplet)
            all_triplets.append(list(reversed(triplet)))        

    return all_triplets

In [3]:
def neg_triplet(directly_follows, tasks_subprocesses_id):
    
    all_triplets = []
    for t in tasks_subprocesses_id:
        triplet = [t]
        if len(directly_follows[t]) == 1:
            temp_follow = directly_follows[t][0]
            follow = directly_follows[temp_follow][0]
            if follow in tasks_subprocesses_id:
                triplet.append(follow)
                not_follow_list = [x for x in tasks_subprocesses_id if x not in triplet]
                not_follow = random.sample(not_follow_list, 1)
                triplet.append(not_follow[0])
        if len(triplet) == 3:
            all_triplets.append(triplet)
            
    return all_triplets

In [4]:
def build_triplet_input(directly_follows, tasks_subprocesses_id, gateways_count, temp_closing_count):
    
    opening_gateways = [x for x in gateways_count if x not in temp_closing_count]
    closing_tasks_gateways = temp_closing_count.keys()
    neg_triplets = neg_triplet(directly_follows, tasks_subprocesses_id)
    hard_neg_triplets = hard_neg_triplet(directly_follows, opening_gateways, closing_tasks_gateways, tasks_subprocesses_id)
    
    return neg_triplets, hard_neg_triplets

In [54]:
filtered_id = []
filtered_data = []
min_task = 20
# max_task = 10
flows_extracted = []
models_skipped = []

for f in glob.glob('../thesis_data/bpmai/models/*.meta.json'):
    with open(f) as jsonFiles:
        data = json.load(jsonFiles)
        if data['model']['naturalLanguage'] == 'en' and not data['model']['modelName'].isdigit():
            filtered_id.append(data['model']['modelId'])
            filtered_data.append(data)

bpmn20 = [x for x in filtered_data if x['model']['modelingLanguage'] == 'bpmn20']
bpmn20_filtered = [x for x in bpmn20 if ('Task' in x['revision']['elementCounts'].keys()) and x['revision']['elementCounts']['Task'] >= min_task]
# bpmn20_filtered = [x for x in bpmn20 if ('Task' in x['revision']['elementCounts'].keys()) and x['revision']['elementCounts']['Task'] >= min_task and x['revision']['elementCounts']['Task'] <= max_task]

bpmn20_filtered_id = [x['model']['modelId'] for x in bpmn20_filtered]
bpmn20_filtered_id = np.unique(bpmn20_filtered_id)
print('num of bpmn20: ' + str(len(bpmn20_filtered_id)))


num of bpmn20: 412


In [55]:
temp_neg_triplets = []
temp_hard_neg_triplets = []
# bpmn20_filtered_id = ['69285564', '159373', '448828966', '1536606145']
for file_num in bpmn20_filtered_id:
    pools_and_lanes = False
    file = '../thesis_data/bpmai/models/' + file_num + '.json'
    try:
        results = load_JSON(file)
        if len(results) == 4:
            pools_and_lanes = True
            shapes_id = results[0]
            directly_follows = results[1]
            lanes = results[2]
            pools = results[3]
        else:
            shapes_id = results[0]
            directly_follows = results[1]
            flows = results[2]

        stencils = set()
        tasks_subprocesses_id = set()
        # pools_lanes_id = set()
        start_events_id = set()
        end_events_id = set()
        int_events_id = set()
        gateways_count = {}
        closing_gateways_id = set()
        shapes_unwanted_id = set()
        tasks_subprocesses = ['Task', 'CollapsedSubprocess', 'Subprocess']
        gateways = ['Exclusive_Databased_Gateway', 'InclusiveGateway', 'ParallelGateway']
        shapes_unwanted = ['DataObject', 'ITSystem', 'TextAnnotation', 
                        'Association_Undirected', 'Association_Unidirectional', 'MessageFlow']

        for s in shapes_id.keys():
            stencils.add(shapes_id[s])
            if re.match('(Start.)', shapes_id[s]):
                start_events_id.add(s)
            if re.match('(End.)', shapes_id[s]):
                end_events_id.add(s)
            if re.match('(Intermediate.)', shapes_id[s]):
                int_events_id.add(s)
            if shapes_id[s] in gateways:
                gateways_count.update({s: len(directly_follows[s])})
                if len(directly_follows[s]) == 1:
                    closing_gateways_id.add(s)
            if shapes_id[s] in tasks_subprocesses:
                tasks_subprocesses_id.add(s)
            if shapes_id[s] in shapes_unwanted:
                shapes_unwanted_id.add(s)

        for f in directly_follows.copy():
            if f in shapes_unwanted_id:
                directly_follows.pop(f)
            if f in directly_follows.keys() and directly_follows[f]:
                for r in directly_follows[f]:
                    if r in shapes_unwanted_id:
                        directly_follows[f].remove(r)

        closing_gateways_tasks_count = {}
        for g in closing_gateways_id:
            count = 0
            for f in directly_follows.values():
                count += sum(1 if re.match(g, x) else 0 for x in f)
            closing_gateways_tasks_count[g] = count

        for t in tasks_subprocesses_id:
            count = 0
            for f in directly_follows.values():
                count += sum(1 if re.match(t, x) else 0 for x in f)
            if count > 1:
                closing_gateways_tasks_count[t] = count

        for e in end_events_id:
            count = 0
            for f in directly_follows.values():
                count += sum(1 if re.match(e, x) else 0 for x in f)
            if count > 1:
                closing_gateways_tasks_count[e] = count

        shapes_wanted = set.union(tasks_subprocesses_id, start_events_id, end_events_id, int_events_id)
        temp_closing_count = closing_gateways_tasks_count.copy()
        neg_triplets, hard_neg_triplets = build_triplet_input(directly_follows, tasks_subprocesses_id, gateways_count, temp_closing_count)
        
        if pools_and_lanes:
            names = {}
            for x in lanes.values():
                names.update(x)
        else:
            names = flows    
            
        translated_neg_triplets = []
        translated_hard_neg_triplets = []
        for triplet in neg_triplets:
            temp_translated = [names[t] for t in triplet if names[t] != 'Task or Subprocess']
            if len(temp_translated) == 3:
#                 translated_neg_triplets.append(temp_translated)
                translated_neg_triplets += temp_translated
        for triplet in hard_neg_triplets:
            temp_translated = [names[t] for t in triplet if names[t] != 'Task or Subprocess']
            if len(temp_translated) == 3:
#                 translated_hard_neg_triplets.append(temp_translated)
                translated_hard_neg_triplets += temp_translated
            
        temp_neg_triplets.append(translated_neg_triplets)
        temp_hard_neg_triplets.append(translated_hard_neg_triplets)
        flows_extracted.append(file_num)

    except:
#         print('file skipped - error occurred')
        print(file_num)
        models_skipped.append(file_num)

        
negatives = []
for neg in temp_neg_triplets:
#     negatives.append(neg)
    negatives += neg

hard_negatives = []
for hard_neg in temp_hard_neg_triplets:
#     hard_negatives.append(hard_neg)
    hard_negatives += hard_neg

triplet_train_dataset = {'negatives': negatives, 'hard_negatives': hard_negatives,
                        'flows_extracted': flows_extracted, 'models_skipped': models_skipped}
with open('triplet_train_dataset.json', 'w') as f:
    json.dump(triplet_train_dataset, f)


1004323525
1043382837
106014777
1076439938
1082729770
1133610130
114014453
1142114864
1157592789
1164564371
1166520634
1171645114
117696398
1179587055
1181107040
1181823490
1187781216
1188579335
1212374173
122737759
1236805002
1259812245
1272203526
1273409172
1289094085
130159280
1306388197
1315574332
1353066507
1354540163
1362025090
1362164763
1366535375
1376985950
1391529931
1401890603
1409366538
1433732682
1436262813
1457860970
1480414975
1480990975
1481452533
1495069783
1508257835
1509505213
1516855746
1525489746
1526159253
1535435640
1563278367
1563800096
1567838130
158045461
1581586152
1588460268
1599317222
1607341144
1608879135
1616816944
1636636277
1663003736
1665508847
1667017327
1701441401
1701917025
170928774
1714864350
172446928
1726895112
1756146417
1756384480
1765832645
176876750
1793071752
1798276208
1818146728
1818800031
1823285702
1848181121
1851725690
1858275545
1866263532
1874517276
1888645264
189675240
1906315955
1924962263
1934621887
1952868524
1959138694
197080324

In [56]:
len(triplet_train_dataset['flows_extracted'])

222

In [57]:
triplet_train_dataset['flows_extracted']

['1020811925',
 '1024559312',
 '1024943999',
 '1040516350',
 '1050726999',
 '1052982385',
 '1061253189',
 '1067797812',
 '1072642347',
 '1088057928',
 '1092544771',
 '1099184284',
 '1112919494',
 '1116997047',
 '1128365841',
 '1141896355',
 '1148936892',
 '1158648175',
 '1167102290',
 '1170887240',
 '1172098360',
 '1173387553',
 '118119366',
 '1187137764',
 '1188157819',
 '1194292220',
 '1206572641',
 '1215129533',
 '1227426477',
 '125460031',
 '1257142300',
 '1288516525',
 '1309301951',
 '1310731655',
 '1311288946',
 '1313145225',
 '1325538128',
 '1338574476',
 '1340242365',
 '1352840228',
 '1371040592',
 '1389879293',
 '1399535837',
 '1400955521',
 '1406329570',
 '1407917684',
 '1408075999',
 '1408085869',
 '1408109366',
 '1427746723',
 '1456412935',
 '1457907538',
 '1472561477',
 '1472617140',
 '1474696354',
 '1477560333',
 '1481254907',
 '1503268773',
 '1508335739',
 '1514221448',
 '1521337671',
 '152936680',
 '1531134842',
 '1535035276',
 '1536606145',
 '1585843988',
 '1599984036'

In [58]:
print(len(triplet_train_dataset['negatives'])) # 4263
print(len(triplet_train_dataset['hard_negatives'])) # 2133

5913
5280


In [52]:
with open('./train_dataset.json', 'r') as f:
    data = json.load(f)

In [53]:
len(data['summary']) # 6396

2132

In [39]:
translated_triplets = []
for triplet in triplets_input_data:
#     temp_translated = [flows[t] for t in triplet]
    names = {}
    for x in lanes.values():
        names.update(x)
    temp_translated = [names[t] for t in triplet]
    translated_triplets.append(temp_translated)

translated_triplets

[['Apply bag tag to bag(s)',
  'Send regular checked in bag(s) to makeup area',
  'Place bag(s) on belf for weighing'],
 ['Register baggage details',
  'Generate baggage tag ID',
  'Acquire invoice for excess'],
 ['Advise wether bag(s) backed by self',
  'Deposit regular baggage',
  'Ask passenger if bag(s) packed by self'],
 ['Apply heavy sticker on bag(s)',
  'Print excess payment invoice',
  'Park bag(s) till payment received'],
 ['Park bag(s) till payment received',
  'Print excess payment invoice',
  'Apply heavy sticker on bag(s)']]

In [45]:
translated_triplets = []
for triplet in triplets_input_data:
#     temp_translated = [flows[t] for t in triplet]
    names = {}
    for x in lanes.values():
        names.update(x)
    temp_translated = [names[t] for t in triplet]
    translated_triplets += temp_translated

translated_triplets

['Apply bag tag to bag(s)',
 'Send regular checked in bag(s) to makeup area',
 'Place bag(s) on belf for weighing',
 'Register baggage details',
 'Generate baggage tag ID',
 'Acquire invoice for excess',
 'Advise wether bag(s) backed by self',
 'Deposit regular baggage',
 'Ask passenger if bag(s) packed by self',
 'Apply heavy sticker on bag(s)',
 'Print excess payment invoice',
 'Park bag(s) till payment received',
 'Park bag(s) till payment received',
 'Print excess payment invoice',
 'Apply heavy sticker on bag(s)']