In [97]:
import pandas as pd
import numpy as np
import os
import re

# startEvent, intermediateCatchEvent, task, exclusiveGateway -> name, sid
bpmn_path = "dataset-bpmn/bpmn-models"
reference_path = "dataset-bpmn/reference"

def bpmn_to_df(filename): 
    filepath = bpmn_path + "/" + filename
    tags = ["startEvent", "intermediateCatchEvent", "task", "exclusiveGateway"]
    names = []
    sids = []
    locations = []
    
    with open(filepath, 'r') as file: 
        for line in file:
            line = line.replace('"', '\\"')
            if re.match(r'[ \t]+<\w', line):
                line = re.sub(r'[ \t]+<', '', line)
                words = line.split("\" ")
                tag = words[0].split(" ")[0]
                if tag in tags:
                    if tag == "task": 
                        name = words[-2].replace('name=\\"', '')[:-1]
                        sid = words[1].replace('id=\\"', '')[:-1]
                    else:
                        name = words[-1].replace('name=\\"', '')[:-4]
                        sid = words[0].split(' ')[1].replace('id=\\"', '')[:-1]

                    if name != "":
                        names.append(name)
                        sids.append(sid)
                        locations.append(filename[:-5])
    
    df = pd.DataFrame({"Name": names, "SID": sids, "Location": locations})
    return df

def bpmn_full_df(): 
    filenames = [f for f in os.listdir(bpmn_path) if os.path.isfile(os.path.join(bpmn_path, f))]
    dfs = [bpmn_to_df(filename) for filename in filenames]
    df = pd.concat(dfs)
    return df
        
expr_sid = bpmn_full_df()
expr_sid.at[1,"Name"] = "Send online protocol"
expr_sid["Name"] = expr_sid["Name"].replace("&#10;", " ")

expr_sid.head()

Unnamed: 0,Name,SID,Location
0,Apply online,sid-B11703D1-08C3-416B-8947-76CE10A27978,Cologne
1,Send online protocol,sid-2A61C308-48B4-4C26-9CB3-D489A0BBC1E3,Cologne
2,Send documents by post,sid-4C4748D1-DBC6-4155-91C8-F71F3796A768,Cologne
3,Take aptitude test,sid-A3568080-F469-4D86-BB96-7A5C48342857,Cologne
4,Pay for aptitude test,sid-F6E821C8-4300-4AA0-B6D1-6BA5D15FA7DE,Cologne


In [100]:
# if results are incorrect, check if same sids appear in different locations
def reference_to_df(filename):
    filepath = reference_path + "/" + filename
    sid1s = []
    sid2s = []
    
    with open(filepath, 'r') as file: 
        for line in file:
            line = line.replace('"', '\\"')
            if re.match(r'[ \t]+<\w', line):
                line = re.sub(r'[ \t]+<', '', line)
                words = line.split("\" ")
                tag = words[0].split(" ")[0]
                if "entity" in tag:
                    if tag == "entity1": 
                        sid = re.search(r'sid[-\d\w]+', line)[0]
                        sid1s.append(sid)
                    elif tag == "entity2":
                        sid = re.search(r'sid[-\d\w]+', line)[0]
                        sid2s.append(sid)
    
    # don't append list if sid pairs shouldn't be added reversed
    df = pd.DataFrame({"SID1": sid1s + sid2s, "SID2": sid2s + sid1s})
    return df
    
def reference_full_df(): 
    filenames = [f for f in os.listdir(reference_path) if os.path.isfile(os.path.join(reference_path, f))]
    dfs = [reference_to_df(filename) for filename in filenames]
    df = pd.concat(dfs)
    return df

equiv_sid = reference_full_df()
equiv_sid.head()

Unnamed: 0,SID1,SID2
0,sid-50620B60-4F01-4DFD-8EB8-BBCAAECE6026,sid-02B39B7C-7C88-4BCC-AD34-FEED3A2D9E42
1,sid-8712EF38-01F5-4812-ACB7-8F0EF759F38C,sid-445A78A6-1CA7-40BF-B7BF-B8CCCAAEF703
2,sid-B41B5C8F-D1D5-4641-943B-DB396FD69D0B,sid-5108D3F6-5D84-4ECB-B826-1A54D3D17B20
3,sid-DF0BF153-C7B8-47A6-9436-2217B6A02FB9,sid-B296B105-D85C-47FE-A200-50F7C64E65CF
4,sid-4C4748D1-DBC6-4155-91C8-F71F3796A768,sid-234E99B4-0E97-4770-B2CF-AD0DACDF6888


In [101]:
def combine_dfs():
    if os.path.exists("expr_equiv.csv") and os.path.isfile("expr_equiv.csv"):  
        df = pd.read_csv("expr_equiv.csv")
    else: 
        df = equiv_sid.copy()

        lookup = dict(zip(expr_sid["SID"], expr_sid["Name"]))
        df["Name1"] = df["SID1"].map(lookup)
        df["Name2"] = df["SID2"].map(lookup)
        df["Equivalent"] = [True] * df.shape[0]

        df = df.drop(["SID1", "SID2"], axis=1)

        expressions = expr_sid["Name"].tolist()

        for expr1 in expressions:
            for expr2 in expressions:
                if not ((df[['Name1','Name2']].values == [expr1, expr2]).all(axis=1).any()):
                    row = pd.DataFrame({"Name1": [expr1], "Name2": [expr2], "Equivalent": [False]})
                    df = df.append(row)
    
    return df
    
expr_equiv = combine_dfs()
expr_equiv.to_csv("expr_equiv.csv", index=False)
expr_equiv.tail()

Unnamed: 0,Name1,Name2,Equivalent
0,Matriculate,Send german application,False
0,Matriculate,Fill in online form of application,False
0,Matriculate,Receive Commitment,False
0,Matriculate,Waiting for response,False
0,Matriculate,Matriculate,False


In [132]:
import nltk
#nltk.download('stopwords')
from gensim.models import Word2Vec
import scipy

# filter stopwords
df = expr_equiv.copy()
df = df.dropna()

def preprocess_text(s):
    s = s.lower().replace("&#10;", " ")
    s = re.sub('\s+', ' ', s)
    return s

stop_words = set(stopwords.words('english'))
df['Name1'] = df['Name1'].apply(lambda x: ' '.join([preprocess_text(word) for word in x.split() if word not in stop_words]))
df['Name2'] = df['Name2'].apply(lambda x: ' '.join([preprocess_text(word) for word in x.split() if word not in stop_words]))

print(df.tail(20))

#all_words = [word for sent in df['Name1'].tolist() for word in sent.split()] # flattened
all_words = [sent.split() for sent in df['Name1'].tolist()] # sublists for sentences
model = Word2Vec(all_words, min_count=2)

# add SIF
def cosine_word2vec():
    cos_emb = []
    for index,row in df.iterrows():
        v1 = np.mean([model[word] for word in row["Name1"].split()],axis=0)
        v2 = np.mean([model[word] for word in row["Name2"].split()],axis=0)
        cosine = scipy.spatial.distance.cosine(v1,v2)
        cos_emb.append(cosine)
    df["CosEmb"] = cos_emb

cosine_word2vec()
df.head()

# cross validation!
# filter forbidden words, POS tag, role of leopold, hypernym, hyponym, antonym
# word2vec + smooth inverse frequence (SIF) + cosine similarity, word2vec + LDA + Jense

# apply function to get column
#df['price'] = df['Symbol'].apply(getquotetoday)

         Name1                                              Name2  Equivalent
0  matriculate                                documents  received       False
0  matriculate                          send interview invitation       False
0  matriculate                                  conduct interview       False
0  matriculate                                       go interview       False
0  matriculate                               invited an interview       False
0  matriculate                                 precheck documents       False
0  matriculate                                          complete?       False
0  matriculate                                    send commitment       False
0  matriculate                          checking contentual match       False
0  matriculate            check university entrance qualification       False
0  matriculate                                          check dsh       False
0  matriculate                              checking completenes



Unnamed: 0,Name1,Name2,Equivalent,CosEmb
0,send letter rejection,send letter rejection,True,0.0
1,rejected,rejected,True,0.0
2,check documents,check documents,True,0.0
3,wait results,wait results,True,0.0
4,send documents post,send online protocol,True,0.258218


In [133]:
df.tail()
# same words must be equivalent!

Unnamed: 0,Name1,Name2,Equivalent,CosEmb
0,matriculate,send german application,False,0.879552
0,matriculate,fill online form application,False,0.874639
0,matriculate,receive commitment,False,0.969761
0,matriculate,waiting response,False,1.010466
0,matriculate,matriculate,False,0.0
