In [5]:
import pandas as pd
import numpy as np
import os
import re

model_path = "data/models"

def read_model(filename): 
    filepath = model_path + "/" + filename
    tags = ["startEvent", "intermediateCatchEvent", "task", "exclusiveGateway"]
    names = []
    sids = []
    locations = []
    
    with open(filepath, 'r') as file: 
        for line in file:
            line = line.replace('"', '\\"')
            if re.match(r'[ \t]+<\w', line):
                line = re.sub(r'[ \t]+<', '', line)
                words = line.split("\" ")
                tag = words[0].split(" ")[0]
                if tag in tags:
                    if tag == "task": 
                        name = words[-2].replace('name=\\"', '')[:-1]
                        sid = words[1].replace('id=\\"', '')[:-1]
                    else:
                        name = words[-1].replace('name=\\"', '')[:-4]
                        sid = words[0].split(' ')[1].replace('id=\\"', '')[:-1]

                    if name != "":
                        names.append(name)
                        sids.append(sid)
                        locations.append(filename[:-5])
    
    df = pd.DataFrame({"Name": names, "SID": sids, "Location": locations})
    return df

def read_all_models():
    filenames = [f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f))]
    dfs = [read_model(filename) for filename in filenames]
    df = pd.concat(dfs)
    return df

models = read_all_models()
models.at[1,"Name"] = "Send online protocol" # weird characters
models.head()

Unnamed: 0,Name,SID,Location
0,Apply online,sid-B11703D1-08C3-416B-8947-76CE10A27978,Cologne
1,Send online protocol,sid-2A61C308-48B4-4C26-9CB3-D489A0BBC1E3,Cologne
2,Send documents by post,sid-4C4748D1-DBC6-4155-91C8-F71F3796A768,Cologne
3,Take aptitude test,sid-A3568080-F469-4D86-BB96-7A5C48342857,Cologne
4,Pay for aptitude test,sid-F6E821C8-4300-4AA0-B6D1-6BA5D15FA7DE,Cologne


In [7]:
reference_path = "data/goldstandard"

def read_reference(filename):
    filepath = reference_path + "/" + filename
    sid1s = []
    sid2s = []
    
    with open(filepath, 'r') as file: 
        for line in file:
            line = line.replace('"', '\\"')
            if re.match(r'[ \t]+<\w', line):
                line = re.sub(r'[ \t]+<', '', line)
                words = line.split("\" ")
                tag = words[0].split(" ")[0]
                if "entity" in tag:
                    if tag == "entity1": 
                        sid = re.search(r'sid[-\d\w]+', line)[0]
                        sid1s.append(sid)
                    elif tag == "entity2":
                        sid = re.search(r'sid[-\d\w]+', line)[0]
                        sid2s.append(sid)
    
    # don't append list if sid pairs shouldn't be added reversed
    df = pd.DataFrame({"SID1": sid1s + sid2s, "SID2": sid2s + sid1s})
    return df
    
def read_all_references(): 
    filenames = [f for f in os.listdir(reference_path) if os.path.isfile(os.path.join(reference_path, f))]
    dfs = [read_reference(filename) for filename in filenames]
    df = pd.concat(dfs)
    return df

references = read_all_references()
references.head()

Unnamed: 0,SID1,SID2
0,sid-50620B60-4F01-4DFD-8EB8-BBCAAECE6026,sid-02B39B7C-7C88-4BCC-AD34-FEED3A2D9E42
1,sid-8712EF38-01F5-4812-ACB7-8F0EF759F38C,sid-445A78A6-1CA7-40BF-B7BF-B8CCCAAEF703
2,sid-B41B5C8F-D1D5-4641-943B-DB396FD69D0B,sid-5108D3F6-5D84-4ECB-B826-1A54D3D17B20
3,sid-DF0BF153-C7B8-47A6-9436-2217B6A02FB9,sid-B296B105-D85C-47FE-A200-50F7C64E65CF
4,sid-4C4748D1-DBC6-4155-91C8-F71F3796A768,sid-234E99B4-0E97-4770-B2CF-AD0DACDF6888


In [26]:
def prepare_expressions():
    if os.path.exists("expressions.csv") and os.path.isfile("expressions.csv"):  
        df = pd.read_csv("expressions.csv")
    else: 
    sid_dict = pd.Series(models["Name"].values,index=models["SID"]).to_dict()
    df = references.copy()
    df.replace({"SID1": sid_dict}, inplace=True)
    df.replace({"SID2": sid_dict}, inplace=True)
    
    df.columns = ["Name1", "Name2"]
    
    # add expressions reversed
    reversed_cols = pd.DataFrame({"Name1": df["Name2"].tolist(), "Name2": df["Name1"].tolist()})
    df = df.append(reversed_cols)
    df["Equivalent"] = [True] * df.shape[0]
    
    expressions = models["Name"].tolist()

    for expr1 in expressions:
        for expr2 in expressions:
            if not ((df[['Name1','Name2']].values == [expr1, expr2]).all(axis=1).any()):
                row = pd.DataFrame({"Name1": [expr1], "Name2": [expr2], "Equivalent": [False]})
                df = df.append(row)
                
    return df
                
expressions = prepare_expressions()

In [29]:
expressions.to_csv("expressions.csv", index=False)
expressions.tail()

Unnamed: 0,Name1,Name2,Equivalent
0,Matriculate,Send german application,False
0,Matriculate,Fill in online form of application,False
0,Matriculate,Receive Commitment,False
0,Matriculate,Waiting for response,False
0,Matriculate,Matriculate,False


In [33]:
import nltk
nltk.download()
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = ' '.join([word.lower() for word in text.split() if word not in stop_words])
    return text

expressions['Name1'] = expressions['Name1'].apply(lambda x: preprocess_text(x))
expressions['Name2'] = expressions['Name2'].apply(lambda x: preprocess_text(x))
expressions.head()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


Unnamed: 0,Name1,Name2,Equivalent
0,send letter rejection,send letter rejection,True
1,rejected,rejected,True
2,check documents,check documents,True
3,wait results,wait results,True
4,send documents post,send online protocol,True


In [34]:
expressions.loc[expressions["Name1"] == expressions["Name2"], "Equivalent"] = True
expressions.tail()

Unnamed: 0,Name1,Name2,Equivalent
0,matriculate,send german application,False
0,matriculate,fill online form application,False
0,matriculate,receive commitment,False
0,matriculate,waiting response,False
0,matriculate,matriculate,True


In [36]:
from gensim.models import Word2Vec

all_words = [word for sent in df['Name1'].tolist() for word in sent.split()] 
all_sents = [sent.split() for sent in df['Name1'].tolist()]
word2vec = Word2Vec(all_sents, min_count=2)

ModuleNotFoundError: No module named 'gensim'

In [None]:
import scipy
def cosine_similarity(vec1, vec2): 
    cos_emb = []
    for index,row in df.iterrows():
        v1 = np.mean([model[word] for word in row["Name1"].split()],axis=0)
        v2 = np.mean([model[word] for word in row["Name2"].split()],axis=0)
        cosine = scipy.spatial.distance.cosine(v1,v2)
        cos_emb.append(cosine)
    df["CosEmb"] = cos_emb