In [1]:
import pandas as pd
import numpy as np
import os
import re

model_path = "data/models"

def read_model(filename): 
    filepath = model_path + "/" + filename
    tags = ["startEvent", "intermediateCatchEvent", "task", "exclusiveGateway"]
    names = []
    sids = []
    locations = []
    
    with open(filepath, 'r') as file: 
        for line in file:
            line = line.replace('"', '\\"')
            if re.match(r'[ \t]+<\w', line):
                line = re.sub(r'[ \t]+<', '', line)
                words = line.split("\" ")
                tag = words[0].split(" ")[0]
                if tag in tags:
                    if tag == "task": 
                        name = words[-2].replace('name=\\"', '')[:-1]
                        sid = words[1].replace('id=\\"', '')[:-1]
                    else:
                        name = words[-1].replace('name=\\"', '')[:-4]
                        sid = words[0].split(' ')[1].replace('id=\\"', '')[:-1]

                    if name != "":
                        names.append(name)
                        sids.append(sid)
                        locations.append(filename[:-5])
    
    df = pd.DataFrame({"Name": names, "SID": sids, "Location": locations})
    return df

def read_all_models():
    filenames = [f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f))]
    dfs = [read_model(filename) for filename in filenames]
    df = pd.concat(dfs)
    return df

models = read_all_models()
models.at[1,"Name"] = "Send online protocol" # weird characters
models.head()

Unnamed: 0,Name,SID,Location
0,Apply online,sid-B11703D1-08C3-416B-8947-76CE10A27978,Cologne
1,Send online protocol,sid-2A61C308-48B4-4C26-9CB3-D489A0BBC1E3,Cologne
2,Send documents by post,sid-4C4748D1-DBC6-4155-91C8-F71F3796A768,Cologne
3,Take aptitude test,sid-A3568080-F469-4D86-BB96-7A5C48342857,Cologne
4,Pay for aptitude test,sid-F6E821C8-4300-4AA0-B6D1-6BA5D15FA7DE,Cologne


In [2]:
reference_path = "data/goldstandard"

def read_reference(filename):
    filepath = reference_path + "/" + filename
    sid1s = []
    sid2s = []
    
    with open(filepath, 'r') as file: 
        for line in file:
            line = line.replace('"', '\\"')
            if re.match(r'[ \t]+<\w', line):
                line = re.sub(r'[ \t]+<', '', line)
                words = line.split("\" ")
                tag = words[0].split(" ")[0]
                if "entity" in tag:
                    if tag == "entity1": 
                        sid = re.search(r'sid[-\d\w]+', line)[0]
                        sid1s.append(sid)
                    elif tag == "entity2":
                        sid = re.search(r'sid[-\d\w]+', line)[0]
                        sid2s.append(sid)
    
    # don't append list if sid pairs shouldn't be added reversed
    df = pd.DataFrame({"SID1": sid1s + sid2s, "SID2": sid2s + sid1s})
    return df
    
def read_all_references(): 
    filenames = [f for f in os.listdir(reference_path) if os.path.isfile(os.path.join(reference_path, f))]
    dfs = [read_reference(filename) for filename in filenames]
    df = pd.concat(dfs)
    return df

references = read_all_references()
references.head()

Unnamed: 0,SID1,SID2
0,sid-50620B60-4F01-4DFD-8EB8-BBCAAECE6026,sid-02B39B7C-7C88-4BCC-AD34-FEED3A2D9E42
1,sid-8712EF38-01F5-4812-ACB7-8F0EF759F38C,sid-445A78A6-1CA7-40BF-B7BF-B8CCCAAEF703
2,sid-B41B5C8F-D1D5-4641-943B-DB396FD69D0B,sid-5108D3F6-5D84-4ECB-B826-1A54D3D17B20
3,sid-DF0BF153-C7B8-47A6-9436-2217B6A02FB9,sid-B296B105-D85C-47FE-A200-50F7C64E65CF
4,sid-4C4748D1-DBC6-4155-91C8-F71F3796A768,sid-234E99B4-0E97-4770-B2CF-AD0DACDF6888


In [3]:
def prepare_expressions():
    if os.path.exists("expressions.csv") and os.path.isfile("expressions.csv"):  
        df = pd.read_csv("expressions.csv")
    else: 
        sid_dict = pd.Series(models["Name"].values,index=models["SID"]).to_dict()
        df = references.copy()
        df.replace({"SID1": sid_dict}, inplace=True)
        df.replace({"SID2": sid_dict}, inplace=True)

        df.columns = ["Name1", "Name2"]

        # add expressions reversed
        reversed_cols = pd.DataFrame({"Name1": df["Name2"].tolist(), "Name2": df["Name1"].tolist()})
        df = df.append(reversed_cols)
        df["Equivalent"] = [True] * df.shape[0]

        expressions = models["Name"].tolist()

        for expr1 in expressions:
            for expr2 in expressions:
                if not ((df[['Name1','Name2']].values == [expr1, expr2]).all(axis=1).any()):
                    row = pd.DataFrame({"Name1": [expr1], "Name2": [expr2], "Equivalent": [False]})
                    df = df.append(row)
                
    return df
                
expressions = prepare_expressions()

In [4]:
expressions.to_csv("expressions.csv", index=False)
expressions.tail()

Unnamed: 0,Name1,Name2,Equivalent
31044,Matriculate,Send german application,False
31045,Matriculate,Fill in online form of application,False
31046,Matriculate,Receive Commitment,False
31047,Matriculate,Waiting for response,False
31048,Matriculate,Matriculate,False


In [5]:
import nltk
nltk.download()
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() 
def preprocess_text(text):
    text = ' '.join([lemmatizer.lemmatize(word.lower()) for word in text.split() if word not in stop_words])
    return text

expressions['Name1'] = expressions['Name1'].apply(lambda x: preprocess_text(x))
expressions['Name2'] = expressions['Name2'].apply(lambda x: preprocess_text(x))
expressions.head()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


Unnamed: 0,Name1,Name2,Equivalent
0,send letter rejection,send letter rejection,True
1,rejected,rejected,True
2,check document,check document,True
3,wait result,wait result,True
4,send document post,send online protocol,True


In [6]:
expressions.loc[expressions["Name1"] == expressions["Name2"], "Equivalent"] = True
expressions.tail()

Unnamed: 0,Name1,Name2,Equivalent
31044,matriculate,send german application,False
31045,matriculate,fill online form application,False
31046,matriculate,receive commitment,False
31047,matriculate,waiting response,False
31048,matriculate,matriculate,True


In [7]:
from gensim.models import Word2Vec

all_words = [word for sent in expressions['Name1'].tolist() for word in sent.split()] 
all_sents = [sent.split() for sent in expressions['Name1'].tolist()]
word2vec = Word2Vec(all_sents, min_count=2)

In [8]:
import scipy
def cosine_similarity(text1, text2, model):
    if model == "word2vec":
        v1 = np.mean([word2vec[word] for word in text1.split()],axis=0)
        v2 = np.mean([word2vec[word] for word in text2.split()],axis=0)
        cosine = scipy.spatial.distance.cosine(v1,v2)
    else:
        v1 = doc2vec.infer_vector(text1.split())
        v2 = doc2vec.infer_vector(text2.split())
        cosine = scipy.spatial.distance.cosine(v1, v2)
    return cosine

expressions['CosDistW'] = expressions.apply(lambda x: cosine_similarity(x["Name1"], x["Name2"], "word2vec"), axis=1)
expressions.tail()

  after removing the cwd from sys.path.
  """


Unnamed: 0,Name1,Name2,Equivalent,CosDistW
31044,matriculate,send german application,False,0.868707
31045,matriculate,fill online form application,False,0.783734
31046,matriculate,receive commitment,False,0.911773
31047,matriculate,waiting response,False,0.910031
31048,matriculate,matriculate,True,0.0


In [9]:
def word_movers_distance(sent1, sent2, model):
    if model == "word2vec":
        distance = word2vec.wmdistance(sent1.split(), sent2.split())
    else:
        v1 = doc2vec.infer_vector(sent1.split())
        v2 = doc2vec.infer_vector(sent2.split())
        distance = doc2vec.wmdistance(sent1, sent2)
    return distance

expressions['WordDistW'] = expressions.apply(lambda x: word_movers_distance(x["Name1"], x["Name2"], "word2vec"), axis=1)
expressions.tail()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Name1,Name2,Equivalent,CosDistW,WordDistW
31044,matriculate,send german application,False,0.868707,3.00591
31045,matriculate,fill online form application,False,0.783734,3.867533
31046,matriculate,receive commitment,False,0.911773,2.563744
31047,matriculate,waiting response,False,0.910031,3.195273
31048,matriculate,matriculate,True,0.0,0.0


In [10]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

list_of_sentences = [" ".join(sublist) for sublist in all_sents]
#sentences = [TaggedDocument(sent, [i]) for i, sent in enumerate(list_of_sentences)]
sentences = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(list_of_sentences)]

doc2vec = Doc2Vec(size=15, alpha=0.025, min_alpha=0.00025, min_count=1, dm =1)
doc2vec.build_vocab(sentences)

for epoch in range(10):
    doc2vec.train(sentences, total_examples=doc2vec.corpus_count, epochs=doc2vec.iter)
    # decrease the learning rate
    doc2vec.alpha -= 0.0002
    # fix the learning rate, no decay
    doc2vec.min_alpha = doc2vec.alpha

  if sys.path[0] == '':


In [11]:
expressions['CosDistD'] = expressions.apply(lambda x: cosine_similarity(x["Name1"], x["Name2"], "doc2vec"), axis=1)
expressions['WordDistD'] = expressions.apply(lambda x: word_movers_distance(x["Name1"], x["Name2"], "doc2vec"), axis=1)
expressions.head()

  import sys


Unnamed: 0,Name1,Name2,Equivalent,CosDistW,WordDistW,CosDistD,WordDistD
0,send letter rejection,send letter rejection,True,0.0,0.0,0.416109,0.0
1,rejected,rejected,True,0.0,0.0,0.05537,0.0
2,check document,check document,True,0.0,0.0,0.101537,0.0
3,wait result,wait result,True,0.0,0.0,0.480343,0.0
4,send document post,send online protocol,True,0.122018,1.74839,0.817252,0.0


In [12]:
expressions["Equivalent"] = expressions["Equivalent"].astype(int)
expressions = expressions.replace([np.inf, -np.inf], np.nan)
expressions = expressions.dropna()
expressions.head()

Unnamed: 0,Name1,Name2,Equivalent,CosDistW,WordDistW,CosDistD,WordDistD
0,send letter rejection,send letter rejection,1,0.0,0.0,0.416109,0.0
1,rejected,rejected,1,0.0,0.0,0.05537,0.0
2,check document,check document,1,0.0,0.0,0.101537,0.0
3,wait result,wait result,1,0.0,0.0,0.480343,0.0
4,send document post,send online protocol,1,0.122018,1.74839,0.817252,0.0


In [13]:
expressions["Equivalent"].value_counts()

0    28274
1     1015
Name: Equivalent, dtype: int64

In [14]:
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score

df = expressions.copy()
df_majority = df[df["Equivalent"] == 0]
df_minority = df[df["Equivalent"] == 1]

n_majority = expressions["Equivalent"].value_counts().max()
df_minority_upsampled = resample(df_minority, replace=True, n_samples=n_majority, random_state=42)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

df_upsampled["Equivalent"].value_counts()

1    28274
0    28274
Name: Equivalent, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split

important_cols = ["CosDistW", "WordDistW", "CosDistD", "WordDistD"]

X_train, X_test, y_train, y_test = train_test_split(df_upsampled[important_cols], df_upsampled["Equivalent"], test_size=0.2, random_state=42, shuffle=True)

In [17]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
scores = cross_val_score(decision_tree, X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
scores = cross_val_score(decision_tree, X_train, y_train, cv=5, scoring='f1_macro')

Accuracy: 1.00 (+/- 0.00)


In [19]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
print("accuracy ", accuracy_score(y_test, y_pred))
print("precision ", precision_score(y_test, y_pred))
print("recall ", recall_score(y_test, y_pred))

accuracy  0.9993810786914236
precision  0.9987717143358484
recall  1.0


In [20]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()
scores = cross_val_score(random_forest, X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
scores = cross_val_score(random_forest, X_train, y_train, cv=5, scoring='f1_macro')



Accuracy: 1.00 (+/- 0.00)




In [None]:
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
print("accuracy ", accuracy_score(y_test, y_pred))
print("precision ", precision_score(y_test, y_pred))
print("recall ", recall_score(y_test, y_pred))