This script compares various sentence similarity techniques

In [None]:
#setup
%pip install -U pip
%pip install -U spacy
%pip install -U gensim
%pip install pandas numpy scipy scikit-learn spacy nltk transformers torch gensim openpyxl python-Levenshtein sent2vec sentence-transformers
!python -m spacy download nl_core_news_lg
import nltk
# Download required NLTK data
nltk.download('punkt')

In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances
import torch
import Levenshtein
from sent2vec.vectorizer import Vectorizer
from sentence_transformers import SentenceTransformer
import os
if os.path.exists('text_private.py'):
    from text_private import sentences, references
elif os.path.exists('text_public.py'):
    from text_public import sentences, references

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

class Embedder:
    def __init__(self, method, model_path=None, suffix=''):
        self.method = method
        self.suffix = suffix
        if suffix!="":
            self.name = f"{method}_{suffix}"
        else:
            self.name = self.method
        self.model_path = model_path
        self.embedding_dict = {}
        self.comparison_type = 'similarity'
        self.nlp = spacy.load('nl_core_news_lg')
        self.initialise_model()

    def initialise_model(self):
        if self.method=='spacy':
            self.model = self.nlp
        elif self.method=='doc2vec':
            self.model = Doc2Vec.load(self.model_path)
        elif self.method=='sent2vec':
            self.model = Vectorizer(pretrained_weights=self.model_path)
        elif	self.method=='sentTF':
            self.model = SentenceTransformer(self.model_path)
        elif self.method == 'TF':
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
            self.model = AutoModel.from_pretrained(self.model_path)
        elif self.method == 'wmd':
            self.comparison_type = 'distance'
        elif self.method == 'ld':
            self.comparison_type = 'distance' 
            
    def generate_embedding(self,sentence):
        if self.method == 'spacy':
            tokens = self.model(sentence)
            vector = np.median([word.vector for word in tokens], axis=0)
        elif self.method == 'doc2vec':
            tokens = self.get_tokens(sentence)
            tagged_data = [TaggedDocument(words=tokens, tags=["sentence"])]
            vector = self.model.infer_vector(tagged_data[0].words)
        elif self.method == 'sent2vec':
            if type(sentence) == list:
                self.model.__init__(self.model_path)
                sentences = sentence
                self.model.run(sentences)
                vector = self.model.vectors
            else:
                self.model.__init__(self.model_path)
                sentences = [sentence]
                self.model.run(sentences)
                vector = self.model.vectors[0]
        elif self.method == 'sentTF':
            if type(sentence) == list:
                self.model.__init__(self.model_path)
                sentences = sentence
                vector = self.model.encode(sentences)
            else:
                self.model.__init__(self.model_path)
                sentences = [sentence]
                vector = self.model.encode(sentences)[0]
        elif self.method == 'TF':
            inputs = self.tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
            input_ids = inputs["input_ids"]
            attention_mask = inputs["attention_mask"]
            with torch.no_grad():
                outputs = self.model(input_ids,attention_mask=attention_mask)
                vector = outputs.last_hidden_state.mean(dim=1).numpy()[0]
        # elif self.method == 'TF':
        #     inputs = self.tokenizer(text=sentence, return_tensors="pt", padding=True, truncation=True)
        #     with torch.no_grad():
        #         outputs = self.model(**inputs, output_hidden_states=True)
        #     self.op = outputs
        #     last_hidden_state = outputs.last_hidden_state
        #     vector = last_hidden_state.mean(dim=1).numpy()[0]
        # vector = concatenate(vector, metrics)    
        return vector
    
    def get_embedding(self, sentence):
        if type(sentence) == str:
            if not self.is_exist(sentence):
                embedding = self.generate_embedding(sentence)
                self.embedding_dict[sentence] = embedding
            else:
                embedding = self.embedding_dict[sentence]
            return embedding
        elif type(sentence) == list:
            embeddings = self.generate_embedding(sentence)
            for sen,embedding in zip(sentence,embeddings):
                self.embedding_dict[sen] = embedding
    
    def is_exist(self,sentence):
        if sentence in self.embedding_dict.keys():
            return True
        else:
            return False
    
    def get_comparison(self,sentence1,sentence2):
        if self.method in ['spacy','doc2vec','sent2vec','sentTF', 'TF']:
            self.vec1 = self.get_embedding(sentence1)
            self.vec2 = self.get_embedding(sentence2)
            measure = cosine_similarity([self.vec1], [self.vec2])[0][0]
        elif self.method == 'wmd':
            tokens1 = self.nlp(sentence1)
            tokens2 = self.nlp(sentence2)
            measure = tokens1.similarity(tokens2)
        elif self.method=='jaccard':
            tokens1 = self.get_tokens(sentence1)
            tokens2 = self.get_tokens(sentence2)
            set1 = set(tokens1)
            set2 = set(tokens2)
            intersection = len(set1.intersection(set2))
            union = len(set1) + len(set2) - intersection
            measure = intersection / union
        elif self.method == 'ld':
            measure = Levenshtein.distance(sentence1, sentence2)
        return measure
    
    def get_tokens(self,sentence):
        doc = self.nlp(sentence)
        tokens = [token.text for token in doc]
        return tokens
    




In [3]:
print("initialising simple extractors")
spacy_extractor = Embedder(method='spacy',suffix='token_based')
doc2vec_extractor = Embedder(method='doc2vec', model_path='models/doc2vec_model')
# dbert_extractor = Embedder(method='sent2vec', model_path='distilbert-base-uncased', suffix='dbert')
dberttf_extractor = Embedder(method='TF', model_path='distilbert-base-uncased', suffix='dbert-tf')
# mlbert_extractor = Embedder(method='sent2vec', model_path='distilbert-base-multilingual-cased', suffix='mlbert')
mlbert_extractor = Embedder(method='TF', model_path='distilbert-base-multilingual-cased', suffix='mlbert')
glove_extractor = Embedder(method='sentTF', model_path='sentence-transformers/average_word_embeddings_glove.6B.300d', suffix='glove')
print("initialising transformer extractors")
bert_extractor = Embedder(method='TF', model_path='bert-base-uncased', suffix='bert')
robbert_extractor = Embedder(method='TF', model_path='pdelobelle/robbert-v2-dutch-base', suffix='robbert')
bertje_extractor = Embedder(method='TF', model_path='GroNLP/bert-base-dutch-cased',suffix='bertje')
print("initialising traditional extractors")
jaccard_extractor = Embedder(method='jaccard')
wmd_extractor = Embedder(method='wmd')
ld_extractor = Embedder(method='ld')
# print("pre populating embeddings for dbert")
# dbert_extractor.get_embedding(list(sentences.keys())+references)
# print("pre populating embeddings for mlbert")
# mlbert_extractor.get_embedding(list(sentences.keys())+references)


initialising simple extractors
initialising transformer extractors


Some weights of RobertaModel were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


initialising traditional extractors


In [4]:

result_dict = {'sentence':[], 'reference':[], 'method':[], 'comparison_type':[], 'measure':[]}

chosen_extractors = [
                    spacy_extractor, 
                    doc2vec_extractor, 
                    # dbert_extractor, 
                    dberttf_extractor,
                    mlbert_extractor, 
                    glove_extractor, 
                    bert_extractor, 
                    robbert_extractor, 
                    bertje_extractor,
                    jaccard_extractor, 
                    wmd_extractor, 
                    ld_extractor,
                     ]
for extractor in chosen_extractors:
  print('Model:',extractor.name)
  for sentence1 in sentences.keys():
      for sentence2 in references:
            measure = extractor.get_comparison(sentence1,sentence2)
            result_dict['sentence'].append(sentence1)
            result_dict['reference'].append(sentence2)
            result_dict['method'].append(extractor.name)
            result_dict['comparison_type'].append(extractor.comparison_type)
            result_dict['measure'].append(measure)
        
result = pd.DataFrame(result_dict)
print(result)    
result_file = 'compare_sentence_similarity_methods_fast.xlsx'
result.to_excel(result_file)



Model: spacy_token_based
Model: doc2vec
Model: TF_dbert-tf
Model: TF_mlbert
Model: sentTF_glove
Model: TF_bert
Model: TF_robbert
Model: TF_bertje
Model: jaccard
Model: wmd
Model: ld
                                               sentence  \
0                            Meneer nam het medicijn in   
1                            Meneer nam het medicijn in   
2                            Meneer nam het medicijn in   
3                            Meneer nam het medicijn in   
4                            Meneer nam het medicijn in   
...                                                 ...   
1315   pijnklachten rug, paracetamol op standaard la...   
1316   pijnklachten rug, paracetamol op standaard la...   
1317   pijnklachten rug, paracetamol op standaard la...   
1318   pijnklachten rug, paracetamol op standaard la...   
1319   pijnklachten rug, paracetamol op standaard la...   

                                              reference             method  \
0                              

In [5]:


all_measures = list(result.method.unique())+['ensemble']
all_measure_dict = {i:{'correct_match_count':0, 'partially_correct_match_count':0,'incorrect_match_count':0, 'correct_matches':{},'partially_correct_matches':{},'incorrect_matches':{}} for i in all_measures}
N = 3
for sen in result.sentence.unique():
    for method in all_measures:
        for type2 in result.comparison_type.unique():
            if method == 'ensemble':
                score_df = result[(result.sentence == sen)]
                type1 = score_df.comparison_type.unique()[0]
            else:
                score_df = result[(result.sentence == sen) & (result.method == method)]
                type1 = score_df.comparison_type.unique()[0]
            if type1 != type2:
                pass
            if type1 == "similarity":
                best_matches = score_df.sort_values(['measure'], ascending=False)[:N]['reference'].to_list()
                best_scores = score_df.sort_values(['measure'], ascending=False)[:N]['measure'].to_list()
            if type1 == "distance":
                best_matches = score_df.sort_values(['measure'], ascending=True)[:N]['reference'].to_list()
                best_scores = score_df.sort_values(['measure'], ascending=True)[:N]['measure'].to_list()

            best_match_stat = [f'Measure:{score:.2f} {match}' for (match,score) in zip(best_matches, best_scores)]
            if any(set(best_matches).intersection(set(sentences[sen][:1]))):
                all_measure_dict[method]["correct_match_count"] += 1
                all_measure_dict[method]["correct_matches"].update({sen:best_match_stat})
            elif any(set(best_matches).intersection(set(sentences[sen]))):
                all_measure_dict[method]["partially_correct_match_count"] += 1
                all_measure_dict[method]["partially_correct_matches"].update({sen:best_match_stat})
            else:
                all_measure_dict[method]["incorrect_match_count"] += 1
                all_measure_dict[method]["incorrect_matches"].update({sen:best_match_stat})

for k,v in all_measure_dict.items():
    print(k)
    for k_,v_ in v.items():
        print(k_)
        if type(v_) == dict:
            for k__,v__ in v_.items():
                print('\t',k__,':',)
                if type(v__) == list:
                    for i in v__:
                        print('\t\t:',i)
        else:
            print('\t',k_,':',v_)
    print("************************")
    print()


spacy_token_based
correct_match_count
	 correct_match_count : 8
partially_correct_match_count
	 partially_correct_match_count : 2
incorrect_match_count
	 incorrect_match_count : 14
correct_matches
	 Meneer nam het medicijn in :
		: Measure:0.63 neemt medicatie in
		: Measure:0.44 verdoving met morfineachtige pijnstiller in hoge dosis
		: Measure:0.39 toedienen van pijnstiller
	 Meneer injecteerde het geneesmiddel zelf :
		: Measure:0.66 injectie van medicatie
		: Measure:0.60 laat voldoende pijnbeheersing zien zonder oraal pijnstillend middel
		: Measure:0.60 pijnverlichting door pijnstillend middel
	 kreeg morfine voor de verzorging met redelijk resultaat. :
		: Measure:0.68 pijnverlichting door pijnstillend middel
		: Measure:0.65 verdoving met morfineachtige pijnstiller in hoge dosis
		: Measure:0.64 laat voldoende pijnbeheersing zien zonder oraal pijnstillend middel
	 meneer geeft aan geen pijnklachten te hebben, daarom in overleg met dr. van de velden naproxen op zo nodig laten ze