This script compares various sentence similarity techniques

In [1]:
import pandas as pd
import numpy as np
import os
if os.path.exists('text_private.py'):
    from text_private import sentences, references
elif os.path.exists('text_public.py'):
    from text_public import sentences, references
import dutch_text_analytics.textanalytics as ta

In [2]:
print("initialising simple extractors")
spacy_extractor = ta.Embedder(method='spacy',suffix='token_based')
doc2vec_extractor = ta.Embedder(method='doc2vec', model_path='models/doc2vec_model')
dberttf_extractor = ta.Embedder(method='TF', model_path='distilbert-base-uncased', suffix='dbert-tf')
mlbert_extractor = ta.Embedder(method='TF', model_path='distilbert-base-multilingual-cased', suffix='mlbert')
glove_extractor = ta.Embedder(method='sentTF', model_path='sentence-transformers/average_word_embeddings_glove.6B.300d', suffix='glove')
print("initialising transformer extractors")
bert_extractor = ta.Embedder(method='TF', model_path='bert-base-uncased', suffix='bert')
robbert_extractor = ta.Embedder(method='TF', model_path='pdelobelle/robbert-v2-dutch-base', suffix='robbert')
bertje_extractor = ta.Embedder(method='TF', model_path='GroNLP/bert-base-dutch-cased',suffix='bertje')
medrobertanl_extractor = ta.Embedder(method='TF', model_path='CLTL/MedRoBERTa.nl', suffix='medrobertanl')
print("initialising traditional extractors")
jaccard_extractor = ta.Embedder(method='jaccard')
wmd_extractor = ta.Embedder(method='wmd')
ld_extractor = ta.Embedder(method='ld')


initialising simple extractors
initialising transformer extractors


Some weights of RobertaModel were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at CLTL/MedRoBERTa.nl and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


initialising traditional extractors


In [3]:

result_dict = {'sentence':[], 'reference':[], 'method':[], 'comparison_type':[], 'measure':[]}

chosen_extractors = [
                    spacy_extractor, 
                    doc2vec_extractor, 
                    dberttf_extractor,
                    mlbert_extractor, 
                    glove_extractor, 
                    bert_extractor, 
                    robbert_extractor, 
                    bertje_extractor,
                    medrobertanl_extractor,
                    jaccard_extractor, 
                    wmd_extractor, 
                    ld_extractor,
                     ]
for extractor in chosen_extractors:
  print('Model:',extractor.name)
  for sentence1 in sentences.keys():
      for sentence2 in references:
            measure = extractor.get_comparison(sentence1,sentence2)
            result_dict['sentence'].append(sentence1)
            result_dict['reference'].append(sentence2)
            result_dict['method'].append(extractor.name)
            result_dict['comparison_type'].append(extractor.comparison_type)
            result_dict['measure'].append(measure)
        
result = pd.DataFrame(result_dict)
print(result)    
result_file = 'outputs/compare_sentence_similarity_methods_fast.xlsx'
result.to_excel(result_file)



Model: spacy_token_based
Model: doc2vec
Model: TF_dbert-tf
Model: TF_mlbert
Model: sentTF_glove
Model: TF_bert
Model: TF_robbert
Model: TF_bertje


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Model: TF_medrobertanl
Model: jaccard
Model: wmd
Model: ld
                                               sentence  \
0                            Meneer nam het medicijn in   
1                            Meneer nam het medicijn in   
2                            Meneer nam het medicijn in   
3                            Meneer nam het medicijn in   
4                            Meneer nam het medicijn in   
...                                                 ...   
1435   pijnklachten rug, paracetamol op standaard la...   
1436   pijnklachten rug, paracetamol op standaard la...   
1437   pijnklachten rug, paracetamol op standaard la...   
1438   pijnklachten rug, paracetamol op standaard la...   
1439   pijnklachten rug, paracetamol op standaard la...   

                                              reference             method  \
0                                injectie van medicatie  spacy_token_based   
1                                    neemt medicatie in  spacy_token_based  

In [4]:


all_measures = list(result.method.unique())+['ensemble']
all_measure_dict = {i:{'correct_match_count':0, 'partially_correct_match_count':0,'incorrect_match_count':0, 'correct_matches':{},'partially_correct_matches':{},'incorrect_matches':{}} for i in all_measures}
N = 2
for sen in result.sentence.unique():
    for method in all_measures:
        for type2 in result.comparison_type.unique():
            if method == 'ensemble':
                score_df = result[(result.sentence == sen)]
                type1 = score_df.comparison_type.unique()[0]
            else:
                score_df = result[(result.sentence == sen) & (result.method == method)]
                type1 = score_df.comparison_type.unique()[0]
            if type1 != type2:
                continue
            if type1 == "similarity":
                best_matches = score_df.sort_values(['measure'], ascending=False)[:N]['reference'].to_list()
                best_scores = score_df.sort_values(['measure'], ascending=False)[:N]['measure'].to_list()
            if type1 == "distance":
                best_matches = score_df.sort_values(['measure'], ascending=True)[:N]['reference'].to_list()
                best_scores = score_df.sort_values(['measure'], ascending=True)[:N]['measure'].to_list()

            best_match_stat = [f'Measure:{score:.2f} {match}' for (match,score) in zip(best_matches, best_scores)]
            if any(set(best_matches).intersection(set(sentences[sen][:1]))):
                all_measure_dict[method]["correct_match_count"] += 1
                all_measure_dict[method]["correct_matches"].update({sen:best_match_stat})
            elif any(set(best_matches).intersection(set(sentences[sen]))):
                all_measure_dict[method]["partially_correct_match_count"] += 1
                all_measure_dict[method]["partially_correct_matches"].update({sen:best_match_stat})
            else:
                # print('**',sen,method,type1,type2,best_matches,'**')
                all_measure_dict[method]["incorrect_match_count"] += 1
                all_measure_dict[method]["incorrect_matches"].update({sen:best_match_stat})

for k,v in all_measure_dict.items():
    print(k)
    for k_,v_ in v.items():
        print(k_)
        if type(v_) == dict:
            for k__,v__ in v_.items():
                print('\t',k__,':',)
                if type(v__) == list:
                    for i in v__:
                        print('\t\t:',i)
        else:
            print('\t',k_,':',v_)
    print("************************")
    print()


spacy_token_based
correct_match_count
	 correct_match_count : 4
partially_correct_match_count
	 partially_correct_match_count : 1
incorrect_match_count
	 incorrect_match_count : 7
correct_matches
	 Meneer nam het medicijn in :
		: Measure:0.63 neemt medicatie in
		: Measure:0.44 verdoving met morfineachtige pijnstiller in hoge dosis
	 Meneer injecteerde het geneesmiddel zelf :
		: Measure:0.66 injectie van medicatie
		: Measure:0.60 laat voldoende pijnbeheersing zien zonder oraal pijnstillend middel
	 kreeg morfine voor de verzorging met redelijk resultaat. :
		: Measure:0.68 pijnverlichting door pijnstillend middel
		: Measure:0.65 verdoving met morfineachtige pijnstiller in hoge dosis
	 meneer geeft aan geen pijnklachten te hebben, daarom in overleg met dr. van de velden naproxen op zo nodig laten zetten. :
		: Measure:0.72 laat voldoende pijnbeheersing zien zonder oraal pijnstillend middel
		: Measure:0.70 laat voldoende pijnbeheersing zien zonder orale pijnstillers
partially_correc

In [5]:
print("Summary:")
summary_dict = {'method_name':[], 'correct_match_count':[], 'partially_correct_match_count':[], 'incorrect_match_count':[]}
for k,v in all_measure_dict.items():
    summary_dict['method_name'].append(k)
    for k_,v_ in v.items():
        if k_ in ['correct_match_count','partially_correct_match_count','incorrect_match_count',]:
            summary_dict[k_].append(v_)
summary_df = pd.DataFrame(summary_dict)
print(summary_df)
summary_file = 'outputs/compare_sentence_similarity_methods_fast_summary.xlsx'
summary_df.to_excel(summary_file)

Summary:
          method_name  correct_match_count  partially_correct_match_count  \
0   spacy_token_based                    4                              1   
1             doc2vec                    1                              1   
2         TF_dbert-tf                    1                              0   
3           TF_mlbert                    3                              1   
4        sentTF_glove                    3                              0   
5             TF_bert                    1                              0   
6          TF_robbert                    2                              1   
7           TF_bertje                    2                              0   
8     TF_medrobertanl                    1                              1   
9             jaccard                    3                              0   
10                wmd                    2                              0   
11                 ld                    4                         

In [6]:
from textanalytics import Classifier
mdeberta_classifier = Classifier(method='TF', classes = list(references), model_path="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli", suffix="mDeBERTa", multi_label=False)

analysis_dict = {'sentence':[],'gt':[], 'prediction':[]}
for sentence,gt_labels in sentences.items():
    gt_label = gt_labels[0]
    result = mdeberta_classifier.classify(sentence)
    predicted_label = result['labels'][np.argmax(result['scores'])]
    analysis_dict['sentence'].append(sentence)
    analysis_dict['gt'].append(gt_label) 
    analysis_dict['prediction'].append(predicted_label)   
analysis_df =  pd.DataFrame(analysis_dict)





In [7]:
analysis_df

Unnamed: 0,sentence,gt,prediction
0,Meneer nam het medicijn in,neemt medicatie in,neemt medicatie in
1,Meneer injecteerde het geneesmiddel zelf,injectie van medicatie,neemt medicatie in
2,wilde pijnstilling was te vroeg voor paracetam...,verzoek om pijnstillers,pijnverlichting door pijnstillend middel
3,"geeft aan meer last in de onderbuik te hebben,...",pijnverlichting door analgeticum,neemt medicatie in
4,kreeg morfine voor de verzorging met redelijk ...,verdoving met morfineachtige pijnstiller in ho...,neemt medicatie in
5,kreeg om 15.30 oxycodon na mobiliseren.,toedienen van analgeticum,neemt medicatie in
6,"meneer geeft aan geen pijnklachten te hebben, ...",laat voldoende pijnbeheersing zien zonder oraa...,verzoek om pijnstillers
7,meneer heeft in de middag rond 16.00u toename ...,toedienen van analgeticum,neemt medicatie in
8,mevrouw geeft aan iets meer pijnklachten te kr...,toedienen van analgeticum,neemt medicatie in
9,mevrouw geeft aann dat ze pijnklachten ervaart...,kan medicatie innemen,neemt medicatie in


: 