This script compares various sentence similarity techniques

In [7]:
#setup
%pip install pandas numpy scikit-learn spacy nltk transformers torch gensim openpyxl
!python -m spacy download nl_core_news_lg

Note: you may need to restart the kernel to use updated packages.
Collecting nl-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/nl_core_news_lg-3.7.0/nl_core_news_lg-3.7.0-py3-none-any.whl (568.1 MB)
     ---------------------------------------- 0.0/568.1 MB ? eta -:--:--
     -------------------------------------- 0.0/568.1 MB 660.6 kB/s eta 0:14:20
     ---------------------------------------- 0.3/568.1 MB 3.1 MB/s eta 0:03:05
     ---------------------------------------- 1.2/568.1 MB 8.7 MB/s eta 0:01:06
     --------------------------------------- 2.1/568.1 MB 12.1 MB/s eta 0:00:47
     --------------------------------------- 4.0/568.1 MB 18.4 MB/s eta 0:00:31
      -------------------------------------- 7.6/568.1 MB 27.1 MB/s eta 0:00:21
      ------------------------------------- 11.7/568.1 MB 59.5 MB/s eta 0:00:10
     - ----------------------------------- 17.0/568.1 MB 108.8 MB/s eta 0:00:06
     - ----------------------------------

In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances
import torch

# Download required NLTK data
nltk.download('punkt')

# Load spaCy and spaCy model
# spacy.download('nl_core_web_sm')
nlp = spacy.load('nl_core_news_lg')

# Load Doc2Vec model
doc2vec_model = Doc2Vec.load("models/doc2vec_model")

# Load BERT model and tokenizer
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')

# Load BERT model and tokenizer
tokenizer_robert = AutoTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")
model_robert = AutoModel.from_pretrained("pdelobelle/robbert-v2-dutch-base")

def preprocess_sentence(sentence):
    doc = nlp(sentence)
    tokens = [token.text for token in doc]
    return tokens

def cosine_similarity_word_embeddings(sentence1, sentence2):
    tokens1 = nlp(sentence1)
    tokens2 = nlp(sentence2)
    vec1 = np.mean([word.vector for word in tokens1], axis=0)
    vec2 = np.mean([word.vector for word in tokens2], axis=0)
    similarity = cosine_similarity([vec1], [vec2])
    return similarity[0][0]

# Calculate sentence embeddings using Doc2Vec
def get_doc2vec_embeddings(tokens, model):
    tagged_data = [TaggedDocument(words=tokens, tags=["sentence"])]
    vector = model.infer_vector(tagged_data[0].words)
    return vector

def get_transformer_embeddings(sentence, model, tokenizer):
    input_ids = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)["input_ids"]
    with torch.no_grad():
        outputs = model(input_ids)
        gg  = outputs
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    return embeddings

# Calculate Word Movers' Distance
def wmdistance(tokens1, tokens2):
    tokens1 = nlp(" ".join(tokens1))
    tokens2 = nlp(" ".join(tokens2))
    distance = tokens1.similarity(tokens2)
    return distance

# Calculate Jaccard Similarity
def jacc_similarity(tokens1, tokens2):
    set1 = set(tokens1)
    set2 = set(tokens2)
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    similarity = intersection / union
    return similarity


# Sample sentences
sentences = ["Meneer nam het medicijn in", "Meneer injecteerde het geneesmiddel zelf"]
references = ["injectie van medicatie", "neemt medicatie in"]
# Preprocess and tokenize sentences

result_dict = {'sentence':[], 'reference':[], 'word_similarity':[], 'doc2vec_similarity':[], 'bert_similarity':[],'robert_similarity':[], 'wm_distance':[], 'jaccard_similarity':[]}
for sentence1 in sentences:
    for sentence2 in references:
        tokens1 = preprocess_sentence(sentence1)
        tokens2 = preprocess_sentence(sentence2)

        word_similarity = cosine_similarity_word_embeddings(sentence1, sentence2)

        # Calculate cosine similarity using Doc2Vec embeddings
        vec1 = get_doc2vec_embeddings(tokens1, doc2vec_model)
        vec2 = get_doc2vec_embeddings(tokens2, doc2vec_model)
        doc2vec_similarity = cosine_similarity([vec1], [vec2])[0][0]

        # Calculate cosine similarity using BERT embeddings
        bert_embeddings1 = get_transformer_embeddings(sentence1, model_bert, tokenizer_bert)
        bert_embeddings2 = get_transformer_embeddings(sentence2, model_bert, tokenizer_bert)
        bert_similarity = cosine_similarity(bert_embeddings1, bert_embeddings2)[0][0]

        # Calculate cosine similarity using RoBERT embeddings
        robert_embeddings1 = get_transformer_embeddings(sentence1, model_robert, tokenizer_robert)
        robert_embeddings2 = get_transformer_embeddings(sentence2, model_robert, tokenizer_robert)
        robert_similarity = cosine_similarity(robert_embeddings1, robert_embeddings2)[0][0]

        # Calculate word movers similarity 
        wm_distance = wmdistance(tokens1, tokens2)

        # Calculate Jaccard similarity 
        jaccard_similarity = jacc_similarity(tokens1, tokens2)
        
        # Assign similarity scores to the result
        result_dict['sentence'].append(sentence1)
        result_dict['reference'].append(sentence2)
        result_dict['word_similarity'].append(word_similarity)
        result_dict['doc2vec_similarity'].append(doc2vec_similarity)
        result_dict['bert_similarity'].append(bert_similarity)
        result_dict['robert_similarity'].append(robert_similarity)
        result_dict['wm_distance'].append(wm_distance)
        result_dict['jaccard_similarity'].append(jaccard_similarity)

#Generate the result file
result = pd.DataFrame(result_dict)
result.to_excel('compare_sentence_similarity_methods.xlsx')


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\20194661\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
