In [1]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('distiluse-base-multilingual-cased-v2')

In [2]:
import pandas as pd
df = pd.read_csv(r'/Users/arinatryaskova/Documents/quora_duplicate_questions.tsv', sep='\t')

In [3]:
df = df.drop(df[df.is_duplicate == 0].index)

In [14]:
corpus = df['question1'].to_numpy()[:10000]
questions2 = df['question2'].to_numpy()[:10000]

In [15]:
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

In [7]:
corpus_embeddings[0].shape

torch.Size([512])

# Usage of cosine similarity 

As a sentence similarity efficiency we use such approach: for each sentence in questions2 array, we build embedding with chosen model and then compare use approach to calculate the similarity between embedding of input string (from questions2 set) and embedding of "ideal" string (from corpus set). Then we calculate accurate metrics as the ratio of correct answers to entire length of corpus array.

In [16]:
# Query sentences:
queries = questions2

score_counter = 0

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 1
for query, sentence in zip(queries, corpus):
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()
    
    top_results = torch.topk(cos_scores, k=top_k)

    for score, idx in zip(top_results[0], top_results[1]):
        if corpus[idx] == sentence:
            score_counter += 1
        
print(score_counter / len(corpus))

0.6245


# Usage of hnswlib algorithm

In [17]:
import hnswlib

In [24]:
embedding_size = 512    #Size of embeddings
top_k_hits = 1         #Output k hits

index = hnswlib.Index(space = 'cosine', dim = embedding_size)

print("Start creating HNSWLIB index")
index.init_index(max_elements = len(corpus_embeddings), ef_construction = 20, M = 2)

index.add_items(corpus_embeddings, list(range(len(corpus_embeddings))))

index.set_ef(50)  # ef should always be > top_k_hits

Start creating HNSWLIB index


In [25]:
queries = questions2

score_counter = 0

for query, sentence in zip(queries, corpus):
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    
    corpus_id, distance = index.knn_query(query_embedding, k=top_k_hits)
    
    if corpus[corpus_id] == sentence:
        score_counter += 1
        
print(score_counter / len(corpus))

0.1769


# Usage of Annoy algorithm

In [26]:
from simpleneighbors import SimpleNeighbors

In [28]:
annoy_index_questions1 = SimpleNeighbors(512, metric='dot')
for i, item in enumerate(corpus):
    emb = embedder.encode(item, convert_to_numpy=True)
    #emb = embed([item,]).numpy()[0]
    annoy_index_questions1.add_one(item, emb)
annoy_index_questions1.build()

In [30]:
score_counter = 0
for i, question_test in enumerate(questions2):
    
    ret = annoy_index_questions1.nearest(embedder.encode(question_test, convert_to_numpy=True))[0]
    if ret == corpus[i]:
        score_counter += 1
print(score_counter / len(corpus))

0.5094
