In [2]:
from sentence_transformers import SentenceTransformer, util
import torch

In [33]:
import os
import codecs
import re
import time
import numpy as np

## Loading the model
Load the paraphrase-xlm-r-multilingual-v1 model, and use gpu if it is available

In [4]:
model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')
if torch.cuda.is_available():
   model = model.to(torch.device("cuda"))
print(model.device)

cpu


  return torch._C._cuda_getDeviceCount() > 0


## Example
Test multi lingual sentence embeddings by doing cosine similarity between similar sentences in different languages

In [65]:
# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['Мачка седи напољу',
              'Човек свира гитару',
              'Нови филм је сјајан']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))


The cat sits outside 		 Мачка седи напољу 		 Score: 0.9889
A man is playing guitar 		 Човек свира гитару 		 Score: 0.9719
The new movie is awesome 		 Нови филм је сјајан 		 Score: 0.8863


## Embed the documents

### Load the dataset

In [6]:
dataset_path = '../acts'
dataset = {}
TAG_RE = re.compile(r'<[^>]+>')
for file_path in os.listdir(dataset_path):
    with codecs.open(os.path.join(dataset_path, file_path), 'r', encoding='utf-8') as f:
        data = f.read()
        data = data.lower() #lowercase
        data = TAG_RE.sub('', data) #remove html tags
        data = " ".join(data.split()) #remove extra whitespaces
        dataset[file_path] = data

In [7]:
corpus = list(dataset.values())

### Embed the corpus

In [20]:
embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

In [21]:
print(type(embeddings))
print(type(embeddings[0]))
print(embeddings[0])

<class 'torch.Tensor'>
<class 'torch.Tensor'>
tensor([ 2.6268e-01,  2.1535e-01,  4.7530e-02,  3.2643e-01, -6.4580e-01,
         1.4841e-02,  6.6523e-02,  1.5537e-01,  1.1395e-01,  8.5271e-02,
         1.9012e-01,  8.8985e-03, -2.8907e-02,  8.4982e-03,  3.8316e-02,
        -2.5428e-01,  3.0941e-01, -2.1273e-02,  3.3076e-03, -1.1747e-01,
        -9.0544e-02, -3.3897e-01, -1.6136e-01,  9.3052e-02,  2.3413e-01,
         1.6467e-01,  2.9227e-01,  1.8580e-01, -3.2769e-01, -3.5479e-02,
         2.3625e-02, -1.3125e-01, -3.1069e-02,  4.2423e-02, -1.0064e-01,
         2.1015e-01,  8.4453e-02,  1.6454e-01, -1.2877e-01,  2.5345e-02,
         4.2720e-01,  5.6138e-02, -9.3906e-02,  3.4160e-01, -2.5491e-02,
        -7.9388e-02, -2.2092e-01, -2.3218e-01, -2.1752e-03, -1.7837e-02,
        -4.9334e-02,  1.9557e-01,  6.1125e-03,  4.2973e-02, -1.9483e-01,
         1.5243e-01,  9.1909e-02,  1.4664e-01,  1.9402e-01,  2.5061e-01,
        -1.6577e-01,  6.6146e-02, -5.8396e-02,  3.4775e-01,  2.5780e-01,
     

In [26]:
cosine_scores = util.pytorch_cos_sim(embeddings, embeddings2[0])
cosine_scores.numpy()

array([[ 0.00607183],
       [ 0.03425825],
       [ 0.07264607],
       ...,
       [ 0.1150783 ],
       [-0.0054166 ],
       [ 0.04229683]], dtype=float32)

In [59]:
def calculate_similarity(X, model, query, top_k=10):
    """ Vectorizes the `query` via `vectorizor` and calculates the cosine similarity of
    the `query` and `X` (all the documents) and returns the `top_k` similar documents."""

    # Vectorize the query to the same length as documents
    query_vec = model.encode(query, convert_to_tensor=True)
    # Compute the cosine similarity between query_vec and all the documents
    cosine_similarities = util.pytorch_cos_sim(X, query_vec).numpy().flatten()
    # Sort the similar documents from the most similar to less similar and return the indices
    most_similar_doc_indices = np.argsort(cosine_similarities, axis=0)[:-top_k-1:-1]
    return (most_similar_doc_indices, cosine_similarities)

In [60]:
def show_similar_documents(df, cosine_similarities, similar_doc_indices):
    """ Prints the most similar documents using indices in the `similar_doc_indices` vector."""
    counter = 1
    for index in similar_doc_indices:
        print('Top-{}, Similarity = {}'.format(counter, cosine_similarities[index]))
        print('body: {}, '.format(df[index]))
        print()
        counter += 1

In [66]:
user_question = [u'порез казна камата']
search_start = time.time()
sim_vecs, cosine_similarities = calculate_similarity(embeddings, model, user_question)
search_time = time.time() - search_start
print("search time: {:.2f} ms".format(search_time * 1000))
print()
show_similar_documents(list(dataset.keys()), cosine_similarities, sim_vecs)

search time: 63.87 ms

Top-1, Similarity = 0.6095771789550781
body: 190.html, 

Top-2, Similarity = 0.5718044638633728
body: 450.html, 

Top-3, Similarity = 0.5671951770782471
body: 194.html, 

Top-4, Similarity = 0.5632493495941162
body: 698.html, 

Top-5, Similarity = 0.548108696937561
body: 448.html, 

Top-6, Similarity = 0.545495867729187
body: 502.html, 

Top-7, Similarity = 0.5452719926834106
body: 201.html, 

Top-8, Similarity = 0.5430032014846802
body: 199.html, 

Top-9, Similarity = 0.5397124886512756
body: 200.html, 

Top-10, Similarity = 0.5395081043243408
body: 282.html, 

