<a href="https://colab.research.google.com/github/anupamaray/nlp-for-hindi/blob/master/SemanticSim_SentTrans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Library

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

## Model Selection and Initialization

In [None]:
# List of models optimized for semantic textual similarity can be found at:
# https://docs.google.com/spreadsheets/d/14QplCdTCDwEmTqrn1LH4yrbKvdogK4oQvYO1K1aPR5M/edit#gid=0
model = SentenceTransformer('stsb-roberta-large')
#model1 = SentenceTransformer('paraphrase-distilroberta-base-v1')
#model2 = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')

100%|██████████| 1.31G/1.31G [01:36<00:00, 13.6MB/s]  


## Calculate semantic similarity between two sentences

In [None]:
sentence1 = "I like Python because I can build AI applications"
sentence2 = "I like Python because I can do data analytics"

# encode sentences to get their embeddings
embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)

# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)

print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)
print("Similarity score:", cosine_scores.item())

Sentence 1: I like Python because I can build AI applications
Sentence 2: I like Python because I can do data analytics
Similarity score: 0.8188022375106812


## Calculate semantic similarity between two lists of sentences

In [None]:
sentences1 = ["I like Python because I can build AI applications", "The cat sits on the ground"]   
sentences2 = ["I like Python because I can do data analytics", "The cat walks on the sidewalk"]

# encode list of sentences to get their embeddings
embedding1 = model.encode(sentences1, convert_to_tensor=True)
embedding2 = model.encode(sentences2, convert_to_tensor=True)

# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)

for i in range(len(sentences1)):
    for j in range(len(sentences2)):
        print("Sentence 1:", sentences1[i])
        print("Sentence 2:", sentences2[j])
        print("Similarity Score:", cosine_scores[i][j].item())
        print()

Sentence 1: I like Python because I can build AI applications
Sentence 2: I like Python because I can do data analytics
Similarity Score: 0.8188023567199707

Sentence 1: I like Python because I can build AI applications
Sentence 2: The cat walks on the sidewalk
Similarity Score: -0.06005367636680603

Sentence 1: The cat sits on the ground
Sentence 2: I like Python because I can do data analytics
Similarity Score: 0.12721936404705048

Sentence 1: The cat sits on the ground
Sentence 2: The cat walks on the sidewalk
Similarity Score: 0.4131842255592346



## Retrieve Top K most similar sentences from a corpus given a sentence

In [None]:
corpus = ["I like Python because I can build AI applications",
          "I like Python because I can do data analytics",
          "The cat sits on the ground",
         "The cat walks on the sidewalk"]

# encode corpus to get corpus embeddings
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

In [None]:
sentence = "I like Javascript because I can build web applications"

# encode sentence to get sentence embeddings
sentence_embedding = model.encode(sentence, convert_to_tensor=True)

In [None]:
# top_k results to return
top_k=2

# compute similarity scores of the sentence with the corpus
cos_scores = util.pytorch_cos_sim(sentence_embedding, corpus_embeddings)[0]

# Sort the results in decreasing order and get the first top_k
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

print("Sentence:", sentence, "\n")
print("Top", top_k, "most similar sentences in corpus:")
for idx in top_results[0:top_k]:
    print(corpus[idx], "(Score: %.4f)" % (cos_scores[idx]))

Sentence: I like Javascript because I can build web applications 

Top 2 most similar sentences in corpus:
I like Python because I can build AI applications (Score: 0.6253)
I like Python because I can do data analytics (Score: 0.5348)
