### Import Needed Modules

## We will discuss sentences similarity using two ways: **BERT Model with Cosine Similarity** and **SpaCy**

## 1. **BERT Model**

In [None]:
# Install Transformers and sentence_transformers modules 
!pip install transformers sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

### Download BERT model

In [None]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

### Create Sentences

In [None]:
Sentences = ["Three years later, the coffin was still full of Jello.",
            "The fish dreamed of escaping the fishbowl and into the toilet where he saw his firend go.",
            "The person box was packed with jelly many dozens of months later.",
            "Standing on one's head at job inerviews forms a lasting impression.",
            "It took him a month to finish the meal.",
            "He found a leprechaun in his walnut shell."]

### Apply model

In [None]:
sentence_vecs = model.encode(Sentences)

### Get cosine_similarity of a sentence with other sentences

In [None]:
cosine_similarity([sentence_vecs[0]], sentence_vecs[1:])

### Cosine Similarity with all the data

In [None]:
print('\t    Sentence 1 Sentence 2 Sentence 3 Sentence 4 Sentence 5 Sentence 6')
for i in range(len(sentence_vecs)):
    print(f'Sentence {i+1} {cosine_similarity([sentence_vecs[i]], sentence_vecs[:])[0]}')

-------------------------------------------

## 2. **SpaCy**

In [None]:
import pandas as pd
import spacy

In [None]:
# Load english large model from spacy
nlp = spacy.load('en_core_web_lg')

#### We will get similarity among the first sentence and other sentences

In [None]:
# Tokenize the first sentence
txt = nlp(Sentences[0])

In [None]:
# Tokenize other sentences
all_txt = [nlp(text) for text in Sentences]

In [None]:
sims = []  # List for similarities values
doc_id = []  # List for sentences ids
sents = []  # List for sentences

for i in range(len(all_txt)):
    sims.append(all_txt[i].similarity(txt))
    doc_id.append(f'Sentence {i}')
    sents.append(Sentences[i])
    
    # Create a dataframe for all similarities
    sims_docs = pd.DataFrame(list(zip(doc_id, sents, sims)), columns=['doc_id', 'sentence', 'sims'])

In [None]:
# Sort the dataframe descending according similarity
sims_docs_sorted = sims_docs.sort_values(by='sims', ascending=False)

#### The similarity among the first sentence and all the sentences

In [None]:
sims_docs_sorted