In [96]:
corpus = ["Jake Peralta is the best detective in New York.",
          "Oranges are my favorite fruit",
          "I'd like an apple", 
          "An apple a day keeps the doctor away",
          "Obama speaks to the media in Illinois",
          "The president greets the press in Chicago",
          "50 new COVID-19 cases were reported in Singapore today",
         '3 theft cases were reported in Jurong West last week']

## TF-IDF method

In [None]:
!pip3 install sklearn

In [52]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [91]:
def find_sim(input_doc, corpus):
    corpus.append(input_doc)
    vect = TfidfVectorizer(min_df=1, stop_words = 'english')
    tfidf = vect.fit_transform(corpus)
    
    sim = cosine_similarity(tfidf[-1], tfidf[:-1])
    print(sim)
    
    top3 = np.argsort(-sim)[0][:3].tolist()
    for i in top3:
        print(corpus[i])
    corpus.pop()
    return top3

In [97]:
input_doc = 'The detective solved two murder cases within the past week'
find_sim(input_doc, corpus)

[[0.1374178  0.         0.         0.         0.         0.
  0.0916917  0.25224758]]
3 theft cases were reported in Jurong West last week
Jake Peralta is the best detective in New York.
50 new COVID-19 cases were reported in Singapore today


[7, 0, 6]

## Sentence-Transformers

In [12]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [13]:
input_doc = 'The detective solved two murder cases within the past week'
corpus.append(input_doc)

#Encoding:
embeddings = model.encode(corpus)
embeddings.shape

NameError: name 'model' is not defined

In [None]:
result = cosine_similarity([embeddings[-1]],embeddings[:-1])
corpus.pop() # remove new entry from corpus list
result

In [None]:
result1 = corpus[np.argmax(result)]
result2 = corpus[np.argsort(-result)[0][:3][1]]
result3 = corpus[np.argsort(-result)[0][:3][2]]

In [None]:
result1

## Soft Cosine Measure

In [None]:
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess

In [None]:
import nltk
# Import and download stopwords from NLTK.
from nltk.corpus import stopwords
from nltk import download
nltk.download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english')

def preprocess(sentence):
    return [w for w in sentence.lower().split() if w not in stop_words]


In [None]:
documents = []
for i in range(len(corpus)):
    documents.append(preprocess(corpus[i]))


In [None]:
print(documents)

In [None]:
from gensim.corpora import Dictionary
dictionary = Dictionary(documents)

bow = []
for doc in documents:
    doc = dictionary.doc2bow(doc)
    bow.append(doc)

from gensim.models import TfidfModel
tfidf = TfidfModel(bow)

out = []
for b in bow:
    b = tfidf[b]
    out.append(b)
out[0]

In [None]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')

from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
termsim_index = WordEmbeddingSimilarityIndex(model)
termsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary, tfidf)

In [None]:
similarity = termsim_matrix.inner_product(out[-1], out[6], normalized=(True, True))
print('similarity = %.4f' % similarity)
termsim_matrix

In [None]:
from gensim.similarities import SoftCosineSimilarity
#Calculate Soft Cosine Similarity between the query and the documents.
def find_similarity(query,documents):
    query = preprocess(query)
    query = dictionary.doc2bow(query)
    index = SoftCosineSimilarity(
        [dictionary.doc2bow(document) for document in documents],
        termsim_matrix)
    return index[query]

In [None]:
doc = 'COVID is a hoax. Blame the chinese'

find_similarity(doc, documents)

In [None]:
corpus