In [1]:
corpus = ["This is a test sentence",
          "Oranges are my favorite fruit",
          "I'd like an apple", 
          "An apple a day keeps the doctor away",
          "Obama speaks to the media in Illinois",
          "The president greets the press in Chicago",
          "50 new COVID-19 cases were reported in Singapore today",
         '3 theft cases were reported in Jurong West last week']

## TF-IDF method

In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
input_doc = 'COVID is a hoax. Blame the chinese'
corpus.append(input_doc)

In [4]:
vect = TfidfVectorizer(min_df=1, stop_words = 'english')
# vect = TfidfVectorizer(min_df=1)
tfidf = vect.fit_transform(corpus)
pairwise_similarity = tfidf * tfidf.T

arr = pairwise_similarity.toarray()
np.fill_diagonal(arr, np.nan)

input_idx = corpus.index(input_doc)
result_idx = np.nanargmax(arr[input_idx])
corpus[result_idx]

'50 new COVID-19 cases were reported in Singapore today'

In [5]:
arr

array([[       nan, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        ,        nan, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        ,        nan, 0.25103029, 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.25103029,        nan, 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ,        nan,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
               nan, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ,        nan, 0.22920576, 0.13854187],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.22920576,        nan, 0.        ],


In [6]:
corpus.pop()

'COVID is a hoax. Blame the chinese'

## Sentence-Transformers

In [7]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [9]:
input_doc = 'The detective solved two murder cases within the past week'
corpus.append(input_doc)

#Encoding:
embeddings = model.encode(corpus)
embeddings.shape

(9, 768)

In [10]:
result = cosine_similarity([embeddings[-1]],embeddings[:-1])
corpus.pop() # remove new entry from corpus list
result

array([[ 0.01250362,  0.12978113, -0.00957555,  0.29170883,  0.29865646,
         0.29103833,  0.4675786 ,  0.71766376]], dtype=float32)

In [11]:
result1 = corpus[np.argmax(result)]
result2 = corpus[np.argsort(-result)[0][:3][1]]
result3 = corpus[np.argsort(-result)[0][:3][2]]

In [12]:
result1

'3 theft cases were reported in Jurong West last week'

## Soft Cosine Measure

In [13]:
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess

In [14]:
import nltk
# Import and download stopwords from NLTK.
from nltk.corpus import stopwords
from nltk import download
nltk.download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english')

def preprocess(sentence):
    return [w for w in sentence.lower().split() if w not in stop_words]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mingshuseah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
documents = []
for i in range(len(corpus)):
    documents.append(preprocess(corpus[i]))


In [16]:
print(documents)

[['test', 'sentence'], ['oranges', 'favorite', 'fruit'], ["i'd", 'like', 'apple'], ['apple', 'day', 'keeps', 'doctor', 'away'], ['obama', 'speaks', 'media', 'illinois'], ['president', 'greets', 'press', 'chicago'], ['50', 'new', 'covid-19', 'cases', 'reported', 'singapore', 'today'], ['3', 'theft', 'cases', 'reported', 'jurong', 'west', 'last', 'week']]


In [17]:
from gensim.corpora import Dictionary
dictionary = Dictionary(documents)

bow = []
for doc in documents:
    doc = dictionary.doc2bow(doc)
    bow.append(doc)

from gensim.models import TfidfModel
tfidf = TfidfModel(bow)

out = []
for b in bow:
    b = tfidf[b]
    out.append(b)
out[0]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[(0, 0.7071067811865476), (1, 0.7071067811865476)]

In [18]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')

from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
termsim_index = WordEmbeddingSimilarityIndex(model)
termsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary, tfidf)

100%|███████████████████████████████████████████| 33/33 [00:15<00:00,  2.12it/s]


In [19]:
similarity = termsim_matrix.inner_product(out[-1], out[6], normalized=(True, True))
print('similarity = %.4f' % similarity)
termsim_matrix

similarity = 0.1667


<gensim.similarities.termsim.SparseTermSimilarityMatrix at 0x12eaf4ee0>

In [20]:
from gensim.similarities import SoftCosineSimilarity
#Calculate Soft Cosine Similarity between the query and the documents.
def find_similarity(query,documents):
    query = preprocess(query)
    query = dictionary.doc2bow(query)
    index = SoftCosineSimilarity(
        [dictionary.doc2bow(document) for document in documents],
        termsim_matrix)
    return index[query]

In [21]:
doc = 'COVID is a hoax. Blame the chinese'

find_similarity(doc, documents)

array(0., dtype=float32)

In [22]:
corpus

['This is a test sentence',
 'Oranges are my favorite fruit',
 "I'd like an apple",
 'An apple a day keeps the doctor away',
 'Obama speaks to the media in Illinois',
 'The president greets the press in Chicago',
 '50 new COVID-19 cases were reported in Singapore today',
 '3 theft cases were reported in Jurong West last week']