In [1]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np

In [2]:
sentences = ["I ate dinner.", 
       "We had a three-course meal.", 
       "Brad came to dinner with us.",
       "He loves fish tacos.",
       "In the end, we all felt like we ate too much.",
       "We all agreed; it was a magnificent evening."]
print("Sentence : ",sentences)

Sentence :  ['I ate dinner.', 'We had a three-course meal.', 'Brad came to dinner with us.', 'He loves fish tacos.', 'In the end, we all felt like we ate too much.', 'We all agreed; it was a magnificent evening.']


In [3]:
# Tokenization of each document
tokenized_sent = []
for s in sentences:
    tokenized_sent.append(word_tokenize(s.lower()))
#tokenized_sent

In [4]:
#Install Gensim (Optional)
#!python -m pip install -U gensim

In [5]:
!pip install gensim



In [6]:
# import
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
tagged_data

[TaggedDocument(words=['i', 'ate', 'dinner', '.'], tags=[0]),
 TaggedDocument(words=['we', 'had', 'a', 'three-course', 'meal', '.'], tags=[1]),
 TaggedDocument(words=['brad', 'came', 'to', 'dinner', 'with', 'us', '.'], tags=[2]),
 TaggedDocument(words=['he', 'loves', 'fish', 'tacos', '.'], tags=[3]),
 TaggedDocument(words=['in', 'the', 'end', ',', 'we', 'all', 'felt', 'like', 'we', 'ate', 'too', 'much', '.'], tags=[4]),
 TaggedDocument(words=['we', 'all', 'agreed', ';', 'it', 'was', 'a', 'magnificent', 'evening', '.'], tags=[5])]

In [7]:
## Train doc2vec model
model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

'''
vector_size = Dimensionality of the feature vectors.
window = The maximum distance between the current and predicted word within a sentence.
min_count = Ignores all words with total frequency lower than this.
alpha = The initial learning rate.
'''

## Print model vocabulary
model.wv.vocab

{'i': <gensim.models.keyedvectors.Vocab at 0x14aec1f71c0>,
 'ate': <gensim.models.keyedvectors.Vocab at 0x14aebbcbb20>,
 'dinner': <gensim.models.keyedvectors.Vocab at 0x14aebbcbcd0>,
 '.': <gensim.models.keyedvectors.Vocab at 0x14aebbcbf70>,
 'we': <gensim.models.keyedvectors.Vocab at 0x14aebbcbfa0>,
 'had': <gensim.models.keyedvectors.Vocab at 0x14aebbcbc40>,
 'a': <gensim.models.keyedvectors.Vocab at 0x14aed26b070>,
 'three-course': <gensim.models.keyedvectors.Vocab at 0x14aed26b0d0>,
 'meal': <gensim.models.keyedvectors.Vocab at 0x14aed26b130>,
 'brad': <gensim.models.keyedvectors.Vocab at 0x14aed26b190>,
 'came': <gensim.models.keyedvectors.Vocab at 0x14aed26b1f0>,
 'to': <gensim.models.keyedvectors.Vocab at 0x14aed26b250>,
 'with': <gensim.models.keyedvectors.Vocab at 0x14aed26b2b0>,
 'us': <gensim.models.keyedvectors.Vocab at 0x14aed26b310>,
 'he': <gensim.models.keyedvectors.Vocab at 0x14aed26b370>,
 'loves': <gensim.models.keyedvectors.Vocab at 0x14aed26b3d0>,
 'fish': <gensim

In [8]:
print("Similirity: ")
test_doc = word_tokenize("I had pizza and pasta".lower())
test_doc_vector = model.infer_vector(test_doc)
model.docvecs.most_similar(positive = [test_doc_vector])


Similirity: 


[(3, 0.6410925388336182),
 (0, 0.33706802129745483),
 (4, 0.254173219203949),
 (2, 0.20233546197414398),
 (5, 0.09914921224117279),
 (1, 0.051740117371082306)]

In [None]:
conda install pytorch torchvision cudatoolkit=10.2 -c pytorch

In [8]:
!pip install sentence-transformers



In [9]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [10]:
sentence_embeddings = sbert_model.encode(sentences)

#print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))
#print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])


In [11]:
query = "I had pizza and pasta"
query_vec = sbert_model.encode([query])[0]

In [12]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [13]:
for sent in sentences:
  sim = cosine(query_vec, sbert_model.encode([sent])[0])
  print("Sentence = ", sent, "; similarity = ", sim)

Sentence =  I ate dinner. ; similarity =  0.71734613
Sentence =  We had a three-course meal. ; similarity =  0.637134
Sentence =  Brad came to dinner with us. ; similarity =  0.5897909
Sentence =  He loves fish tacos. ; similarity =  0.62239355
Sentence =  In the end, we all felt like we ate too much. ; similarity =  0.41980496
Sentence =  We all agreed; it was a magnificent evening. ; similarity =  0.18081596


In [14]:
# Install TF-Hub.
!pip install tensorflow-hub



In [14]:
import tensorflow as tf
import tensorflow_hub as hub

In [16]:
#"https://tfhub.dev/google/universal-sentence-encoder/1"
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
print ("module %s loaded" % module_url)

INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 170.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 330.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 490.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 650.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 810.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 960.00MB
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [17]:
sentence_embeddings = model(sentences)
query = "I had pizza and pasta"
query_vec = model([query])[0]

In [18]:
for sent in sentences:
  sim = cosine(query_vec, model([sent])[0])
  print("Sentence = ", sent, "; similarity = ", sim)

Sentence =  I ate dinner. ; similarity =  0.4686642
Sentence =  We had a three-course meal. ; similarity =  0.35643065
Sentence =  Brad came to dinner with us. ; similarity =  0.20338944
Sentence =  He loves fish tacos. ; similarity =  0.16515438
Sentence =  In the end, we all felt like we ate too much. ; similarity =  0.14987424
Sentence =  We all agreed; it was a magnificent evening. ; similarity =  0.05843591
