In [None]:
#3/8/2024 myDeepLearningAI course: Open Source Models with Hugging Face
#Lesson 4: Sentence Similarity

#infra: Trainbox + VSCode 
#      env: tc2-venv created by Blake
#      confirmed Python 3.10.4
#      numpy 1.24.2, pandas 2.0.0, added scikit-learn 1.2.2, req scipy
#      [not any more pip 22.0.4], ipykernel 6.22.0, ipython 8.12.0
#
#for NNLM added
#      tensorflow 2.15.0, tensorflow_hub 0.16.1
#for HuggingFace added
#      tqdm-4.66.2 huggingface-hub-0.21.1 [safetensors-0.4.2 tokenizers-0.15.2] transformers-4.38.1
#need a lower version `ulrlib3` -> downgrade it with $ pip install requests==2.27.1 -> urllib3 1.26.18
#needs pip 24.0
#for HuggingFace Sentence Embeddings added
#      sentence-transformers-2.5.1

#history
#3/8/2024 SENTENCE SIMILARITY
#      Started with L4: Sentence Embeddings
#      $delta0 myFix and myExperimentation

#$network


#References
#DeepLearningAI course
# refer to https://learn.deeplearning.ai/courses/open-source-models-hugging-face


In [22]:
import os
import sys
import numpy as np

from sentence_transformers import SentenceTransformer
from sentence_transformers import util

from sklearn.metrics import pairwise_distances_argmin as distances_argmin

In [2]:
#$network settings

#set proxy needed to fix SSL error
os.environ['REQUESTS_CA_BUNDLE'] = '../../myCreds/all_ca_certs.crt'
# #os.environ['CURL_CA_BUNDLE'] = '../../myCreds/all_ca_certs.pem'

proxy = 'http://devproxy01.chq.ei:8080' #'http://proxy-chq.gtm.chq.ei:8080'
os.environ['http_proxy'] = proxy 
os.environ['HTTP_PROXY'] = proxy
os.environ['https_proxy'] = proxy
os.environ['HTTPS_PROXY'] = proxy

In [24]:
!python -V
print(sys.version)

Python 3.10.4
3.10.4 (tags/v3.10.4:9d38120, Mar 23 2022, 23:13:41) [MSC v.1929 64 bit (AMD64)]


# Lesson 4: Sentence Embeddings

- In the classroom, the libraries are already installed for you.
- If you would like to run this code on your own machine, you can install the following:
``` 
    !pip install sentence-transformers
```

- Here is some code that suppresses warning messages.

In [3]:
# from transformers.utils import logging
# logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


### Build the `sentence embedding` pipeline using 🤗 Transformers Library

In [5]:
model = SentenceTransformer("all-MiniLM-L12-v2") #all-MiniLM-L6-v2

More info on [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2).

In [6]:
sentences1 = ['The cat sits outside',
              'A man is playing guitar',
              'The movies are awesome']

In [7]:
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings1

tensor([[ 0.1733, -0.0505, -0.0035,  ..., -0.0032, -0.0040,  0.0507],
        [-0.0059, -0.0690, -0.0538,  ..., -0.0683,  0.0289, -0.0066],
        [-0.0910, -0.0705,  0.0082,  ..., -0.0132,  0.1260, -0.0543]])

In [9]:
sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

# sentences2 = ['A woman watches TV',
#               'The new movie is so great',
#               'The dog plays in the garden']

In [10]:
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
embeddings2

* Calculate the cosine similarity between two sentences as a measure of how similar they are to each other.

In [13]:
cosine_scores = util.cos_sim(embeddings1,embeddings2)
print(cosine_scores)

In [15]:
#delta0: myValidate fails
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i],
                                                 sentences2[i],
                                                 cosine_scores[i][i]))

The cat sits outside 		 The dog plays in the garden 		 Score: 0.2733
A man is playing guitar 		 A woman watches TV 		 Score: -0.0123
The movies are awesome 		 The new movie is so great 		 Score: 0.6399


### myFix and myValidate
delta0: instead of diagonal values, look for `argmin`  
Refer to prev DeepLearningAI course

In [16]:
# idx_doc_cosine = np.argmax(cosine_scores)
# idx_doc_cosine

tensor(8)

In [17]:
idx_doc_distances = distances_argmin(embeddings1, embeddings2)
idx_doc_distances

array([0, 0, 2], dtype=int64)

In [18]:
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Min distance to: {}".format(sentences1[i],
                                                    sentences2[idx_doc_distances[i]],
                                                    idx_doc_distances[i])) 

The cat sits outside 		 The dog plays in the garden 		 Min distance to: 0
A man is playing guitar 		 The dog plays in the garden 		 Min distance to: 0
The movies are awesome 		 The new movie is so great 		 Min distance to: 2


In [19]:
idx_doc_distances2 = distances_argmin(embeddings2, embeddings1)
idx_doc_distances2

array([0, 0, 2], dtype=int64)

In [20]:
for i in range(len(sentences2)):
    print("{} \t\t {} \t\t Min distance to: {}".format(sentences2[i],
                                                    sentences1[idx_doc_distances2[i]],
                                                    idx_doc_distances2[i]))

The dog plays in the garden 		 The cat sits outside 		 Min distance to: 0
A woman watches TV 		 The cat sits outside 		 Min distance to: 0
The new movie is so great 		 The movies are awesome 		 Min distance to: 2
