# Generating Semantic Embedding using HuggingFace's Sentence Trasformers library (The Easy way)

In [1]:
! pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-4.0.1-py3-none-any.whl.metadata (13 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.15.2-cp312-cp312-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting Pillow (from sentence-transformers)
  Using cached pillow-11.1.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.1 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence-transformers)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.0.1-py3-none-any.whl (340 kB)
Using cached pillow-11.1.0-cp312-cp312-macosx_11_0_arm64.whl (3.1 MB)
Using cached scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl (11.2 MB)
Using cached 

In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


# Representing Sentences into Dense vectors

## Here we are mere using the SentenceTransformers module
- `SentenceTransformer` module, provides access to the pre-trained models for generating sentence embeddings
- The class provided by SentenceTransformer is built on top of the HuggingFace's Transformers and PyTorch/TensorFlow

### About the model Chosen here
- MiniLM-L6-v2 is a small efficient version of the BERT like model
- L6 in the name implies it has 6 layers, and v2 implies this is version 2
- It has been optimized for speed and memory, making it a good choice for generating the embeddings
- It maps sentences into 384-dimension embeddings ie numberical representation of the input sentences


In [6]:

# Load a pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')


## Defining the document corpus
- The documents defined below would serve as the corpus the Retriever module would siff throuh


In [7]:

# Example sentences
sentences = [
    "Metformin is a medication used to treat type 2 diabetes.",
    "Common side effects include nausea, upset stomach, and diarrhea.",
    "Insulin helps control blood sugar levels.",
    "Octopuses have three hearts and blue blood.",
    "Bananas are berries, but strawberries are not.",
    "The Eiffel Tower can grow more than 6 inches in summer due to heat expansion.",
    "Honey never spoils and has been found edible in ancient Egyptian tombs.",
    "Sharks have been around longer than trees, existing for over 400 million years."]

## Encoding the document corpus
- The model corpus is encoded, using the model defined by the `model` pipeline variable
- This would output the corpus into vector representation, based on the specifications of the model
- Although the actual dimensions of the output vector is 384-dimension, we are just looking at the first 5 dimensions of the output

In [8]:
## Using the Model Pipeline to generate embeddings

# Generate embeddings
embeddings = model.encode(sentences)
print(f"embeddings shape -> (num_sentences, embedding_dimensions) : {embeddings.shape}")
embeddings[:, :5]

embeddings shape -> (num_sentences, embedding_dimensions) : (8, 384)


array([[-0.06462408, -0.01414089, -0.04102236,  0.05251157, -0.07824506],
       [ 0.02023205, -0.05939261,  0.00395693,  0.02894642, -0.00524463],
       [-0.07433561,  0.1275397 , -0.06659397,  0.06253764, -0.00612355],
       [ 0.01835446,  0.01416124, -0.01720773,  0.04222286, -0.07529111],
       [ 0.016516  , -0.0464867 ,  0.00407454,  0.01632662,  0.01158659],
       [ 0.02896879,  0.00143153,  0.01369293,  0.04595578,  0.00489472],
       [-0.04607044,  0.0279206 ,  0.04649594, -0.00482514, -0.01890971],
       [-0.01410103,  0.02397047,  0.05505345,  0.05128846,  0.00436963]],
      dtype=float32)

## Output Embeddings Shape

In [24]:
## Each of the sentences have been transformed into a 
print("Shape of dense vector representation of 8 sentences : {embeddings.shape}")
print()
## First 10 dimensions of the dense representation of sentence 1 and 2.
print(embeddings[0][:10])
print(embeddings[1][:10])

Shape of dense vector representation of 8 sentences : {embeddings.shape}

[-0.06462408 -0.01414089 -0.04102236  0.05251157 -0.07824506  0.00645232
  0.01479468  0.10869684  0.02307448 -0.05817275]
[ 0.02023205 -0.05939261  0.00395693  0.02894642 -0.00524463 -0.0390905
  0.04136479  0.08110133 -0.03313445 -0.04686553]


## Experimenting with Semantic Similarity
- We have multi-dimension representation of the output vectors
- We check the similarity scores between the embeddings representation of the output vectors

In [25]:
## semantic similarity between sentences
print(sentences[0])
print(sentences[2])
print(f"similarity score : {np.dot(embeddings[0], embeddings[2])}")

Metformin is a medication used to treat type 2 diabetes.
Insulin helps control blood sugar levels.
similarity score : 0.3995707333087921


### Measuring the Semantic Similarities between embedded sentences

In [6]:
## semantic similarity between sentences
print(sentences[0])
print(sentences[3])
print(f"similarity score : {np.dot(embeddings[0], embeddings[3])}")

Metformin is a medication used to treat type 2 diabetes.
Octopuses have three hearts and blue blood.
similarity score : -0.016284015029668808


## Alternatively, using cosine similarity metric from sklearn.metrics, to measure how similar the vectors are

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
# Compare the first and second sentences
sim = cosine_similarity([embeddings[0]], [embeddings[1]])
print(f"Similarity Score: {sim[0][0]:.4f}")

Similarity Score: 0.0351


# (OPTIONAL) Generating Semantic Embedding using PyTorch (the hard way)

### Closer Look at the inner workings of SentenceTransformer Class

In [10]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

## Fetching the model/tokenizer

### Tokenizer
- AutoTokenizer() helps you load the given tokenizer object for a specific model
- A tokenizer in LLMs breaks down raw text into smaller units called tokens (e.g., words, subwords, or characters), which the model can understand and process. 
- It also handles the reverse operation, converting generated tokens back into human-readable text.

### Model
- The `model` object holds, a pre-trained neural network loaded from the Hugging Face Transformers library based on the specified model_name. 
- It provides the architecture and weights needed to put together the model in memory, to the generate embeddings.

In [11]:
## Loading the tokenizer and model (MiniLM)

model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


### Tokenizer object
- Expains the model name
- Vocabulary size
- Special tokens
- Padding process

In [12]:
tokenizer

BertTokenizerFast(name_or_path='sentence-transformers/all-MiniLM-L6-v2', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

### Model Object

- Describes the structure of the model loaded
- The model structure, in this case its the Encoder-only model
- It also describes the output dimensions of each of the layer
- Shows the Attention Mechanism - with the Key, Query and Value matrices


In [13]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [14]:
sentences

['Metformin is a medication used to treat type 2 diabetes.',
 'Common side effects include nausea, upset stomach, and diarrhea.',
 'Insulin helps control blood sugar levels.',
 'Octopuses have three hearts and blue blood.',
 'Bananas are berries, but strawberries are not.',
 'The Eiffel Tower can grow more than 6 inches in summer due to heat expansion.',
 'Honey never spoils and has been found edible in ancient Egyptian tombs.',
 'Sharks have been around longer than trees, existing for over 400 million years.']

### Raw input -> Tokenized Input
The tokenizer:
- 

In [15]:
# tokenize and encode the inputs

inputs = tokenizer(sentences,
                   padding = True,
                   truncation = True,
                   return_tensors = "pt")

# tokenized input
inputs['input_ids']

tensor([[  101,  2777, 14192,  2378,  2003,  1037, 14667,  2109,  2000,  7438,
          2828,  1016, 14671,  1012,   102,     0,     0,     0,     0,     0],
        [  101,  2691,  2217,  3896,  2421, 19029,  1010,  6314,  4308,  1010,
          1998, 22939, 12171, 20192,  1012,   102,     0,     0,     0,     0],
        [  101, 22597,  7126,  2491,  2668,  5699,  3798,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101, 24318,  2229,  2031,  2093,  8072,  1998,  2630,  2668,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101, 26191,  2024, 22681,  1010,  2021, 13137, 20968,  2024,  2025,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1996,  1041, 13355,  2884,  3578,  2064,  4982,  2062,  2084,
          1020,  5282,  1999,  2621,  2349,  2000,  3684,  4935,  1012,   102],
        [  101,  6861,  2196, 27594,  2015,  1

In [15]:

## Getting the input to pass through the loaded model

with torch.no_grad():
    outputs =  model(**inputs)

outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-1.1995e-01, -3.3542e-01, -2.0211e-01,  ..., -2.8208e-01,
          -3.1256e-01, -1.3458e-01],
         [-8.2270e-01, -1.3833e-01,  1.6015e-01,  ..., -4.8873e-01,
          -9.3062e-01,  7.0992e-01],
         [-6.6834e-01, -5.9575e-03,  2.7987e-01,  ...,  5.8347e-01,
           5.6502e-01, -1.6965e-01],
         ...,
         [-6.4299e-01, -3.9824e-01,  2.2439e-02,  ..., -5.1957e-01,
           4.2425e-01, -1.9175e-01],
         [-6.3898e-01, -4.1930e-01,  4.4994e-02,  ..., -4.9156e-01,
           4.2146e-01, -2.5802e-01],
         [-6.2997e-01, -4.1032e-01,  5.0820e-02,  ..., -4.8530e-01,
           3.7536e-01, -2.7483e-01]],

        [[ 4.5147e-01,  4.4765e-02,  1.4672e-01,  ...,  6.1254e-02,
          -4.0575e-01, -1.1386e-01],
         [ 3.8334e-01, -3.6245e-01,  5.4244e-01,  ...,  4.6947e-01,
          -1.1225e+00,  8.0497e-01],
         [-8.2962e-01, -1.2266e-02,  2.8013e-01,  ...,  6.9566e-01,
          -1.

In [None]:
## Accessing the model layer where 
##

outputs.last_hidden_state.shape

torch.Size([8, 20, 384])