In [5]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [43]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

sentences = [
    'The new book is awesome',
    'This recent novel is so good',
]
model = SentenceTransformer('Supabase/gte-small')



In [44]:
embeddings = model.encode(sentences)
print(cos_sim(embeddings[0], embeddings[1]))

tensor([[0.8366]])


In [45]:
embeddings1 = model.encode('The new book is awesome')
embeddings2 = model.encode('I like flamenco dancing')
cos_sim(embeddings1, embeddings2)

tensor([[0.7523]])

In [40]:
# Two lists of sentences
sentences1 = [
    "The cat sits outside",
    "A man is playing guitar",
    "The book is awesome",
]

sentences2 = [
    "The dog sits in the garden",
    "A woman is playing piano",
    "The food is awesome",
]

# Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

# Compute cosine-similarities
cosine_scores = cos_sim(embeddings1, embeddings2)

# Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(
        sentences1[i], sentences2[i], cosine_scores[i][i]
    ))

The cat sits outside 		 The dog sits in the garden 		 Score: 0.4350
A man is playing guitar 		 A woman is playing piano 		 Score: 0.2508
The book is awesome 		 The food is awesome 		 Score: 0.4507


In [10]:
# Our sentences to encode
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog."
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-4.56867635e-01 -6.03740886e-02  2.77014356e-02 -1.49324030e-01
 -2.58537903e-02  3.99035126e-01 -1.55861266e-02  2.39103317e-01
  9.95153934e-02  1.49657920e-01 -3.45251858e-01 -4.33488786e-01
  6.84537232e-01  2.49792591e-01  3.92541915e-01  3.05619121e-01
 -2.38010854e-01  3.97296131e-01 -4.60436165e-01 -1.37540340e-01
  5.90817809e-01 -2.84304172e-01  1.05978139e-01 -5.92266202e-01
 -1.59350365e-01  4.13091660e-01 -1.64931849e-01 -7.34148696e-02
 -3.01011473e-01 -1.89854705e+00  2.36649923e-02 -5.51725984e-01
  7.99842477e-01 -4.33840714e-02 -2.60188192e-01 -1.74996063e-01
 -4.91537303e-01  4.09644276e-01 -1.80870801e-01  2.30171114e-01
  2.36194938e-01  2.71462679e-01  2.17982121e-02 -6.09191716e-01
 -2.04823956e-01 -5.56082964e-01 -6.08014047e-01  7.77903624e-05
 -8.24695081e-02 -2.05188334e-01 -7.09772184e-02 -4.21118766e-01
 -9.76334512e-02  8.62645656e-02  2.12224036e-01  1.12527385e-01
  2.59943

In [32]:
from sentence_transformers.util import semantic_search

docs = [
    "A man is eating food.",
    "A man is eating a piece of bread.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
    "A woman is playing violin.",
    "Two men pushed carts through the woods.",
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.",
    "A cheetah is running behind its prey.",
]

docs_embeddings = model.encode(docs, convert_to_tensor=True)

query = "tell me about animal"
query_embedding = model.encode(query, convert_to_tensor=True)

hits = semantic_search(query_embedding, docs_embeddings, top_k=2)
hits

for hit in hits[0]:
    print(docs[hit['corpus_id']], "(Score: %.4f)" % hit['score'])

A man is riding a horse. (Score: 0.2982)
A cheetah is running behind its prey. (Score: 0.2692)


In [41]:
from transformers import PreTrainedTokenizerFast

def split_large_text(large_text, max_tokens):
    # Initialize the tokenizer with a pre-trained model
    tokenizer = PreTrainedTokenizerFast.from_pretrained("gpt2")

    # Encode the large text into tokens
    tokenized_text = tokenizer.encode(large_text)

    chunks = []
    current_chunk = []
    current_length = 0

    for token in tokenized_text:
        current_chunk.append(token)
        current_length += 1

        if current_length >= max_tokens:
            # Decode the tokens to text and strip trailing punctuation
            chunks.append(tokenizer.decode(current_chunk).rstrip(' .,;'))
            current_chunk = []
            current_length = 0

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(tokenizer.decode(current_chunk).rstrip(' .,;'))

    return chunks

# Example usage
large_text = "William Shakespeare was an English playwright, poet, and actor. He is widely regarded as the greatest writer in the English language and the world's pre-eminent dramatist."
max_tokens = 50
print(split_large_text(large_text, max_tokens))


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


["William Shakespeare was an English playwright, poet, and actor. He is widely regarded as the greatest writer in the English language and the world's pre-eminent dramatist"]


Why use tokens?

> By breaking words into smaller parts (tokens), LLMs can better handle new or unusual words by understanding their building blocks. It also helps the model grasp the nuances of language, such as different word forms and contextual meanings.

[source](https://kelvin.legal/understanding-large-language-models-words-versus-tokens/#:~:text=By%20breaking%20words%20into%20smaller,word%20forms%20and%20contextual%20meanings.)

In [21]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained("gpt2")
sent = "If we split a text by number of characters, it is not obvious how many tokens these chunks will be."

# Tokenize and count tokens
encoded = tokenizer.encode(sent)
print("Number of tokens:", len(encoded))

# Decode tokens to see what they represent
tokens = [tokenizer.decode([token]) for token in encoded]
print("Tokens:", tokens)

# Decode and count words in the decoded text
decoded = tokenizer.decode(encoded)
print("Decoded text:", decoded)
print("Number of words in decoded text:", len(decoded.split()))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


Number of tokens: 22
Tokens: ['If', ' we', ' split', ' a', ' text', ' by', ' number', ' of', ' characters', ',', ' it', ' is', ' not', ' obvious', ' how', ' many', ' tokens', ' these', ' chunks', ' will', ' be', '.']
Decoded text: If we split a text by number of characters, it is not obvious how many tokens these chunks will be.
Number of words in decoded text: 20


In [19]:
doc = """If we split a text by number of characters, it is not obvious how many tokens these chunks will be.
And at the same time if we want to split a text into bigger possible chunks and keep these chunks under certain LLM tokens limit, we cannot operate by number of characters."""
split_large_text(doc, 10)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


['If we split a text by number of characters',
 ' it is not obvious how many tokens these chunks will',
 ' be.\nAnd at the same time if we',
 ' want to split a text into bigger possible chunks and',
 ' keep these chunks under certain LLM tokens limit',
 ' we cannot operate by number of characters']

In [42]:
doc = """William Shakespeare was an English playwright, poet, and actor. He is widely regarded as the greatest writer in the English language and the world's pre-eminent dramatist."""
split_large_text(doc, 10)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


['William Shakespeare was an English playwright, poet',
 ' and actor. He is widely regarded as the greatest',
 " writer in the English language and the world's pre",
 '-eminent dramatist']

In [27]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Example of a different model

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]