## Embedding Techniques

### Azure OpenAI Embeddings

In [59]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [61]:
from langchain_openai import AzureOpenAIEmbeddings
api_version = "2024-12-01-preview" 

embeddings = AzureOpenAIEmbeddings(
    api_key = os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
    model = os.getenv("AZURE_OPENAI_EMBEDDINGS_MODEL"),
    api_version = api_version
)

embeddings

AzureOpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x7c29b7c52a10>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x7c29b8058350>, model='text-embedding-3-large', dimensions=None, deployment=None, openai_api_version='2024-12-01-preview', openai_api_base=None, openai_api_type='azure', openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=2048, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True, azure_endpoint='https://pfs-2-namit-resource.cognitiveservices.azure.com/', azure_ad_token=None, azure_ad_token_provider=None, azure_ad_async_token_provider=None, valida

In [62]:
text = "This is a tutorial on open ai embeddings."

In [63]:
query_result = embeddings.embed_query(text)

In [64]:
query_result

[-0.006408915389329195,
 0.014243429526686668,
 -0.015311058610677719,
 -0.03974092751741409,
 0.03461630642414093,
 -0.015398981049656868,
 0.008101421408355236,
 0.06415823847055435,
 0.0025528897531330585,
 0.014318792149424553,
 -0.009263253770768642,
 0.019719740375876427,
 0.008365188725292683,
 -0.01408014539629221,
 0.010519287548959255,
 0.012415899895131588,
 0.02204340323805809,
 0.029039515182375908,
 -0.01984534226357937,
 -0.028260773047804832,
 0.002168229315429926,
 0.024316824972629547,
 -0.047603704035282135,
 -0.021666591987013817,
 0.04350902885198593,
 -0.0037963639479130507,
 0.012629425153136253,
 0.03617379069328308,
 -0.011838124133646488,
 0.050919633358716965,
 0.004198295064270496,
 -0.030772842466831207,
 0.005212542600929737,
 -0.011260348372161388,
 -0.011379671283066273,
 0.017973851412534714,
 0.04627230763435364,
 0.03991677239537239,
 -0.03838441148400307,
 0.02341248095035553,
 0.029039515182375908,
 0.0036896008532494307,
 0.007592727895826101,
 -0.

In [65]:
embeddings_1024 = AzureOpenAIEmbeddings(
    api_key = os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
    model = os.getenv("AZURE_OPENAI_EMBEDDINGS_MODEL"),
    api_version = api_version,
    dimensions = 1024
)

In [66]:
embeddings_1024

AzureOpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x7c29b7b5d390>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x7c29b7c07250>, model='text-embedding-3-large', dimensions=1024, deployment=None, openai_api_version='2024-12-01-preview', openai_api_base=None, openai_api_type='azure', openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=2048, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True, azure_endpoint='https://pfs-2-namit-resource.cognitiveservices.azure.com/', azure_ad_token=None, azure_ad_token_provider=None, azure_ad_async_token_provider=None, valida

In [69]:
text = "This is a tutorial on open ai embeddings."
query_result = embeddings_1024.embed_query(text)
len(query_result)

1024

In [70]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("data/speech.txt")
documents = loader.load()
documents



[Document(metadata={'source': 'data/speech.txt'}, page_content='Down through the ages, a traditional form has evolved for this type of speech, which is: Some old fart, his best years behind him, who, over the course of his life, has made a series of dreadful mistakes (that would be me), gives heartfelt advice to a group of shining, energetic young people, with all of their best years ahead of them (that would be you).\nAnd I intend to respect that tradition.\nNow, one useful thing you can do with an old person, in addition to borrowing money from them, or asking them to do one of their old-time “dances,” so you can watch, while laughing, is ask: “Looking back, what do you regret?” And they’ll tell you. Sometimes, as you know, they’ll tell you even if you haven’t asked. Sometimes, even when you’ve specifically requested they not tell you, they’ll tell you.\nSo: What do I regret? Being poor from time to time? Not really. Working terrible jobs, like “knuckle-puller in a slaughterhouse?” (

In [73]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
speech_docs = text_splitter.split_documents(documents)
print(len(speech_docs))
print(speech_docs)

35
[Document(metadata={'source': 'data/speech.txt'}, page_content='Down through the ages, a traditional form has evolved for this type of speech, which is: Some old'), Document(metadata={'source': 'data/speech.txt'}, page_content='Some old fart, his best years behind him, who, over the course of his life, has made a series of'), Document(metadata={'source': 'data/speech.txt'}, page_content='series of dreadful mistakes (that would be me), gives heartfelt advice to a group of shining,'), Document(metadata={'source': 'data/speech.txt'}, page_content='shining, energetic young people, with all of their best years ahead of them (that would be you).'), Document(metadata={'source': 'data/speech.txt'}, page_content='And I intend to respect that tradition.'), Document(metadata={'source': 'data/speech.txt'}, page_content='Now, one useful thing you can do with an old person, in addition to borrowing money from them, or'), Document(metadata={'source': 'data/speech.txt'}, page_content='them, or aski

In [75]:
## Vector Embedding and Vector Store
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(speech_docs, embeddings_1024)
db

<langchain_community.vectorstores.chroma.Chroma at 0x7c296cf52490>

In [76]:
query = "It will be easier for us to create AI tools ourselves."
retrieved_results = db.similarity_search(query)
print(retrieved_results)

[Document(metadata={'source': 'data/speech.txt'}, page_content='shining, energetic young people, with all of their best years ahead of them (that would be you).'), Document(metadata={'source': 'data/speech.txt'}, page_content='Now, one useful thing you can do with an old person, in addition to borrowing money from them, or'), Document(metadata={'source': 'data/speech.txt'}, page_content='her Convocation Speech name will be “ELLEN.” ELLEN was small, shy. She wore these blue cat’s-eye'), Document(metadata={'source': 'data/speech.txt'}, page_content='So she came to our school and our neighborhood, and was mostly ignored, occasionally teased (“Your')]


### Ollama Embeddings

In [81]:
# from langchain_community.embeddings import OllamaEmbeddings
# embeddings=(
#     OllamaEmbeddings(model="gemma:2b") ##by default it uses "llama2" model
# )
# '''
# /tmp/ipykernel_329451/793947940.py:2: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the `langchain-ollama package and should be used instead. To use it run `pip install -U `langchain-ollama` and import as `from `langchain_ollama import OllamaEmbeddings``.
#   OllamaEmbeddings(model="gemma:2b") ##by default it uses "llama2" model
# '''

In [82]:
from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(
    model="gemma:2b",
)

In [83]:
embeddings

OllamaEmbeddings(model='gemma:2b', validate_model_on_init=False, base_url=None, client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)

In [86]:
text1 = "LangChain is the framework for building context-aware reasoning applications"
text2 = "LangGraph is a library for building stateful, multi-actor applications with LLMs"
r1 = embeddings.embed_documents([text, text2])
for vector in r1:
    print(str(vector)[:100])  # Show the first 100 characters of the vector


[0.0011620543, -0.0026224295, -0.0044009765, 0.022861933, 0.010003764, 0.036049027, 0.020056384, -0.
[0.00021138022, -0.0030938762, -0.0142226, 0.0152740255, -0.0150553165, 0.016580734, -0.004265862, -


In [87]:
## Using other embedding models
embeddings = OllamaEmbeddings(
    model="mxbai-embed-large",
)
embeddings

OllamaEmbeddings(model='mxbai-embed-large', validate_model_on_init=False, base_url=None, client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)

In [88]:
text = "This is a tutorial on ollama embeddings."
query_result = embeddings.embed_query(text)
query_result

[0.017782321,
 0.04749889,
 -0.013258273,
 -0.04921094,
 -0.008285059,
 -0.017269589,
 0.00028594097,
 -0.012673728,
 0.020080056,
 0.03880519,
 0.021583961,
 0.045047544,
 -0.0056706956,
 -0.030093204,
 -0.021934351,
 0.030551564,
 0.0042492957,
 -0.052492738,
 -0.02089723,
 -0.01745632,
 -0.02409741,
 0.045853555,
 -0.08186457,
 -0.024691848,
 -0.0056548584,
 0.025099643,
 0.057197332,
 -0.0027595775,
 0.054540124,
 0.0025790278,
 -0.0009151481,
 0.044078488,
 0.022126893,
 -0.040617723,
 -0.029718963,
 0.0019937707,
 0.017111873,
 -0.024009135,
 -0.038171224,
 -0.069987446,
 0.0022069975,
 0.023203667,
 -0.006082978,
 -0.062461186,
 -0.058349416,
 0.012818924,
 0.021526402,
 0.011441472,
 0.02768869,
 -0.047209226,
 -0.020302722,
 0.0036874786,
 -0.012230972,
 -0.00058618776,
 -0.024118535,
 0.0052916016,
 -0.012191864,
 -0.029037053,
 -0.026185023,
 0.041533608,
 0.039216258,
 0.018894207,
 0.02217144,
 -0.057051208,
 -0.021352665,
 -0.013536608,
 -0.025461715,
 -0.012518517,
 -0.0

### HuggingFace Embeddings

In [1]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [2]:
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2",
    model_kwargs={"device": "cpu"},
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
text = "This is a tutorial on huggingface embeddings."

In [7]:
query_result = embeddings.embed_query(text)
query_result

[-0.03889043256640434,
 0.008742008358240128,
 0.06382442265748978,
 0.0033765980042517185,
 0.026593487709760666,
 0.024661635980010033,
 -0.05931262671947479,
 -0.09228713810443878,
 0.004443374462425709,
 0.0035933381877839565,
 0.024500906467437744,
 -0.03133570775389671,
 0.020446116104722023,
 0.023291965946555138,
 0.05185587331652641,
 0.025981543585658073,
 -0.011212974786758423,
 0.07309852540493011,
 0.013340459205210209,
 0.0758344754576683,
 -0.0331185907125473,
 0.0022929594852030277,
 0.05722101405262947,
 -0.09554317593574524,
 -0.0021646087989211082,
 -0.003871443448588252,
 -0.011653517372906208,
 0.04401404783129692,
 0.13203072547912598,
 -0.015292440541088581,
 -0.00405794708058238,
 -0.03116501122713089,
 0.01572895050048828,
 0.04171823710203171,
 -0.06291922181844711,
 0.08282333612442017,
 0.03192957490682602,
 0.0695149153470993,
 -0.09001610428094864,
 0.01428124587982893,
 -0.04467559978365898,
 0.04082360118627548,
 0.018578728660941124,
 0.0125842122361063

In [8]:
len(query_result)

384