# Embedding Techniques

Converting text into numerical values before storing into vector store db.

## Using OpenAI Embeddings

In [1]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000023209A1FA90>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000002320D9DF5B0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [2]:
# Embedding a simple text
text = 'This is a simple text'

## DO NOT OVERUSE - THIS COSTS MONEY
# embedded_text = embeddings.embed_query(text)
# print(len(embedded_text))
# embedded_text

In [3]:
# Loading text document
from langchain_community.document_loaders import TextLoader

loader = TextLoader('speech.txt')
text_doc = loader.load()

# Splitting text document into chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
splitted_doc = splitter.split_documents(text_doc)

# Storing into VectorStoreDB - ChromaDB
from langchain_community.vectorstores import Chroma

## DO NOT OVERUSE - THIS COSTS MONEY
db = Chroma.from_documents(splitted_doc, embeddings)
db

<langchain_community.vectorstores.chroma.Chroma at 0x2320da0fbe0>

In [7]:
# Retrieve the results from the VectorStoreDB
query = "It will be all the easier for us to conduct ourselves as belligerents"

retrieved_results = db.similarity_search(query)
print(retrieved_results)

[Document(metadata={'source': 'speech.txt'}, page_content='It will be all the easier for us to conduct ourselves as belligerents in a high spirit of right and'), Document(metadata={'source': 'speech.txt'}, page_content='I feel confident, conduct our operations as belligerents without passion and ourselves observe with'), Document(metadata={'source': 'speech.txt'}, page_content='of mutual advantage between us—however hard it may be for them, for the time being, to believe that'), Document(metadata={'source': 'speech.txt'}, page_content='itself seeming to be in the balance. But the right is more precious than peace, and we shall fight')]


## Using Ollama Embeddings

In [1]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model='llama3.2:1b')
embeddings

OllamaEmbeddings(model='llama3.2:1b', base_url=None, client_kwargs={})

In [None]:
# Embedding list of text
text = [
       "Alpha is the first letter of Greek alphabet",
       "Beta is the second letter of Greek alphabet", 
    ]

embedded_docs = embeddings.embed_documents(text)

In [4]:
print(len(embedded_docs[0]))
embedded_docs[0]

2048


[-0.032884322,
 0.030431315,
 0.005606013,
 0.0014641136,
 0.050535094,
 -0.021277025,
 0.022304945,
 0.026426895,
 -0.029974615,
 0.038029183,
 -0.008246146,
 -0.024843676,
 -0.023381358,
 -0.0027180528,
 0.007406909,
 -0.0017564617,
 -0.018635076,
 0.01852663,
 0.043445285,
 0.019317297,
 -0.02735308,
 0.04372299,
 -0.019815521,
 0.0027475506,
 -0.04316906,
 -0.017766688,
 -0.0455275,
 0.029741036,
 -0.015329412,
 0.003322708,
 0.0077727824,
 0.010929614,
 -0.008743195,
 -0.019020122,
 0.0019707894,
 -0.04260786,
 0.033750743,
 -0.0043189004,
 0.0105831055,
 -0.008788394,
 0.015161221,
 -0.014548537,
 -0.015192574,
 0.029846376,
 -0.0035916942,
 0.0065053203,
 0.00021519943,
 -0.013859558,
 0.013087324,
 0.0120737245,
 -0.04748478,
 0.024978964,
 0.06123962,
 0.019909116,
 -0.01602146,
 0.04180204,
 0.0076696966,
 -0.021368727,
 -0.0079453755,
 0.0140593415,
 0.0080106985,
 -0.015736608,
 -0.042874265,
 -0.013255417,
 0.030147284,
 -0.012908207,
 0.0018688006,
 0.04312493,
 -0.001252

In [5]:
# Embedding single query
embeddings.embed_query("What is the second letter of Greek alphabet ")

[-0.013549699,
 -0.010950653,
 -0.0072738114,
 0.006838297,
 0.032437455,
 -0.019672913,
 0.020828776,
 -0.026913185,
 -0.0119044995,
 0.021355018,
 -0.00096875324,
 -0.047891747,
 -0.0072904723,
 0.021239154,
 -0.04251853,
 0.009848854,
 -0.01067947,
 -0.01313511,
 0.01751153,
 -0.043227382,
 0.01212421,
 0.0037700522,
 -0.015605705,
 0.041068118,
 -0.015324943,
 -0.012731382,
 -0.028863216,
 -0.007618788,
 -0.014977826,
 0.0389591,
 -0.040334508,
 -0.042932022,
 -0.0009561201,
 -0.03532254,
 -0.007279429,
 0.0066067525,
 -0.065841615,
 -0.0023054786,
 0.015570996,
 0.013618227,
 -0.0013782785,
 0.003981841,
 0.014273454,
 0.024145968,
 -0.010093806,
 0.033994805,
 -0.025240732,
 0.03522851,
 -0.007446207,
 0.017236885,
 0.00044322715,
 0.06315477,
 -0.006197816,
 -0.023292359,
 -0.010258144,
 0.01894911,
 -0.030608665,
 -0.017564047,
 -0.0049244473,
 0.01400665,
 -0.023366114,
 -0.02051697,
 -0.014753231,
 0.020002816,
 0.01645938,
 -0.014318374,
 -0.011934868,
 0.018856838,
 -0.0366

## Using HuggingFace Embeddings

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [9]:
# Embedding a query
query = "This is sample text"
embedded_docs = embeddings.embed_query(query)

In [11]:
print(len(embedded_docs))
embedded_docs

384


[-0.0036133350804448128,
 0.08470533043146133,
 -0.038935743272304535,
 0.01812080107629299,
 0.0657837763428688,
 0.025117814540863037,
 0.06537902355194092,
 0.058730706572532654,
 0.05446251481771469,
 0.00033412041375413537,
 0.027775784954428673,
 -0.008672214113175869,
 0.00047145565622486174,
 -0.05506840720772743,
 0.020526710897684097,
 0.043319784104824066,
 0.05890136957168579,
 -0.024940600618720055,
 -0.03512798994779587,
 -0.010982196778059006,
 0.05797865241765976,
 0.06433533132076263,
 0.04225873947143555,
 0.015137149021029472,
 0.011003808118402958,
 0.06278208643198013,
 -0.017231229692697525,
 0.07714583724737167,
 0.12189143151044846,
 -0.02414359338581562,
 -0.023110447451472282,
 0.09137076139450073,
 0.1575256586074829,
 0.016054177656769753,
 0.0545060969889164,
 0.01843559369444847,
 -0.063252754509449,
 0.05616050958633423,
 0.032800666987895966,
 0.04736696556210518,
 -0.004562672693282366,
 -0.07927948981523514,
 0.006678008008748293,
 0.019421707838773727

In [12]:
# Embedding list of text
text = [
       "Alpha is the first letter of Greek alphabet",
       "Beta is the second letter of Greek alphabet", 
    ]

embedded_docs = embeddings.embed_documents(text)

In [14]:
print(len(embedded_docs[0]))
embedded_docs

384


[[0.004709018860012293,
  -0.05832066386938095,
  -0.05513669177889824,
  -0.013069421984255314,
  -0.04050260782241821,
  -0.013990375213325024,
  0.08047975599765778,
  -0.02164486236870289,
  0.009532303549349308,
  0.05417705699801445,
  0.06955375522375107,
  0.10983368754386902,
  -0.03820323571562767,
  0.02504148706793785,
  -0.05174914374947548,
  -0.04984031990170479,
  -0.07185733318328857,
  -0.00449318066239357,
  -0.01851734146475792,
  -0.040730785578489304,
  -0.04226188361644745,
  -0.004491632338613272,
  -9.7976706456393e-05,
  0.013725693337619305,
  0.003098088316619396,
  0.07191695272922516,
  -0.013512326404452324,
  -0.047566551715135574,
  0.039082545787096024,
  -0.045140087604522705,
  -0.0015503885224461555,
  -0.04356252774596214,
  0.16599102318286896,
  0.019635219126939774,
  -0.054768308997154236,
  0.05165570601820946,
  -0.037061356008052826,
  -0.06599248200654984,
  -0.026455171406269073,
  0.045900844037532806,
  -0.03778329864144325,
  -0.0267349