## Install required packages

In [21]:
!pip install langchain langchain_chroma langchain_community sentence-transformers langchain_cohere unstructured langchain_huggingface -q

## Import necessary packages

In [22]:
import os
import shutil
import warnings
warnings.filterwarnings('ignore')

# langchain utilities
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.evaluation import load_evaluator
from langchain_core.prompts import PromptTemplate

# embedding models
from langchain_cohere import CohereEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

# vector store
from langchain_chroma import Chroma

# chat models
from langchain_cohere import ChatCohere

# api key from secrets
from google.colab import userdata
os.environ["COHERE_API_KEY"] = userdata.get('COHERE_API_KEY')

## Load data from source into documents

In [62]:
# load data from the source
def load_docs(directory):
    loader = DirectoryLoader(directory, glob="*.txt")
    docs = loader.load()
    return docs

documents = load_docs("_data")

len(documents)

1

In [63]:
# split the documents furthur down to chunks
def split_docs(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 50,
        length_function = len,
        add_start_index = True
    )

    chunks = text_splitter.split_documents(documents)

    for chunk in chunks:
        chunk_id = chunk.metadata["start_index"]
        chunk.metadata["id"] = f"{chunk.metadata['source']}_{chunk_id}"

    return chunks

chunks = split_docs(documents)

len(chunks)

1128

In [64]:
doc = chunks[1]
print(doc.page_content)
print(doc.metadata)

the air, descending slowly in wide circles down towards the earth. Even as he gazed his quick ears caught sounds in the woodlands below, on the west side of the River. He stiffened. There were cries, and among them, to his horror, he could distinguish the harsh voices of Orcs. Then suddenly with a deep-throated call a great horn blew, and the blasts of it smote the hills and echoed in the hollows, rising in a mighty shout above the roaring of the falls. "The horn of Boromir!" he cried. "He is in need!"
{'source': '_data/tt.txt', 'start_index': 948, 'id': '_data/tt.txt_948'}


In [57]:
DB_PATH = "chroma"

In [56]:
# clear out the database
if os.path.exists(DB_PATH):
    shutil.rmtree(DB_PATH)

## Create embeddings

In [None]:
embedding_function = CohereEmbeddings(model="embed-english-light-v3.0")

vector = embedding_function.embed_query("How are you?")

print(f"This is a {len(vector)} dimensional vector")

print(vector)

This is a 384 dimensional vector
[0.04888916, -0.026947021, 0.02331543, 0.021408081, -0.08123779, -0.0016756058, 0.0769043, 0.05114746, -0.07409668, 0.024551392, -0.009246826, -0.03540039, 0.00680542, 0.009170532, -0.029037476, 0.0019798279, 0.105773926, -0.06890869, -0.2331543, -0.046203613, -0.13098145, 0.00623703, 0.010093689, -0.02949524, 0.009902954, 0.019424438, -0.030380249, -0.0036945343, -0.0048942566, -0.04550171, -0.028015137, -0.052124023, -0.021972656, 0.022262573, 0.0047569275, -0.036895752, 0.004760742, 0.016601562, -0.006450653, -0.04373169, 0.02381897, -0.029815674, 0.014007568, -0.062042236, -0.039978027, 0.015617371, 0.010765076, 0.013801575, -0.012413025, -0.029022217, -0.08868408, -0.012649536, 0.032165527, 0.051361084, 0.03677368, 0.111816406, 0.06100464, 0.07122803, 0.022476196, 0.05706787, -0.061340332, 0.01852417, -0.07861328, 0.05722046, 0.031082153, 0.048706055, -0.060333252, 0.0070991516, -0.05090332, -0.07104492, -0.06951904, -0.021972656, 0.08215332, 0.026

## Save the embeddings into the vector database

In [61]:
def save_to_database(chunks):
    """
    convert chunks to embeddings and store them in the chroma db
    """

    # create the embedding function
    embedding_function = CohereEmbeddings(model="embed-english-light-v3.0")

    # load it into chroma
    db = Chroma.from_documents(chunks, embedding_function, persist_directory=DB_PATH)

    print(f"{len(chunks)} chunks are now stored in chroma at {DB_PATH}.")

save_to_database(chunks)

2286 chunks are now stored in chroma at chroma.


## Update the database with new embeddings

In [65]:
# saves as a sqlite3 file
def save_to_chroma(chunks):

    # create the open-source embedding function
    embedding_function = CohereEmbeddings(model="embed-english-light-v3.0")

    # load it into Chroma
    db = Chroma(embedding_function=embedding_function, persist_directory=DB_PATH)

    # check for existing documents
    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in the db: {len(existing_ids)}")

    # get new documents
    new_chunks = []
    for chunk in chunks:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    # get new document ids
    new_chunk_ids = []
    for chunk in new_chunks:
        new_chunk_ids.append(chunk.metadata["id"])

    # update the db with new documents
    print(f"Number of new documents to add in the db: {len(new_chunks)}")
    db.add_documents(new_chunks, ids=new_chunk_ids)

    print(f"{len(chunks)} chunks are now stored in chroma")

save_to_chroma(chunks)

Number of existing documents in the db: 2286
Number of new documents to add in the db: 1128
1128 chunks are now stored in chroma


## Use langchain's evaluator to compare embeddings

In [None]:
embedding_model = HuggingFaceEmbeddings()

# this constructor uses OpenAI embeddings by default
hf_evaluator = load_evaluator("embedding_distance", embeddings=embedding_model)

hf_evaluator.evaluate_strings(prediction="argentina", reference="messi")

{'score': 0.6373337369763273}

## Query the database and get the context

In [None]:
# search the db
def query_chroma(query):

    embedding_function = CohereEmbeddings(model="embed-english-light-v3.0")
    db = Chroma(persist_directory="chroma_vs", embedding_function=embedding_function)
    docs = db.similarity_search_with_relevance_scores(query, k=2)

    print(docs)

    if len(docs) == 0 or docs[0][1] < 0.10:
        raise ValueError("No matching documents found.")

    context = "\n\n----\n\n".join([doc.page_content for doc, score in docs])
    print(context)

    return context

# query the db
query = "who is the king of gandor?"
context = query_chroma(query)

context

[(Document(metadata={'source': 'data/rotk.txt', 'start_index': 515665}, page_content='now he awaits you. You shall eat and drink with him. When you are ready I will lead you to him." "The King?" said Sam. "What king, and who is he?" "The King of Gondor and Lord of the Western Lands," said Gandalf "and he has taken back all his ancient realm. He will ride soon to his crowning, but he waits for you." "What shall we wear?" said Sam; for all he could see was the old and tattered clothes that they had journeyed in, lying folded on the ground beside their beds. "The clothes that you wore on your way to Mordor," said Gandalf.'), 0.2246058833317378), (Document(metadata={'source': 'data/rotk.txt', 'start_index': 17199}, page_content='a paved passage, long and empty, and as they went Gandalf spoke softly to Pippin. "Be careful of your words, Master Peregrin! This is no time for hobbit pertness. Théoden is a kindly old man. Denethor is of another sort, proud and subtle, a man of far greater linea

'now he awaits you. You shall eat and drink with him. When you are ready I will lead you to him." "The King?" said Sam. "What king, and who is he?" "The King of Gondor and Lord of the Western Lands," said Gandalf "and he has taken back all his ancient realm. He will ride soon to his crowning, but he waits for you." "What shall we wear?" said Sam; for all he could see was the old and tattered clothes that they had journeyed in, lying folded on the ground beside their beds. "The clothes that you wore on your way to Mordor," said Gandalf.\n\n----\n\na paved passage, long and empty, and as they went Gandalf spoke softly to Pippin. "Be careful of your words, Master Peregrin! This is no time for hobbit pertness. Théoden is a kindly old man. Denethor is of another sort, proud and subtle, a man of far greater lineage and power, though he is not called a king. But he will speak most to you, and question you much, since you can tell him of his son Boromir. He loved him greatly: too much perhaps;

In [None]:
embedding_function = CohereEmbeddings(model="embed-english-light-v3.0")
db = Chroma(persist_directory="chroma_vs", embedding_function=embedding_function)
docs = db.similarity_search_with_relevance_scores(query, k=1)

docs[0][1]

0.2246058833317378

In [None]:
docs[0][0].metadata

{'source': 'data/rotk.txt', 'start_index': 515665}

In [None]:
docs[0][0].page_content

'now he awaits you. You shall eat and drink with him. When you are ready I will lead you to him." "The King?" said Sam. "What king, and who is he?" "The King of Gondor and Lord of the Western Lands," said Gandalf "and he has taken back all his ancient realm. He will ride soon to his crowning, but he waits for you." "What shall we wear?" said Sam; for all he could see was the old and tattered clothes that they had journeyed in, lying folded on the ground beside their beds. "The clothes that you wore on your way to Mordor," said Gandalf.'

## Create a prompt

In [None]:
PROMPT_TEMPLATE = """
Answer the question based on the following context:

{context}

---

The question is: {query}
"""

In [None]:
prompt_template = PromptTemplate.from_template(PROMPT_TEMPLATE)

prompt = prompt_template.format(context=context, query=query)

prompt

'\nAnswer the question based on the following context:\n\nnow he awaits you. You shall eat and drink with him. When you are ready I will lead you to him." "The King?" said Sam. "What king, and who is he?" "The King of Gondor and Lord of the Western Lands," said Gandalf "and he has taken back all his ancient realm. He will ride soon to his crowning, but he waits for you." "What shall we wear?" said Sam; for all he could see was the old and tattered clothes that they had journeyed in, lying folded on the ground beside their beds. "The clothes that you wore on your way to Mordor," said Gandalf.\n\n----\n\na paved passage, long and empty, and as they went Gandalf spoke softly to Pippin. "Be careful of your words, Master Peregrin! This is no time for hobbit pertness. Théoden is a kindly old man. Denethor is of another sort, proud and subtle, a man of far greater lineage and power, though he is not called a king. But he will speak most to you, and question you much, since you can tell him of

## Use a chat model like cohere and pass in the prompt

In [None]:
# chat model
cohere_chat_model = ChatCohere()

In [None]:
ai_msg = cohere_chat_model.invoke(prompt)

In [None]:
ai_msg.content

'The King of Gondor is mentioned in the provided text as "The King of Gondor and Lord of the Western Lands." This title suggests that the king in question rules over the realm of Gondor and holds dominion over the Western Lands. \n\nLater in the text, it is also mentioned that this king is awaiting a character named Sam, indicating that the king might be someone familiar to Sam and the other characters in the conversation. However, the specific name or identity of the King of Gondor is not directly provided in the given context.'

In [None]:
ai_msg

AIMessage(content='The King of Gondor is mentioned in the provided text as "The King of Gondor and Lord of the Western Lands." This title suggests that the king in question rules over the realm of Gondor and holds dominion over the Western Lands. \n\nLater in the text, it is also mentioned that this king is awaiting a character named Sam, indicating that the king might be someone familiar to Sam and the other characters in the conversation. However, the specific name or identity of the King of Gondor is not directly provided in the given context.', additional_kwargs={'documents': None, 'citations': None, 'search_results': None, 'search_queries': None, 'is_search_required': None, 'generation_id': '99dfbdf7-eba5-45e6-ae2d-6f229c3b75d3', 'token_count': {'input_tokens': 352, 'output_tokens': 109}}, response_metadata={'documents': None, 'citations': None, 'search_results': None, 'search_queries': None, 'is_search_required': None, 'generation_id': '99dfbdf7-eba5-45e6-ae2d-6f229c3b75d3', 'tok