In [None]:
%pip install -U langchain-nomic langchain_community tiktoken langchain-openai chromadb langchain

In [123]:
! nomic login <<token>>

zsh:1: parse error near `>>'


In [124]:
# Optional: LangSmith API keys
import os
from dotenv import load_dotenv

load_dotenv(override=True)

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"

## Document Loading

In [125]:
# useful for printing the documents beautifully
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [7]:
# from langchain_community.document_loaders import PyPDFLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# loader = PyPDFLoader("../data/Robinson Advisory.pdf")
# pages = loader.load()
# for page in pages:
#     # just change the page_content to remove the newlines inserted instead of spaces
#     page.page_content = page.page_content.replace('\n', ' ')

In [None]:
%pip uninstall llama-index-vector-stores-chroma -y

In [None]:
%pip install unstructured
%pip install "unstructured[pdf]"

In [126]:
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader('../data/robinson', show_progress=True)
documents = loader.load()

100%|██████████| 1/1 [00:00<00:00,  3.26it/s]


In [127]:
pretty_print_docs(documents)

Document 1:

ADVISORY SERVICES AGREEMENT

This Advisory Services Agreement is entered into as of June 15th, 2023 (the “Effective Date”), by and between Cloud Investments Ltd., ID 51-426526-3, an Israeli company (the "Company"), and Mr. Jack Robinson, Passport Number 780055578, Israel, Email: jackrobinson@gmail.com ("Advisor").

residing

at 1 Rabin st, Tel Aviv,

Whereas,

Advisor has expertise and/or knowledge and/or relationships, which are relevant to the Company’s business and the Company has asked Advisor to provide it with certain Advisory services, as described in this Agreement; and

Whereas,

Advisor has agreed to provide the Company with such services, subject to the terms set forth in this Agreement.

NOW THEREFORE THE PARTIES AGREE AS FOLLOWS:

1. Services:

1.1

Advisor shall provide to the Company, as an independent contractor, software development services, and / or any other services as agreed by the parties from time to time (the “Services”). Advisor shall not appoint 

## Splitting

In [None]:
%pip install langchain_text_splitters

In [128]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024, chunk_overlap=200
)
doc_splits = text_splitter.split_documents(documents)
print(len(doc_splits))
doc_splits

20


[Document(page_content='ADVISORY SERVICES AGREEMENT\n\nThis Advisory Services Agreement is entered into as of June 15th, 2023 (the “Effective Date”), by and between Cloud Investments Ltd., ID 51-426526-3, an Israeli company (the "Company"), and Mr. Jack Robinson, Passport Number 780055578, Israel, Email: jackrobinson@gmail.com ("Advisor").\n\nresiding\n\nat 1 Rabin st, Tel Aviv,\n\nWhereas,\n\nAdvisor has expertise and/or knowledge and/or relationships, which are relevant to the Company’s business and the Company has asked Advisor to provide it with certain Advisory services, as described in this Agreement; and\n\nWhereas,\n\nAdvisor has agreed to provide the Company with such services, subject to the terms set forth in this Agreement.\n\nNOW THEREFORE THE PARTIES AGREE AS FOLLOWS:\n\n1. Services:\n\n1.1', metadata={'source': '../data/robinson/Robinson Advisory.pdf'}),
 Document(page_content='Whereas,\n\nAdvisor has agreed to provide the Company with such services, subject to the terms

In [90]:
# Replaced in favor of recursive text splitter

# from langchain_text_splitters import CharacterTextSplitter

# text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
#     chunk_size=1024, chunk_overlap=200
# )
# doc_splits = text_splitter.split_documents(documents)
# print(len(doc_splits))
# doc_splits

4


[Document(page_content='ADVISORY SERVICES AGREEMENT\n\nThis Advisory Services Agreement is entered into as of June 15th, 2023 (the “Effective Date”), by and between Cloud Investments Ltd., ID 51-426526-3, an Israeli company (the "Company"), and Mr. Jack Robinson, Passport Number 780055578, Israel, Email: jackrobinson@gmail.com ("Advisor").\n\nresiding\n\nat 1 Rabin st, Tel Aviv,\n\nWhereas,\n\nAdvisor has expertise and/or knowledge and/or relationships, which are relevant to the Company’s business and the Company has asked Advisor to provide it with certain Advisory services, as described in this Agreement; and\n\nWhereas,\n\nAdvisor has agreed to provide the Company with such services, subject to the terms set forth in this Agreement.\n\nNOW THEREFORE THE PARTIES AGREE AS FOLLOWS:\n\n1. Services:\n\n1.1\n\nAdvisor shall provide to the Company, as an independent contractor, software development services, and / or any other services as agreed by the parties from time to time (the “Servi

## Getting the number of characters for each chunk
#### Only to be used for RecursiveCharacterTextSplitter

In [129]:
for chunk in doc_splits:
    print("The chunk has %s characters" % len(chunk.page_content))

The chunk has 771 characters
The chunk has 753 characters
The chunk has 618 characters
The chunk has 812 characters
The chunk has 965 characters
The chunk has 583 characters
The chunk has 836 characters
The chunk has 922 characters
The chunk has 774 characters
The chunk has 791 characters
The chunk has 550 characters
The chunk has 1007 characters
The chunk has 906 characters
The chunk has 880 characters
The chunk has 728 characters
The chunk has 728 characters
The chunk has 1003 characters
The chunk has 541 characters
The chunk has 711 characters
The chunk has 863 characters


## Getting the number of tokens for each document

##### Only to be used for CharacterTextSplitter.from_tiktoken_encoder() text splitter

In [91]:
# import tiktoken

# encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
# for d in doc_splits:
#     print("The document is %s tokens" % len(encoding.encode(d.page_content)))

The document is 966 tokens
The document is 984 tokens
The document is 900 tokens
The document is 466 tokens


## Index(Vector Store)

In [130]:
from langchain_community.vectorstores.chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_nomic import NomicEmbeddings
from langchain_nomic.embeddings import NomicEmbeddings

In [142]:
import uuid

# Create a list of unique ids for each document based on the content
ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in doc_splits]
unique_ids = list(set(ids))
print(unique_ids)
# Ensure that only docs that correspond to unique ids are kept and that only one of the duplicate ids is kept
seen_ids = set()
unique_docs = [doc for doc, id in zip(doc_splits, ids) if id not in seen_ids and (seen_ids.add(id) or True)]

# Add the unique documents to your database
db = Chroma.from_documents(unique_docs, 
                           embedding=NomicEmbeddings(model="nomic-embed-text-v1"), 
                           ids=unique_ids, 
                           persist_directory='../data/chroma-dbb'
)
db.persist()

['d6f8e0d5-59c1-552e-b5e9-5e49bb58ec18', '39e5ae5f-ca9d-5fea-b6e3-111e719cbf6f', 'f69bd7c2-783f-58f3-b1ec-d306da5428ed', '15294549-03e5-5d79-ad41-c35811e88430', '2f405de2-33ad-5f49-a25d-aff899f67d66', '8eaa82d1-9b71-5c33-b4f9-bb059dfcebe5', 'e5e8d9ab-5bae-5850-ab96-86ccea1e4274', '71443022-874d-51d9-93b5-833c851c22dc', '28ab26f8-4778-5473-936f-9a9c41dc2716', '431653c3-c626-52c2-ae73-fc5c033f17fa', '1f38d2fe-c1c4-576b-b0d5-73a4dedd4728', '6e19583f-0292-5b33-a912-f03b7eb69389', 'ddf2d06d-a5ca-5e83-8d18-6f2d6fd2970f', 'c3eaaf3c-53c2-5148-b9bc-073e485e4a09', '3ad5a47d-e82e-52c6-a7d5-c646602ec21c', '5e358cf8-9e00-541f-a771-43ee0b3c0033', '3d8beabe-b983-5093-8a8b-cef2712a03a6', '17ab2a08-c73c-59b4-94f5-e257acbefcf1', '3e15e8a4-b871-5707-8409-50b7bc771807']


In [143]:
db.get()

{'ids': ['15294549-03e5-5d79-ad41-c35811e88430',
  '17ab2a08-c73c-59b4-94f5-e257acbefcf1',
  '1f38d2fe-c1c4-576b-b0d5-73a4dedd4728',
  '28ab26f8-4778-5473-936f-9a9c41dc2716',
  '2f405de2-33ad-5f49-a25d-aff899f67d66',
  '39e5ae5f-ca9d-5fea-b6e3-111e719cbf6f',
  '3ad5a47d-e82e-52c6-a7d5-c646602ec21c',
  '3d8beabe-b983-5093-8a8b-cef2712a03a6',
  '3e15e8a4-b871-5707-8409-50b7bc771807',
  '431653c3-c626-52c2-ae73-fc5c033f17fa',
  '5e358cf8-9e00-541f-a771-43ee0b3c0033',
  '6e19583f-0292-5b33-a912-f03b7eb69389',
  '71443022-874d-51d9-93b5-833c851c22dc',
  '8eaa82d1-9b71-5c33-b4f9-bb059dfcebe5',
  'c3eaaf3c-53c2-5148-b9bc-073e485e4a09',
  'd6f8e0d5-59c1-552e-b5e9-5e49bb58ec18',
  'ddf2d06d-a5ca-5e83-8d18-6f2d6fd2970f',
  'e5e8d9ab-5bae-5850-ab96-86ccea1e4274',
  'f69bd7c2-783f-58f3-b1ec-d306da5428ed'],
 'embeddings': None,
 'metadatas': [{'source': '../data/robinson/Robinson Advisory.pdf'},
  {'source': '../data/robinson/Robinson Advisory.pdf'},
  {'source': '../data/robinson/Robinson Advisory

In [132]:
retriever = db.as_retriever(search_kwargs={"k": 4})

In [86]:
# # Add to vectorDB
# vectorstore = Chroma.from_documents(
#     documents=doc_splits,
#     collection_name="rag-chromaa",
#     embedding=NomicEmbeddings(model="nomic-embed-text-v1"),
# )
# retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [133]:
retriever.get_relevant_documents('What are the compensation to the Advisor under the Agreement?')

[Document(page_content='- 3-\n\nconstitute salary payments, and 40% of such payments shall constitute payment by the Company for all other Advisor statutory rights and benefits as employee of the Company throughout the Term. Advisor further consents that the Company may offset any amounts due to him under this Section from any amounts payable to Advisor under this Agreement. Advisor shall indemnify the Company for any loss or expenses incurred by the Company if it were determined that an alleged employer/employee relationship existed between the Advisor and the Company.', metadata={'source': '../data/robinson/Robinson Advisory.pdf'}),
 Document(page_content="8. Relationship of the Parties; Indemnification: The sole relationship between the Company and the Advisor shall be that of independent contractors. Advisor shall not be deemed to be, nor treated by the Company as, an employee of the Company. Advisor shall not receive nor be entitled to overtime pay, insurance, paid vacation, sever

## RAG Chain

In [134]:
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

# Prompt
template = """Answer the question (by giving reference to the context you used) based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Local LLM
ollama_llm = "mistral:instruct"
model_local = ChatOllama(model=ollama_llm)

# Chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model_local
    | StrOutputParser()
)

In [135]:
chain.invoke("What are the payments to the Advisor under the Agreement?")

' The payments to the Advisor under the Agreement consist of hourly fees at a rate of USD 9 per Billable Hour, limited to a maximum of USD 1,500 per month, and a Workspace Expense of USD 100 per month, as stated in Section 6.1 and 6.2 of the documents provided. However, if it is determined that the Advisor is an employee of the Company, payments to the Advisor shall be reduced so that 60% of such payments shall constitute salary payments, and the remaining 40% shall constitute payment for all other Advisor statutory rights and benefits as an employee throughout the Term (Section 3). The Advisor is also solely responsible for any income taxes or assessments on compensation received under the Agreement, and any expenses and costs related to their performance of obligations under the Agreement (Section 6.4).'

In [118]:
chain.invoke("Is there a non-compete obligation to the Advisor?")

' Yes, based on the context provided, there is a non-compete obligation to the Advisor as stated in Document 6, under the section headed "Non-Compete." During the term of engagement with the Company and for a period of 12 months thereafter, the Advisor shall not be involved in any business that competes with the Company\'s Business and shall not solicit or hire any employee or service provider of the Company without the prior written consent of the Company.'

In [120]:
chain.invoke('Can the Advisor charge for meal time?')

' Based on the context provided, the Advisor is not entitled to be reimbursed for meal time as it is explicitly stated that "Billable Hour: Net time devoted to the provisioning of the Services, without calculating meals, travels or any other overhead time borne by the Advisor." Therefore, the answer is no, the Advisor cannot charge for meal time according to this agreement.'

In [121]:
chain.invoke('In which street does the Advisor live?')

' The Advisor lives on Rabin street in Tel Aviv.'

In [122]:
chain.invoke('Is the Advisor entitled to social benefits?')

' According to the context provided, the Advisor is not entitled to social benefits from the Company as stated in the first document under section "Relationship of the Parties; Indemnification". Additionally, the second document under section "6.4" states that the Advisor shall be solely responsible for any income taxes or other assessments and all expenses and costs related to compensation received hereunder, which further implies that social benefits are not included in the compensation package for the Advisor.'