# Chroma DB

# Setup

# Run from an virtualenv!
```
python3 -m virtualenv myenv 
source myenv/bin/activate 
# pip install notebook
jupyter notebook
```

In [None]:
# Only needed, if pip-installs of the next cells do not work correctly
# !pip freeze | grep llama | xargs pip uninstall -y
# !pip install -U llama-index llama-index-llms-azure-openai llama-index-embeddings-azure-openai --force-reinstall --no-cache-dir

In [None]:
# Pydantic >=2 causes compatibility issues with llama-index - that's why it's downgraded to 1.10.10
%pip install llama-index llama-index-llms-openai langchain chardet lark sentence-transformers chromadb pydantic==1.10.10 --quiet

In [25]:
import sys
import os
import json
from getpass import getpass
import psutil
import pprint
from pprint import pprint as prettyprint
import textwrap

In [2]:
IN_NOTEBOOK = any(["jupyter-notebook" in i for i in psutil.Process().parent().cmdline()])
if IN_NOTEBOOK:
  CREDS = json.loads(getpass("Secrets (JSON string): "))
  os.environ['CREDS'] = json.dumps(CREDS)
  CREDS = json.loads(os.getenv('CREDS'))

Secrets (JSON string):  ········


In [3]:
os.environ["OPENAI_API_KEY"] = CREDS['OpenAI']['v2']['credential'] 

## Download Data

In [None]:
!mkdir -p 'data/paul_graham/'
!curl 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' --output 'data/paul_graham/paul_graham_essay.txt'

# 1. Hello World

In [None]:
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer

CHROMA_DATA_PATH = "chroma_data/"
EMBED_MODEL = "all-MiniLM-L6-v2"
COLLECTION_NAME = "demo_docs"

client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

In [None]:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL
)

collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func,
    metadata={"hnsw:space": "cosine"},
)

In [None]:
documents = [
	"The latest iPhone model comes with impressive features and a powerful camera.",
	"Exploring the beautiful beaches and vibrant culture of Bali is a dream for many travelers.",
	"Einstein's theory of relativity revolutionized our understanding of space and time.",
	"Traditional Italian pizza is famous for its thin crust, fresh ingredients, and wood-fired ovens.",
	"The American Revolution had a profound impact on the birth of the United States as a nation.",
	"Regular exercise and a balanced diet are essential for maintaining good physical health.",
	"Leonardo da Vinci's Mona Lisa is considered one of the most iconic paintings in art history.",
	"Climate change poses a significant threat to the planet's ecosystems and biodiversity.",
	"Startup companies often face challenges in securing funding and scaling their operations.",
	"Beethoven's Symphony No. 9 is celebrated for its powerful choral finale, 'Ode to Joy.'",
]

genres = [
	"technology",
	"travel",
	"science",
	"food",
	"history",
	"fitness",
	"art",
	"climate change",
	"business",
	"music",
]

collection.add(
	documents=documents,
	ids=[f"id{i}" for i in range(len(documents))],
	metadatas=[{"genre": g} for g in genres]
)

In [None]:
query_results = collection.query(
	query_texts=["Find me some delicious food!"],
	n_results=1,
)

query_results.keys()

In [None]:
print(query_results["documents"])
print(query_results["ids"])
print(query_results["distances"])
print(query_results["metadatas"])

# 2. Langchain and Chroma DB

In [4]:
from langchain.document_loaders import TextLoader, DirectoryLoader

text_loader_kwargs={'autodetect_encoding': True}
loader = DirectoryLoader('./data/hotels/london', show_progress=True, 
    loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
docs = loader.load()

100%|█████████████████████████████████████████| 104/104 [00:13<00:00,  7.91it/s]


In [5]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1,
    chunk_overlap  = 0,
    length_function = lambda x: 1, # hack - usually len is used 
    is_separator_regex = False
)
split_docs = text_splitter.split_documents(docs)
len(split_docs) 

7324

In [6]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

persist_directory = 'vector_store'
embedding = OpenAIEmbeddings()

  warn_deprecated(


In [7]:
# First init db
vectordb = Chroma.from_documents(
    documents=split_docs,
    embedding=embedding,
    persist_directory=persist_directory
)

In [None]:
# Load persisted db
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

In [8]:
print(vectordb._collection.count())

7324


In [49]:
def myPrettyPrinter(obj):
    #print(json.dumps(obj, indent=2))
    #prettyprint(obj)
    myText = pprint.pformat(obj)
    myText = myText.encode('ascii', 'ignore').decode('unicode_escape') # unescape 
    for x in myText.split('\n'):
      print(textwrap.fill(x, width=80, initial_indent='', subsequent_indent='          '))
      print()

In [50]:
query_docs = vectordb.similarity_search('politeness of staff', k=3)
myPrettyPrinter(query_docs)

[Document(page_content="Jun 21 2004     Very Rude Staff My husband and I stayed
          here during our second visit to London. The location and the area
          surrounding the hotel were not bad. I don't mind hotels as long as the
          room is clean, the linens fresh and the staff friendly. Unfortunately,
          the biggest complaint we had was with the staff. They are very
          unfriendly and not helpful at all. Stay away from the breakfast staff
          especially! They are extremely rude and the breakfast was HORRIBLE.
          The man in charge was extremely nasty!!", metadata={'source':
          'data/hotels/london/uk_england_london_bayswater_inn'}),

 Document(page_content="Jul 14 2007     Please stay away!       We would like
          your future customers to be aware of this terrible hotel and stay
          away. During our stay they never greeted us, only gave a suspicious
          look. When we asked their help to unlock the dodgy room door, he
     

In [12]:
import langchain 
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

langchain.debug = True

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="All sources starts with 'data/hotels/london/uk_england_london_' \
          then goes hotel chain, constant 'london_' and location.",
        type="string",
    )
]

document_content_description = "Customer reviews for hotels"
llm = OpenAI(temperature=0.1) # low temperature to make model more factual
# by default 'text-davinci-003' is used

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

question = "breakfast in Travelodge Farringdon"
mydocs = retriever.get_relevant_documents(question, k = 5)

[32;1m[1;3m[chain/start][0m [1m[1:retriever:Retriever > 2:chain:query_constructor] Entering Chain run with input:
[0m{
  "query": "breakfast in Travelodge Farringdon"
}
[32;1m[1;3m[chain/start][0m [1m[1:retriever:Retriever > 2:chain:query_constructor > 3:prompt:FewShotPromptTemplate] Entering Prompt run with input:
[0m{
  "query": "breakfast in Travelodge Farringdon"
}
[36;1m[1;3m[chain/end][0m [1m[1:retriever:Retriever > 2:chain:query_constructor > 3:prompt:FewShotPromptTemplate] [0ms] Exiting Prompt run with output:
[0m[outputs]
[32;1m[1;3m[llm/start][0m [1m[1:retriever:Retriever > 2:chain:query_constructor > 4:llm:OpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Your goal is to structure the user's query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    \"query\": string \\ text string to compare to docume

In [51]:
myPrettyPrinter(mydocs)

[Document(page_content="Jul 6 2007      Loved it even though it was small
          My favorite thing about this hotel was the breakfast, oh my-- soooo
          good. Bacon, bacon, bacon. I still think about the hot breakfast
          buffet, two years after I stayed here. The staff was very nice, the
          room was clean. I wasn't thrilled about the location, but it was my
          first visit to London, so I'm sure that added to the issues with
          getting around. I had reserved a double, but I got there too early so
          I just took the single they were offering. It was super small, but
          clean and the bathroom looked new. I reccommend this place.",
          metadata={'source':
          'data/hotels/london/uk_england_london_travelodge_london_farringdon'}),

 Document(page_content='Feb 19 2005     four nights at the Travelodge   We
          arrived at 8 a.m. and when we checked in at the front desk we were
          told they do not store your luggage. So