# Chroma Quickstart

## 1. Load API Key with .env

In [None]:
from dotenv import load_dotenv

# Your .env file can contain connection env vars for Chroma, but this notebook example uses ephemeral local storage so this technically isn't necessary
load_dotenv()

## 2. Initialize Chroma client

Next, use your API key to initialize your client.

In [None]:
import os
import chromadb

client = chromadb.Client()

## 3. Prepare language model for vector encoder

We use a small transformers language model to create 364-dimensional embeddings. You can out models for generating embeddings

In [None]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = embedder.encode("Example sentences.")
dimension = embeddings.shape[0]

## 4. Create a Chroma collection

This creates a collection named "quickstart" that performs similarity search with your vectors.

In [None]:
collection_name = "quickstart"
collection = client.create_collection(collection_name)

## 5. Generate vector values from wikipedia text

We retrieve a wikipedia based dataset with Hugging Face's datasets library. Note that this dataset contains Cohere's vectors, but we're generating our own in this notebook.

In [None]:
from datasets import load_dataset
import torch
import cohere
from tqdm.auto import tqdm

max_docs = 1000 # Increase to use etnire dataset
docs_stream = load_dataset(f"Cohere/wikipedia-22-12-simple-embeddings", split="train", streaming=True)

documents = []
embeddings = []
metadatas = []
ids = []


pbar = tqdm(total=max_docs)
for doc in docs_stream:
    documents.append(doc["title"] + " " + doc["text"])
    embeddings.append(embedder.encode(doc["title"] + " " + doc["text"]).tolist())
    metadatas.append({
        "title": doc["title"],
        "text": doc["text"],
        "url": doc["url"],
    })
    ids.append(str(doc["id"]))
    pbar.update(1)
    if len(embeddings) >= max_docs:
        break


## 6. Upsert vectors

Now that you’ve created your collection and the vector embeddings of your wikipedia data, you can upsert these vectors into your collection.

In [None]:
collection.add(
    documents=documents,
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids
)

## 7. Check the that vectors were inserted to che collection

## 8. Run a similarity search

In [None]:
query = "What's the greatest human advancement of all time?"

def search(query: str):
    query_results = collection.query(
        query_embeddings=[embedder.encode(query).tolist()],
        include=["documents"]
    )
    matches = query_results.get("documents", [])
    if matches:
        return matches[0][0]
    else:
        return {}

search(query)

## 9. Deploy an app to port forward and share publically

In [None]:
import gradio as gr

gr.close_all()

iface = gr.Interface(
    fn=search,
    inputs=gr.Textbox(lines=2, placeholder="Type your query here..."),
    outputs="text",
    title="Search Wikipedia text with a Chroma Collection",
    description="This is a simple wikipedia search engine powered by transformers and chroma",
)

# Run the Gradio app on localhost:5000 or whichever port you specified
iface.launch(server_port=5000, inline=False, quiet=True)

print(f"See your app deployed publically with the port you're securely forwarding: {os.environ.get('PORT_FORWARD_URL')}")

## 10. Clean up

When you no longer need the collection, call `drop_collection` and specify the name to shut it down.

In [None]:
client.drop_collection(
    collection_name=collection_name
)