# chroma

> Put email summaries into a vector store

In [1]:
#| default_exp chroma

In [2]:
#| export
from typing import List, Dict
from pathlib import Path
import json

from langchain.vectorstores import Chroma
from langchain.embeddings import VertexAIEmbeddings
from langchain.schema import Document

from classifier.schema import get_embedder

In [3]:
# Load summaries
data_dir = Path('../data')
assert data_dir.exists()
summary_path = data_dir / 'summaries.json'
assert summary_path.exists()

In [4]:
with summary_path.open('r') as f:
    summaries = json.load(f)

In [5]:
summaries['0']

{'label': 'Order Processing',
 'summaries': {'0': ' The email is requesting a drop ship order for Ohio State University.\nThe PO number is 7004014842, the account number is 2150126632, and the store number is 16422.\nThe drug name is EPIDIOLEX 100MG/ML SOL 100ML, the order quantity is 5, and the prescriber names are LUCRETIA LONG and PHILIP CLAYTON JONAS.\nThe prescriber NPIs or DEAs are ML0822634 and FJ142'}}

In [6]:
#| export
def concat_email_summaries(
    summaries: Dict[str, Dict[str, List[str]]]
    ) -> List[Document]:
    documents = []
    for idx, idx_dict in summaries.items():
        label = idx_dict.get('label')
        idx_content = ""
        for _, summary in idx_dict['summaries'].items():
            idx_content = idx_content + summary.strip() + "\n"
        documents.append(
            Document(
                page_content=idx_content,
                metadata={
                    'idx': int(idx),
                    'label': label
                }
            )
        )
    return documents

In [7]:
summary_documents = concat_email_summaries(summaries)

In [8]:
summary_documents[0]

Document(page_content='The email is requesting a drop ship order for Ohio State University.\nThe PO number is 7004014842, the account number is 2150126632, and the store number is 16422.\nThe drug name is EPIDIOLEX 100MG/ML SOL 100ML, the order quantity is 5, and the prescriber names are LUCRETIA LONG and PHILIP CLAYTON JONAS.\nThe prescriber NPIs or DEAs are ML0822634 and FJ142\n', metadata={'idx': 0, 'label': 'Order Processing'})

# Make our chroma db from these documents

In [9]:
embedder = get_embedder()

In [16]:
#| export
def get_or_make_chroma(
        data_dir: Path, 
        documents: List[Document] = None,
        overwrite: bool = False):
    chroma_dir = data_dir / 'chroma'
    if not chroma_dir.exists():
        chroma_dir.mkdir()
    embedding_function = get_embedder()
    persist_directory = str(chroma_dir.resolve())
    if len(list(chroma_dir.glob("*.sqlite3"))) > 0 and not overwrite:
        return Chroma(
            persist_directory=persist_directory,
            embedding_function=embedding_function
        )
    if documents is None:
        raise ValueError("documents cannot be None")
    return Chroma.from_documents(
        documents,
        embedding_function,
        persist_directory=persist_directory
    )

In [13]:
chroma = get_or_make_chroma(data_dir, summary_documents, overwrite=True)

In [14]:
chroma.similarity_search("Help I need a drop ship")

[Document(page_content='The email is requesting a drop ship order for the following:\n\n- Client Name: Rosedale Infectious Diseases, PLLC\n- PO ID: 7004000449\n- Account #: 2150129609\n- Store #: 16405\n- NDC: 49702024015\n- Drug Name: CABENUVA 600-900MG INJ SUSP KIT\n- Order Quantity: 1\n- Prescriber Name: ASHLEY DAY SCOTT\n- Prescriber NPI\n', metadata={'idx': 16, 'label': 'Order Processing'}),
 Document(page_content='The email is requesting a drop ship order for Ohio State University.\nThe PO number is 7004014842, the account number is 2150126632, and the store number is 16422.\nThe drug name is EPIDIOLEX 100MG/ML SOL 100ML, the order quantity is 5, and the prescriber names are LUCRETIA LONG and PHILIP CLAYTON JONAS.\nThe prescriber NPIs or DEAs are ML0822634 and FJ142\n', metadata={'idx': 0, 'label': 'Order Processing'}),
 Document(page_content='The customer, Tara May from Tower Health, is not receiving the drop ship invoices for their orders with their daily packet of invoices. Th

In [17]:
#| hide
import nbdev; nbdev.nbdev_export()