# chroma

> Put email summaries into a vector store

In [1]:
#| default_exp chroma

In [2]:
#| export
from typing import List, Dict
from pathlib import Path
import json
import pandas as pd

from langchain.vectorstores import Chroma
from langchain.embeddings import VertexAIEmbeddings
from langchain.schema import Document
from langchain.document_loaders import DataFrameLoader

from classifier.schema import get_embedder
from classifier.load import get_training_instances, get_idx, TrainingInstance

In [3]:
# Load summaries
data_dir = Path('../data')
assert data_dir.exists()
summary_path = data_dir / 'summaries.csv'
assert summary_path.exists()

In [4]:
summaries = pd.read_csv(summary_path)

In [5]:
summaries.head(5)

Unnamed: 0,summary
0,The customer received an invoice from Cardina...
1,The customer received an email from the State...
2,The customer would like to place a new order ...
3,"The customer, State of Florida Next Gen, upda..."
4,The customer is inquiring about an order plac...


Add metadata, create documents

In [6]:
train_idx, test_idx = get_idx()

In [7]:
training_instances = list(get_training_instances())

In [8]:
training_instances[0]

TrainingInstance(idx=20775, label='Billing / Invoice', email_subject='Invoices 1 of 1 for 2057199110 , TEXAS INSTITUTE FOR SURGERY', email_body='Dear Valued Customer,  Your Cardinal Health invoice is attached to this email as a PDF file.  If you have any questions, please contact our Customer Service department at the phone number listed on the invoice.  Thank you, Cardinal Health  _________________________________________________  This message is for the designated recipient only and may contain privileged, proprietary or otherwise private information. If you have received it in error, please notify the sender immediately and delete the original. Any other use of the email by you is prohibited.  Dansk - Deutsch - Espanol - Francais - Italiano - Japanese - Nederlands - Norsk - Portuguese - Chinese Svenska: http://www.cardinalhealth.com/en/support/terms-and-conditions-english.html', metadata={'BU': 'PD', 'case_number': 3553288, 'ACCOUNT_BUSINESS_UNIT__C': 'a1G4z00000H6C4aEAF', 'received

In [9]:
#| export
def merge_summaries_with_instances(
        summaries: pd.DataFrame, 
        instances: List[TrainingInstance]
        ) -> pd.DataFrame:
    instances_frame = pd.DataFrame([i.to_series() for i in instances])
    return pd.concat(
        [
            summaries,
            instances_frame
        ],
        axis=1).set_index('idx')

In [10]:
chroma_document_frame = merge_summaries_with_instances(
    summaries,
    training_instances
)
chroma_document_frame.head(2)

Unnamed: 0_level_0,summary,BU,case_number,ACCOUNT_BUSINESS_UNIT__C,received_at,sfdc_subcategory,predicted_category,predicted_subcategory,record_type,probability,Accuracy_upd,Bin,label,email_subject,email_body
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20775,The customer received an invoice from Cardina...,PD,3553288,a1G4z00000H6C4aEAF,2023-10-13T12:37:20,Billing Statements,Billing / Invoice,,1,0.474032,Correct,4,Billing / Invoice,"Invoices 1 of 1 for 2057199110 , TEXAS INSTITU...","Dear Valued Customer, Your Cardinal Health in..."
46774,The customer received an email from the State...,PD,3658829,,2023-11-29T20:25:47,Account updates,Billing / Invoice,,1,0.566661,Incorrect,5,Account/Inquiry,Invoice status from State of Florida Next Gen,External Email â€“ Please use caution before o...


In [11]:
train_documents = chroma_document_frame.loc[train_idx, :]
test_documents = chroma_document_frame.loc[test_idx, :]

# Make our chroma db from these documents

In [12]:
chroma_documents = DataFrameLoader(
    train_documents.reset_index(drop=False)[['summary', 'idx', 'label']],
    page_content_column='summary').load()
len(chroma_documents)

900

In [13]:
chroma_documents[0]

Document(page_content=' The customer received an invoice from Cardinal Health for invoice number 1 of 1 for account number 2057199110, but they have questions about the invoice.', metadata={'idx': 20775, 'label': 'Billing / Invoice'})

In [14]:
embedder = get_embedder()

In [15]:
#| export
def get_or_make_chroma(
        data_dir: Path, 
        documents: List[Document] = None,
        overwrite: bool = False):
    chroma_dir = data_dir / 'chroma'
    if not chroma_dir.exists():
        chroma_dir.mkdir()
    embedding_function = get_embedder()
    persist_directory = str(chroma_dir.resolve())
    if len(list(chroma_dir.glob("*.sqlite3"))) > 0:
        if not overwrite:
            return Chroma(
                persist_directory=persist_directory,
                embedding_function=embedding_function
            )
        else:
            for f in chroma_dir.glob("*"):
                f.unlink()
    if documents is None:
        raise ValueError("documents cannot be None")
    return Chroma.from_documents(
        documents,
        embedding_function,
        persist_directory=persist_directory
    )

In [17]:
chroma = get_or_make_chroma(data_dir, chroma_documents, overwrite=True)

In [18]:
chroma.similarity_search("Help I need a drop ship")

[Document(page_content=' **Customer:** Melissa Green, Dropship Coordinator\n\n**Issue:** Received a PO for item 5872742 via Weblink, but the vendor does not dropship as per their notes.\n\n**Details:**\n- PO Number: Not provided\n- Item Number: 5872742\n- Vendor: Not specified\n\n**Request:** Melissa needs confirmation from the vendor regarding their dropshipping policy for the specified item.', metadata={'idx': 7566, 'label': 'Order Processing'}),
 Document(page_content=' The customer sent a Drop Ship order for Cardinal account number 2150122506, containing 2 items to order.\nThey attached the Drop Ship order to the email.\nIf there are any questions, the customer can be contacted at 1-877-805-9483.', metadata={'idx': 14583, 'label': 'Order Processing'}),
 Document(page_content=' The customer sent a drop ship order for Cardinal account number 2150411502, containing 1 item to order.\nThey have attached the drop ship order to the email.\nIf there are any questions, the customer can be c

In [19]:
#| hide
import nbdev; nbdev.nbdev_export()