# chroma

> Put email summaries into a vector store

In [1]:
#| default_exp chroma

In [1]:
#| export
from typing import List, Dict, Any
from pathlib import Path
import json
import pandas as pd

import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings import VertexAIEmbeddings
from langchain.schema import Document
from langchain.document_loaders import DataFrameLoader
from google.cloud import storage
from tqdm import tqdm

from classifier.schema import get_embedder, get_storage_client, WRITE_PREFIX
from classifier.load import get_training_instances, get_idx, TrainingInstance, \
    PROJECT_BUCKET, get_train_test_idx, LABEL_COLUMN, get_document_batches, write_idx

In [2]:
# Load summaries
data_dir = Path('../data')
assert data_dir.exists()
summary_path = data_dir / 'summaries.csv'
assert summary_path.exists()

In [3]:
summaries = pd.read_csv(summary_path)

In [4]:
summaries.head(5)

Unnamed: 0,summary
0,The customer received an invoice from Cardina...
1,The customer received an email from the State...
2,The customer would like to place a new order ...
3,"The customer, State of Florida Next Gen, upda..."
4,The customer is inquiring about an order plac...


Add metadata, create documents

In [5]:
train_idx, test_idx = get_idx()

In [6]:
training_instances = list(get_training_instances())

In [7]:
training_instances[0]

TrainingInstance(idx=20775, label='Billing / Invoice', email_subject='Invoices 1 of 1 for 2057199110 , TEXAS INSTITUTE FOR SURGERY', email_body='Dear Valued Customer,  Your Cardinal Health invoice is attached to this email as a PDF file.  If you have any questions, please contact our Customer Service department at the phone number listed on the invoice.  Thank you, Cardinal Health  _________________________________________________  This message is for the designated recipient only and may contain privileged, proprietary or otherwise private information. If you have received it in error, please notify the sender immediately and delete the original. Any other use of the email by you is prohibited.  Dansk - Deutsch - Espanol - Francais - Italiano - Japanese - Nederlands - Norsk - Portuguese - Chinese Svenska: http://www.cardinalhealth.com/en/support/terms-and-conditions-english.html', metadata={'BU': 'PD', 'case_number': 3553288, 'ACCOUNT_BUSINESS_UNIT__C': 'a1G4z00000H6C4aEAF', 'received

In [8]:
#| export
def merge_summaries_with_instances(
        summaries: pd.DataFrame, 
        instances: List[TrainingInstance]
        ) -> pd.DataFrame:
    instances_frame = pd.DataFrame([i.to_series() for i in instances])
    return pd.concat(
        [
            summaries,
            instances_frame
        ],
        axis=1).set_index('idx')

In [9]:
chroma_document_frame = merge_summaries_with_instances(
    summaries,
    training_instances
)
chroma_document_frame.head(2)

Unnamed: 0_level_0,summary,BU,case_number,ACCOUNT_BUSINESS_UNIT__C,received_at,sfdc_subcategory,predicted_category,predicted_subcategory,record_type,probability,Accuracy_upd,Bin,label,email_subject,email_body
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20775,The customer received an invoice from Cardina...,PD,3553288,a1G4z00000H6C4aEAF,2023-10-13T12:37:20,Billing Statements,Billing / Invoice,,1,0.474032,Correct,4,Billing / Invoice,"Invoices 1 of 1 for 2057199110 , TEXAS INSTITU...","Dear Valued Customer, Your Cardinal Health in..."
46774,The customer received an email from the State...,PD,3658829,,2023-11-29T20:25:47,Account updates,Billing / Invoice,,1,0.566661,Incorrect,5,Account/Inquiry,Invoice status from State of Florida Next Gen,External Email â€“ Please use caution before o...


In [10]:
train_documents = chroma_document_frame.loc[train_idx, :]
test_documents = chroma_document_frame.loc[test_idx, :]

# Make our chroma db from these documents

In [11]:
chroma_documents = DataFrameLoader(
    train_documents.reset_index(drop=False)[['summary', 'idx', 'label']],
    page_content_column='summary').load()
len(chroma_documents)

900

In [12]:
chroma_documents[0]

Document(page_content=' The customer received an invoice from Cardinal Health for invoice number 1 of 1 for account number 2057199110, but they have questions about the invoice.', metadata={'idx': 20775, 'label': 'Billing / Invoice'})

In [13]:
embedder = get_embedder()

In [14]:
#| export
def get_or_make_chroma(
        data_dir: Path, 
        documents: List[Document] = None,
        overwrite: bool = False):
    chroma_dir = data_dir / 'chroma'
    if not chroma_dir.exists():
        chroma_dir.mkdir()
    embedding_function = get_embedder()
    persist_directory = str(chroma_dir.resolve())
    if len(list(chroma_dir.glob("*.sqlite3"))) > 0:
        if not overwrite:
            return Chroma(
                persist_directory=persist_directory,
                embedding_function=embedding_function
            )
        else:
            for f in chroma_dir.glob("*"):
                f.unlink()
    if documents is None:
        raise ValueError("documents cannot be None")
    return Chroma.from_documents(
        documents,
        embedding_function,
        persist_directory=persist_directory
    )

In [15]:
chroma = get_or_make_chroma(data_dir, chroma_documents, overwrite=False)

In [16]:
chroma.similarity_search("Help I need a drop ship")

[Document(page_content=' **Customer:** Melissa Green, Dropship Coordinator\n\n**Issue:** Received a PO for item 5872742 via Weblink, but the vendor does not dropship as per their notes.\n\n**Details:**\n- PO Number: Not provided\n- Item Number: 5872742\n- Vendor: Not specified\n\n**Request:** Melissa needs confirmation from the vendor regarding their dropshipping policy for the specified item.', metadata={'idx': 7566, 'label': 'Order Processing'}),
 Document(page_content=' The customer sent a Drop Ship order for Cardinal account number 2150122506, containing 2 items to order.\nThey attached the Drop Ship order to the email.\nIf there are any questions, the customer can be contacted at 1-877-805-9483.', metadata={'idx': 14583, 'label': 'Order Processing'}),
 Document(page_content=' The customer sent a drop ship order for Cardinal account number 2150411502, containing 1 item to order.\nThey have attached the drop ship order to the email.\nIf there are any questions, the customer can be c

## Make chroma DB from the 10k batch processed docs

In [17]:
#| export
def read_json_lines_from_gcs(
        blob_name: str,
        bucket_name: str = PROJECT_BUCKET) -> Any:
    client = get_storage_client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    with blob.open('r') as f:
        for line in f.readlines():
            yield json.loads(line)

In [18]:
batch_result_file_uri = "JDB_experiments/summarization/prediction-model-2023-12-18T15:10:57.834767Z/000000000000.jsonl"

batch_result = list(read_json_lines_from_gcs(batch_result_file_uri))

In [19]:
batch_result[0].keys()

dict_keys(['instance', 'predictions', 'status'])

In [20]:
len(batch_result[0].get("predictions"))

1

In [21]:
batch_result_summaries = [r.get('predictions', [{}])[0].get("content", "").strip() for r in batch_result]

In [22]:
batch_result_metadata = list(read_json_lines_from_gcs(
    "JDB_experiments/summarization_metadata.jsonl"
))

In [23]:
len(batch_result_summaries) == len(batch_result_metadata)

True

In [24]:
len([r for r in batch_result_summaries if len(r) == 0])

725

In [25]:
batch_result_dataframe = pd.DataFrame.from_records(batch_result_metadata)
batch_result_dataframe.loc[:, 'summary'] = pd.Series(batch_result_summaries)

In [26]:
# Add label
training_data = pd.read_excel(
    f"gs://{PROJECT_BUCKET}/Last50KCases_withSubjectAndBody.xlsx")
training_data.loc[:, 'email_subject'] = training_data.email_subject.fillna("N/A")

In [27]:
batch_result_labels = training_data.loc[batch_result_dataframe.idx, LABEL_COLUMN]
batch_result_dataframe.loc[:, 'label'] = batch_result_labels.tolist()

In [28]:
batch_result_dataframe.head(2)

Unnamed: 0,BU,case_number,ACCOUNT_BUSINESS_UNIT__C,received_at,sfdc_subcategory,predicted_category,predicted_subcategory,record_type,probability,Accuracy_upd,Bin,idx,summary,label
0,PD,3665915,,2023-12-01T22:43:59,Inquiry,Order Processing,Order Entry,1,0.676467,Incorrect,6,48942,"The email is from Max Daugherty, Vice Presiden...",Order Processing
1,PD,3622254,,2023-11-13T16:16:55,Account updates,Account/Inquiry,,1,0.764455,Correct,7,37242,"The customer, Nicholas Brand from Slavins Hanc...",Account/Inquiry


In [29]:
train_8k, test_2k = get_train_test_idx(
    batch_result_dataframe,
    batch_result_dataframe.shape[0],
    0.8,
    label_column='label'
)

In [30]:
train_8k.shape[0], test_2k.shape[0]

(8000, 2000)

In [31]:
train_8k = train_8k[train_8k.summary.str.len() > 0]
test_2k = test_2k[test_2k.summary.str.len() > 0]

train_8k.shape[0], test_2k.shape[0]

(7418, 1857)

In [32]:
write_idx(
    train_8k.set_index('idx').index,
    test_2k.set_index('idx').index,
    prefix=WRITE_PREFIX + "/summarization_idx"
)

In [85]:
chroma_10k_path = data_dir / 'chroma_10k'
if not chroma_10k_path.exists():
    chroma_10k_path.mkdir()

In [86]:
chroma_10k_documents = DataFrameLoader(
    train_8k[['idx', 'label', 'summary']], page_content_column='summary'
).load()
len(chroma_10k_documents)

7418

Embed

In [88]:
embedder = get_embedder()

In [100]:
chroma_10k_documents_embedded = []

pbar = tqdm(total=len(chroma_10k_documents), ncols=80, leave=False)

for doc_batch in get_document_batches(iter(chroma_10k_documents), 5):
    doc_batch_embedded = embedder.embed_documents([d.page_content for d in doc_batch])
    chroma_10k_documents_embedded.extend(doc_batch_embedded)
    pbar.update(len(doc_batch))

pbar.close()

                                                                                

In [112]:
# Write embeddings to GCS
with get_storage_client().bucket(PROJECT_BUCKET).blob(WRITE_PREFIX + "/summarization_embeddings.json").open('w') as f:
    json.dump(chroma_10k_documents_embedded, f)

Add to chroma

In [104]:
# Should take about 15 minutes if you're building it fresh
chroma_10k_client = chromadb.PersistentClient(path=str(chroma_10k_path.resolve()))

In [105]:
collection_10k = chroma_10k_client.get_or_create_collection(name="emails")

In [109]:
len(chroma_10k_documents_embedded), len(doc_batch)

(7418, 3)

In [110]:
collection_10k.add(
    documents=[d.page_content for d in chroma_10k_documents],
    embeddings=chroma_10k_documents_embedded,
    metadatas=[d.metadata for d in chroma_10k_documents],
    ids=[str(d.metadata.get('idx')) for d in chroma_10k_documents]
)

In [118]:
langchain_10k_chroma = Chroma(
    collection_name="emails",
    client=chroma_10k_client, 
    embedding_function=embedder)

## Export

In [119]:
#| hide
import nbdev; nbdev.nbdev_export()