# chroma

> Put email summaries into a vector store

In [35]:
#| default_exp chroma

In [7]:
#| export
from typing import List, Dict, Any, Iterable
from pathlib import Path
import json
import pandas as pd

import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings import VertexAIEmbeddings
from langchain.schema import Document
from langchain.document_loaders import DataFrameLoader
from google.cloud import storage
from tqdm import tqdm

from classifier.schema import get_embedder, get_storage_client, WRITE_PREFIX
from classifier.load import get_emails_from_frame, get_idx, Email, \
    PROJECT_BUCKET, get_train_test_idx, LABEL_COLUMN, get_batches, write_idx, \
    get_raw_emails_tejas_case_numbers

In [2]:
## Load summaries - Local
# data_dir = Path('../data')
# assert data_dir.exists()
# summary_path = data_dir / 'summaries.csv'
# assert summary_path.exists()

In [5]:
summaries_path = f'gs://{PROJECT_BUCKET}/{WRITE_PREFIX}/tejas/summaries.csv'
summaries = pd.read_csv(summaries_path)
summaries.rename({"Unnamed: 0": "idx"}, axis=1, inplace=True)
summaries.head(5)

Unnamed: 0,idx,summary
0,31716,Pavlina Georgieva (Logistics Coordinator) sen...
1,35200,**Summary**\n\nA customer reached out to Card...
2,462,**Subject**: Invoice 7322207358 - State of Fl...
3,3705,**Subject**: ACTION REQUIRED | Additional Inf...
4,25300,**Subject**: Paid - Invoice 7328757492\n\n**C...


Add metadata, create documents

In [6]:
train_idx, test_idx = get_idx(prefix=f"{WRITE_PREFIX}/tejas")

In [12]:
raw_emails_tejas = get_raw_emails_tejas_case_numbers()
emails = list(get_emails_from_frame(
    raw_emails_tejas,
    index_prefix=f'{WRITE_PREFIX}/tejas'
))

In [22]:
emails_frame = pd.DataFrame([e.to_series() for e in emails])
emails_frame.head(2)

Unnamed: 0,BU,case_number,ACCOUNT_BUSINESS_UNIT__C,received_at,sfdc_subcategory,predicted_category,predicted_subcategory,record_type,probability,Accuracy_upd,Bin,idx,label,email_subject,email_body
0,PD,3598350,,2023-11-01T19:40:57,Drop Ship Order,Order Processing,Drop Ship Order,1,0.576672,Correct,5,31716,Order Processing,Equashield latest - FW: EQ II Catalog 2023 - C...,External Email â€“ Please use caution before o...
1,PD,3613116,,2023-11-08T17:27:04,Account balance,Billing / Invoice,,1,0.496874,Correct,4,35200,Billing / Invoice,Auto-Reply. We Have Received Your Request,"To whom it may concern, Your request has been..."


In [27]:
chroma_document_frame = emails_frame.merge(summaries, on='idx', how='inner').set_index('idx')
chroma_document_frame.head(2)

Unnamed: 0_level_0,BU,case_number,ACCOUNT_BUSINESS_UNIT__C,received_at,sfdc_subcategory,predicted_category,predicted_subcategory,record_type,probability,Accuracy_upd,Bin,label,email_subject,email_body,summary
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
31716,PD,3598350,,2023-11-01T19:40:57,Drop Ship Order,Order Processing,Drop Ship Order,1,0.576672,Correct,5,Order Processing,Equashield latest - FW: EQ II Catalog 2023 - C...,External Email â€“ Please use caution before o...,Pavlina Georgieva (Logistics Coordinator) sen...
35200,PD,3613116,,2023-11-08T17:27:04,Account balance,Billing / Invoice,,1,0.496874,Correct,4,Billing / Invoice,Auto-Reply. We Have Received Your Request,"To whom it may concern, Your request has been...",**Summary**\n\nA customer reached out to Card...


In [28]:
chroma_document_frame.shape

(3000, 15)

In [29]:
train_documents = chroma_document_frame.loc[train_idx, :]
test_documents = chroma_document_frame.loc[test_idx, :]

# Make our chroma db from these documents

In [30]:
chroma_documents = DataFrameLoader(
    train_documents.reset_index(drop=False)[['summary', 'idx', 'label']],
    page_content_column='summary').load()
len(chroma_documents)

2400

In [31]:
chroma_documents[0]

Document(page_content=' Pavlina Georgieva (Logistics Coordinator) sent the 2023 EQ II Catalog from Cardinal Health to Thomas Everitt (Customer Service Representative).', metadata={'idx': 31716, 'label': 'Order Processing'})

In [32]:
embedder = get_embedder()

In [33]:
#| export
def get_or_make_chroma(
        data_dir: Path, 
        documents: List[Document] = None,
        overwrite: bool = False):
    chroma_dir = data_dir / 'chroma'
    if not chroma_dir.exists():
        chroma_dir.mkdir()
    embedding_function = get_embedder()
    persist_directory = str(chroma_dir.resolve())
    if len(list(chroma_dir.glob("*.sqlite3"))) > 0:
        if not overwrite:
            return Chroma(
                persist_directory=persist_directory,
                embedding_function=embedding_function
            )
        else:
            for f in chroma_dir.glob("*"):
                f.unlink()
    if documents is None:
        raise ValueError("documents cannot be None")
    return Chroma.from_documents(
        documents,
        embedding_function,
        persist_directory=persist_directory
    )

In [35]:
data_dir = Path("../data")
assert data_dir.exists()
tejas_dir = data_dir / "tejas"
if not tejas_dir.exists():
    tejas_dir.mkdir()

In [36]:
# Takes a bit
chroma = get_or_make_chroma(tejas_dir, chroma_documents, overwrite=False)

In [37]:
chroma.similarity_search("Help I need a drop ship")

[Document(page_content=' **Customer:** \n- Name: N/A\n- Account Number: 24-2057190625 465086\n- Request: Drop ship order\n\n**Order Details:**\n- NDC: 677683-0630-68\n- CIN: 2905347\n- Item: Calcium Gluconate 2.5% Gel\n- Quantity: 2\n- PO: 1MEDCAR11022023\n\n**Action Required:**\n- Process the drop ship order', metadata={'idx': 32233, 'label': 'Order Processing'}),
 Document(page_content=' **Customer:** Robin Passmore, Supply at Maxor, (806)324-5410 EX: 5405\n\n**Request:** Robin wants to place a drop ship order for several accounts for the same item.\n\n**Item Details:**\n- ECONOMY EMRG. SPILL KIT\n- ITEM #3678257\n- QTY #1\n\n**Accounts:**\n- 2057201907\n- 2052016170\n- 2052014129\n-', metadata={'idx': 44254, 'label': 'Order Processing'}),
 Document(page_content=' **Subject**: Cardinal Drop Ship order - Acct 2150123024\n\n**Customer**: Cardinal (Acct # 2150123024)\n\n**Issue**: Drop Ship order with 1 item\n\n**Action Required**: Contact Wellpartner Operations at 1-877-805-9483 for an

## Make chroma DB from the 10k batch processed docs

In [20]:
#| export
def read_json_lines_from_gcs(
        blob_name: str,
        bucket_name: str = PROJECT_BUCKET) -> Iterable[Any]:
    client = get_storage_client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    with blob.open('r') as f:
        for line in f.readlines():
            yield json.loads(line)

In [21]:
batch_result_file_uri = "JDB_experiments/summarization/prediction-model-2023-12-18T15:10:57.834767Z/000000000000.jsonl"

batch_result = list(read_json_lines_from_gcs(batch_result_file_uri))

In [22]:
batch_result[0].keys()

dict_keys(['instance', 'predictions', 'status'])

In [23]:
len(batch_result[0].get("predictions"))

1

In [24]:
batch_result_summaries = [r.get('predictions', [{}])[0].get("content", "").strip() for r in batch_result]

In [25]:
batch_result_metadata = list(read_json_lines_from_gcs(
    "JDB_experiments/summarization_metadata.jsonl"
))

In [26]:
len(batch_result_summaries) == len(batch_result_metadata)

True

In [27]:
len([r for r in batch_result_summaries if len(r) == 0])

725

In [28]:
batch_result_dataframe = pd.DataFrame.from_records(batch_result_metadata)
batch_result_dataframe.loc[:, 'summary'] = pd.Series(batch_result_summaries)

In [29]:
# Add label
training_data = pd.read_excel(
    f"gs://{PROJECT_BUCKET}/Last50KCases_withSubjectAndBody.xlsx")
training_data.loc[:, 'email_subject'] = training_data.email_subject.fillna("N/A")

In [30]:
batch_result_labels = training_data.loc[batch_result_dataframe.idx, LABEL_COLUMN]
batch_result_dataframe.loc[:, 'label'] = batch_result_labels.tolist()

In [31]:
batch_result_dataframe.head(2)

Unnamed: 0,BU,case_number,ACCOUNT_BUSINESS_UNIT__C,received_at,sfdc_subcategory,predicted_category,predicted_subcategory,record_type,probability,Accuracy_upd,Bin,idx,summary,label
0,PD,3665915,,2023-12-01T22:43:59,Inquiry,Order Processing,Order Entry,1,0.676467,Incorrect,6,48942,"The email is from Max Daugherty, Vice Presiden...",Order Processing
1,PD,3622254,,2023-11-13T16:16:55,Account updates,Account/Inquiry,,1,0.764455,Correct,7,37242,"The customer, Nicholas Brand from Slavins Hanc...",Account/Inquiry


In [34]:
batch_result_dataframe.summary.head(5).values.tolist()

['The email is from Max Daugherty, Vice President of Clinical Operations at Partners Pharmacy. He is requesting a same-day order for account number 2057192443 in Stafford, TX. The confirmation number for the order is 1053194086.\n\nThe customer is requesting that the order be removed from credit hold, processed, and delivered to the pharmacy the same day.',
 'The customer, Nicholas Brand from Slavins Hancock Pharmacy, is requesting a return for the following items due to overstock:\n\n- LOSARTAN/HCT TAB 50-12.5 (NDC: 00093-7367-98)\n- HYDROCORT CRE 1% (NDC: 00168-0015-31)\n- MOMETASONE CRE 0.1% (NDC',
 'The customer, Twila Traweek, needs to place an item on a dropship order with overnight shipping for Monday or Tuesday. The order should be charged to account number 2052011851. Twila requests confirmation when the order is placed.',
 'The customer, Leslie K. Keller, is inquiring about the best dating available for CIN 5476262 Gamastan 2ml.',
 '- Tina, a customer with account number 2057

In [29]:
train_8k, test_2k = get_train_test_idx(
    batch_result_dataframe,
    batch_result_dataframe.shape[0],
    0.8,
    label_column='label'
)

In [30]:
train_8k.shape[0], test_2k.shape[0]

(8000, 2000)

In [31]:
train_8k = train_8k[train_8k.summary.str.len() > 0]
test_2k = test_2k[test_2k.summary.str.len() > 0]

train_8k.shape[0], test_2k.shape[0]

(7418, 1857)

In [32]:
write_idx(
    train_8k.set_index('idx').index,
    test_2k.set_index('idx').index,
    prefix=WRITE_PREFIX + "/summarization_idx"
)

In [85]:
chroma_10k_path = data_dir / 'chroma_10k'
if not chroma_10k_path.exists():
    chroma_10k_path.mkdir()

In [86]:
chroma_10k_documents = DataFrameLoader(
    train_8k[['idx', 'label', 'summary']], page_content_column='summary'
).load()
len(chroma_10k_documents)

7418

Embed

In [88]:
embedder = get_embedder()

In [100]:
chroma_10k_documents_embedded = []

pbar = tqdm(total=len(chroma_10k_documents), ncols=80, leave=False)

for doc_batch in get_batches(iter(chroma_10k_documents), 5):
    doc_batch_embedded = embedder.embed_documents([d.page_content for d in doc_batch])
    chroma_10k_documents_embedded.extend(doc_batch_embedded)
    pbar.update(len(doc_batch))

pbar.close()

                                                                                

In [112]:
# Write embeddings to GCS
with get_storage_client().bucket(PROJECT_BUCKET).blob(WRITE_PREFIX + "/summarization_embeddings.json").open('w') as f:
    json.dump(chroma_10k_documents_embedded, f)

Add to chroma

In [104]:
# Should take about 15 minutes if you're building it fresh
chroma_10k_client = chromadb.PersistentClient(path=str(chroma_10k_path.resolve()))

In [105]:
collection_10k = chroma_10k_client.get_or_create_collection(name="emails")

In [109]:
len(chroma_10k_documents_embedded), len(doc_batch)

(7418, 3)

In [110]:
collection_10k.add(
    documents=[d.page_content for d in chroma_10k_documents],
    embeddings=chroma_10k_documents_embedded,
    metadatas=[d.metadata for d in chroma_10k_documents],
    ids=[str(d.metadata.get('idx')) for d in chroma_10k_documents]
)

In [118]:
langchain_10k_chroma = Chroma(
    collection_name="emails",
    client=chroma_10k_client, 
    embedding_function=embedder)

## Export

In [38]:
#| hide
import nbdev; nbdev.nbdev_export()