# process

> Process our emails, remove boilerplate, split email chains

In [1]:
#| default_exp process

In [2]:
#| export
from typing import Dict, Any, Tuple, Iterable, List

from classifier.schema import predict, get_storage_client, \
    get_model, DEFAULT_PREDICT_PARAMS, quota_handler
from classifier.load import get_emails_from_frame, get_raw_emails_tejas_case_numbers, \
    Email, PROJECT_BUCKET, WRITE_PREFIX, get_idx, get_batches

from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.llms import VertexAI
from langchain.schema.runnable import RunnableSequence
from tqdm.auto import tqdm
from google.cloud.aiplatform import BatchPredictionJob

In [3]:
import json
from pathlib import Path

import pandas as pd
from itertools import chain

In [4]:
raw_emails_tejas = get_raw_emails_tejas_case_numbers()

In [5]:
training_instance_loader = get_emails_from_frame(
    raw_emails_tejas,
    'train',
    index_prefix=f'{WRITE_PREFIX}/tejas'
)

In [6]:
sample_training_instances = next(get_batches(training_instance_loader))
len(sample_training_instances)

32

In [7]:
#| export
EMAIL_SUBJECT_PREFIX = "--EMAIL SUBJECT--"
EMAIL_BODY_PREFIX = "--EMAIL BODY--"
PREFIX_LEN = len(EMAIL_SUBJECT_PREFIX + EMAIL_BODY_PREFIX) + len("\n"*4)


def email_to_document(
        email: Email
        ) -> Document:
    metadata = email.metadata.copy()
    metadata['idx'] = email.idx
    metadata['label'] = email.label
    return Document(
        page_content="\n".join([
            EMAIL_SUBJECT_PREFIX,
            email.email_subject,
            EMAIL_BODY_PREFIX,
            email.email_body]),
        metadata=metadata
    )

## Split Email Chains

In [8]:
#| export
SPLIT_CHAIN_PROMPT_TEMPLATE = """The following is text from an email chain.
If there is more than one email in the chain, return the positions in the text where each email starts.
If there is only a single email in the chain, return [0].
Return a list of positions in the text as integers.
---EMAIL---
{email}
---END EMAIL---
Positions:"""


SPLIT_CHAIN_PROMPT = PromptTemplate.from_template(SPLIT_CHAIN_PROMPT_TEMPLATE)

In [9]:
llm = VertexAI()

In [10]:
split_chain = SPLIT_CHAIN_PROMPT | llm

In [11]:
sample_training_instances[21].idx

48166

In [12]:
sample_training_instances[21].email_body

"Hi,\n\n\nWe will update with the A/C no shorlty for this Ship to.\nWould you be able to process the PO with the Ship to Location ?\n\nST ELIAS SPECIALTY HOSPITAL\n4800 CORDOVA ST\nANCHORAGE, AK 99503\n\n\nThank you,\nPreethi P\nBuyer â€“ Order Confirmation & Open Order,\nProvidence Resource, Engineering and Hospitality\nO 949-381-4500 Option 2\npreethi.panchabakesan@providence.org\n\n\n\n\n\n------------------------------------------------------------------------------------------------\nTo: provpoconfirmation@providence.org, ProvPOConfirmation@provsjh.org\nFrom: pharma-customerservice@cardinalhealth.com\nDate: 2023-11-29 16:36:29\nSubject: RE: RE: Document Purchase Order P0180000002700\n\n\nHello Providence,\n\nGood day.\n\nMay we please have the Ship to account number for this request?\n\nLooking forward to your response.\n\nRegards,\n[A logo with red lines  Description automatically generated]    Nico Atendido\nSr Rep, Customer Service Ops | Pharma\nGlobal Business Services\nPharma

In [13]:
split_document_example = sample_training_instances[21]

In [14]:
example_split_document = email_to_document(split_document_example)
example_split_answer = split_chain.invoke({"email": example_split_document.page_content})
example_split_answer

' [0, 1000]'

In [15]:
import ast

example_split_answer_positions = ast.literal_eval(example_split_answer)

In [16]:
example_split_emails = []

for start_pos, end_pos in zip(example_split_answer_positions, example_split_answer_positions[1:] + [None]):
    example_split_emails.append(example_split_document.page_content[start_pos: end_pos])

len(example_split_emails)

2

In [17]:
for e in example_split_emails:
    print("-- START EMAIL --")
    print(e)
    print("-- END EMAIL --\n")

-- START EMAIL --
--EMAIL SUBJECT--
RE:RITM10279783 RE: Document Purchase Order P0180000002700
--EMAIL BODY--
Hi,


We will update with the A/C no shorlty for this Ship to.
Would you be able to process the PO with the Ship to Location ?

ST ELIAS SPECIALTY HOSPITAL
4800 CORDOVA ST
ANCHORAGE, AK 99503


Thank you,
Preethi P
Buyer â€“ Order Confirmation & Open Order,
Providence Resource, Engineering and Hospitality
O 949-381-4500 Option 2
preethi.panchabakesan@providence.org





------------------------------------------------------------------------------------------------
To: provpoconfirmation@providence.org, ProvPOConfirmation@provsjh.org
From: pharma-customerservice@cardinalhealth.com
Date: 2023-11-29 16:36:29
Subject: RE: RE: Document Purchase Order P0180000002700


Hello Providence,

Good day.

May we please have the Ship to account number for this request?

Looking forward to your response.

Regards,
[A logo with red lines  Description automatically generated]    Nico Atendido
S

## Summarize emails

In [18]:
pd.Series([len(d.email_body + d.email_subject) for d in sample_training_instances]).describe()

count      32.000000
mean     1396.937500
std       656.691558
min       400.000000
25%      1009.500000
50%      1115.500000
75%      1746.750000
max      2964.000000
dtype: float64

Define summarization prompt

In [19]:
#| export
# TODO: Ignore sender, receiver information
# TODO: Summarize most recent email, ignore rest
# TODO: Remove boilerplate
SUMMARIZE_PROMPT_PREFIX = """Summarize the following email chain. 
Include information a customer service representative might find useful. 
Denote the business function the involved parties may perform.
Focus on the most recent email.
Do not include any boilerplate content in your summary.
Only use information present in the email.
Think through your summary step-by-step.
EMAIL: """

SUMMARIZE_PROMPT_STR = SUMMARIZE_PROMPT_PREFIX + "{context}\nSummary:"

SUMMARIZE_PROMPT = PromptTemplate.from_template(SUMMARIZE_PROMPT_STR)

BISON_MAXIMUM_INPUT_TOKENS = 8192
CONTEXT_TOKEN_LIMIT = BISON_MAXIMUM_INPUT_TOKENS - len(SUMMARIZE_PROMPT_PREFIX)

In [20]:
CONTEXT_TOKEN_LIMIT

7838

### Summarize individually

In [21]:
model = VertexAI()

In [22]:
#| export
def get_summary_chain() -> RunnableSequence:
    return SUMMARIZE_PROMPT | VertexAI()

In [23]:
chain = get_summary_chain()

In [24]:
type(chain)

langchain_core.runnables.base.RunnableSequence

In [25]:
example_batch = [email_to_document(i) for i in sample_training_instances]

In [53]:
#| export
@quota_handler
def get_documents_summaries(
    documents: List[Document], 
    chain: RunnableSequence
    ) -> List[str]:
    return chain.batch([{'context': d.page_content} for d in documents])

In [47]:
example_batch_summaries = get_documents_summaries(example_batch[:5], chain)
len(example_batch_summaries)

5

In [48]:
for doc, summary, instance in zip(
    example_batch, 
    example_batch_summaries, 
    sample_training_instances[:len(example_batch)]
    ):
    print("-- DOCUMENT --\n", doc.page_content, "\n")
    print("-- LABEL --\n", instance.label, "\n")
    print("-- SUMMARY --\n", summary.strip(), "\n")

-- DOCUMENT --
 --EMAIL SUBJECT--
Equashield latest - FW: EQ II Catalog 2023 - Cardinal Health.xlsx
--EMAIL BODY--
External Email â€“ Please use caution before opening attachments or clicking links  Let us know if you need anything else.    Regards,  Thomas Everitt Customer Service Representative Office    +1 516 684 8200 / Ext: 220 Mobile  +1 516 398 97 25 Fax          +1 516 684 8202 www.equashield.com<http://www.equashield.com/> [cid:image001.png@01DA0CD9.861376B0] [cid:image002.png@01DA0CD9.861376B0]  From: Pavlina Georgieva <pavlina@equashield.com> Sent: Wednesday, November 1, 2023 3:39 PM To: Thomas Everitt <Thomas.e@equashield.com> Subject: EQ II Catalog 2023 - Cardinal Health.xlsx      Regards,  Pavlina Georgieva Logistics Coordinator Office    +1 516 684 8200 / Ext: 202 Fax          +1 516 684 8202 www.equashield.com<http://www.equashield.com/> [cid:image001.png@01DA0CD9.861376B0] [cid:image002.png@01DA0CD9.861376B0]   ________________________________ Confidentiality Notice: T

Get summaries for our cohort of emails.

In [54]:
#| export
def get_summaries(
        instances: Iterable[Email], 
        chain: RunnableSequence,
        batch_size: int = 5) -> Iterable[List[str]]:
    for instance_batch in get_batches(instances, batch_size):
        instance_batch_documents = [email_to_document(i) for i in instance_batch]
        yield get_documents_summaries(instance_batch_documents, chain)

In [50]:
train_idx, test_idx = get_idx(prefix=f"{WRITE_PREFIX}/tejas")
train_idx.shape
total_instances = train_idx.shape[0] + test_idx.shape[0]

In [51]:
total_instances

3000

In [56]:
instance_loader = get_emails_from_frame(
    raw_emails_tejas,
    index_prefix=f'{WRITE_PREFIX}/tejas')

summaries = []

pbar = tqdm(total=total_instances, ncols=80, leave=True)

for s in get_summaries(instance_loader, chain):
    summaries.extend(s)
    pbar.update(len(s))

pbar.close()

len(summaries) == total_instances

  0%|                                                  | 0/3000 [00:00<?, ?it/s]

True

In [57]:
len(summaries)

3000

In [63]:
# write summaries
pd.DataFrame(
    summaries, 
    columns=["summary"], 
    index=pd.concat([train_idx, test_idx], axis=0).values
).to_csv(
    f'gs://{PROJECT_BUCKET}/{WRITE_PREFIX}/tejas/summaries.csv', 
    index=True)

## Batch Prediction

### Prepare prompts for batch

In [12]:
documents = list(map(make_document_from_instance, sample_training_instances))
documents[0]

Document(page_content='--EMAIL SUBJECT--\nPO 70026263\n--EMAIL BODY--\nExternal Email â€“ Please use caution before opening attachments or clicking links\n\nHello\n\nThe attached PO was sent a few days ago.  Please confirm it is in your system.\n\n\nThank you\nJoe\n\n\nJoe Liesse\nOperations Coordinator\nSupply Chain Management â€“ P2P\nPhone: 507-266-5551\nEmail: liesse.joseph@mayo.edu<mailto:liesse.joseph@mayo.edu>\n________________________\nMayo Clinic\n200 First Street SW\nRochester, MN 55905', metadata={'BU': 'PD', 'case_number': 3665915, 'ACCOUNT_BUSINESS_UNIT__C': nan, 'received_at': '2023-12-01T22:43:59', 'sfdc_subcategory': 'Inquiry', 'predicted_category': 'Order Processing', 'predicted_subcategory': 'Order Entry', 'record_type': 1, 'probability': 0.6764667, 'Accuracy_upd': 'Incorrect', 'Bin': 6, 'idx': 48942})

Prepare prompts

In [None]:
#| export
def prepare_summarization_prompt(document: Document) -> Tuple[Dict[str, str], Dict[str, Any]]:
    prompt = {'prompt': SUMMARIZE_PROMPT.format(context=document.page_content)}
    return prompt, document.metadata

In [14]:
prompt_and_metadata = prepare_summarization_prompt(documents[0])
prompt_and_metadata

({'prompt': 'You are a customer service representative.\nSummarize the following email, try to preserve as much information as is necessary to diagnose and solve the customers issue detailed in the email.\nThink through your summary step-by-step.\nOnly use information present in the email.\nEMAIL:\n--EMAIL SUBJECT--\nPO 70026263\n--EMAIL BODY--\nExternal Email â€“ Please use caution before opening attachments or clicking links\n\nHello\n\nThe attached PO was sent a few days ago.  Please confirm it is in your system.\n\n\nThank you\nJoe\n\n\nJoe Liesse\nOperations Coordinator\nSupply Chain Management â€“ P2P\nPhone: 507-266-5551\nEmail: liesse.joseph@mayo.edu<mailto:liesse.joseph@mayo.edu>\n________________________\nMayo Clinic\n200 First Street SW\nRochester, MN 55905\nSummary:'},
 {'BU': 'PD',
  'case_number': 3665915,
  'ACCOUNT_BUSINESS_UNIT__C': nan,
  'received_at': '2023-12-01T22:43:59',
  'sfdc_subcategory': 'Inquiry',
  'predicted_category': 'Order Processing',
  'predicted_sub

Make a summarization request

In [15]:
sample_prompt = prompt_and_metadata[0]['prompt']

In [16]:
predict(sample_prompt)

 The customer, Joe Liesse from Mayo Clinic, is inquiring about a purchase order (PO) numbered 70026263 that was sent a few days ago. He wants confirmation that the PO has been received and is in the recipient's system.

In [28]:
#| export
SUMMARIZATION_PROMPT_FILE_NAME = "summarization_prompts.jsonl"
SUMMARIZATION_METADATA_FILE_NAME = "summarization_metadata.jsonl"


def prepare_batch_summarization_files(
        loader: Iterable[Email],
        bucket_name: str = PROJECT_BUCKET,
        use_pbar: bool = False,
        pbar_size: int = 10000,
        prefix: str = WRITE_PREFIX):
    client = get_storage_client()
    bucket = client.bucket(bucket_name=bucket_name)
    prompt_blob_name = f"{prefix}/{SUMMARIZATION_PROMPT_FILE_NAME}"
    metadata_blob_name = f"{prefix}/{SUMMARIZATION_METADATA_FILE_NAME}"
    prompt_blob = bucket.blob(blob_name=prompt_blob_name)
    metadata_blob = bucket.blob(blob_name=metadata_blob_name)
    if use_pbar:
        pbar = tqdm(total=pbar_size, ncols=80, leave=False)
    with metadata_blob.open("w") as metadata_f:
        with prompt_blob.open("w") as prompt_f:
            for instance in loader:
                document = email_to_document(instance)
                prompt, metadata = prepare_summarization_prompt(document)
                # Write prompt to JSONL file in GCS, write metadat to similar file
                json.dump(prompt, prompt_f)
                prompt_f.write("\n")
                json.dump(metadata, metadata_f)
                metadata_f.write("\n")
                if use_pbar:
                    pbar.update(1)
    if use_pbar:
        pbar.close()

In [26]:
# prepare_batch_summarization_files(
#     loader=get_emails_from_frame(),
#     use_pbar=True)

  0%|                                                 | 0/10000 [00:00<?, ?it/s]

### Invoke batch prediction

In [35]:
#| export
SUMMARIZATION_RESULT_PREFIX = "summarization"


def summarize_prompts(
        file_prefix: str = WRITE_PREFIX,
        file_name: str = SUMMARIZATION_PROMPT_FILE_NAME,
        bucket_name: str = PROJECT_BUCKET,
        params: Dict[str, Any] = DEFAULT_PREDICT_PARAMS
        ) -> BatchPredictionJob:
    dataset = f"gs://{bucket_name}/{file_prefix}/{file_name}"
    destination_url_prefix = f"gs://{bucket_name}/{file_prefix}/{SUMMARIZATION_RESULT_PREFIX}"
    model = get_model()
    return model.batch_predict(
        dataset=dataset,
        destination_uri_prefix=destination_url_prefix,
        # Optional:
        model_parameters=params)

In [None]:
# batch_job = summarize_prompts()

In [39]:
#| export
def load_batch_prediction_results():
    pass

## Export

In [65]:
#| hide
import nbdev; nbdev.nbdev_export()