# process

> Process our emails, remove boilerplate, split email chains

In [1]:
#| default_exp process

In [1]:
#| export
from typing import List, Dict, Any, Tuple, Iterable
from itertools import chain, islice

from classifier.schema import batch_predict, predict, get_storage_client, \
    get_model, DEFAULT_PREDICT_PARAMS
from classifier.load import get_training_instances, TrainingInstance, \
    PROJECT_BUCKET, WRITE_PREFIX, get_idx, get_document_batches

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.llms import VertexAI
from tqdm.auto import tqdm
from google.cloud.aiplatform import BatchPredictionJob

In [2]:
import json
from pathlib import Path

import pandas as pd

## Summarize emails

In [5]:
sample_training_instances = next(get_document_batches(get_training_instances()))
len(sample_training_instances)

32

In [6]:
pd.Series([len(d.email_body + d.email_subject) for d in sample_training_instances]).describe()

count      32.000000
mean     1350.812500
std      1262.578683
min       125.000000
25%       766.000000
50%      1007.000000
75%      1516.000000
max      6949.000000
dtype: float64

Define summarization prompt

In [7]:
#| export
SUMMARIZE_PROMPT_PREFIX = """You are a customer service representative.
Summarize the following email, try to preserve as much information as is necessary to diagnose and solve the customers issue detailed in the email.
Think through your summary step-by-step.
Only use information present in the email.
EMAIL:
"""

SUMMARIZE_PROMPT_STR = SUMMARIZE_PROMPT_PREFIX + "{context}\nSummary:"

SUMMARIZE_PROMPT = PromptTemplate.from_template(SUMMARIZE_PROMPT_STR)

BISON_MAXIMUM_INPUT_TOKENS = 8192
CONTEXT_TOKEN_LIMIT = BISON_MAXIMUM_INPUT_TOKENS - len(SUMMARIZE_PROMPT_PREFIX)

In [8]:
CONTEXT_TOKEN_LIMIT

7910

In [9]:
#| export
EMAIL_SUBJECT_PREFIX = "--EMAIL SUBJECT--"
EMAIL_BODY_PREFIX = "--EMAIL BODY--"
PREFIX_LEN = len(EMAIL_SUBJECT_PREFIX + EMAIL_BODY_PREFIX) + len("\n"*4)


def make_document_from_instance(
        instance: TrainingInstance
        ) -> Document:
    metadata = instance.metadata.copy()
    metadata['idx'] = instance.idx
    return Document(
        page_content="\n".join([
            EMAIL_SUBJECT_PREFIX,
            instance.email_subject,
            EMAIL_BODY_PREFIX,
            instance.email_body]),
        metadata=metadata
    )

### Handle larger than prompt emails

In [10]:
##| export
# def split_training_instance_for_summary(
#     training_instance: TrainingInstance,
#     character_limit: int = CONTEXT_TOKEN_LIMIT
#     ) -> List[Document]:
#     subject_len = len(training_instance.email_subject)
#     body_len = len(training_instance.email_body)
#     if (subject_len + body_len + PREFIX_LEN) > character_limit:
#         body_limit = character_limit - subject_len - PREFIX_LEN
#         body_splitter = RecursiveCharacterTextSplitter(
#             chunk_size=body_limit)
#         body_texts = body_splitter.split_text(training_instance.email_body)
#     else:
#         body_texts = [training_instance.email_body]
#     metadata = training_instance.metadata
#     metadata['idx'] = training_instance.idx
#     metadata['label'] = training_instance.label
#     # Gather split instances as documents
#     split_instances = []
#     for i, body in enumerate(body_texts):
#         i_metadata = metadata.copy()
#         i_metadata['idx_chunk'] = i
#         i_document = make_document_from_email(
#             body,
#             subject=training_instance.email_subject,
#             metadata=i_metadata
#         )
#         split_instances.append(i_document)
#     return split_instances


# def split_training_instances(instances: List[TrainingInstance]) -> List[Document]:
#     return list(chain.from_iterable(map(split_training_instance_for_summary, instances)))

In [11]:
# sample_split_instances = split_training_instances(sample_training_instances)
# [len(d.page_content) for d in sample_split_instances]

In [12]:
# sample_split_instances[0]

In [13]:
# # Doc 1 was split into 2 pieces
# sample_split_instances[1].metadata['idx'] == \
#     sample_split_instances[2].metadata['idx']

### Format Prompts

In [14]:
#| export
def prepare_summarization_prompt(document: Document) -> Tuple[Dict[str, str], Dict[str, Any]]:
    prompt = {'prompt': SUMMARIZE_PROMPT.format(context=document.page_content)}
    return prompt, document.metadata

### Summarize individually

In [15]:
train_idx, test_idx = get_idx()
total_instances = train_idx.shape[0] + test_idx.shape[0]
total_instances

1000

In [16]:
model = VertexAI()

In [17]:
chain = SUMMARIZE_PROMPT | model

In [18]:
example_batch = [make_document_from_instance(i) for i in sample_training_instances]

In [20]:
chain.batch([{'context': d.page_content} for d in example_batch[:5]])

[' The customer received an invoice from Cardinal Health for invoice number 1 of 1 for account number 2057199110, but they have questions about the invoice.',
 ' The customer received an email from the State of Florida Next Gen regarding an invoice with the following details:\n\n- Invoice Date: 28th November 2023\n- Invoice Number: 7340827424\n- Invoice Amount: $11,523.00 USD\n\nThe email informs the customer that the State of Florida Next Gen has transitioned to using the SAP Business Network for invoice status visibility and tracking. The customer is required to activate their account on the SAP Business Network to view and track invoices.\n\nThe email provides a link for the customer to register for a free',
 ' The customer would like to place a new order for their patient, FS.\n- Account number: 2057189687\n- PO number: PIC23100401\n- Patient: FS\n- Medication: Soliris - 300 mg/30ml\n- CIN: 400-3091\n- Quantity: 4 Vials\n- NPI: 1619288172\n- Requested delivery date: 10/09/2023\n\nT

In [21]:
summaries = []

pbar = tqdm(total=total_instances, ncols=80, leave=False)
for instance_batch in get_document_batches(get_training_instances(), 5):
    documents = [make_document_from_instance(i) for i in instance_batch]
    summary = chain.batch([{'context': d.page_content} for d in documents])
    summaries.extend(summary)
    pbar.update(len(instance_batch))
pbar.close()

  0%|                                                  | 0/1000 [00:00<?, ?it/s]

In [24]:
data_dir = Path("../data")
assert data_dir.exists()

In [29]:
# write summaries
pd.Series(summaries, name="summary").to_csv(data_dir / 'summaries.csv', index=False)

### Prepare prompts for batch

In [12]:
documents = list(map(make_document_from_instance, sample_training_instances))
documents[0]

Document(page_content='--EMAIL SUBJECT--\nPO 70026263\n--EMAIL BODY--\nExternal Email â€“ Please use caution before opening attachments or clicking links\n\nHello\n\nThe attached PO was sent a few days ago.  Please confirm it is in your system.\n\n\nThank you\nJoe\n\n\nJoe Liesse\nOperations Coordinator\nSupply Chain Management â€“ P2P\nPhone: 507-266-5551\nEmail: liesse.joseph@mayo.edu<mailto:liesse.joseph@mayo.edu>\n________________________\nMayo Clinic\n200 First Street SW\nRochester, MN 55905', metadata={'BU': 'PD', 'case_number': 3665915, 'ACCOUNT_BUSINESS_UNIT__C': nan, 'received_at': '2023-12-01T22:43:59', 'sfdc_subcategory': 'Inquiry', 'predicted_category': 'Order Processing', 'predicted_subcategory': 'Order Entry', 'record_type': 1, 'probability': 0.6764667, 'Accuracy_upd': 'Incorrect', 'Bin': 6, 'idx': 48942})

Prepare prompts

In [14]:
prompt_and_metadata = prepare_summarization_prompt(documents[0])
prompt_and_metadata

({'prompt': 'You are a customer service representative.\nSummarize the following email, try to preserve as much information as is necessary to diagnose and solve the customers issue detailed in the email.\nThink through your summary step-by-step.\nOnly use information present in the email.\nEMAIL:\n--EMAIL SUBJECT--\nPO 70026263\n--EMAIL BODY--\nExternal Email â€“ Please use caution before opening attachments or clicking links\n\nHello\n\nThe attached PO was sent a few days ago.  Please confirm it is in your system.\n\n\nThank you\nJoe\n\n\nJoe Liesse\nOperations Coordinator\nSupply Chain Management â€“ P2P\nPhone: 507-266-5551\nEmail: liesse.joseph@mayo.edu<mailto:liesse.joseph@mayo.edu>\n________________________\nMayo Clinic\n200 First Street SW\nRochester, MN 55905\nSummary:'},
 {'BU': 'PD',
  'case_number': 3665915,
  'ACCOUNT_BUSINESS_UNIT__C': nan,
  'received_at': '2023-12-01T22:43:59',
  'sfdc_subcategory': 'Inquiry',
  'predicted_category': 'Order Processing',
  'predicted_sub

Make a summarization request

In [15]:
sample_prompt = prompt_and_metadata[0]['prompt']

In [16]:
predict(sample_prompt)

 The customer, Joe Liesse from Mayo Clinic, is inquiring about a purchase order (PO) numbered 70026263 that was sent a few days ago. He wants confirmation that the PO has been received and is in the recipient's system.

In [28]:
#| export
SUMMARIZATION_PROMPT_FILE_NAME = "summarization_prompts.jsonl"
SUMMARIZATION_METADATA_FILE_NAME = "summarization_metadata.jsonl"


def prepare_batch_summarization_files(
        loader: Iterable[TrainingInstance],
        bucket_name: str = PROJECT_BUCKET,
        use_pbar: bool = False,
        pbar_size: int = 10000,
        prefix: str = WRITE_PREFIX):
    client = get_storage_client()
    bucket = client.bucket(bucket_name=bucket_name)
    prompt_blob_name = f"{prefix}/{SUMMARIZATION_PROMPT_FILE_NAME}"
    metadata_blob_name = f"{prefix}/{SUMMARIZATION_METADATA_FILE_NAME}"
    prompt_blob = bucket.blob(blob_name=prompt_blob_name)
    metadata_blob = bucket.blob(blob_name=metadata_blob_name)
    if use_pbar:
        pbar = tqdm(total=pbar_size, ncols=80, leave=False)
    with metadata_blob.open("w") as metadata_f:
        with prompt_blob.open("w") as prompt_f:
            for instance in loader:
                document = make_document_from_instance(instance)
                prompt, metadata = prepare_summarization_prompt(document)
                # Write prompt to JSONL file in GCS, write metadat to similar file
                json.dump(prompt, prompt_f)
                prompt_f.write("\n")
                json.dump(metadata, metadata_f)
                metadata_f.write("\n")
                if use_pbar:
                    pbar.update(1)
    if use_pbar:
        pbar.close()

In [26]:
prepare_batch_summarization_files(
    loader=get_training_instances(),
    use_pbar=True)

  0%|                                                 | 0/10000 [00:00<?, ?it/s]

## Invoke batch prediction

In [35]:
#| export
SUMMARIZATION_RESULT_PREFIX = "summarization"


def summarize_prompts(
        file_prefix: str = WRITE_PREFIX,
        file_name: str = SUMMARIZATION_PROMPT_FILE_NAME,
        bucket_name: str = PROJECT_BUCKET,
        params: Dict[str, Any] = DEFAULT_PREDICT_PARAMS
        ) -> BatchPredictionJob:
    dataset = f"gs://{bucket_name}/{file_prefix}/{file_name}"
    destination_url_prefix = f"gs://{bucket_name}/{file_prefix}/{SUMMARIZATION_RESULT_PREFIX}"
    model = get_model()
    return model.batch_predict(
        dataset=dataset,
        destination_uri_prefix=destination_url_prefix,
        # Optional:
        model_parameters=params)

In [None]:
# batch_job = summarize_prompts()

In [39]:
#| export
def load_batch_prediction_results():
    pass

## Export

In [28]:
#| hide
import nbdev; nbdev.nbdev_export()