# process

> Process our emails, remove boilerplate, split email chains

In [1]:
#| default_exp process

In [27]:
#| export
from typing import List, Dict
from itertools import chain, islice

from classifier.schema import batch_predict, predict
from classifier.load import get_training_instances, TrainingInstance

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from tqdm.auto import tqdm

In [28]:
import json
from pathlib import Path

## Filter text chain using map-reduce

In [4]:
sample_training_instances = list(islice(get_training_instances(), 20))
len(sample_training_instances)

20

In [5]:
[len(d.email_body + d.email_subject) for d in sample_training_instances]

[515,
 9884,
 611,
 616,
 1006,
 549,
 640,
 185,
 1760,
 737,
 826,
 2256,
 818,
 1709,
 1030,
 925,
 515,
 326,
 1012,
 7330]

Define summarization prompt

In [6]:
#| export
SUMMARIZE_PROMPT_PREFIX = """You are a customer service representative.
Summarize the following email, try to preserve as much information as is necessary to diagnose and solve the customers issue detailed in the email.
Think through your summary step-by-step.
Only use information present in the email.
EMAIL:
"""

SUMMARIZE_PROMPT_STR = SUMMARIZE_PROMPT_PREFIX + "{context}\nSummary:"

SUMMARIZE_PROMPT = PromptTemplate.from_template(SUMMARIZE_PROMPT_STR)

BISON_MAXIMUM_INPUT_TOKENS = 8192
CONTEXT_TOKEN_LIMIT = BISON_MAXIMUM_INPUT_TOKENS - len(SUMMARIZE_PROMPT_PREFIX)

In [7]:
CONTEXT_TOKEN_LIMIT

7910

In [55]:
#| export
EMAIL_SUBJECT_PREFIX = "--EMAIL SUBJECT--"
EMAIL_BODY_PREFIX = "--EMAIL BODY--"
PREFIX_LEN = len(EMAIL_SUBJECT_PREFIX + EMAIL_BODY_PREFIX) + len("\n"*4)


def make_document_from_email(
        body: str, 
        subject: str, 
        metadata: Dict[str, str]
        ) -> Document:
    return Document(
        page_content="\n".join([
            EMAIL_SUBJECT_PREFIX,
            subject,
            EMAIL_BODY_PREFIX,
            body]),
        metadata=metadata
    )


def split_training_instance_for_summary(
    training_instance: TrainingInstance,
    character_limit: int = CONTEXT_TOKEN_LIMIT
    ) -> List[Document]:
    subject_len = len(training_instance.email_subject)
    body_len = len(training_instance.email_body)
    if (subject_len + body_len + PREFIX_LEN) > character_limit:
        body_limit = character_limit - subject_len - PREFIX_LEN
        body_splitter = RecursiveCharacterTextSplitter(
            chunk_size=body_limit)
        body_texts = body_splitter.split_text(training_instance.email_body)
    else:
        body_texts = [training_instance.email_body]
    metadata = training_instance.metadata
    metadata['idx'] = training_instance.idx
    metadata['label'] = training_instance.label
    # Gather split instances as documents
    split_instances = []
    for i, body in enumerate(body_texts):
        i_metadata = metadata.copy()
        i_metadata['idx_chunk'] = i
        i_document = make_document_from_email(
            body,
            subject=training_instance.email_subject,
            metadata=i_metadata
        )
        split_instances.append(i_document)
    return split_instances


def split_training_instances(instances: List[TrainingInstance]) -> List[Document]:
    return list(chain.from_iterable(map(split_training_instance_for_summary, instances)))

In [56]:
sample_split_instances = split_training_instances(sample_training_instances)
[len(d.page_content) for d in sample_split_instances]

[549,
 7907,
 2241,
 645,
 650,
 1040,
 583,
 674,
 219,
 1794,
 771,
 860,
 2290,
 852,
 1743,
 1064,
 959,
 549,
 360,
 1046,
 7364]

In [57]:
sample_split_instances[0]

Document(page_content='--EMAIL SUBJECT--\nPO# 7004014842 || Walgreens Store 16422 || Ohio State University\n--EMAIL BODY--\nExternal Email â€“ Please use caution before opening attachments or clicking links  Cardinal Ordering Team,  Please place the drop ship order(s) listed below for:  Client Name Ohio State University PO ID 7004014842 Account # 2150126632 Store # 16422 NDC 70127010010 Drug Name EPIDIOLEX 100MG/ML SOL 100ML Order Quantity 5 Prescriber Name LUCRETIA LONG, PHILIP CLAYTON JONAS Prescriber NPI or DEA ML0822634, FJ1422132  Thanks & Regards, Bhavesh Lalwani', metadata={'BU': 'SPD', 'case_number': 3469839, 'ACCOUNT_BUSINESS_UNIT__C': nan, 'received_at': '2023-09-11T13:22:32', 'sfdc_subcategory': 'Order Entry', 'predicted_category': 'Order Processing', 'predicted_subcategory': nan, 'record_type': 2, 'probability': 0.8768061, 'Accuracy_upd': 'Correct', 'Bin': 8, 'idx': 0, 'label': 'Order Processing', 'idx_chunk': 0})

In [58]:
# Doc 1 was split into 2 pieces
sample_split_instances[1].metadata['idx'] == \
    sample_split_instances[2].metadata['idx']

True

Make a summarization request

In [59]:
sample_prompt = SUMMARIZE_PROMPT.format(
    context=sample_split_instances[0].page_content)
sample_prompt

'You are a customer service representative.\nSummarize the following email, try to preserve as much information as is necessary to diagnose and solve the customers issue detailed in the email.\nThink through your summary step-by-step.\nOnly use information present in the email.\nEMAIL:\n--EMAIL SUBJECT--\nPO# 7004014842 || Walgreens Store 16422 || Ohio State University\n--EMAIL BODY--\nExternal Email â€“ Please use caution before opening attachments or clicking links  Cardinal Ordering Team,  Please place the drop ship order(s) listed below for:  Client Name Ohio State University PO ID 7004014842 Account # 2150126632 Store # 16422 NDC 70127010010 Drug Name EPIDIOLEX 100MG/ML SOL 100ML Order Quantity 5 Prescriber Name LUCRETIA LONG, PHILIP CLAYTON JONAS Prescriber NPI or DEA ML0822634, FJ1422132  Thanks & Regards, Bhavesh Lalwani\nSummary:'

In [60]:
#| export
def get_email_document_summary(document: Document) -> str:
    prompt = SUMMARIZE_PROMPT.format(
        context=document.page_content
    )
    summary_response = predict(prompt)
    return summary_response.text

In [61]:
example_summarization = get_email_document_summary(sample_split_instances[0])
example_summarization

' The email is requesting a drop ship order for Ohio State University.\nThe PO number is 7004014842, the account number is 2150126632, and the store number is 16422.\nThe drug name is EPIDIOLEX 100MG/ML SOL 100ML, the order quantity is 5, and the prescriber names are LUCRETIA LONG and PHILIP CLAYTON JONAS.\nThe prescriber NPIs or DEAs are ML0822634 and FJ142'

In [62]:
# summarize our sample
summaries = {}

for document in tqdm(sample_split_instances, ncols=80, leave=False):
    instance_idx = document.metadata.get('idx')
    instance_idx_chunk = document.metadata.get('idx_chunk')
    instance_label = document.metadata.get('label')
    instance_summary = get_email_document_summary(document)
    if instance_idx not in summaries:
        summaries[instance_idx] = {
            'label': instance_label,
            'summaries': {}
        }
    summaries[instance_idx]['summaries'][instance_idx_chunk] = instance_summary

  0%|                                                    | 0/21 [00:00<?, ?it/s]

In [63]:
summaries[0]

{'label': 'Order Processing',
 'summaries': {0: ' The email is requesting a drop ship order for Ohio State University.\nThe PO number is 7004014842, the account number is 2150126632, and the store number is 16422.\nThe drug name is EPIDIOLEX 100MG/ML SOL 100ML, the order quantity is 5, and the prescriber names are LUCRETIA LONG and PHILIP CLAYTON JONAS.\nThe prescriber NPIs or DEAs are ML0822634 and FJ142'}}

In [64]:
# Save summaries
data_dir = Path("../data")
assert data_dir.exists()
with open(data_dir / 'summaries.json', 'w+') as f:
    json.dump(summaries, f, indent=4)

## Prepare batch prediction

>  TODO

In [65]:
#| hide
import nbdev; nbdev.nbdev_export()