# Split processing

> Lets process documents differently based on whether we're predicting a label or embedding into a vector store.

In [1]:
#| default_exp experiments.split_processing

In [81]:
#| export
from typing import Dict, List
from pathlib import Path
import pandas as pd
from tqdm import tqdm

from classifier.schema import get_embedder, get_model, quota_handler, WRITE_PREFIX
from classifier.load import Email, get_batches, get_idx, get_emails_from_frame, \
    get_raw_emails_tejas_case_numbers, get_possible_labels

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableSequence
from langchain.llms import VertexAI

## Process

Load training emails

In [13]:
#| export
TEJAS_PREFIX = f"{WRITE_PREFIX}/tejas"

In [9]:
all_emails = get_raw_emails_tejas_case_numbers()
all_emails.shape[0]

3200

In [12]:
train_idx, test_idx = get_idx(prefix=TEJAS_PREFIX)

### Prepare Training Documents

> For inclusion in a vector store for few-shot prompting.

In [18]:
training_emails = list(get_emails_from_frame(
    all_emails,
    'train',
    index_prefix=TEJAS_PREFIX
))
len(training_emails)

2400

In [22]:
training_emails[0]

Email(idx=31716, label='Order Processing', email_subject='Equashield latest - FW: EQ II Catalog 2023 - Cardinal Health.xlsx', email_body='External Email â€“ Please use caution before opening attachments or clicking links  Let us know if you need anything else.    Regards,  Thomas Everitt Customer Service Representative Office    +1 516 684 8200 / Ext: 220 Mobile  +1 516 398 97 25 Fax          +1 516 684 8202 www.equashield.com<http://www.equashield.com/> [cid:image001.png@01DA0CD9.861376B0] [cid:image002.png@01DA0CD9.861376B0]  From: Pavlina Georgieva <pavlina@equashield.com> Sent: Wednesday, November 1, 2023 3:39 PM To: Thomas Everitt <Thomas.e@equashield.com> Subject: EQ II Catalog 2023 - Cardinal Health.xlsx      Regards,  Pavlina Georgieva Logistics Coordinator Office    +1 516 684 8200 / Ext: 202 Fax          +1 516 684 8202 www.equashield.com<http://www.equashield.com/> [cid:image001.png@01DA0CD9.861376B0] [cid:image002.png@01DA0CD9.861376B0]   ________________________________ Co

Load descriptions

In [51]:
data_dir = Path("../../data")
assert data_dir.exists()

descriptions_path = data_dir / 'labels.xlsx'
assert descriptions_path.exists()

descriptions = pd.read_excel(descriptions_path).map(lambda s: s.strip() if isinstance(s, str) else s)
descriptions.head()

Unnamed: 0,CATEGORY,Account/Inquiry,Order Processing,Delivery,Order Discrepancy,Returns,Billing / Invoice,Credits,Pricing,Product Quality,Product Inquiry,Claims,General Inquiry,IT Customer Technology Issue,Program / Promotions
0,SUBCATEGORY,New Account Set Up,Order Entry,Refused Delivery,Keying Error,Customer order error,Freight charge error,Billed Wrong Account,Product Pricing,Adverse Event,General Product inquiry,,Sales,,
1,,Licensing,Will Call,Deliver/ Carrier issue,Wrong Item Shipped,Damage,,Credit Inquiry,Pricing Issue,Defective Item,Product Availability,,,,
2,,Account updates,Emergency Order,Missing/ mis-routed tote,Miss Label,Expired product,,credit not issued / error,,Safety Issue,Restricted item,,,,
3,,,Adjust Order,,Overage,Flu Return,,Pass Thru Credit,,Suspicious Product,,,,,
4,,,Shipping Service Change,,Partial Carton,Overstock,,,,,,,,,


In [52]:
labels = get_possible_labels()

In [53]:
labels

['Order Processing',
 'Product Inquiry',
 'Account/Inquiry',
 'General Inquiry',
 'Returns',
 'Billing / Invoice',
 'Delivery',
 'Credits',
 'Order Discrepancy',
 'Pricing',
 'Program / Promotions']

In [54]:
#| export
def make_description_from_row(row: pd.Series) -> str:
    if row.dropna().shape[0] == 0:
        return ""
    return "Issues including:\n" + "\n".join(["- " + v.strip() for v in row.dropna().values])

In [55]:
descriptions_dict = descriptions.T.iloc[1:, :].apply(make_description_from_row, axis=1).to_dict()
descriptions_dict

{'Account/Inquiry': 'Issues including:\n- New Account Set Up\n- Licensing\n- Account updates',
 'Order Processing': 'Issues including:\n- Order Entry\n- Will Call\n- Emergency Order\n- Adjust Order\n- Shipping Service Change\n- Cancel Order\n- Allocation override',
 'Delivery': 'Issues including:\n- Refused Delivery\n- Deliver/ Carrier issue\n- Missing/ mis-routed tote',
 'Order Discrepancy': 'Issues including:\n- Keying Error\n- Wrong Item Shipped\n- Miss Label\n- Overage\n- Partial Carton\n- Product Excursions\n- Shipped Short Dated product\n- Shortage',
 'Returns': 'Issues including:\n- Customer order error\n- Damage\n- Expired product\n- Flu Return\n- Overstock\n- Product Description Not Clear\n- Recall\n- Shipping Label/Rtn\n- Short Dated product\n- Unauthorized Return',
 'Billing / Invoice': 'Issues including:\n- Freight charge error',
 'Credits': 'Issues including:\n- Billed Wrong Account\n- Credit Inquiry\n- credit not issued / error\n- Pass Thru Credit',
 'Pricing': 'Issues in

In [56]:
all([l in descriptions_dict.keys() for l in labels])

True

Define prompt

In [66]:
#| export
TRAIN_PROMPT_TEMPLATE = """Here is an email chain sent to customer service \
and how customer service labeled it for handling, including a description of the label. 
Summarize the email and explain why the email was labeled the way it was.
Do not include any boilerplate content in your summary.
Only use information present in the email.
Think through your explanation step-by-step.
-- EMAIL --
{email}
-- LABEL --
{label}
-- LABEL DESCRIPTION --
{label_description}
-- SUMMARY AND EXPLANATION --
"""

TRAIN_PROMPT = PromptTemplate.from_template(TRAIN_PROMPT_TEMPLATE)

In [67]:
llm = VertexAI()

In [101]:
train_processing_chain = TRAIN_PROMPT | llm

In [69]:
#| export
def format_email_for_train_summary(
        email: Email,
        descriptions: Dict[str, str]
        ) -> Dict[str, str]:
    return {
        'email': f'SUBJECT: {email.email_subject}\nBODY: {email.email_body}',
        'label': email.label,
        'label_description': descriptions.get(email.label)
    }

In [74]:
example_train_prompt = TRAIN_PROMPT.format(**format_email_for_train_summary(training_emails[0], descriptions_dict))
print(example_train_prompt)

Here is an email chain sent to customer service and how customer service labeled it for handling, including a description of the label. 
Summarize the email and explain why the email was labeled the way it was.
Do not include any boilerplate content in your summary.
Only use information present in the email.
Think through your explanation step-by-step.
-- EMAIL --
SUBJECT: Equashield latest - FW: EQ II Catalog 2023 - Cardinal Health.xlsx
BODY: External Email â€“ Please use caution before opening attachments or clicking links  Let us know if you need anything else.    Regards,  Thomas Everitt Customer Service Representative Office    +1 516 684 8200 / Ext: 220 Mobile  +1 516 398 97 25 Fax          +1 516 684 8202 www.equashield.com<http://www.equashield.com/> [cid:image001.png@01DA0CD9.861376B0] [cid:image002.png@01DA0CD9.861376B0]  From: Pavlina Georgieva <pavlina@equashield.com> Sent: Wednesday, November 1, 2023 3:39 PM To: Thomas Everitt <Thomas.e@equashield.com> Subject: EQ II Catal

In [75]:
llm(example_train_prompt)

' The email chain is about sharing a catalog for the year 2023.\n\nThe email was labeled as "Order Processing" because it contains information about a catalog, which is related to order processing. The catalog is likely used by customers to place orders, and the email is providing an updated version of the catalog to the customer service representative.'

### Prepare test documents

In [76]:
test_emails = list(get_emails_from_frame(
    all_emails,
    'test',
    index_prefix=TEJAS_PREFIX
))
len(test_emails)

600

In [72]:
#| export
TEST_PROMPT_TEMPLATE = """Here is an email sent to our customer service department.
Summarize it, identifying points of action for customer service if there are any.
Do not include any names, company names, addresses or other identifying information.
Remove boilerplate. 
-- EMAIL SUBJECT --
{subject}
-- EMAIL BODY --
{body}
-- SUMMARY --
"""

TEST_PROMPT = PromptTemplate.from_template(TEST_PROMPT_TEMPLATE)

In [73]:
#| export
def format_email_for_test_summary(
        email: Email,
        ) -> Dict[str, str]:
    return {
        'subject': email.email_subject,
        'body': email.email_body,
    }

In [78]:
example_test_prompt = TEST_PROMPT.format(**format_email_for_test_summary(test_emails[0]))
print(example_test_prompt)

Here is an email sent to our customer service department.
Summarize it, identifying points of action for customer service if there are any.
Do not include any names, company names, addresses or other identifying information.
Remove boilerplate. 
-- EMAIL SUBJECT --
Need signature AC account
-- EMAIL BODY --
Good afternoon,  We have received an order from customer 2057194105. They sent unsigned MRA 3901356789. Can you please reach out to the customer and let them know they have 48 hours to send a signed MRA or we will send back the case for no credit.  Thanks,   [cid:image001.png@01D9F52F.5AD273F0]  Tom Coppedge Returns Lead | Warehouse Operations 2840 Elm Pont Industrial Drive St. Charles, MO. 63301    _________________________________________________  This message is for the designated recipient only and may contain privileged, proprietary or otherwise private information. If you have received it in error, please notify the sender immediately and delete the original. Any other use of 

In [79]:
llm(example_test_prompt)

' Customer 2057194105 sent an unsigned MRA 3901356789 with their order.\n\n**Action:** Customer service should reach out to the customer and inform them that they have 48 hours to send a signed MRA or the case will be returned for no credit.'

### Process everything

In [None]:
llm.batch()

In [107]:
#| export
@quota_handler
def batch_predict(prompts: List[Dict[str, str]], chain: RunnableSequence) -> List[str]:
    return chain.batch(prompts)

In [110]:
train_summaries = []

pbar = tqdm(total=len(training_emails), ncols=80, leave=True)

try:
    for batch in get_batches(training_emails, 5):
        batch_prompts = [format_email_for_train_summary(e, descriptions_dict) for e in batch]
        train_summaries.extend(batch_predict(batch_prompts, train_processing_chain))
        pbar.update(len(batch))
except:
    pass
finally:
    pbar.close()

len(train_summaries) == len(training_emails)

 62%|████████████████████████▏              | 1485/2400 [45:11<27:48,  1.82s/it]

## Export

In [None]:
import nbdev; nbdev.nbdev_export()