# load

> Load documents from GCS

In [1]:
#| default_exp load

In [22]:
#| export
import math
from typing import Dict, Any, Iterable, List, Tuple
import pandas as pd
import json

from pydantic import BaseModel
from langchain.schema import Document
from langchain.document_loaders.base import BaseLoader
from sklearn.model_selection import train_test_split
from google.cloud import storage

from classifier.schema import PROJECT_ID, PROJECT_BUCKET, WRITE_PREFIX

Load the data from GCS

In [2]:
training_data = pd.read_excel(
    f"gs://{PROJECT_BUCKET}/Last50KCases_withSubjectAndBody.xlsx")
training_data.loc[:, 'email_subject'] = training_data.email_subject.fillna("N/A")

In [3]:
training_data[
    (training_data.email_subject.isna()) |
    (training_data.email_body.isna())]

Unnamed: 0,BU,case_number,ACCOUNT_BUSINESS_UNIT__C,received_at,sfdc_category,sfdc_subcategory,predicted_category,predicted_subcategory,record_type,probability,Accuracy_upd,Bin,email_subject,email_body


In [4]:
#| export
LABEL_COLUMN = "sfdc_category"

In [5]:
training_data[LABEL_COLUMN].value_counts()

sfdc_category
Order Processing        23011
Account/Inquiry         12128
General Inquiry          5645
Order Discrepancy        3924
Returns                  3806
Product Inquiry          3034
Billing / Invoice        2262
Delivery                 1880
Credits                  1066
Pricing                   241
Program / Promotions      240
Name: count, dtype: int64

In [6]:
training_data.iloc[0]

BU                                                                        SPD
case_number                                                           3469839
ACCOUNT_BUSINESS_UNIT__C                                                  NaN
received_at                                               2023-09-11T13:22:32
sfdc_category                                                Order Processing
sfdc_subcategory                                                  Order Entry
predicted_category                                           Order Processing
predicted_subcategory                                                     NaN
record_type                                                                 2
probability                                                          0.876806
Accuracy_upd                                                          Correct
Bin                                                                         8
email_subject               PO# 7004014842 || Walgreens Store 16

In [7]:
#| export
def get_possible_labels() -> List[str]:
    return pd.read_excel(
        f"gs://{PROJECT_BUCKET}/Last50KCases_withSubjectAndBody.xlsx"
        ).sfdc_category.unique().tolist()

In [186]:
#| export
class TrainingInstance(BaseModel):
    idx: int
    label: str
    email_subject: str
    email_body: str
    metadata: Dict[str, Any]

    def to_series(self) -> pd.Series:
        data = self.metadata.copy()
        data['idx'] = self.idx
        data['label'] = self.label
        data['email_subject'] = self.email_subject
        data['email_body'] = self.email_body
        return pd.Series(data)


def training_instance_from_row(idx: int, row: pd.Series):
    metadata = row.drop(
        [
            'sfdc_category', 
            'email_subject',
            'email_body'
        ]).to_dict()
    return TrainingInstance(
        idx=idx,
        label=row.sfdc_category,
        email_subject=str(row.email_subject),
        email_body=str(row.email_body),
        metadata=metadata
    )

In [187]:
example_training_instance = training_instance_from_row(0, training_data.iloc[0])

## Get a "training" and "test" sample, remove outliers

### Outlier removal
We will remove outliers by subject + body length

In [95]:
#| export
# Our prompt to summarize takes up some amount of prompt space. This is a rough limit
EMAIL_SIZE_LIMIT = 7800


def email_small_enough(subject: str, body: str, limit: int = EMAIL_SIZE_LIMIT) -> bool:
    if not isinstance(subject, str):
        subject = str(subject)
    if not isinstance(body, str):
        body = str(body)
    return (len(subject) + len(body)) < limit

In [96]:
size_mask = training_data.apply(
    lambda row: email_small_enough(
        row.email_subject,
        row.email_body
    ), axis=1)

In [97]:
training_data_included = training_data[size_mask]
training_data_included.shape[0]

56096

### Train and test set
We will sample by label. Limit to 5,000 emails total. 90% train, 10% test.

In [160]:
#| export
INCLUSION_COUNT = 1000


def get_train_test_idx(
        data: pd.DataFrame,
        inclusion_count: int = INCLUSION_COUNT, 
        train_proportion: int = 0.9,
        label_column: str = LABEL_COLUMN,
        random_state: int = 42):
    train_count = int(round(inclusion_count * train_proportion))
    test_count = inclusion_count - train_count
    train, test = train_test_split(
        data,
        test_size=test_count, 
        train_size=train_count,
        random_state=random_state,
        stratify=data[label_column])
    input_data = pd.concat([train, test], axis=0)
    return train_test_split(
        input_data,
        test_size=1-train_proportion,
        train_size=train_proportion,
        random_state=random_state,
        stratify=input_data[label_column]
    )

In [161]:
train, test = get_train_test_idx(training_data_included)

In [162]:
train.shape[0] + test.shape[0]

1000

In [163]:
train[LABEL_COLUMN].value_counts().sort_index(), test[LABEL_COLUMN].value_counts().sort_index()

(sfdc_category
 Account/Inquiry         193
 Billing / Invoice        35
 Credits                  14
 Delivery                 29
 General Inquiry          87
 Order Discrepancy        62
 Order Processing        364
 Pricing                   4
 Product Inquiry          48
 Program / Promotions      4
 Returns                  60
 Name: count, dtype: int64,
 sfdc_category
 Account/Inquiry      21
 Billing / Invoice     4
 Credits               2
 Delivery              3
 General Inquiry      10
 Order Discrepancy     7
 Order Processing     41
 Product Inquiry       5
 Returns               7
 Name: count, dtype: int64)

In [164]:
#| export
TRAIN_IDX_NAME = "train_idx.csv"
TEST_IDX_NAME = "test_idx.csv"


def write_idx(
        train_idx: pd.Index, 
        test_idx: pd.Index, 
        bucket_name: str = PROJECT_BUCKET,
        prefix: str = WRITE_PREFIX):
    
    train_idx.to_series().to_csv(f"gs://{bucket_name}/{prefix}/{TRAIN_IDX_NAME}", index=False)
    test_idx.to_series().to_csv(f"gs://{bucket_name}/{prefix}/{TEST_IDX_NAME}", index=False)

In [165]:
write_idx(
    train.index,
    test.index
)

In [166]:
#| export
def get_idx(
        bucket_name: str = PROJECT_BUCKET,
        prefix: str = WRITE_PREFIX) -> Tuple[pd.Series, pd.Series]:
    return pd.read_csv(f'gs://{bucket_name}/{prefix}/{TRAIN_IDX_NAME}').iloc[:, 0], \
        pd.read_csv(f'gs://{bucket_name}/{prefix}/{TEST_IDX_NAME}').iloc[:, 0]

In [167]:
train_idx, test_idx = get_idx()
train_idx.head()

0    20775
1    46774
2    15159
3     2756
4    26009
Name: 0, dtype: int64

In [168]:
full_idx = pd.concat(
    [train_idx, test_idx],
    axis=0,
    ignore_index=True
)
training_data_included.loc[full_idx, :].head(2)

Unnamed: 0,BU,case_number,ACCOUNT_BUSINESS_UNIT__C,received_at,sfdc_category,sfdc_subcategory,predicted_category,predicted_subcategory,record_type,probability,Accuracy_upd,Bin,email_subject,email_body
20775,PD,3553288,a1G4z00000H6C4aEAF,2023-10-13T12:37:20,Billing / Invoice,Billing Statements,Billing / Invoice,,1,0.474032,Correct,4,"Invoices 1 of 1 for 2057199110 , TEXAS INSTITU...","Dear Valued Customer, Your Cardinal Health in..."
46774,PD,3658829,,2023-11-29T20:25:47,Account/Inquiry,Account updates,Billing / Invoice,,1,0.566661,Incorrect,5,Invoice status from State of Florida Next Gen,External Email â€“ Please use caution before o...


In [169]:
#| export
def get_training_instances(
        bucket_name: str = PROJECT_BUCKET
) -> Iterable[TrainingInstance]:
    data = pd.read_excel(
        f"gs://{bucket_name}/Last50KCases_withSubjectAndBody.xlsx")
    # Load train and test idx
    train_idx, test_idx = get_idx(bucket_name=bucket_name)
    full_idx = pd.concat([train_idx, test_idx], axis=0, ignore_index=True)
    data = data.loc[full_idx, :]
    for idx, row in data.iterrows():
        yield training_instance_from_row(idx, row)

In [170]:
# max length
instance_loader = get_training_instances()

In [178]:
#| export
def get_document_batches(loader: Iterable[TrainingInstance], batch_size: int = 32) -> Iterable[List[TrainingInstance]]:
    "Get a batch of documents of size `batch_size` from a BaseLoader with `.lazy_load` implemented."
    batch = []
    for instance in loader:
        batch.append(instance)
        if len(batch) >= batch_size:
            yield batch
            batch = []
    yield batch

In [179]:
next(instance_loader)

TrainingInstance(idx=15159, label='Order Processing', email_subject='New soliris order', email_body='External Email â€“ Please use caution before opening attachments or clicking links  Hello, Iâ€™d like to place a new order please  Account #2057189687 PO# PIC23100401 Patient: FS Soliris -  300 mg/30ml      CIN:  400-3091 Quantity: 4 Vials NPI:1619288172 TX:10/09/2023        Lena Oâ€™Brien, CPHT Oncology Pharmacy Technician Tri-City Medical Center | Outpatient Infusion Center 3617 Vista Way, Oceanside, CA 92056 P: 442.266.2809 or 442.266.2804 | F: 760.721.8736  [cid:image003.png@01D8952D.AC5BE480]  CONFIDENTIALITY NOTICE  This message and any included attachments are from the Tri-City Healthcare District and are intended only for the addressee.  The information contained in this message is confidential and may constitute non-public information under international, federal, or state securities laws and is intended only for the use of the addressee.  Unauthorized forwarding, printing, cop

In [180]:
next(instance_loader)

TrainingInstance(idx=2756, label='Account/Inquiry', email_subject='Paid - Invoice 7322186957 - to State of Florida Next Gen (ANID: AN01722330651) - Notification from Ariba Network', email_body='External Email â€“ Please use caution before opening attachments or clicking links   Your customer State of Florida Next Gen updated your invoice on SAP Business Network.  You can view the invoice in your online Outbox (ANID: AN11096391905).  Country  US Customer  State of Florida Next Gen Invoice number  7322186957 Invoice Status  Paid Description:   Header Level Exceptions: Accounting Verification Exception Please accept or edit the accounting information---   If you have any questions, contact your customer.   Download the SAP Business Network Supplier app to your mobile device and manage customer orders on the go.  Ariba, Inc., 3420 Hillview Ave, Bldg3, Palo Alto, CA 94304, USA SAP Business Network Privacy Statement Ariba Data Policy Support If a customer-specific privacy statement applies t

In [181]:
training_data_included.loc[train_idx, :].head(5)

Unnamed: 0,BU,case_number,ACCOUNT_BUSINESS_UNIT__C,received_at,sfdc_category,sfdc_subcategory,predicted_category,predicted_subcategory,record_type,probability,Accuracy_upd,Bin,email_subject,email_body
20775,PD,3553288,a1G4z00000H6C4aEAF,2023-10-13T12:37:20,Billing / Invoice,Billing Statements,Billing / Invoice,,1,0.474032,Correct,4,"Invoices 1 of 1 for 2057199110 , TEXAS INSTITU...","Dear Valued Customer, Your Cardinal Health in..."
46774,PD,3658829,,2023-11-29T20:25:47,Account/Inquiry,Account updates,Billing / Invoice,,1,0.566661,Incorrect,5,Invoice status from State of Florida Next Gen,External Email â€“ Please use caution before o...
15159,SPD,3530234,,2023-10-04T15:34:31,Order Processing,Order Entry,Order Processing,,2,0.738666,Correct,7,New soliris order,External Email â€“ Please use caution before o...
2756,PD,3479534,,2023-09-13T20:53:32,Account/Inquiry,Account updates,Account/Inquiry,,1,0.98752,Correct,9,Paid - Invoice 7322186957 - to State of Florid...,External Email â€“ Please use caution before o...
26009,SPD,3575015,,2023-10-23T19:21:39,General Inquiry,Drop-Ship Issue,Delivery,,2,0.486582,Incorrect,4,DROPSHIP VYVGART-160644502-TRACKING CENTRAL,ACCOUNT NUMBER 2150137661 ORDER NUMBER 10543...


In [182]:
large_docs = training_data[~size_mask]

In [183]:
large_docs.shape

(1141, 14)

In [184]:
sample_large_doc = large_docs.iloc[0]
display("--SUBJECT--\n", sample_large_doc.email_subject)
display("--BODY START--\n", sample_large_doc.email_body[:1500]), print("--BODY END--\n", sample_large_doc.email_body[-500:])

'--SUBJECT--\n'

'RE: Drop Ship Invoices'

'--BODY START--\n'

'Hi, Tara.  Thank you for your patience. I have sent a follow up email last week to the dropship billing team but have not heard back. Usually for Dropship invoices, it does not come the same day the order arrives. It arrives days/weeks (depending on the vendor)  later than the order.  Can you kindly provide the confirmation number/s of the orders?  Let me loop in customer service to assist.  Hi, @Pharma-CustomerService<mailto:Pharma-CustomerService@cardinalhealth.com>.  Kindly assist Tara with her issue. They have not been receiving their invoices for dropship orders  SAP Ship To Name 2052014745 PHOENIXVILLE HOSPITAL CO LLC 2057194735 PHOENIXVILLE HSP SYS PHARM 2150232475 PHOENIXVILLE HSP SYS PHARM  Let me know if you have any questions.  Thank you,   [cid:image002.png@01D9E48E.FD2FFD90]     Winona Fejer Sr. Specialist, Account Operations, Customer Success Pharmaceutical Distribution AcctOpsE@cardinalhealth.com<mailto:AcctOpsE@cardinalhealth.com>     From: May, Tara <TaraM.May@towerhe

--BODY END--
 __________________________________________  This message is for the designated recipient only and may contain privileged, proprietary or otherwise private information. If you have received it in error, please notify the sender immediately and delete the original. Any other use of the email by you is prohibited.  Dansk - Deutsch - Espanol - Francais - Italiano - Japanese - Nederlands - Norsk - Portuguese - Chinese Svenska: http://www.cardinalhealth.com/en/support/terms-and-conditions-english.html


(None, None)

## Export

In [190]:
#| hide
import nbdev; nbdev.nbdev_export()