# load

> Load documents from GCS

In [1]:
#| default_exp load

In [51]:
#| export
from typing import Dict, Any, Iterable, List
import pandas as pd

from pydantic import BaseModel
from langchain.schema import Document
from langchain.document_loaders.base import BaseLoader

from classifier.schema import PROJECT_ID, PROJECT_BUCKET

Load the data from GCS

In [4]:
training_data = pd.read_excel(
    f"gs://{PROJECT_BUCKET}/Last50KCases_withSubjectAndBody.xlsx")

In [5]:
#| export
label_column = "sfdc_category"

In [6]:
training_data.sfdc_category.value_counts()

sfdc_category
Order Processing        23011
Account/Inquiry         12128
General Inquiry          5645
Order Discrepancy        3924
Returns                  3806
Product Inquiry          3034
Billing / Invoice        2262
Delivery                 1880
Credits                  1066
Pricing                   241
Program / Promotions      240
Name: count, dtype: int64

In [8]:
training_data.iloc[0]

BU                                                                        SPD
case_number                                                           3469839
ACCOUNT_BUSINESS_UNIT__C                                                  NaN
received_at                                               2023-09-11T13:22:32
sfdc_category                                                Order Processing
sfdc_subcategory                                                  Order Entry
predicted_category                                           Order Processing
predicted_subcategory                                                     NaN
record_type                                                                 2
probability                                                          0.876806
Accuracy_upd                                                          Correct
Bin                                                                         8
email_subject               PO# 7004014842 || Walgreens Store 16

In [52]:
#| export
def get_possible_labels() -> List[str]:
    return pd.read_excel(
        f"gs://{PROJECT_BUCKET}/Last50KCases_withSubjectAndBody.xlsx"
        ).sfdc_category.unique().tolist()

In [41]:
#| export
class TrainingInstance(BaseModel):
    idx: int
    label: str
    email_subject: str
    email_body: str
    metadata: Dict[str, Any]


def training_instance_from_row(idx: int, row: pd.Series):
    metadata = row.drop(
        [
            'sfdc_category', 
            'email_subject',
            'email_body'
        ]).to_dict()
    return TrainingInstance(
        idx=idx,
        label=row.sfdc_category,
        email_subject=row.email_subject,
        email_body=row.email_body,
        metadata=metadata
    )

In [44]:
example_training_instance = training_instance_from_row(0, training_data.iloc[0])

In [45]:
#| export
def get_training_instances() -> Iterable[TrainingInstance]:
    training_data = pd.read_excel(
        f"gs://{PROJECT_BUCKET}/Last50KCases_withSubjectAndBody.xlsx")
    for idx, row in training_data.iterrows():
        yield training_instance_from_row(idx, row)

In [46]:
# max length
next(get_training_instances())

TrainingInstance(idx=0, label='Order Processing', email_subject='PO# 7004014842 || Walgreens Store 16422 || Ohio State University', email_body='External Email â€“ Please use caution before opening attachments or clicking links  Cardinal Ordering Team,  Please place the drop ship order(s) listed below for:  Client Name Ohio State University PO ID 7004014842 Account # 2150126632 Store # 16422 NDC 70127010010 Drug Name EPIDIOLEX 100MG/ML SOL 100ML Order Quantity 5 Prescriber Name LUCRETIA LONG, PHILIP CLAYTON JONAS Prescriber NPI or DEA ML0822634, FJ1422132  Thanks & Regards, Bhavesh Lalwani', metadata={'BU': 'SPD', 'case_number': 3469839, 'ACCOUNT_BUSINESS_UNIT__C': nan, 'received_at': '2023-09-11T13:22:32', 'sfdc_subcategory': 'Order Entry', 'predicted_category': 'Order Processing', 'predicted_subcategory': nan, 'record_type': 2, 'probability': 0.8768061, 'Accuracy_upd': 'Correct', 'Bin': 8})

In [47]:
large_docs = training_data[training_data.email_body.str.len() > 30000]

In [48]:
large_docs.shape

(147, 14)

In [49]:
sample_large_doc = large_docs.iloc[0]
print("--SUBJECT--\n", sample_large_doc.email_subject)
print("--BODY START--\n", sample_large_doc.email_body[:500]), print("--BODY END--\n", sample_large_doc.email_body[-500:])

--SUBJECT--
 09.12.23 Oncaspar and Asparlas Tracking
--BODY START--
 CAH PO# Sales Document Ship To Customer Number Ship To Customer Name Address Street Address City Address Region Address City Postal Code Material Description QTY TRACKING # Ship Date Service Type Description NOTES 1.61E+08 1051125329 2057190317 ANNANDROBERT H LURIE CHLDRN WAC 225 EAST CHICAGO AVE CHICAGO IL 60611-2991 ASPARLAS SF 750U/ML 5ML SPD DSHP 2 703867100356 9/7/2023 12:00:00 AM FedEx Priority Overnight Delivered  1.61E+08 1051115361 2052044098 KAISER ONCOLOGY PHARMACY 598 1600 EUREKA RD 
--BODY END--
  UNIVERSITY HOSP SUNY HSC WAC 750 E ADAMS ST SYRACUSE NY 13210-2306 ASPARLAS SF 750U/ML 5ML SPD DSHP 1 703867124592 9/11/2023 12:00:00 AM FedEx Standard Overnight At FedEx destination facility  1.61E+08 1051373221 2150204117 UK HOSP PHARM MC OP 340B 800 ROSE ST ROOM HC201 LEXINGTON KY 40536 ASPARLAS SF 750U/ML 5ML SPD DSHP 1 703867124684 9/11/2023 12:00:00 AM FedEx Standard Overnight Departed FedEx location  1.61

(None, None)

In [54]:
#| hide
import nbdev; nbdev.nbdev_export()