# 10k retrieval filtering, 2-step prediction

> Sample size up to 10k, create two-step prediction chain

In [1]:
#| default_exp experiments.retrieval_filtering

In [35]:
#| export
from typing import Dict, List
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics, model_selection

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableSequence
from langchain.llms import VertexAI
from langchain.vectorstores import Chroma
from langchain.document_loaders import DataFrameLoader

from classifier.schema import get_embedder, get_model, quota_handler, WRITE_PREFIX, PROJECT_BUCKET
from classifier.load import Email, get_batches, get_idx, get_emails_from_frame, \
    get_possible_labels, get_raw_emails, email_small_enough, write_idx
from classifier.chroma import get_or_make_chroma
from classifier.predict import make_prediction_prompt, get_predictions, write_predictions
from classifier.experiments.split_processing import \
    format_email_for_train_summary, \
    format_email_for_test_summary, \
    make_description_from_row, batch_predict, \
    TRAIN_PROMPT, TEST_PROMPT

EXPERIMENT_PREFIX = "retrieval_filtering"
EXPERIMENT_WRITE_PREFIX = WRITE_PREFIX + "/" + EXPERIMENT_PREFIX

In [29]:
data_dir = Path("../../data")
assert data_dir.exists()
experiment_dir = data_dir / EXPERIMENT_PREFIX
if not experiment_dir.exists():
    experiment_dir.mkdir()

## Sample

In [6]:
all_emails = get_raw_emails()

In [11]:
size_mask = all_emails.apply(
    lambda row: email_small_enough(
        row.email_subject,
        row.email_body
    ), axis=1)
size_mask.value_counts()

True     56096
False     1141
Name: count, dtype: int64

Filter on size

In [14]:
all_emails_filtered = all_emails[size_mask]
all_emails_filtered.shape[0]

56096

In [15]:
all_emails_filtered.head(2)

Unnamed: 0,BU,case_number,ACCOUNT_BUSINESS_UNIT__C,received_at,sfdc_category,sfdc_subcategory,predicted_category,predicted_subcategory,record_type,probability,Accuracy_upd,Bin,email_subject,email_body
0,SPD,3469839,,2023-09-11T13:22:32,Order Processing,Order Entry,Order Processing,,2,0.876806,Correct,8,PO# 7004014842 || Walgreens Store 16422 || Ohi...,External Email â€“ Please use caution before o...
2,PD,3469842,a1G4z00000H4uvREAR,2023-09-11T13:22:43,Order Processing,Order Entry,General Inquiry,,1,0.838036,Incorrect,8,Purchase Order #65398,External Email â€“ Please use caution before o...


Train Test Split

In [17]:
train_idx, test_idx = model_selection.train_test_split(
    all_emails_filtered.index.values,
    test_size=2000,
    train_size=8000,
    stratify=all_emails_filtered.sfdc_category)

In [24]:
all_emails_filtered.loc[train_idx, 'sfdc_category'].value_counts().sort_index(),\
all_emails_filtered.loc[test_idx, 'sfdc_category'].value_counts().sort_index()

(sfdc_category
 Account/Inquiry         1712
 Billing / Invoice        316
 Credits                  128
 Delivery                 258
 General Inquiry          770
 Order Discrepancy        552
 Order Processing        3235
 Pricing                   33
 Product Inquiry          426
 Program / Promotions      34
 Returns                  536
 Name: count, dtype: int64,
 sfdc_category
 Account/Inquiry         428
 Billing / Invoice        79
 Credits                  32
 Delivery                 65
 General Inquiry         193
 Order Discrepancy       138
 Order Processing        809
 Pricing                   8
 Product Inquiry         106
 Program / Promotions      8
 Returns                 134
 Name: count, dtype: int64)

In [25]:
write_idx(
    pd.Index(train_idx),
    pd.Index(test_idx),
    prefix=EXPERIMENT_WRITE_PREFIX
)

## Process

In [39]:
training_emails = list(get_emails_from_frame(
    get_raw_emails(),
    'train',
    index_prefix=EXPERIMENT_WRITE_PREFIX
))
len(training_emails)

8000

Load label descriptions

In [30]:
descriptions_path = data_dir / 'labels.xlsx'
assert descriptions_path.exists()

In [33]:
descriptions = pd.read_excel(descriptions_path).map(lambda s: s.strip() if isinstance(s, str) else s)
descriptions_dict = descriptions.T.iloc[1:, :].apply(make_description_from_row, axis=1).to_dict()

Process training data

In [36]:
train_processing_chain = TRAIN_PROMPT | VertexAI()

In [43]:
# # Takes about 2.25 hours
# train_summaries = []

# pbar = tqdm(total=len(training_emails), ncols=80, leave=True)

# try:
#     for batch in get_batches(training_emails, 5):
#         batch_prompts = [format_email_for_train_summary(e, descriptions_dict) for e in batch]
#         train_summaries.extend(batch_predict(batch_prompts, train_processing_chain))
#         pbar.update(len(batch))
# except:
#     pass
# finally:
#     pbar.close()

# len(train_summaries) == len(training_emails)

In [44]:
# train_summary_df = pd.DataFrame(
#     zip(
#         train_summaries,
#         [t.idx for t in training_emails],
#         [t.label for t in training_emails]
#     ),
#     columns=['summary', 'idx', 'label']
# )
# train_summary_df.to_csv(
#     f'gs://{PROJECT_BUCKET}/{EXPERIMENT_WRITE_PREFIX}/split_train_summaries.csv', 
#     index=False)

In [45]:
train_summary_df = pd.read_csv(f'gs://{PROJECT_BUCKET}/{EXPERIMENT_WRITE_PREFIX}/split_train_summaries.csv')

In [46]:
train_summary_df.head(2)

Unnamed: 0,summary,idx,label
0,**Summary**\nThe email is about a new purchas...,40220,Order Processing
1,"**Summary:**\nBarbara Conley, an OS&D Analyst...",19920,Credits


Process test data

In [47]:
test_emails = list(get_emails_from_frame(
    get_raw_emails(),
    'test',
    index_prefix=EXPERIMENT_WRITE_PREFIX
))
len(test_emails)

2000

In [49]:
test_processing_chain = TEST_PROMPT | VertexAI()

In [50]:
# Takes about ~15 minutes
test_summaries = []

pbar = tqdm(total=len(test_emails), ncols=80, leave=True)

try:
    for batch in get_batches(test_emails, 5):
        batch_prompts = [format_email_for_test_summary(e) for e in batch]
        test_summaries.extend(batch_predict(batch_prompts, test_processing_chain))
        pbar.update(len(batch))
except:
    pass
finally:
    pbar.close()

len(test_summaries) == len(test_emails)

  2%|▉                                        | 45/2000 [01:04<44:20,  1.36s/it]

In [None]:
test_summary_df = pd.DataFrame(
    zip(
        test_summaries,
        [t.idx for t in test_emails],
        [t.label for t in test_emails]
    ),
    columns=['summary', 'idx', 'label']
)
test_summary_df.to_csv(
    f'gs://{PROJECT_BUCKET}/{EXPERIMENT_WRITE_PREFIX}/split_test_summaries.csv', 
    index=False)

In [None]:
test_summary_df = pd.read_csv(f'gs://{PROJECT_BUCKET}/{EXPERIMENT_WRITE_PREFIX}/split_test_summaries.csv')

In [None]:
test_summary_df.head(2)

## Vectorstore

## Prediction (2-step)

## Evaluation

## Export

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()