## Applies a transformers model to classify emails
Recommended setup:
- AWS SageMaker Studio
- Image: Pytorch 2.0.0 Python 3.10 GPU Optimized
- Instance Type: g4dn.xlarge

In [4]:
%%capture
!pip install transformers[torch] pandas tqdm

In [16]:
from transformers import pipeline, AutoTokenizer
from transformers.pipelines.pt_utils import KeyDataset
from datasets import Dataset
import pandas as pd
from numpy import nan
import configparser

In [17]:
# Constants
config = configparser.ConfigParser()
config.read('config.ini')
ENCODING = config['global']['ENCODING']
checkpoint = config['classify_emails']['HF_CLASSIFIER_NAME']
in_path = config['make_pairs']['OUT_FILE']
out_path = config['classify_emails']['OUT_FILE']

max_length = 512
batch_size = 32  # Number of examples to batch in pipeline

tokenizer = AutoTokenizer.from_pretrained(checkpoint, max_length=max_length, padding=True, truncation=True)

In [18]:
def load_classifier(checkpoint):
    return pipeline("text-classification", model=checkpoint, tokenizer=tokenizer, padding=True, truncation=True,
                   device=0, batch_size=batch_size)


def extract(dataset, classifier): 
    results = classifier(KeyDataset(dataset, "text"));
    return [label['label'] for label in results]

def from_csv(in_path,out_path):
    """
    Classify email pairs with a transformers classifier
    """
    classifier = load_classifier(checkpoint) 
    df = pd.read_csv(in_path, index_col=0, encoding=ENCODING)
    
    # Classify all rows
    classifier_inputs = [f">>> Question:\n{q}\n\n>>> Answer:\n{ans}" for q, ans in zip(df['question'], df['answer'])]
    dataset = Dataset.from_dict({"text": classifier_inputs})
    df['label'] = extract(dataset, classifier)
    
    # Save file
    df.to_csv(out_path,encoding=ENCODING)

In [19]:
from_csv(in_path,out_path)