## Trains a transformers model to extract the relevant content of emails
Recommended setup:
- AWS SageMaker Studio
- Image: Pytorch 2.0.0 Python 3.10 GPU Optimized
- Instance Type: g4dn.xlarge

In [None]:
%%capture
!pip install transformers[torch] datasets evaluate huggingface_hub sentencepiece seqeval accelerate ipywidgets

### Load models and data

In [None]:
import csv
import ast
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
import configparser

In [None]:
# Constants
config = configparser.ConfigParser()
config.read('config.ini')
ENCODING = config['global']['ENCODING']

model_checkpoint = "roberta-base"
max_length = 512
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Load annotated data from file

training_data_file = config['extract_contents']['TRAINING_ANNOTATION_FILE']
cols = {
    'from': 9,
    'body': 4,
    'question': 13,
    'answer': 3,
}

q_docs = {"text": [], "target": []}
a_docs = {"text": [], "target": []}

def read_csv():
    with open(training_data_file, 'r', encoding=ENCODING) as csvfile:
        datareader = csv.reader(csvfile)
        next(datareader) # skip header row 
        
        for row in datareader:
            if len(row) == 0: continue
            
            is_q = row[cols['from']] == '1'
            is_a = row[cols['from']] == '2'
            
            if not is_q and not is_a: continue
            
            spans = row[cols['question'] if is_q else cols['answer']]
            data_list = q_docs if is_q else a_docs
            
            data_list["text"].append(row[cols['body']])
            
            target = None
            if spans:
                span = ast.literal_eval(spans)[0]
                data_list["target"].append({"start": span["start"], "end": span["end"]})
            else:
                data_list["target"].append({"start": 0, "end": 0})

def make_dataset_split(docs, test_size=0.15, valid_size=0.15):
    """
    Creates a dataset with train/test/valid split
    """
    dataset = Dataset.from_dict(docs)
    train_testvalid = dataset.train_test_split(test_size = test_size + valid_size)
    test_valid = train_testvalid['test'].train_test_split(test_size = test_size / (test_size + valid_size))

    dataset = DatasetDict({
        'train': train_testvalid['train'],
        'test': test_valid['test'],
        'valid': test_valid['train']})
    
    return dataset

read_csv()

In [None]:
# Set up the data as expected for the training function

label_names = [
    "I-E",
    "O"
]

label2id = {
    "I-E": 0,
    "O": 1
}

id2label = {
    0: "I-E",
    1: "O"
}

def tokenize(examples):
    return tokenizer(examples, truncation=True, is_split_into_words=False, return_overflowing_tokens=True, 
                     return_offsets_mapping=True, max_length = max_length, stride = 128, padding="max_length")

def preprocess(examples):
    tokenized_inputs = tokenize(examples["text"])

    overflow_to_sample_mapping = tokenized_inputs.pop("overflow_to_sample_mapping")
    offsets_mapping = tokenized_inputs.pop("offset_mapping")
    targets = examples["target"]

    labels = []
    sample_ids = []
    for i, offset_mapping in enumerate(offsets_mapping):
        sample_ids.append(overflow_to_sample_mapping[i])
        target = targets[overflow_to_sample_mapping[i]]
        start_char = target["start"]
        end_char = target["end"] + 1
        
        label = []
        for token_mapping in offset_mapping:
            if token_mapping[0] == 0 and token_mapping[1] == 0:
                 # ignore special token
                label.append(-100)
            elif token_mapping[1] >= start_char and token_mapping[0] <= end_char:
                # in span
                label.append(label2id["I-E"])
            else:
                # outside of span
                label.append(label2id["O"])
                
        labels.append(label)

    tokenized_inputs["labels"] = labels
    tokenized_inputs["sample_ids"] = sample_ids
    return tokenized_inputs

def tokenize_dataset(dataset):
    return dataset.map(
        preprocess,
        batched=True,
        remove_columns=["text","target"],
    )

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

### Train the models

In [None]:
import evaluate
import numpy as np
from huggingface_hub import notebook_login
from transformers import TrainingArguments, Trainer, AutoModelForTokenClassification

In [None]:
# Set the metrics for training

metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
# Login to huggingface to save the model

from huggingface_hub import interpreter_login

interpreter_login()

In [None]:
def train_model(dataset, hf_name):
    print(f"Training model {hf_name}")
    
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        num_labels=2,
        id2label=id2label,
        label2id=label2id,
    )

    args = TrainingArguments(
        hf_name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        num_train_epochs=4,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        weight_decay=0.01,
        logging_steps=25,
        eval_steps=25,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        push_to_hub=True,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["valid"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    trainer.train()
    trainer.push_to_hub()
    del model

In [None]:
# Train the answer extractor

dataset_a = make_dataset_split(a_docs)
tokenized_dataset_a = tokenize_dataset(dataset_a)
train_model(tokenized_dataset_a, config['extract_contents']['HF_ANSWER_MODEL_NAME'])
del dataset_a
del tokenized_dataset_a

In [None]:
# Train the question extractor

dataset_q = make_dataset_split(q_docs)
tokenized_dataset_q = tokenize_dataset(dataset_q)
train_model(tokenized_dataset_q, config['extract_contents']['HF_QUESTION_MODEL_NAME'])
del dataset_q
del tokenized_dataset_q

### Inference
You can test inference with your models below. Call the compare(dataset, model, split, index) function to compare the real and predicted answer on the given dataset, model, test/train/valid split, and index.

In [None]:
from transformers import pipeline

q_checkpoint = config['extract_contents']['HF_QUESTION_MODEL_NAME']
a_checkpoint = config['extract_contents']['HF_ANSWER_MODEL_NAME']

max_length = 512
tokenizer = AutoTokenizer.from_pretrained(q_checkpoint, max_length=max_length, stride = 128, return_overflowing_tokens=True)
a_model = pipeline("ner", model=a_checkpoint, tokenizer=tokenizer, aggregation_strategy="simple", stride = 128)
q_model = pipeline("ner", model=q_checkpoint, tokenizer=tokenizer, aggregation_strategy="simple", stride = 128)
dataset_a = make_dataset_split(a_docs)
dataset_q = make_dataset_split(q_docs)

In [None]:
def expected_ans(dataset, split, i):
    return dataset[split]['text'][i][dataset[split]['target'][i]['start']:dataset[split]['target'][i]['end']]

In [None]:
def predicted_ans(dataset, model, split, i):
    text = dataset[split]['text'][i]
    tags = model(dataset[split]['text'][i])
    max_tags = list(filter(lambda tag: tag['score'] >= .9, tags))
    if len(max_tags) == 0 and len(tags) > 0:
        max_tags = [max(tags, key=lambda tag: tag['score'])]
    if len(max_tags) == 0: 
        return ''
    start_idx = min([tag['start'] for tag in max_tags])
    end_idx = max([tag['end'] for tag in max_tags])
    
    return text[start_idx:end_idx]

In [None]:
def compare(dataset, model, split, i):
    print(f"Expected: {expected_ans(dataset, split, i)}")
    print('\n')
    print(f"Actual: {predicted_ans(dataset, model, split, i)}")
    print('\n\n')

In [None]:
compare(dataset_q, q_model, 'test', 1)

In [None]:
compare(dataset_a, a_model, 'test', 0)