# This notebook provides example code for how a model was trained to identify potential Animal Exposures (and Other Pathogen Exposures) using SetFit

In [2]:
import sys
import os
import socket
import operator
import glob
import collections
import time
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from datasets import load_dataset
from datasets import Dataset
from sentence_transformers.losses import CosineSimilarityLoss
from sentence_transformers import SentenceTransformer
import evaluate

from setfit import SetFitModel, SetFitTrainer, sample_dataset

In [3]:
# get our custom code
sys.path.insert(0, r"..")

# get our custom code
sys.path.insert(0, r"../data")

from animal_exposure_sentences import *
from other_exposure_sentences import *

In [7]:
sentence_model_dir = """sentence-transformers/paraphrase-mpnet-base-v2"""
    
print(sentence_model_dir)

sentence-transformers/paraphrase-mpnet-base-v2


# This is now sample code from the SetFit repo...

In [8]:
affirmed_examples = []
negated_examples = []
no_exposure_examples = []

# first let's seet up a dataset of different classes of data
    # just to recall, in other annotaiton, the convention has been:
    # 0 - negated (denied) exposure
    # 1 - affirmed exposure
    # 2 - no mention of exposure

label_names = ['DENIED_OR_NEGATED', 'AFFIRMED', 'NO_MENTION']
    
affirmed_examples = []
negated_examples = []
no_exposure_examples = []

affirmed_examples = affirmed_animal_examples_masked
negated_examples = negated_animal_examples_masked
no_exposure_examples = no_exposure_animal_examples_masked

In [9]:
print(f"len(affirmed_examples): {len(affirmed_examples)}")
print(f"len(negated_examples): {len(negated_examples)}")
print(f"len(no_exposure_examples): {len(no_exposure_examples)}")

len(affirmed_examples): 5
len(negated_examples): 5
len(no_exposure_examples): 5


In [10]:
# now let's prepare some labels
affirmed_example_labels = [1] * len(affirmed_examples)
negated_example_labels = [0] * len(negated_examples)
no_exposure_example_labels = [2] * len(no_exposure_examples)

# let's put these together in the same order to keep associations with labels:
X = affirmed_examples + negated_examples + no_exposure_examples
y = affirmed_example_labels + negated_example_labels + no_exposure_example_labels

print(affirmed_example_labels)

[1, 1, 1, 1, 1]


In [11]:
# and now a data split using scikit-learn ...

# In the paper, less training data was used to 
TRAINING_DATA_SPLIT = 0.8

X_train, X_validation, y_train, y_validation = train_test_split(X, y, stratify = y, 
                                                                train_size = TRAINING_DATA_SPLIT,
                                                               test_size = 1.0 - TRAINING_DATA_SPLIT,
                                                               random_state = 777)

In [12]:
# now turn this into a HuggingFace dataset...
dataset_dict_train = {'text': X_train,
                     'label': y_train}

dataset_dict_validation = {'text': X_validation,
                     'label': y_validation}

train_dataset = Dataset.from_dict(dataset_dict_train)
eval_dataset = Dataset.from_dict(dataset_dict_validation)

In [13]:
accuracy_metric = evaluate.load("accuracy")

print(type(accuracy_metric))

<class 'evaluate_modules.metrics.evaluate-metric--accuracy.f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14.accuracy.Accuracy'>


In [14]:
# Load a SetFit model from local dir
model = SetFitModel.from_pretrained(sentence_model_dir,
                                   local_files_only = True)

# let's take a look at the base of this model and its tokens
# learned how to get this from the Sentence Transformer (SBERT) documentation on adding special tokens
model_body = model.model_body

word_embedding_model = model_body._first_module()
tokenizer = word_embedding_model.tokenizer

# Small number for example purposes, but this is higher in the paper:
NUM_EPOCHS = 1
#NUM_EPOCHS = 4

# Small number for example purposes, but this is higher in the paper:
NUM_ITERATIONS = 1
#NUM_ITERATIONS = 4

print(tokenizer.special_tokens_map)

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss_class=CosineSimilarityLoss,
    #batch_size=32,
    batch_size=16,
    num_iterations=NUM_ITERATIONS, # The number of text pairs to generate for contrastive learning
    num_epochs=NUM_EPOCHS, # The number of epochs to use for contrastive learning
    column_mapping={"text": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  trainer = SetFitTrainer(
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}


Map:   0%|          | 0/12 [00:00<?, ? examples/s]

In [15]:
# Train and evaluate
trainer.train()

***** Running training *****
  Num unique pairs = 24
  Batch size = 16
  Num epochs = 1


Step,Training Loss
1,0.1287


In [16]:
metrics = trainer.evaluate()
print(metrics)

***** Running evaluation *****


{'accuracy': 0.6666666666666666}


In [17]:
# let's also get a classification report for this few shot model on the validation set

# Now we can run our model and get predictions for these classes
start_time = time.time()

validation_pred = model(X_validation)

end_time = time.time()

print(f"Total validation model inference time: [{end_time - start_time:.1f}] seconds") 

Total validation model inference time: [0.2] seconds


In [18]:
print('Validation of few-shot model on Validation Set')
print(classification_report(y_validation, 
                            validation_pred,
                           target_names = label_names,
                            digits = 3
                           ))

Validation of few-shot model on Validation Set
                   precision    recall  f1-score   support

DENIED_OR_NEGATED      0.000     0.000     0.000         1
         AFFIRMED      0.500     1.000     0.667         1
       NO_MENTION      1.000     1.000     1.000         1

         accuracy                          0.667         3
        macro avg      0.500     0.667     0.556         3
     weighted avg      0.500     0.667     0.556         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [19]:
# finally let's try to make a prediction on some very different data:

experiment_texts = ["Patient has been in contact with wild animals", 
               "Patient denies any contact with wild animals", 
               "Patient worked on amphibian vehicles during his service"]

preds = model(experiment_texts)

print(type(preds))

print(preds.shape)

for idx in range(len(experiment_texts)):
    pred = preds[idx]
    
    print(f'Prediction [{pred.item()}] for text: [{experiment_texts[idx]}]')

<class 'torch.Tensor'>
torch.Size([3])
Prediction [1] for text: [Patient has been in contact with wild animals]
Prediction [0] for text: [Patient denies any contact with wild animals]
Prediction [1] for text: [Patient worked on amphibian vehicles during his service]
