In [None]:
!pip install setfit
import torch
import random
import numpy as np
from sentence_transformers.losses import CosineSimilarityLoss
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from setfit import SetFitTrainer,SetFitModel
import pyarrow as pa
import pandas as pd
from datasets import Dataset
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, StratifiedKFold
from statistics import mean 
from sklearn.metrics import classification_report
from sklearn import metrics

random.seed(0)
torch.manual_seed(0)
np.random.seed(0)
if torch.cuda.is_available():  
  device = "cuda:0" 
else:  
  device = "cpu"  

In [None]:
device

'cuda:0'

##DATASET

In [None]:
df = pd.read_excel("PE_combined.xlsx")
df.drop(columns=['Unnamed: 0','Accession Number'],inplace=True)
df['doc'] = df['doc'].replace(r'\s+|\\n', ' ', regex=True) 
df['target'] = df['target'].replace({'Definitive PE NEG':'0'})
df['target'] = df['target'].replace({'Definitive PE POS':'2'})
df['target'] = df['target'].replace({'Probable PE NEG':'1'})
df['target'] = df['target'].replace({'Probable PE POS':'1'})
df['target'] = df['target'].replace({'Indeterminate':'inconclusive'})
df['target'] = df['target'].replace({'Non-diagnostic':'inconclusive'})
df = df[df['target'] != 'inconclusive']
train_df,test_df = train_test_split(df, test_size=0.2,stratify=df['target'],random_state=15)
# train_df , eval_df = train_test_split(train_df, test_size=0.25,stratify=train_df['target'],random_state=20)

In [None]:
print(len(train_df))
print(len(test_df)) 
# print(len(eval_df))

121
31


In [None]:
train_df

Unnamed: 0,doc,target
55,1. No evidence of pulmonary embolism. 2. Multi...,0
70,1. No pulmonary embolism. Enlargement of the c...,0
106,1.No definite evidence of pulmonary embolism. ...,1
74,1. No evidence of pulmonary embolism. 2. No fo...,0
150,Suboptimal bolus timing for evaluation of the ...,1
...,...,...
33,1. No pulmonary embolism. 2. Findings of CHF i...,0
24,1. No pulmonary embolism. Enlargement of the c...,0
110,1. No pulmonary embolism. 2. Mildly enlarged m...,0
65,No evidence of central pulmonary embolism. The...,1


## MODEL

In [None]:
def model_init(params):
    params = params or {}
    return SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2", **params)

def model_init_roberta(params):
    params = params or {}
    return SetFitModel.from_pretrained("all-roberta-large-v1", **params)


    

def compute_metrics(labels,pred):
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred,average='macro')
    precision = precision_score(y_true=labels, y_pred=pred,average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred,average='macro')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}



# Hyperparameter Search

In [None]:
skf = StratifiedKFold(n_splits=5)
iterations_test = [10,15,20,25]
batch_test = [4,8,16]

hyperparameter_search_results = []
for num_iterations in iterations_test:
    for batch_size in batch_test:
        list_of_results = []
        for train_index, test_index in skf.split(train_df['doc'], train_df['target']):

            train_dataset = Dataset(pa.Table.from_pandas(train_df.iloc[train_index]))
            eval_dataset = train_df.iloc[test_index]
            # eval_dataset = Dataset(pa.Table.from_pandas(train_df.iloc[test_index]))
            trainer = SetFitTrainer(
                model_init=model_init,
                train_dataset=train_dataset,
                # eval_dataset=eval_dataset,
                loss_class=CosineSimilarityLoss,
                batch_size=batch_size,
                num_iterations=num_iterations, # The number of text pairs to generate for contrastive learning
                num_epochs=1, # The number of epochs to use for constrastive learning
                column_mapping={"doc": "text", "target": "label"},
                seed=123,
            )
            trainer.train()

            preds = trainer.model.predict(list(eval_dataset['doc']))
            truth = list(eval_dataset['target'])
            results = compute_metrics(truth,preds)
            list_of_results.append(results)
        avg_results = {}
        for key in results.keys():
            avg_results[key] = mean([d[key] for d in list_of_results])
        hyperparameter_search_results.append(avg_results)

In [None]:
hyperparameter_search_results

[{'accuracy': 0.8888888888888888,
  'precision': 0.8730769230769231,
  'recall': 0.8444444444444444,
  'f1': 0.854615596963423},
 {'accuracy': 0.9,
  'precision': 0.8897435897435897,
  'recall': 0.8666666666666666,
  'f1': 0.8750917874396135},
 {'accuracy': 0.8888888888888888,
  'precision': 0.8912698412698412,
  'recall': 0.8388888888888889,
  'f1': 0.850613154960981},
 {'accuracy': 0.8888888888888888,
  'precision': 0.8777777777777778,
  'recall': 0.861111111111111,
  'f1': 0.8672187715665977},
 {'accuracy': 0.9,
  'precision': 0.8897435897435897,
  'recall': 0.8666666666666666,
  'f1': 0.8750917874396135},
 {'accuracy': 0.9,
  'precision': 0.8897435897435897,
  'recall': 0.8666666666666666,
  'f1': 0.8750917874396135},
 {'accuracy': 0.9111111111111111,
  'precision': 0.9126984126984127,
  'recall': 0.8722222222222222,
  'f1': 0.8843127886606147},
 {'accuracy': 0.9,
  'precision': 0.8897435897435897,
  'recall': 0.8666666666666666,
  'f1': 0.8750917874396135},
 {'accuracy': 0.9,
  'p

#Test

In [None]:
train_dataset = Dataset(pa.Table.from_pandas(train_df))
# eval_dataset = Dataset(pa.Table.from_pandas(eval_df))


In [None]:
# eval_dataset

In [None]:
trainer = SetFitTrainer(
    model_init=model_init_roberta,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=25, # The number of text pairs to generate for contrastive learning
    num_epochs=1, # The number of epochs to use for constrastive learning
    column_mapping={"doc": "text", "target": "label"},
    seed=123,
)
trainer.train()



config.json not found in HuggingFace Hub


Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/191 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.84k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/650 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/328 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 6050
  Num epochs = 1
  Total optimization steps = 379
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/379 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import classification_report
preds = trainer.model.predict(list(test_df['doc']))
truth = list(test_df['target'])
results = compute_metrics(truth,preds)
print(results)
print(classification_report(truth,preds))

{'accuracy': 0.9032258064516129, 'precision': 0.8507936507936508, 'recall': 0.8222222222222223, 'f1': 0.8342941611234295}
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        20
           1       0.80      0.67      0.73         6
           2       0.80      0.80      0.80         5

    accuracy                           0.90        31
   macro avg       0.85      0.82      0.83        31
weighted avg       0.90      0.90      0.90        31

