In [None]:
model = "allenai/longformer-base-4096"
model = "dmis-lab/biobert-base-cased-v1.1"

In [None]:
from datasets import load_dataset

dataset = load_dataset('Stardrums/pico-breast-cancer')

In [None]:
dataset.keys()

In [None]:
dataset["validation"]

In [None]:
label_names = dataset["train"].features["ner_tags"].feature.names

## Data Preprocessing

- Bert expects input in `input_ids`, `token_type_ids` and `attention_mask` format
- The label also requires adjustment due to subword tokenization used by BERT

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model)

### Let's see why we need to adjust the labels

- We will process the tokens using tokenizer object

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["tokens"], padding="max_length", truncation=True, is_split_into_words=True, max_length=512)

In [None]:
tokenized_datasets_ = dataset.map(tokenize_function, batched=True)

In [None]:
# tokenized_datasets_['train'][0]['input_ids'][:20]

In [None]:
# tokenized_datasets_['train'][0]['ner_tags'][:20]

In [None]:
len(tokenized_datasets_['train'][0]['input_ids']) == len(tokenized_datasets_['train'][0]['ner_tags'])

- We can see that len of `input_ids` is not matching with `ner_tags` that's why we require to adjust the labels according to the tokenized output

<hr/>

- We will use the argument truncation=True (to truncate texts that are bigger than the maximum size allowed by the model) as there is a sequence in data which has length>512

In [None]:
#Get the values for input_ids, attention_mask, adjusted labels
def tokenize_adjust_labels(all_samples_per_split):
  tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"], is_split_into_words=True, truncation=True, max_length=512)
  
  total_adjusted_labels = []
  
  for k in range(0, len(tokenized_samples["input_ids"])):
    prev_wid = -1
    word_ids_list = tokenized_samples.word_ids(batch_index=k)
    existing_label_ids = all_samples_per_split["ner_tags"][k]
    i = -1
    adjusted_label_ids = []
   
    for word_idx in word_ids_list:
      # Special tokens have a word id that is None. We set the label to -100 so they are automatically
      # ignored in the loss function.
      if(word_idx is None):
        adjusted_label_ids.append(-100)
      elif(word_idx!=prev_wid):
        i = i + 1
        adjusted_label_ids.append(existing_label_ids[i])
        prev_wid = word_idx
      else:
        label_name = label_names[existing_label_ids[i]]
        adjusted_label_ids.append(existing_label_ids[i])
        
    total_adjusted_labels.append(adjusted_label_ids)
  
  #add adjusted labels to the tokenized samples
  tokenized_samples["labels"] = total_adjusted_labels
  return tokenized_samples

tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True, remove_columns=['tokens', 'ner_tags'])

- We will now have all the required fields for training, 'input_ids', 'token_type_ids', 'attention_mask', 'labels'

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset['train'][:2]

- As we can see, different sample have different length therefore we need to 
pad the tokens to have same length 

- https://huggingface.co/docs/transformers/main/main_classes/data_collator#transformers.DataCollatorForTokenClassification

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer, max_length=512)

In [None]:
data_collator

## Fine Tuning

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForTokenClassification, AdamW

In [None]:
#check if gpu is present
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# device

- We will use Distillbert-base-uncased model for fine tuning
- We need to specify the number of labels present in the dataset

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model, num_labels=len(label_names))
# model.to(device)

- Create a function to generate metrics
- We will use `seqeval` metrics, commonly used for token classification

In [None]:
# !pip install seqeval -q

In [None]:
import numpy as np
from datasets import load_metric
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p

    #select predicted index with maximum logit for each token
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
example = dataset["train"][1]
labels = [label_names[i] for i in example[f"ner_tags"]]
metric.compute(predictions=[labels], references=[labels])

- Fine Tuning using Trainer API

In [None]:
from transformers import TrainingArguments, Trainer

batch_size = 16
logging_steps = 1#len(tokenized_dataset['train']) // batch_size
epochs = 20

training_args = TrainingArguments(
    output_dir="./results/",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
#fine tune using train method
trainer.train()

In [None]:
trainer.evaluate()

To get the precision/recall/f1 computed for each category for test set, we can apply the same function as before on the result of the `predict` method:

In [None]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)
# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
results = metric.compute(predictions=true_predictions, references=true_labels)
results

## Observations

- f1 score for LOC and PER is >85% and ORG has <75%
- Overall f1 score is ~83%
- We can improve the accuracy by training the model for more number of epochs 

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline("token-classification", model=trainer.model, tokenizer = tokenizer)


In [None]:
pipe.ensure_tensor_on_device()

In [None]:
text = "Objectives: Primary objectives: to compare radial artery occlusion rate (RAO) after cardiac catheterization between catecholamine-chitosan pad (InnoSEAL) and pneumatic compression device (PCD) and to compare difference in hemostasis time and radial monitoring termination time between two arms. Secondary objectives: to compare radial site bleeding and ease of use of two methods by cath-lab technicians. Background: Hemostatic pads may be an effective alternative to PCD with lesser chance of access site complications with advantage of shortened compression time. Methods: Patients (N = 606) undergoing trans-radial, diagnostic or interventional procedures were randomized to either InnoSEAL arm or PCD. RAO was assessed using US Duplex; performed 6-24 hr posthemostatic device removal. Time to hemostasis was recorded as per defined protocols. Ease of use among cath-lab technicians was assessed through 5 point Likert scale. Results: Data of 597 patients was analyzed (299 InnoSEAL, 298 PCD). RAO rate was 8.5% in InnoSEAL and 9.4% in PCD arm (p value >.05). The pooled median hemostasis time and time to termination of radial monitoring was 42 versus 225 min and 50 versus 240 min in InnoSEAL and PCD arms, respectively (p value: <.01). There was no difference in Grade I/II hematoma (InnoSEAL: 1.3% vs. PCD: 3.4%). InnoSEAL was marginally acceptable compared to PCD by technicians. Conclusion: Hemostasis time is significantly shorter in InnoSEAL arm with reasonable acceptability to its usage among cath lab staff. RAO and bleeding complications are comparable between the arms. Based on our findings, it seems feasible to include Chitosan based hemostasis pad routinely in cath lab."

print(pipe.predict(text))