In [None]:
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch] -U
!pip install seqeval

In [None]:
import utils
from utils import compute_metrics
import datasets
from datasets import load_from_disk
from transformers import DistilBertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
import json


In [None]:
# Defining model
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased")
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-cased", num_labels=25)

In [None]:
train_dataset = load_from_disk('/content/drive/MyDrive/Colab Notebooks/underwriteme-data/combined_train_dataset')
print(train_dataset)
test_dataset = load_from_disk('/content/drive/MyDrive/Colab Notebooks/underwriteme-data/test_dataset')
print(test_dataset)

Dataset({
    features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 27454
})
Dataset({
    features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 6223
})


In [None]:
# Define the mapping from NER label to integer
label_to_id = {
    'O': 0,
    'B-Investigation': 1,
    'I-Investigation': 2,
    'B-Treatment': 3,
    'I-Treatment': 4,
    'B-Condition': 5,
    'I-Condition': 6,
    'B-Behaviour': 7,
    'I-Behaviour': 8,
    'B-Date': 9,
    'I-Date': 10,
    'B-Unit': 11,
    'I-Unit': 12,
    'B-Sign': 13,
    'I-Sign': 14,
    'B-Symptom': 15,
    'I-Symptom': 16,
    'B-Drug': 17,
    'I-Drug': 18,
    'B-LabResult': 19,
    'I-LabResult': 20,
    'B-HealthcareProvider': 21,
    'I-HealthcareProvider': 22,
    'B-LabTest': 23,
    'I-LabTest': 24,
}

id_to_label = {id: label for label, id in label_to_id.items()}

In [None]:
# Define training args

args = TrainingArguments(
"test-ner2",
evaluation_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = datasets.load_metric("seqeval")

trainer = Trainer(
   model,
   args,
   train_dataset=train_dataset,
   eval_dataset=test_dataset,
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

  metric = datasets.load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,2.5222,0.981448,0.541667,0.657444,0.593966,0.758797
2,0.6594,0.287756,0.752183,0.840118,0.793722,0.945016
3,0.2328,0.114534,0.912801,0.948407,0.930264,0.98246
4,0.1606,0.066146,0.974733,0.984239,0.979463,0.991464
5,0.1069,0.055316,0.97896,0.98662,0.982775,0.992379


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=4290, training_loss=0.6399908239191229, metrics={'train_runtime': 694.8297, 'train_samples_per_second': 197.559, 'train_steps_per_second': 6.174, 'total_flos': 972931510765212.0, 'train_loss': 0.6399908239191229, 'epoch': 5.0})

In [None]:
## Save model
model.save_pretrained("ner_model2")

## Save tokenizer
tokenizer.save_pretrained("ner_tokenizer2")

('ner_tokenizer_final/tokenizer_config.json',
 'ner_tokenizer_final/special_tokens_map.json',
 'ner_tokenizer_final/vocab.txt',
 'ner_tokenizer_final/added_tokens.json',
 'ner_tokenizer_final/tokenizer.json')

In [None]:
# Write label-id into config file
config = json.load(open("ner_model2/config.json"))
config["id2label"] = id_to_label
config["label2id"] = label_to_id
json.dump(config, open("ner_model2/config.json","w"))