# BioBert Fine-Tuning for Disease
In this notebook we will be fine-tuning a BioBERT base model for disease NER on biomedical texts. 

In [1]:
!pip install transformers==3.1.0
!pip install seqeval
!pip install tensorflow

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 19.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 4.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 26.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 30.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [2]:
!git clone https://github.com/dmis-lab/biobert-pytorch.git
%cd biobert-pytorch
!./download.sh

Cloning into 'biobert-pytorch'...
remote: Enumerating objects: 236, done.[K
remote: Counting objects: 100% (236/236), done.[K
remote: Compressing objects: 100% (205/205), done.[K
remote: Total 236 (delta 59), reused 173 (delta 18), pack-reused 0[K
Receiving objects: 100% (236/236), 1.92 MiB | 10.44 MiB/s, done.
Resolving deltas: 100% (59/59), done.
/content/biobert-pytorch
BIOBERT_DATA not set; downloading to default path ('data').
--2021-11-30 03:57:51--  https://docs.google.com/uc?export=download&confirm=IT5D&id=1cGqvAm9IZ_86C4Mj7Zf-w9CFilYVDl8j
Resolving docs.google.com (docs.google.com)... 108.177.121.138, 108.177.121.139, 108.177.121.102, ...
Connecting to docs.google.com (docs.google.com)|108.177.121.138|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0c-9g-docs.googleusercontent.com/docs/securesc/nfedaqvs775iq04kq7vptvunu7rtcs2d/450ocig5tranh43fm43i6na4nqj5mkuo/1638244650000/13799006341648886493/14960654872265816510Z/1cG

In [3]:
!mkdir datasets/NER/NCB-BC5CDR-disease
!cat datasets/NER/NCBI-disease/devel.tsv datasets/NER/BC5CDR-disease/devel.tsv > datasets/NER/NCB-BC5CDR-disease/devel.tsv
!cat datasets/NER/NCBI-disease/train.tsv datasets/NER/BC5CDR-disease/train.tsv > datasets/NER/NCB-BC5CDR-disease/train.tsv
!cat datasets/NER/NCBI-disease/test.tsv datasets/NER/BC5CDR-disease/test.tsv > datasets/NER/NCB-BC5CDR-disease/test.tsv
!cat datasets/NER/NCBI-disease/train_dev.tsv datasets/NER/BC5CDR-disease/train_dev.tsv > datasets/NER/NCB-BC5CDR-disease/train_dev.tsv

In [4]:
!rm -rf datasets/NER/NCBI-disease
!mv datasets/NER/NCB-BC5CDR-disease datasets/NER/NCBI-disease

In [5]:
%cd named-entity-recognition/
!./preprocess.sh

/content/biobert-pytorch/named-entity-recognition
*****  NCBI-disease  Preprocessing Start *****
Replacing Done
Downloading: 100% 29.0/29.0 [00:00<00:00, 23.0kB/s]
Downloading: 100% 570/570 [00:00<00:00, 354kB/s]
Downloading: 100% 208k/208k [00:00<00:00, 2.00MB/s]
Downloading: 100% 426k/426k [00:00<00:00, 2.51MB/s]
*****  NCBI-disease  Preprocessing Done *****
*****  BC5CDR-disease  Preprocessing Start *****
Replacing Done
*****  BC5CDR-disease  Preprocessing Done *****
*****  BC5CDR-chem  Preprocessing Start *****
Replacing Done
*****  BC5CDR-chem  Preprocessing Done *****
*****  BC4CHEMD  Preprocessing Start *****
Replacing Done
*****  BC4CHEMD  Preprocessing Done *****
*****  JNLPBA  Preprocessing Start *****
Replacing Done
*****  JNLPBA  Preprocessing Done *****
*****  BC2GM  Preprocessing Start *****
Replacing Done
*****  BC2GM  Preprocessing Done *****
*****  linnaeus  Preprocessing Start *****
Replacing Done
*****  linnaeus  Preprocessing Done *****
*****  s800  Preprocessing St

In [6]:
!python run_ner.py \
  --data_dir ../datasets/NER/NCBI-disease/ \
  --labels ../datasets/NER/NCBI-disease/labels.txt \
  --model_name_or_path dmis-lab/biobert-base-cased-v1.1 \
  --output_dir output/NCBI-disease \
  --max_seq_length 128 \
  --num_train_epochs 3 \
  --per_device_train_batch_size 32 \
  --save_steps 1000 \
  --seed 1 \
  --do_train \
  --do_eval \
  --do_predict \
  --overwrite_output_dir

11/30/2021 04:23:11 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=True,
do_train=True,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=output/NCBI-disease/runs/Nov30_04-23-11

In [19]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline 

#tokenizer = AutoTokenizer.from_pretrained("output/NCBI-disease/")
model = AutoModelForTokenClassification.from_pretrained("output/NCBI-disease/")


In [20]:
input_text = "Impact of COVID-19 Pandemic in a Pediatric and Congenital Cardiovascular Surgery Program in Brazil"
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
ner_results = nlp(input_text)
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(input_text)))
print(ner_results)

NameError: ignored

In [21]:
!ls output/NCBI-disease

checkpoint-1000  config.json  pytorch_model.bin  runs  training_args.bin


In [22]:
!cat output/NCBI-disease/config.json


{
  "_name_or_path": "dmis-lab/biobert-base-cased-v1.1",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "B-bio",
    "1": "I-bio",
    "2": "O"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-bio": 0,
    "I-bio": 1,
    "O": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}
