<a href="https://colab.research.google.com/github/areias/slm-finetunig/blob/main/medical_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Domain specific NER and finetuing


## Medical


* https://huggingface.co/datasets/ncbi_disease



* https://github.com/JHnlp/BioCreative-V-CDR-Corpus  - BC5CDR corpus consists of 1500 PubMed articles with 4409 annotated chemicals, 5818 diseases and 3116 chemical-disease interactions.

* https://paperswithcode.com/dataset/radgraph



* https://huggingface.co/datasets/tner/bionlp2004

 Dataset Card for "tner/bionlp2004"
Dataset Summary

BioNLP2004 NER dataset formatted in a part of TNER project. BioNLP2004 dataset contains training and test only, so we randomly sample a half size of test instances from the training set to create validation set.

    Entity Types: DNA, protein, cell_type, cell_line, RNA



## Other


https://huggingface.co/datasets/sofc_materials_articles




In [1]:
! pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
label2id = {
    "O": 0,
    "B-DNA": 1,
    "I-DNA": 2,
    "B-protein": 3,
    "I-protein": 4,
    "B-cell_type": 5,
    "I-cell_type": 6,
    "B-cell_line": 7,
    "I-cell_line": 8,
    "B-RNA": 9,
    "I-RNA": 10
}

In [3]:
from datasets import load_dataset

dataset = load_dataset("tner/bionlp2004")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
dataset.keys()

dict_keys(['train', 'validation', 'test'])

In [5]:
dataset['train'][0]

{'tokens': ['Since',
  'HUVECs',
  'released',
  'superoxide',
  'anions',
  'in',
  'response',
  'to',
  'TNF',
  ',',
  'and',
  'H2O2',
  'induces',
  'VCAM-1',
  ',',
  'PDTC',
  'may',
  'act',
  'as',
  'a',
  'radical',
  'scavenger',
  '.'],
 'tags': [0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [6]:
# join sentences
# replace tags
# structure into input/ output
# Swap keys and values using dictionary comprehension
swapped_dict = {v: k for k, v in label2id.items()}

# Print the swapped dictionary
print(swapped_dict)


{0: 'O', 1: 'B-DNA', 2: 'I-DNA', 3: 'B-protein', 4: 'I-protein', 5: 'B-cell_type', 6: 'I-cell_type', 7: 'B-cell_line', 8: 'I-cell_line', 9: 'B-RNA', 10: 'I-RNA'}


In [7]:
def label_tokens(entry):
    entry['ner_labels'] = [swapped_dict[x] for x in entry['tags']]
    return entry


In [8]:
dataset['train'] = dataset["train"].map(label_tokens)
dataset['test'] = dataset["test"].map(label_tokens)


In [9]:
def tokens_to_sentence(entry):
    entry['sentence'] = ' '.join(entry['tokens'])
    return entry

dataset['train'] = dataset["train"].map(tokens_to_sentence)
dataset['test'] = dataset["test"].map(tokens_to_sentence)


In [10]:
def extract_entities(entry):
    entities = {'DNA': [], 'protein': [], 'cell_type': [], 'cell_line':[], 'RNA': []}
    current_entity = {"type": None, "words": []}
    for word, label in zip(entry['tokens'], entry['ner_labels']):
        if label.startswith('B-'):
            entity_type = label.split('-')[1]
            if current_entity["type"] == entity_type:
                entities[entity_type].append(' '.join(current_entity["words"]))
                current_entity["words"] = [word]
            else:
                if current_entity["type"] is not None:
                    entities[current_entity["type"]].append(' '.join(current_entity["words"]))
                current_entity = {"type": entity_type, "words": [word]}
        elif label.startswith('I-'):
            if current_entity["type"] is not None:
                current_entity["words"].append(word)
        else:
            if current_entity["type"] is not None:
                entities[current_entity["type"]].append(' '.join(current_entity["words"]))
            current_entity = {"type": None, "words": []}
    if current_entity["type"] is not None:
        entities[current_entity["type"]].append(' '.join(current_entity["words"]))

    entry['entities'] = entities
    return entry


In [11]:
dataset['train'] = dataset["train"].map(extract_entities)
dataset['test'] = dataset["test"].map(extract_entities)


In [12]:
dataset['train'][0]['entities']

{'DNA': [],
 'RNA': [],
 'cell_line': ['HUVECs'],
 'cell_type': [],
 'protein': ['VCAM-1']}

In [13]:
def eval_formatting_func(entry):
    #tokenizer add bos token
    text = (f"[INST] You are an NLP expert tasked with Bio-entity Entity Extraction. "
        f"Identify entities of the type DNA, RNA, cell_line, cell_type, and protein in the following sentence: '{entry['sentence']}'\n"
        "Your answer must be in the form of a dict {'DNA':['DNA entity 1', 'DNA entity 2', '...'], 'RNA': [], 'cell_line': [], 'cell_type': [],"
        "'protein': []}\n"
        "Take care, your answer is only valid if it follows the correct format! [/INST]")

    return text

In [14]:
def formatting_func(entry):
    #tokenizer add bos token
    text = (f"[INST] You are an NLP expert tasked with Bio-entity Entity Extraction. "
        f"Identify entities of the type DNA, RNA, cell_line, cell_type, and protein in the following sentence: '{entry['sentence']}'\n"
        "Your answer must be in the form of a dict {'DNA':['DNA entity 1', 'DNA entity 2', '...'], 'RNA': [], 'cell_line': [], 'cell_type': [],"
        "'protein': []}\n"
        "Take care, your answer is only valid if it follows the correct format! [/INST]\n"
        f"{entry['entities']}</s>")
    return text

In [15]:
print(formatting_func(dataset['train'][9]))

[INST] You are an NLP expert tasked with Bio-entity Entity Extraction. Identify entities of the type DNA, RNA, cell_line, cell_type, and protein in the following sentence: 'By employing a derivative E box that binds ZEB but not E2A , we have shown that the repressor is active in B cells and the IgH enhancer is silenced in the absence of binding competition by bHLH proteins .'
Your answer must be in the form of a dict {'DNA':['DNA entity 1', 'DNA entity 2', '...'], 'RNA': [], 'cell_line': [], 'cell_type': [],'protein': []}
Take care, your answer is only valid if it follows the correct format! [/INST]
{'DNA': ['E box', 'IgH enhancer'], 'RNA': [], 'cell_line': [], 'cell_type': ['B cells'], 'protein': ['ZEB', 'E2A', 'repressor', 'bHLH proteins']}</s>


In [16]:
len(dataset['train']), len(dataset['test']), len(dataset['validation'])

(16619, 3856, 1927)

In [17]:
def count_entities(dataset):
    # Initialize counters
    entity_counts = {
        'DNA': 0,
        'RNA': 0,
        'cell_line': 0,
        'cell_type': 0,
        'protein': 0
    }

    # Count entities
    for entry in dataset:
        for entity, values in entry['entities'].items():
            if values:
                entity_counts[entity] += len(values)

    # Print counts
    for entity, count in entity_counts.items():
        print(f"{entity}: {count}")


In [18]:
count_entities(dataset['train'])

DNA: 8273
RNA: 820
cell_line: 3325
cell_type: 6090
protein: 27240


In [19]:
count_entities(dataset['test'])

DNA: 1056
RNA: 118
cell_line: 500
cell_type: 1921
protein: 5067


In [48]:
train_sample = dataset["train"].shuffle(seed=42).select(range(1000))
test_sample = dataset["test"].shuffle(seed=42).select(range(100))


In [23]:
test_sample

Dataset({
    features: ['tokens', 'tags', 'ner_labels', 'sentence', 'entities'],
    num_rows: 2
})

In [21]:
count_entities(train_sample)

DNA: 506
RNA: 48
cell_line: 193
cell_type: 326
protein: 1609


In [22]:
count_entities(test_sample)

DNA: 2
RNA: 0
cell_line: 0
cell_type: 1
protein: 0


## Load model


In [21]:
! pip install -q peft
! pip install -q git+https://github.com/huggingface/accelerate.git
! pip install -q bitsandbytes
! pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [25]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

In [26]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [27]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [28]:
# Init an eval tokenizer that doesn't add padding or eos token
eval_tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    add_bos_token=True,
)

In [40]:
idx=45
eval_prompt = eval_formatting_func(test_sample[idx])
print(eval_prompt)

[INST] You are an NLP expert tasked with Bio-entity Entity Extraction. Identify entities of the type DNA, RNA, cell_line, cell_type, and protein in the following sentence: 'All clinical specimens from patients with lymphatic leukemia have some measurable level of glucocorticoid receptors ; therefore , the resistance seen in vivo can not be explained by the lack of receptors .'
Your answer must be in the form of a dict {'DNA':['DNA entity 1', 'DNA entity 2', '...'], 'RNA': [], 'cell_line': [], 'cell_type': [],'protein': []}
Take care, your answer is only valid if it follows the correct format! [/INST]


In [41]:
ground_truth = test_sample[idx]['entities']
ground_truth

{'DNA': [],
 'RNA': [],
 'cell_line': [],
 'cell_type': [],
 'protein': ['glucocorticoid receptors']}

In [40]:
def get_prediction(entry):

    eval_prompt = eval_formatting_func(entry)

    model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

    model.eval()
    with torch.no_grad():
        response  = eval_tokenizer.decode(model.generate(**model_input,
                        max_new_tokens=256, repetition_penalty=1.15,
                        pad_token_id=eval_tokenizer.eos_token_id)[0], skip_special_tokens=True)
        response = response.replace(eval_prompt, "")
    return response

In [50]:
from tqdm import tqdm

predictions = []
for sample in tqdm(test_sample):
    response = get_prediction(sample)
    predictions.append({"sentence":sample['sentence'],
                       "entities":sample['entities'],
                        "base_response": response})

100%|██████████| 100/100 [17:46<00:00, 10.67s/it]


In [54]:
cd ..


/


In [57]:
ls

[0m[01;36mbin[0m@                        [01;34metc[0m/     [01;36mlibx32[0m@                   [01;34mproc[0m/  [30;42mtmp[0m/
[01;34mboot[0m/                       [01;34mhome[0m/    [01;34mmedia[0m/                    [01;34mroot[0m/  [01;34mtools[0m/
[01;34mcontent[0m/                    [01;34mkaggle[0m/  med-ner-predictions.json  [01;34mrun[0m/   [01;34musr[0m/
cuda-keyring_1.0-1_all.deb  [01;36mlib[0m@     [01;34mmnt[0m/                      [01;36msbin[0m@  [01;34mvar[0m/
[01;34mdatalab[0m/                    [01;36mlib32[0m@   NGC-DL-CONTAINER-LICENSE  [01;34msrv[0m/
[01;34mdev[0m/                        [01;36mlib64[0m@   [01;34mopt[0m/                      [01;34msys[0m/


In [51]:
len(predictions)

100

In [56]:
import json

# Save the list of items to a JSON file
with open("../med-ner-predictions.json", 'w') as f:
    json.dump(predictions, f, indent=4)

In [87]:
from tqdm import tqdm

for idx in tqdm(range(len(test_sample[:2]))):

    eval_prompt = eval_formatting_func(test_sample[idx])

    model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

    model.eval()
    with torch.no_grad():
        response  = eval_tokenizer.decode(model.generate(**model_input,
                        max_new_tokens=256, repetition_penalty=1.15,
                        pad_token_id=eval_tokenizer.eos_token_id)[0], skip_special_tokens=True)
        response = response.replace(eval_prompt, "")

    response_dict = parse_response(response)

    test_sample[idx].update({'base_model': {'response': response,
                                 'response_dict': response_dict,
                                 'metrics': []}})

  0%|          | 0/5 [00:09<?, ?it/s]


KeyboardInterrupt: 

In [84]:
test_sample[0].update({'base_model': {'response': response,
                                 'response_dict': response_dict,
                                 'metrics': []}})

In [85]:
test_sample[0]

{'tokens': ['Immunoglobulin',
  '(',
  'Ig',
  ')',
  '-kappa',
  'promoters',
  'from',
  'humans',
  'and',
  'mice',
  'share',
  'conserved',
  'sequences',
  '.'],
 'tags': [1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 2, 0],
 'ner_labels': ['B-DNA',
  'I-DNA',
  'I-DNA',
  'I-DNA',
  'I-DNA',
  'I-DNA',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-DNA',
  'I-DNA',
  'O'],
 'sentence': 'Immunoglobulin ( Ig ) -kappa promoters from humans and mice share conserved sequences .',
 'entities': {'DNA': ['Immunoglobulin ( Ig ) -kappa promoters',
   'conserved sequences'],
  'RNA': [],
  'cell_line': [],
  'cell_type': [],
  'protein': []}}

In [77]:
[x['base_model']['response'] for x in test_sample[:3]]

TypeError: string indices must be integers

In [73]:
import re

def parse_response(sentence):
    # Find the text between curly braces
    match = re.search(r'\{[^{}]+\}', sentence)  # Changed 'response' to 'sentence'
    # If a match is found, evaluate it as a dictionary using eval

    if match:
        entities_str = match.group(0)
        try:
            entities_dict = eval(entities_str)
            # List of keys to check
            expected_keys = ['DNA', 'RNA', 'cell_line', 'cell_type', 'protein']
            # Check each key and add it if missing
            for key in expected_keys:
                if key not in entities_dict:
                    entities_dict[key] = []  # Add key with an empty list as value
            return entities_dict
        except:
            return {'DNA':[], 'RNA':[], 'cell_line':[], 'cell_type':[], 'protein': []}
    else:
        # If no match is found, return an empty dictionary
        return {'DNA':[], 'RNA':[], 'cell_line':[], 'cell_type':[], 'protein': []}


In [64]:
response_dict = parse_response(response)
response_dict

{'DNA': [], 'RNA': [], 'cell_line': [], 'cell_type': [], 'protein': []}

In [61]:
def precision(actual, predicted):
    actual_lower = [word.lower() for word in actual]
    predicted_lower = [word.lower() for word in predicted]

    if not actual_lower and not predicted_lower:
        return 1.0  # Both lists are empty, so precision is 1 (correct prediction)

    true_positives = sum(1 for p in predicted_lower if p in actual_lower)
    predicted_positives = len(predicted_lower)
    if predicted_positives == 0:
        return 0  # Handle case where there are no predicted positives to avoid division by zero
    return true_positives / predicted_positives

In [71]:
def recall(actual, predicted):
    actual_lower = [word.lower() for word in actual]
    predicted_lower = [word.lower() for word in predicted]

    if not actual_lower and not predicted_lower:
        return 1.0  # Both lists are empty, so recall is 1 (correct prediction)

    true_positives = sum(1 for p in predicted_lower if p in actual_lower)
    actual_positives = len(actual_lower)
    if actual_positives == 0:
        return 0  # Handle case where there are no actual positives to avoid division by zero
    return true_positives / actual_positives

In [72]:
def f1_score(actual, predicted):
    prec = precision(actual, predicted)
    rec = recall(actual, predicted)
    if prec + rec == 0:
        return 0  # Handle case where precision + recall is zero to avoid division by zero
    return 2 * (prec * rec) / (prec + rec)