<a href="https://colab.research.google.com/github/areias/slm-finetunig/blob/main/medical_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Domain specific NER and finetuing


## Medical


* https://huggingface.co/datasets/ncbi_disease



* https://github.com/JHnlp/BioCreative-V-CDR-Corpus  - BC5CDR corpus consists of 1500 PubMed articles with 4409 annotated chemicals, 5818 diseases and 3116 chemical-disease interactions.

* https://paperswithcode.com/dataset/radgraph



* https://huggingface.co/datasets/tner/bionlp2004

 Dataset Card for "tner/bionlp2004"
Dataset Summary

BioNLP2004 NER dataset formatted in a part of TNER project. BioNLP2004 dataset contains training and test only, so we randomly sample a half size of test instances from the training set to create validation set.

    Entity Types: DNA, protein, cell_type, cell_line, RNA



## Other


https://huggingface.co/datasets/sofc_materials_articles




In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
label2id = {
    "O": 0,
    "B-DNA": 1,
    "I-DNA": 2,
    "B-protein": 3,
    "I-protein": 4,
    "B-cell_type": 5,
    "I-cell_type": 6,
    "B-cell_line": 7,
    "I-cell_line": 8,
    "B-RNA": 9,
    "I-RNA": 10
}

In [None]:
from datasets import load_dataset

dataset = load_dataset("tner/bionlp2004")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset.keys()

dict_keys(['train', 'validation', 'test'])

In [None]:
dataset['train'][0]

{'tokens': ['Since',
  'HUVECs',
  'released',
  'superoxide',
  'anions',
  'in',
  'response',
  'to',
  'TNF',
  ',',
  'and',
  'H2O2',
  'induces',
  'VCAM-1',
  ',',
  'PDTC',
  'may',
  'act',
  'as',
  'a',
  'radical',
  'scavenger',
  '.'],
 'tags': [0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [None]:
# join sentences
# replace tags
# structure into input/ output
# Swap keys and values using dictionary comprehension
swapped_dict = {v: k for k, v in label2id.items()}

# Print the swapped dictionary
print(swapped_dict)


{0: 'O', 1: 'B-DNA', 2: 'I-DNA', 3: 'B-protein', 4: 'I-protein', 5: 'B-cell_type', 6: 'I-cell_type', 7: 'B-cell_line', 8: 'I-cell_line', 9: 'B-RNA', 10: 'I-RNA'}


In [None]:
def label_tokens(entry):
    entry['ner_labels'] = [swapped_dict[x] for x in entry['tags']]
    return entry


In [None]:
dataset['train'] = dataset["train"].map(label_tokens)
dataset['test'] = dataset["test"].map(label_tokens)


In [None]:
def tokens_to_sentence(entry):
    entry['sentence'] = ' '.join(entry['tokens'])
    return entry

dataset['train'] = dataset["train"].map(tokens_to_sentence)
dataset['test'] = dataset["test"].map(tokens_to_sentence)


In [None]:
def extract_entities(entry):
    entities = {'DNA': [], 'protein': [], 'cell_type': [], 'cell_line':[], 'RNA': []}
    current_entity = {"type": None, "words": []}
    for word, label in zip(entry['tokens'], entry['ner_labels']):
        if label.startswith('B-'):
            entity_type = label.split('-')[1]
            if current_entity["type"] == entity_type:
                entities[entity_type].append(' '.join(current_entity["words"]))
                current_entity["words"] = [word]
            else:
                if current_entity["type"] is not None:
                    entities[current_entity["type"]].append(' '.join(current_entity["words"]))
                current_entity = {"type": entity_type, "words": [word]}
        elif label.startswith('I-'):
            if current_entity["type"] is not None:
                current_entity["words"].append(word)
        else:
            if current_entity["type"] is not None:
                entities[current_entity["type"]].append(' '.join(current_entity["words"]))
            current_entity = {"type": None, "words": []}
    if current_entity["type"] is not None:
        entities[current_entity["type"]].append(' '.join(current_entity["words"]))

    entry['entities'] = entities
    return entry


In [None]:
dataset['train'] = dataset["train"].map(extract_entities)
dataset['test'] = dataset["test"].map(extract_entities)


In [None]:
dataset['train'][0]['entities']

{'DNA': [],
 'RNA': [],
 'cell_line': ['HUVECs'],
 'cell_type': [],
 'protein': ['VCAM-1']}

In [7]:
def eval_formatting_func(entry):
    #tokenizer add bos token
    text = (f"[INST] You are an NLP expert tasked with Bio-entity Entity Extraction. "
        f"Identify entities of the type DNA, RNA, cell_line, cell_type, and protein in the following sentence: '{entry['sentence']}'\n"
        "Your answer must be in the form of a dict {'DNA':['DNA entity 1', 'DNA entity 2', '...'], 'RNA': [], 'cell_line': [], 'cell_type': [],"
        "'protein': []}\n"
        "Take care, your answer is only valid if it follows the correct format! [/INST]")

    return text

In [8]:
def formatting_func(entry):
    #tokenizer add bos token
    text = (f"[INST] You are an NLP expert tasked with Bio-entity Entity Extraction. "
        f"Identify entities of the type DNA, RNA, cell_line, cell_type, and protein in the following sentence: '{entry['sentence']}'\n"
        "Your answer must be in the form of a dict {'DNA':['DNA entity 1', 'DNA entity 2', '...'], 'RNA': [], 'cell_line': [], 'cell_type': [],"
        "'protein': []}\n"
        "Take care, your answer is only valid if it follows the correct format! [/INST]\n"
        f"{entry['entities']}</s>")
    return text

In [None]:
print(formatting_func(dataset['train'][9]))

[INST] You are an NLP expert tasked with Bio-entity Entity Extraction. Identify entities of the type DNA, RNA, cell_line, cell_type, and protein in the following sentence: 'By employing a derivative E box that binds ZEB but not E2A , we have shown that the repressor is active in B cells and the IgH enhancer is silenced in the absence of binding competition by bHLH proteins .'
Your answer must be in the form of a dict {'DNA':['DNA entity 1', 'DNA entity 2', '...'], 'RNA': [], 'cell_line': [], 'cell_type': [],'protein': []}
Take care, your answer is only valid if it follows the correct format! [/INST]
{'DNA': ['E box', 'IgH enhancer'], 'RNA': [], 'cell_line': [], 'cell_type': ['B cells'], 'protein': ['ZEB', 'E2A', 'repressor', 'bHLH proteins']}</s>


In [None]:
len(dataset['train']), len(dataset['test']), len(dataset['validation'])

(16619, 3856, 1927)

In [None]:
def count_entities(dataset):
    # Initialize counters
    entity_counts = {
        'DNA': 0,
        'RNA': 0,
        'cell_line': 0,
        'cell_type': 0,
        'protein': 0
    }

    # Count entities
    for entry in dataset:
        for entity, values in entry['entities'].items():
            if values:
                entity_counts[entity] += len(values)

    # Print counts
    for entity, count in entity_counts.items():
        print(f"{entity}: {count}")


In [None]:
count_entities(dataset['train'])

DNA: 8273
RNA: 820
cell_line: 3325
cell_type: 6090
protein: 27240


In [None]:
count_entities(dataset['test'])

DNA: 1056
RNA: 118
cell_line: 500
cell_type: 1921
protein: 5067


In [None]:
train_sample = dataset["train"].shuffle(seed=42).select(range(1000))
test_sample = dataset["test"].shuffle(seed=42).select(range(100))


In [None]:
test_sample

Dataset({
    features: ['tokens', 'tags', 'ner_labels', 'sentence', 'entities'],
    num_rows: 100
})

In [None]:
count_entities(train_sample)

DNA: 506
RNA: 48
cell_line: 193
cell_type: 326
protein: 1609


In [None]:
count_entities(test_sample)

DNA: 31
RNA: 1
cell_line: 8
cell_type: 56
protein: 120


In [None]:
dataset['test'][0]

{'tokens': ['Number',
  'of',
  'glucocorticoid',
  'receptors',
  'in',
  'lymphocytes',
  'and',
  'their',
  'sensitivity',
  'to',
  'hormone',
  'action',
  '.'],
 'tags': [0, 0, 3, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0],
 'ner_labels': ['O',
  'O',
  'B-protein',
  'I-protein',
  'O',
  'B-cell_type',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 'sentence': 'Number of glucocorticoid receptors in lymphocytes and their sensitivity to hormone action .',
 'entities': {'DNA': [],
  'RNA': [],
  'cell_line': [],
  'cell_type': ['lymphocytes'],
  'protein': ['glucocorticoid receptors']}}

In [None]:
# Initialize counters for each category
count_rna = 0
count_cell_line = 0
count_cell_type = 0


# Iterate through the list
balanced_sample= []
for entry in dataset['test']:
    # Check if 'RNA' is not empty and we haven't collected 50 RNA examples yet
    if entry['entities']['RNA'] and count_rna < 50:
        balanced_sample.append(entry)
        count_rna += 1
        pass
    # Check if 'cell_line' is not empty and we haven't collected 50 cell line examples yet
    elif entry['entities']['cell_line'] and count_cell_line < 50:
        balanced_sample.append(entry)
        count_cell_line += 1
        pass
    # Check if 'cell_type' is not empty and we haven't collected 50 cell type examples yet
    elif entry['entities']['cell_type'] and count_cell_type < 50:
        balanced_sample.append(entry)
        count_cell_type += 1

    # Break the loop if we have found 50 examples for each category
    if count_rna == 50 and count_cell_line == 50 and count_cell_type == 50:
        break


In [None]:
count_entities(balanced_sample)

DNA: 60
RNA: 54
cell_line: 86
cell_type: 97
protein: 154


## Load model


In [2]:
! pip install -q peft
! pip install -q git+https://github.com/huggingface/accelerate.git
! pip install -q bitsandbytes
! pip install -q transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/190.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

In [4]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [6]:
# Init an eval tokenizer that doesn't add padding or eos token
eval_tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    add_bos_token=True,
)

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [None]:
idx=45
eval_prompt = eval_formatting_func(test_sample[idx])
print(eval_prompt)

[INST] You are an NLP expert tasked with Bio-entity Entity Extraction. Identify entities of the type DNA, RNA, cell_line, cell_type, and protein in the following sentence: 'All clinical specimens from patients with lymphatic leukemia have some measurable level of glucocorticoid receptors ; therefore , the resistance seen in vivo can not be explained by the lack of receptors .'
Your answer must be in the form of a dict {'DNA':['DNA entity 1', 'DNA entity 2', '...'], 'RNA': [], 'cell_line': [], 'cell_type': [],'protein': []}
Take care, your answer is only valid if it follows the correct format! [/INST]


In [None]:
ground_truth = test_sample[idx]['entities']
ground_truth

{'DNA': [],
 'RNA': [],
 'cell_line': [],
 'cell_type': [],
 'protein': ['glucocorticoid receptors']}

In [None]:
def get_prediction(entry):

    eval_prompt = eval_formatting_func(entry)

    model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

    model.eval()
    with torch.no_grad():
        response  = eval_tokenizer.decode(model.generate(**model_input,
                        max_new_tokens=256, repetition_penalty=1.15,
                        pad_token_id=eval_tokenizer.eos_token_id)[0], skip_special_tokens=True)
        response = response.replace(eval_prompt, "")
    return response

In [None]:
from tqdm import tqdm

predictions = []
for sample in tqdm(balanced_sample):
    response = get_prediction(sample)
    predictions.append({"sentence":sample['sentence'],
                       "entities":sample['entities'],
                        "base_response": response})

100%|██████████| 150/150 [32:20<00:00, 12.94s/it]


In [None]:
%cd /content/drive/My Drive

/content/drive/My Drive


In [None]:
ls mistral-finetune/data

med-ner-predictions.json        nyt10m_test.csv             test_sample.json
nyt10m_finetuning-balanced.csv  rel-ext-train.jsonl         train.jsonl
nyt10m_finetuning.csv           test_sample_basepreds.json  train_sample.json


In [None]:
len(predictions)

100

In [None]:

# Save the list of items to a JSON file
with open("mistral-finetune/data/med-ner-predictions_balanced.json", 'w') as f:
    json.dump(predictions, f, indent=4)

In [None]:
import json

# Save the list of items to a JSON file
with open("mistral-finetune/data/med-ner-predictions.json", 'w') as f:
    json.dump(predictions, f, indent=4)

In [10]:
import re

def parse_response(sentence):
    # Find the text between curly braces
    match = re.search(r'\{[^{}]+\}', sentence)  # Changed 'response' to 'sentence'
    # If a match is found, evaluate it as a dictionary using eval

    if match:
        entities_str = match.group(0)
        try:
            entities_dict = eval(entities_str)
            # List of keys to check
            expected_keys = ['DNA', 'RNA', 'cell_line', 'cell_type', 'protein']
            # Check each key and add it if missing
            for key in expected_keys:
                if key not in entities_dict:
                    entities_dict[key] = []  # Add key with an empty list as value
            return entities_dict
        except:
            return {'DNA':[], 'RNA':[], 'cell_line':[], 'cell_type':[], 'protein': []}
    else:
        # If no match is found, return an empty dictionary
        return {'DNA':[], 'RNA':[], 'cell_line':[], 'cell_type':[], 'protein': []}


In [7]:
for sample in tqdm(predictions):
    response_dict = parse_response(sample['base_response'])
    sample['base_response_dict'] = response_dict


NameError: name 'tqdm' is not defined

In [None]:
predictions[0]

{'sentence': 'Number of glucocorticoid receptors in lymphocytes and their sensitivity to hormone action .',
 'entities': {'DNA': [],
  'RNA': [],
  'cell_line': [],
  'cell_type': ['lymphocytes'],
  'protein': ['glucocorticoid receptors']},
 'base_response': " Based on the given sentence, there are no explicit mentions of DNA, RNA, cell\\_line, cell\\_type, or protein entities. Therefore, all lists should be empty for this input.\n\n```python\n{'DNA':[], 'RNA':[], 'cell_line':[], 'cell_type':[], 'protein':[]}\n```"}

In [4]:
def precision(actual, predicted):
    actual_lower = [word.lower() for word in actual]
    predicted_lower = [word.lower() for word in predicted]

    if not actual_lower and not predicted_lower:
        return 1.0  # Both lists are empty, so precision is 1 (correct prediction)

    true_positives = sum(1 for p in predicted_lower if p in actual_lower)
    predicted_positives = len(predicted_lower)
    if predicted_positives == 0:
        return 0  # Handle case where there are no predicted positives to avoid division by zero
    return true_positives / predicted_positives

In [5]:
def recall(actual, predicted):
    actual_lower = [word.lower() for word in actual]
    predicted_lower = [word.lower() for word in predicted]

    if not actual_lower and not predicted_lower:
        return 1.0  # Both lists are empty, so recall is 1 (correct prediction)

    true_positives = sum(1 for p in predicted_lower if p in actual_lower)
    actual_positives = len(actual_lower)
    if actual_positives == 0:
        return 0  # Handle case where there are no actual positives to avoid division by zero
    return true_positives / actual_positives

In [6]:
def f1_score(actual, predicted):
    prec = precision(actual, predicted)
    rec = recall(actual, predicted)
    if prec + rec == 0:
        return 0  # Handle case where precision + recall is zero to avoid division by zero
    return 2 * (prec * rec) / (prec + rec)

In [1]:
for idx, entry in enumerate(predictions):
    scores = {}
    for label in entry['entities'].keys():
        p = precision(entry['entities'][label], entry['base_response_dict'][label])
        r = recall(entry['entities'][label],  entry['base_response_dict'][label])
        f = f1_score(entry['entities'][label], entry['base_response_dict'][label])
        scores[label] =(p,r,f)
    predictions[idx]['base_scores'] = scores

NameError: name 'predictions' is not defined

In [None]:
predictions[0]

{'sentence': 'Number of glucocorticoid receptors in lymphocytes and their sensitivity to hormone action .',
 'entities': {'DNA': [],
  'RNA': [],
  'cell_line': [],
  'cell_type': ['lymphocytes'],
  'protein': ['glucocorticoid receptors']},
 'base_response': " Based on the given sentence, there are no explicit mentions of DNA, RNA, cell\\_line, cell\\_type, or protein entities. Therefore, all lists should be empty for this input.\n\n```python\n{'DNA':[], 'RNA':[], 'cell_line':[], 'cell_type':[], 'protein':[]}\n```",
 'base_response_dict': {'DNA': [],
  'RNA': [],
  'cell_line': [],
  'cell_type': [],
  'protein': []},
 'base_scores': {'DNA': (1.0, 1.0, 1.0),
  'RNA': (1.0, 1.0, 1.0),
  'cell_line': (1.0, 1.0, 1.0),
  'cell_type': (0, 0.0, 0),
  'protein': (0, 0.0, 0)}}

In [12]:
def calculate_average_metrics(data):
    # Initialize variables to store sum of precision, recall, and f1_score for each entity type
    sum_precision = {'DNA': 0, 'RNA': 0, 'cell_line': 0, 'cell_type': 0, 'protein': 0}
    sum_recall = {'DNA': 0, 'RNA': 0, 'cell_line': 0, 'cell_type': 0, 'protein': 0}
    sum_f1_score = {'DNA': 0, 'RNA': 0, 'cell_line': 0, 'cell_type': 0, 'protein': 0}
    num_instances = len(data)

    # Iterate over the list of dictionaries
    for entry in data:
        metrics = entry['base_scores']
        for entity_type, (precision, recall, f1_score) in metrics.items():
            sum_precision[entity_type] += precision
            sum_recall[entity_type] += recall
            sum_f1_score[entity_type] += f1_score

    # Calculate average precision, recall, and f1_score for each entity type
    avg_precision = {entity_type: sum_precision[entity_type] / num_instances for entity_type in sum_precision}
    avg_recall = {entity_type: sum_recall[entity_type] / num_instances for entity_type in sum_recall}
    avg_f1_score = {entity_type: sum_f1_score[entity_type] / num_instances for entity_type in sum_f1_score}

    return avg_precision, avg_recall, avg_f1_score

In [None]:
calculate_average_metrics(predictions)

({'DNA': 0.6605555555555557,
  'RNA': 0.7866666666666666,
  'cell_line': 0.685,
  'cell_type': 0.48444444444444446,
  'protein': 0.516},
 {'DNA': 0.6473333333333333,
  'RNA': 0.7788888888888889,
  'cell_line': 0.6665079365079365,
  'cell_type': 0.5033333333333333,
  'protein': 0.47855555555555557},
 {'DNA': 0.6511111111111111,
  'RNA': 0.7811111111111111,
  'cell_line': 0.6723232323232323,
  'cell_type': 0.4866666666666667,
  'protein': 0.484162393162393})

In [None]:
count_entities(balanced_sample)

DNA: 60
RNA: 54
cell_line: 86
cell_type: 97
protein: 154


## load finetuned model

In [17]:
!ls drive/MyDrive/mistral-finetune/med-ner/runs/mistral7b-r32alpha16


checkpoint-100	checkpoint-200	checkpoint-275	checkpoint-375	checkpoint-475
checkpoint-125	checkpoint-225	checkpoint-300	checkpoint-400	checkpoint-50
checkpoint-150	checkpoint-25	checkpoint-325	checkpoint-425	checkpoint-500
checkpoint-175	checkpoint-250	checkpoint-350	checkpoint-450	checkpoint-75


In [18]:

from peft import PeftModel

ft_model = PeftModel.from_pretrained(model, "drive/MyDrive/mistral-finetune/med-ner/runs/mistral7b-r32alpha16/checkpoint-500")

In [21]:
!ls drive/MyDrive/mistral-finetune/data/

med-ner-predictions_balanced.json  nyt10m_test.csv	       train.jsonl
med-ner-predictions.json	   rel-ext-train.jsonl	       train_sample.json
nyt10m_finetuning-balanced.csv	   test_sample_basepreds.json
nyt10m_finetuning.csv		   test_sample.json


In [22]:
import json
with open("drive/MyDrive/mistral-finetune/data/med-ner-predictions_balanced.json", "r") as json_file:
    data = json.load(json_file)

In [23]:
data[0]

{'sentence': 'Number of glucocorticoid receptors in lymphocytes and their sensitivity to hormone action .',
 'entities': {'DNA': [],
  'RNA': [],
  'cell_line': [],
  'cell_type': ['lymphocytes'],
  'protein': ['glucocorticoid receptors']},
 'base_response': " Based on the given sentence, there are no explicit mentions of DNA, RNA, cell\\_line, cell\\_type, or protein entities. Therefore, all lists should be empty for this input.\n\n```python\n{'DNA':[], 'RNA':[], 'cell_line':[], 'cell_type':[], 'protein':[]}\n```"}

In [24]:
def get_prediction(entry):

    eval_prompt = eval_formatting_func(entry)

    model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

    model.eval()
    with torch.no_grad():
        response  = eval_tokenizer.decode(ft_model.generate(**model_input,
                        max_new_tokens=256, repetition_penalty=1.15,
                        pad_token_id=eval_tokenizer.eos_token_id)[0], skip_special_tokens=True)
        response = response.replace(eval_prompt, "")
    return response

In [26]:
from tqdm import tqdm

for idx, sample in tqdm(enumerate(data)):
    response = get_prediction(sample)
    data[idx].update({"ft_response": response})

150it [22:02,  8.82s/it]


In [27]:
data[0]

{'sentence': 'Number of glucocorticoid receptors in lymphocytes and their sensitivity to hormone action .',
 'entities': {'DNA': [],
  'RNA': [],
  'cell_line': [],
  'cell_type': ['lymphocytes'],
  'protein': ['glucocorticoid receptors']},
 'base_response': " Based on the given sentence, there are no explicit mentions of DNA, RNA, cell\\_line, cell\\_type, or protein entities. Therefore, all lists should be empty for this input.\n\n```python\n{'DNA':[], 'RNA':[], 'cell_line':[], 'cell_type':[], 'protein':[]}\n```",
 'ft_response': "\n{'DNA': [], 'RNA': [], 'cell_line': [], 'cell_type': ['lymphocytes'], 'protein': ['glucocorticoid receptors']}"}

In [None]:
import json

# Save the list of items to a JSON file
with open("drive/MyDrive/mistral-finetune/data/med-ner-predictions_balanced_ft.json", 'w') as f:
    json.dump(data, f, indent=4)

In [8]:
import json

# Save the list of items to a JSON file
with open("drive/MyDrive/mistral-finetune/data/med-ner-predictions_balanced_ft.json", 'r') as f:
    data= json.load(f)

In [11]:
from tqdm import tqdm
for sample in tqdm(data):
    response_dict = parse_response(sample['ft_response'])
    sample['ft_response_dict'] = response_dict


100%|██████████| 150/150 [00:00<00:00, 10965.69it/s]


In [12]:
data[0]

{'sentence': 'Number of glucocorticoid receptors in lymphocytes and their sensitivity to hormone action .',
 'entities': {'DNA': [],
  'RNA': [],
  'cell_line': [],
  'cell_type': ['lymphocytes'],
  'protein': ['glucocorticoid receptors']},
 'base_response': " Based on the given sentence, there are no explicit mentions of DNA, RNA, cell\\_line, cell\\_type, or protein entities. Therefore, all lists should be empty for this input.\n\n```python\n{'DNA':[], 'RNA':[], 'cell_line':[], 'cell_type':[], 'protein':[]}\n```",
 'ft_response': "\n{'DNA': [], 'RNA': [], 'cell_line': [], 'cell_type': ['lymphocytes'], 'protein': ['glucocorticoid receptors']}",
 'ft_response_dict': {'DNA': [],
  'RNA': [],
  'cell_line': [],
  'cell_type': ['lymphocytes'],
  'protein': ['glucocorticoid receptors']}}

In [13]:
for idx, entry in enumerate(data):
    scores = {}
    for label in entry['entities'].keys():
        p = precision(entry['entities'][label], entry['ft_response_dict'][label])
        r = recall(entry['entities'][label],  entry['ft_response_dict'][label])
        f = f1_score(entry['entities'][label], entry['ft_response_dict'][label])
        scores[label] =(p,r,f)
    data[idx]['ft_scores'] = scores

In [14]:
def calculate_average_metrics(data):
    # Initialize variables to store sum of precision, recall, and f1_score for each entity type
    sum_precision = {'DNA': 0, 'RNA': 0, 'cell_line': 0, 'cell_type': 0, 'protein': 0}
    sum_recall = {'DNA': 0, 'RNA': 0, 'cell_line': 0, 'cell_type': 0, 'protein': 0}
    sum_f1_score = {'DNA': 0, 'RNA': 0, 'cell_line': 0, 'cell_type': 0, 'protein': 0}
    num_instances = len(data)

    # Iterate over the list of dictionaries
    for entry in data:
        metrics = entry['ft_scores']
        for entity_type, (precision, recall, f1_score) in metrics.items():
            sum_precision[entity_type] += precision
            sum_recall[entity_type] += recall
            sum_f1_score[entity_type] += f1_score

    # Calculate average precision, recall, and f1_score for each entity type
    avg_precision = {entity_type: sum_precision[entity_type] / num_instances for entity_type in sum_precision}
    avg_recall = {entity_type: sum_recall[entity_type] / num_instances for entity_type in sum_recall}
    avg_f1_score = {entity_type: sum_f1_score[entity_type] / num_instances for entity_type in sum_f1_score}

    return avg_precision, avg_recall, avg_f1_score

In [15]:
calculate_average_metrics(data)

({'DNA': 0.6755555555555556,
  'RNA': 0.8133333333333334,
  'cell_line': 0.885,
  'cell_type': 0.72,
  'protein': 0.7644444444444444},
 {'DNA': 0.6614444444444445,
  'RNA': 0.8133333333333334,
  'cell_line': 0.8671428571428572,
  'cell_type': 0.7133333333333334,
  'protein': 0.6974999999999999},
 {'DNA': 0.6658803418803417,
  'RNA': 0.8133333333333334,
  'cell_line': 0.8690707070707071,
  'cell_type': 0.7100000000000001,
  'protein': 0.7127472527472529})

In [16]:
"""compared to before ft

({'DNA': 0.6605555555555557,
  'RNA': 0.7866666666666666,
  'cell_line': 0.685,
  'cell_type': 0.48444444444444446,
  'protein': 0.516},
 {'DNA': 0.6473333333333333,
  'RNA': 0.7788888888888889,
  'cell_line': 0.6665079365079365,
  'cell_type': 0.5033333333333333,
  'protein': 0.47855555555555557},
 {'DNA': 0.6511111111111111,
  'RNA': 0.7811111111111111,
  'cell_line': 0.6723232323232323,
  'cell_type': 0.4866666666666667,
  'protein': 0.484162393162393})

"""

"compared to before ft\n\n({'DNA': 0.6605555555555557,\n  'RNA': 0.7866666666666666,\n  'cell_line': 0.685,\n  'cell_type': 0.48444444444444446,\n  'protein': 0.516},\n {'DNA': 0.6473333333333333,\n  'RNA': 0.7788888888888889,\n  'cell_line': 0.6665079365079365,\n  'cell_type': 0.5033333333333333,\n  'protein': 0.47855555555555557},\n {'DNA': 0.6511111111111111,\n  'RNA': 0.7811111111111111,\n  'cell_line': 0.6723232323232323,\n  'cell_type': 0.4866666666666667,\n  'protein': 0.484162393162393})\n  "