## **Step 1: Installations**

In [None]:
!pip install transformers datasets huggingface_hub

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

## **Step 2: Imports**

In [None]:
from datasets import load_dataset
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

## **Step 3: Data Explorations**

In [None]:
dataset = load_dataset("ncbi/ncbi_disease")

README.md:   0%|          | 0.00/9.70k [00:00<?, ?B/s]

ncbi_disease.py:   0%|          | 0.00/5.83k [00:00<?, ?B/s]

The repository for ncbi/ncbi_disease contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/ncbi/ncbi_disease.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/284k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5433 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/924 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/941 [00:00<?, ? examples/s]

In [None]:
# Accessing the data
train_data = dataset['train']
dev_data = dataset['validation']
test_data = dataset['test']

# Example of printing a sample
print(train_data[0])

{'id': '0', 'tokens': ['Identification', 'of', 'APC2', ',', 'a', 'homologue', 'of', 'the', 'adenomatous', 'polyposis', 'coli', 'tumour', 'suppressor', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]}


In [None]:
pd.DataFrame.from_dict(test_data)

Unnamed: 0,id,tokens,ner_tags
0,0,"[Clustering, of, missense, mutations, in, the,...","[0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 2, 2, ..."
1,1,"[Ataxia, -, telangiectasia, (, A, -, T, ), is,...","[1, 2, 2, 0, 1, 2, 2, 0, 0, 0, 1, 2, 2, 2, 2, ..."
2,2,"[The, risk, of, cancer, ,, especially, lymphoi...","[0, 0, 0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, ..."
3,3,"[By, analysing, tumour, DNA, from, patients, w...","[0, 0, 1, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 1, ..."
4,4,"[In, marked, contrast, to, the, ATM, mutation,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, ..."
...,...,...,...
936,936,"[In, an, attempt, to, resolve, this, issue, ,,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
937,937,"[These, reagents, detect, a, 220, -, kD, prote...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
938,938,"[Immunohistochemical, staining, of, human, bre...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
939,939,"[Conversely, ,, BRCA1, expression, was, reduce...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
print("train_size = {}, dev_size = {}, test_size = {}".format(len(train_data), len(dev_data), len(test_data)))

train_size = 5433, dev_size = 924, test_size = 941


## **Step 4: Model Building**

- **Meaning of Ner_tags labels**
   * 0: No disease mentioned &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;-> O
   * 1: First token of a disease mention&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;-> B-Disease
   * 2: Subsequent tokens of a disease mention &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;-> I-Disease

- **Tasks**
  * Zero-Shot NER on biomedical dataset evaluation with Biomedical LLMs as well as  general
  * Zero-shot with self-improving models - UMLS integration + adaptive consistency/self-consistency & self evaluation using beam search

### **Task 1: Zero-Shot Learning Evaluation**

### **a) Models**
- GLiNER
- Universal-ner

## **Model 1: GLiNER**

In [None]:
!pip install gliner

Collecting gliner
  Downloading gliner-0.2.13-py3-none-any.whl.metadata (7.3 kB)
Collecting onnxruntime (from gliner)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime->gliner)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->gliner)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading gliner-0.2.13-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m102.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

###**Imports**

In [None]:
from gliner import GLiNER
import time
from rich.console import Console

### **Model**

In [None]:
model = GLiNER.from_pretrained("urchade/gliner_base")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/4.78k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/792M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



### **Experimental Playground**

In [None]:
labels = ["Disease"]

In [None]:
" ".join(test_data[0]['tokens'])

'Clustering of missense mutations in the ataxia - telangiectasia gene in a sporadic T - cell leukaemia .'

In [None]:
entities = model.predict_entities(' '.join(test_data[2]['tokens']), labels)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
entities = model.predict_entities(' '.join(test_data[0]['tokens']), labels, threshold=0.2)

print(entities)
for entity in entities:
    print(entity["text"], "=>", entity["label"])

[{'start': 40, 'end': 63, 'text': 'ataxia - telangiectasia', 'label': 'Disease', 'score': 0.20026715099811554}, {'start': 83, 'end': 101, 'text': 'T - cell leukaemia', 'label': 'Disease', 'score': 0.8327548503875732}]
ataxia - telangiectasia => Disease
T - cell leukaemia => Disease


In [None]:
print(test_data[2]['tokens'])

['The', 'risk', 'of', 'cancer', ',', 'especially', 'lymphoid', 'neoplasias', ',', 'is', 'substantially', 'elevated', 'in', 'A', '-', 'T', 'patients', 'and', 'has', 'long', 'been', 'associated', 'with', 'chromosomal', 'instability', '.']


In [None]:
test_data[0]['ner_tags']

[0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 2, 2, 2, 2, 0]

In [None]:
model.predict_entities(' '.join(test_data[0]['tokens']), labels=["Disease"], threshold=0.3)

[{'start': 83,
  'end': 101,
  'text': 'T - cell leukaemia',
  'label': 'Disease',
  'score': 0.8327548503875732}]

In [None]:
model.predict_entities(' '.join(test_data[0]['tokens']), labels=["Disease"], threshold=0.2)

[{'start': 40,
  'end': 63,
  'text': 'ataxia - telangiectasia',
  'label': 'Disease',
  'score': 0.20026715099811554},
 {'start': 83,
  'end': 101,
  'text': 'T - cell leukaemia',
  'label': 'Disease',
  'score': 0.8327548503875732}]

### **GLiNER baseline evaluation**

In [None]:
def map_entities_to_labels(tokens, entities):
    labels = [0] * len(tokens)  # Initialize all labels to 0 (non-entity)
    for entity in entities:
        start = tokens.index(entity["text"].split()[0])
        end = start + len(entity["text"].split())
        labels[start] = 1  # B-Disease
        for i in range(start + 1, end):
            labels[i] = 2  # I-Disease
    return labels

In [None]:
predicted_ner_tags = []

for i in range(len(test_data)):

  # converting token input data to sentence to pass to GLiNER model
  tokens = test_data[i]['tokens']
  text = ' '.join(tokens)

  # zero-shot model prediction
  entities = model.predict_entities(text, labels=["Disease"], threshold=0.2)

  # mapping the prediction to the format available in dataset
  numeric_labels = map_entities_to_labels(tokens, entities)
  predicted_ner_tags.append(numeric_labels)

In [None]:
df = pd.DataFrame.from_dict(test_data)

gliner_df = df.copy()

gliner_df['gliner_ner_tags_pred'] = predicted_ner_tags

In [None]:
gliner_df.to_csv('GLiNER_Predictions.csv', index=False)

In [None]:
def calculate_f1_score(true_labels, pred_labels):

    tp = fp = fn = 0

    for true_seq, pred_seq in zip(true_labels, pred_labels):
        true_entities = extract_entities(true_seq)
        pred_entities = extract_entities(pred_seq)

        true_set = set(true_entities)
        pred_set = set(pred_entities)

        tp += len(true_set & pred_set)
        fp += len(pred_set - true_set)
        fn += len(true_set - pred_set)

    # Calculate precision and recall
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1_score

In [None]:
true_labels = gliner_df['ner_tags'].tolist()
pred_labels = gliner_df['gliner_ner_tags_pred'].tolist()

# Calculate F1 score for the "Disease" entity type
f1_disease = calculate_f1_score(true_labels, pred_labels)

print(f"F1 Score for Disease Entity: {f1_disease:.4f}")

F1 Score for Disease Entity: 0.5237


In [None]:
df.to_csv('NCBI_subset.csv', index=False)