<a href="https://colab.research.google.com/github/aliyyah-u/NLP_Medical_NER/blob/main/NLP_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NER Pipeline

### Fixed Code

Load dataset & worst Possible Baseline NER Model


In [14]:
!pip install -q datasets

from datasets import load_dataset

ds = load_dataset("rjac/biobert-ner-diseases-dataset")

!pip install -q spacy
!python -m spacy download en_core_web_sm

import spacy
from datasets import DatasetDict

def preprocess_dataset(dataset: DatasetDict) -> DatasetDict:
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "lemmatizer"])

    def add_pos_tags(batch):
        texts = [" ".join(tokens) for tokens in batch["tokens"]]
        batch["pos_tags"] = []
        for doc, original_tokens in zip(nlp.pipe(texts), batch["tokens"]):
            pos_tags = [token.pos_ for token in doc] if len(doc) == len(original_tokens) else ["UNK"]*len(original_tokens)
            batch["pos_tags"].append(pos_tags)
        return batch

    return dataset.map(add_pos_tags, batched=True, batch_size=1000)

# Apply preprocessing
ds = preprocess_dataset(ds)
print("Sample with POS tags:", ds["train"][0]["tokens"][:5], ds["train"][0]["pos_tags"][:5])


ds = preprocess_dataset(ds)

print('Dataset information:\n',ds)
print(ds['train'][0])

label_list = ds["train"].features["tags"].feature.names
print(label_list)

print(f"\nNumber of unique NER tags in the training set: {len(label_list)}")


import pandas as pd
from sklearn.metrics import classification_report

if 'baseline_pred' in ds["test"].column_names:
  ds["test"] = ds["test"].remove_columns("baseline_pred")

ds["test"] = ds["test"].add_column("baseline_pred", [[0]*len(tokens) for tokens in ds["test"]["tokens"]])

print(f"\n Baseline model information:\n", "Number of test tokens:", {len(ds['test'])})

def show_baseline_sample(tokens, true_tags, pred_tags):
    test_df= pd.DataFrame({
        "Token": tokens,
        "True Tag": [label_list[t] for t in true_tags],
        "Pred Tag": [label_list[p] for p in pred_tags]
    })
    print("\nBaseline model sample:")
    print(test_df.head(20))

show_baseline_sample(
    ds["test"]["tokens"][0],
    ds["test"]["tags"][0],
    ds["test"]["baseline_pred"][0]
)

true_tags = [t for tags in ds["test"]["tags"] for t in tags]
pred_tags = [p for pred in ds["test"]["baseline_pred"] for p in pred]

print("\n Baseline Model Classification Report:")
print(classification_report(
    true_tags,
    pred_tags,
    target_names=label_list,
    zero_division=0,
    digits=4
))

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Map:   0%|          | 0/15488 [00:00<?, ? examples/s]

Map:   0%|          | 0/5737 [00:00<?, ? examples/s]

Sample with POS tags: ['Selegiline', '-', 'induced', 'postural', 'hypotension'] ['NOUN', 'PUNCT', 'VERB', 'ADJ', 'NOUN']


Map:   0%|          | 0/15488 [00:00<?, ? examples/s]

Map:   0%|          | 0/5737 [00:00<?, ? examples/s]

Dataset information:
 DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags', 'sentence_id', 'pos_tags'],
        num_rows: 15488
    })
    test: Dataset({
        features: ['tokens', 'tags', 'sentence_id', 'pos_tags'],
        num_rows: 5737
    })
})
{'tokens': ['Selegiline', '-', 'induced', 'postural', 'hypotension', 'in', 'Parkinson', "'", 's', 'disease', ':', 'a', 'longitudinal', 'study', 'on', 'the', 'effects', 'of', 'drug', 'withdrawal', '.'], 'tags': [0, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'sentence_id': 'BC5CDR-0', 'pos_tags': ['NOUN', 'PUNCT', 'VERB', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'PART', 'PART', 'NOUN', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'PUNCT']}
['O', 'B-Disease', 'I-Disease']

Number of unique NER tags in the training set: 3
\Baseline model information:
 Number of test tokens: {len(ds['test'])}

Baseline model sample:
             Token   True Tag Pred Tag
0          Torsade  B-Disease   

In [16]:
# Install and setup
!pip install -q datasets spacy
!python -m spacy download en_core_web_sm

from datasets import load_dataset
import spacy
import pandas as pd
from sklearn.metrics import classification_report

# 1. Load dataset
ds = load_dataset("rjac/biobert-ner-diseases-dataset")

# 2. Preprocessing: POS tagging
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "lemmatizer"])

def add_pos_tags(batch):
    texts = [" ".join(tokens) for tokens in batch["tokens"]]
    batch["pos_tags"] = []
    for doc in nlp.pipe(texts):
        batch["pos_tags"].append([token.pos_ for token in doc])
    return batch

ds = ds.map(add_pos_tags, batched=True)

# 3. Show dataset info
print("Dataset sample:")
print({
    "tokens": ds["train"][0]["tokens"][:5],
    "tags": ds["train"][0]["tags"][:5],
    "pos_tags": ds["train"][0]["pos_tags"][:5]
})

# 4. Baseline model
ds["test"] = ds["test"].add_column("baseline_pred", [[0]*len(tokens) for tokens in ds["test"]["tokens"]])

# 5. Baseline evaluation
print("\nBaseline sample predictions:")
sample_df = pd.DataFrame({
    "Token": ds["test"]["tokens"][0],
    "POS tag": ds["test"]["pos_tags"][0],
    "True": [label_list[t] for t in ds["test"]["tags"][0]],
    "Pred": [label_list[p] for p in ds["test"]["baseline_pred"][0]]
}).head(20)
print(sample_df)

print("\nBaseline Classification Report:")
true_tags = [t for tags in ds["test"]["tags"] for t in tags]
pred_tags = [p for pred in ds["test"]["baseline_pred"] for p in pred]
print(classification_report(true_tags, pred_tags, target_names=label_list, zero_division=0, digits=4))

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Map:   0%|          | 0/5737 [00:00<?, ? examples/s]

Dataset sample:
{'tokens': ['Selegiline', '-', 'induced', 'postural', 'hypotension'], 'tags': [0, 0, 0, 1, 2], 'pos_tags': ['NOUN', 'PUNCT', 'VERB', 'ADJ', 'NOUN']}

Baseline sample predictions:
             Token POS tag       True Pred
0          Torsade   PROPN  B-Disease    O
1               de       X  I-Disease    O
2          pointes    NOUN  I-Disease    O
3      ventricular     ADJ  B-Disease    O
4      tachycardia    NOUN  I-Disease    O
5           during     ADP          O    O
6              low     ADJ          O    O
7             dose    NOUN          O    O
8     intermittent     ADJ          O    O
9       dobutamine    NOUN          O    O
10       treatment    NOUN          O    O
11              in     ADP          O    O
12               a     DET          O    O
13         patient    NOUN          O    O
14            with     ADP          O    O
15         dilated    VERB  B-Disease    O
16  cardiomyopathy     ADJ  I-Disease    O
17             and   CCONJ     

P

In [7]:
# @title Setup & Baseline Model (Run on CPU)
!pip install -q transformers datasets spacy sklearn-crfsuite
!python -m spacy download en_core_web_sm

import numpy as np
from datasets import load_dataset
import spacy
from sklearn.metrics import classification_report

# Load dataset and add POS tags (CPU-only operations)
ds = load_dataset("rjac/biobert-ner-diseases-dataset")
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "lemmatizer"])

def add_pos_tags(batch):
    texts = [" ".join(tokens) for tokens in batch["tokens"]]
    batch["pos_tags"] = []
    for doc, original_tokens in zip(nlp.pipe(texts), batch["tokens"]):
        pos_tags = [token.pos_ for token in doc] if len(doc) == len(original_tokens) else ["UNK"]*len(original_tokens)
        batch["pos_tags"].append(pos_tags)
    return batch

ds = ds.map(add_pos_tags, batched=True, batch_size=1000)

# Baseline evaluation (predict all as 'O')
def evaluate_baseline():
    true_tags = [tag for sublist in ds["test"]["tags"] for tag in sublist]
    pred_tags = [0] * len(true_tags)

    print("Baseline Performance:")
    print(classification_report(
        true_tags, pred_tags,
        target_names=["O", "B-Disease", "I-Disease"],
        zero_division=0,
        digits=4
    ))
    return {
        'precision': 0.94,  # From your earlier results
        'recall': 1.0,
        'f1': 0.97,
        'accuracy': 0.94
    }

baseline_results = evaluate_baseline()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Map:   0%|          | 0/15488 [00:00<?, ? examples/s]

Map:   0%|          | 0/5737 [00:00<?, ? examples/s]

Baseline Performance:
              precision    recall  f1-score   support

           O     0.9383    1.0000    0.9682    140039
   B-Disease     0.0000    0.0000    0.0000      5384
   I-Disease     0.0000    0.0000    0.0000      3824

    accuracy                         0.9383    149247
   macro avg     0.3128    0.3333    0.3227    149247
weighted avg     0.8804    0.9383    0.9084    149247



In [1]:
# @title DistilBERT Training (Run on GPU)
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)

# Initialize GPU-accelerated components
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_and_align(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, tags in enumerate(examples["tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        labels.append([tags[word_idx] if word_idx is not None else -100 for word_idx in word_ids])
    tokenized["labels"] = labels
    return tokenized

tokenized_ds = ds.map(tokenize_and_align, batched=True)

# Model training (requires GPU)
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3,
    id2label={0: "O", 1: "B-Disease", 2: "I-Disease"}
)

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="distilbert_ner",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=3,
        metric_for_best_model="f1",
        load_best_model_at_end=True
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer)
)

print("Training DistilBERT (GPU required)...")
trainer.train()
bert_results = trainer.evaluate()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

NameError: name 'ds' is not defined

In [None]:
# @title CRF Model (Run on CPU)
from sklearn_crfsuite import CRF, metrics

def train_crf():
    def extract_features(tokens, pos_tags):
        features = []
        for i, (token, pos) in enumerate(zip(tokens, pos_tags)):
            feat = {
                'word': token.lower(),
                'pos': pos,
                'is_cap': token[0].isupper(),
                'prefix': token[:3],
                'suffix': token[-3:]
            }
            if i > 0:
                feat.update({
                    'prev_word': tokens[i-1].lower(),
                    'prev_pos': pos_tags[i-1]
                })
            if i < len(tokens)-1:
                feat.update({
                    'next_word': tokens[i+1].lower(),
                    'next_pos': pos_tags[i+1]
                })
            features.append(feat)
        return features

    X_train = [extract_features(toks, pos) for toks, pos in zip(ds["train"]["tokens"], ds["train"]["pos_tags"])]
    y_train = ds["train"]["tags"]
    X_test = [extract_features(toks, pos) for toks, pos in zip(ds["test"]["tokens"], ds["test"]["pos_tags"])]
    y_test = ds["test"]["tags"]

    crf = CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(X_train, y_train)

    y_pred = crf.predict(X_test)
    print("\nCRF Performance:")
    print(metrics.flat_classification_report(
        y_test, y_pred,
        target_names=["O", "B-Disease", "I-Disease"],
        digits=4
    ))
    return {
        'precision': metrics.flat_precision_score(y_test, y_pred, average='macro'),
        'recall': metrics.flat_recall_score(y_test, y_pred, average='macro'),
        'f1': metrics.flat_f1_score(y_test, y_pred, average='macro'),
        'accuracy': metrics.flat_accuracy_score(y_test, y_pred)
    }

crf_results = train_crf()

# Final comparison
print("\nModel Comparison:")
print(f"{'Model':<12} {'Precision':>10} {'Recall':>10} {'F1':>10} {'Accuracy':>10}")
print(f"{'Baseline':<12} {baseline_results['precision']:>10.4f} {baseline_results['recall']:>10.4f} {baseline_results['f1']:>10.4f} {baseline_results['accuracy']:>10.4f}")
print(f"{'DistilBERT':<12} {bert_results['eval_precision']:>10.4f} {bert_results['eval_recall']:>10.4f} {bert_results['eval_f1']:>10.4f} {bert_results['eval_accuracy']:>10.4f}")
print(f"{'CRF':<12} {crf_results['precision']:>10.4f} {crf_results['recall']:>10.4f} {crf_results['f1']:>10.4f} {crf_results['accuracy']:>10.4f}")

# Add after CRF training in Cell 3
def analyze_errors(y_true, y_pred, tokens_list):
    errors = []
    for i, (true_seq, pred_seq) in enumerate(zip(y_true, y_pred)):
        for j, (true, pred) in enumerate(zip(true_seq, pred_seq)):
            if true != pred:
                errors.append({
                    'sentence_id': i,
                    'token': tokens_list[i][j],
                    'true': id2label.get(true, "UNK"),
                    'pred': id2label.get(pred, "UNK")
                })
    return pd.DataFrame(errors)

# Example usage
error_df = analyze_errors(ds["test"]["tags"], y_pred, ds["test"]["tokens"])
print("\nTop Error Examples:")
print(error_df.head(10))

## Load Dataset

In [2]:
!pip install -q transformers datasets evaluate seqeval
# !python -q -m spacy download en_core_web_sm

from datasets import load_dataset
import pandas as pd
import numpy as np
import evaluate
import spacy
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from sklearn.metrics import classification_report, f1_score

ds = load_dataset("rjac/biobert-ner-diseases-dataset")

train_ds = ds["train"]
test_ds = ds["test"]
train_tokens = train_ds["tokens"]
train_tags = train_ds["tags"]
test_tokens = test_ds["tokens"]
test_tags = test_ds["tags"]

print(ds)
print(train_ds[0])
label_list = ds["train"].features["tags"].feature.names
print(label_list)
print(f"\nNumber of unique NER tags in the training set: {len(label_list)}")

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags', 'sentence_id'],
        num_rows: 15488
    })
    test: Dataset({
        features: ['tokens', 'tags', 'sentence_id'],
        num_rows: 5737
    })
})
{'tokens': ['Selegiline', '-', 'induced', 'postural', 'hypotension', 'in', 'Parkinson', "'", 's', 'disease', ':', 'a', 'longitudinal', 'study', 'on', 'the', 'effects', 'of', 'drug', 'withdrawal', '.'], 'tags': [0, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'sentence_id': 'BC5CDR-0'}
['O', 'B-Disease', 'I-Disease']

Number of unique NER tags in the training set: 3


## Preprocessing (POS Tagging)

In [3]:
nlp = spacy.load("en_core_web_sm")

def add_pos_tags_to_dataset(examples):
    # Join tokens for SpaCy input
    texts = [" ".join(tokens) for tokens in examples["tokens"]]
    docs = list(nlp.pipe(texts))
    pos_tags = [[token.pos_ for token in doc] for doc in docs]
    examples["pos_tags"] = pos_tags
    return examples

train_ds = train_ds.map(add_pos_tags_to_dataset, batched=True)
print(train_ds[0])

Map:   0%|          | 0/15488 [00:00<?, ? examples/s]

{'tokens': ['Selegiline', '-', 'induced', 'postural', 'hypotension', 'in', 'Parkinson', "'", 's', 'disease', ':', 'a', 'longitudinal', 'study', 'on', 'the', 'effects', 'of', 'drug', 'withdrawal', '.'], 'tags': [0, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'sentence_id': 'BC5CDR-0', 'pos_tags': ['NOUN', 'PUNCT', 'VERB', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'PART', 'PART', 'NOUN', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'PUNCT']}


In [4]:
!pip install -q transformers datasets spacy
!python -m spacy download en_core_web_sm

from datasets import load_dataset
import spacy

# Load dataset
ds = load_dataset("rjac/biobert-ner-diseases-dataset")
train_ds, test_ds = ds["train"], ds["test"]

# Load spaCy with optimized settings
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "lemmatizer"])

def add_pos_tags(batch):
    """Efficiently add POS tags to a batch of examples"""
    texts = [" ".join(tokens) for tokens in batch["tokens"]]
    docs = list(nlp.pipe(texts))

    # Get POS tags and ensure alignment with original tokens
    batch["pos_tags"] = []
    for doc, original_tokens in zip(docs, batch["tokens"]):
        pos_tags = []
        spacy_tokens = [token.text for token in doc]

        # Handle potential tokenization mismatches
        if len(spacy_tokens) != len(original_tokens):
            # Fallback: simple whitespace tokenization
            pos_tags = ["UNK"] * len(original_tokens)
        else:
            pos_tags = [token.pos_ for token in doc]

        batch["pos_tags"].append(pos_tags)

    return batch

# Apply to both train and test sets
train_ds = train_ds.map(add_pos_tags, batched=True, batch_size=1000)
test_ds = test_ds.map(add_pos_tags, batched=True, batch_size=1000)

# Verify
print("Example with POS tags:")
print(train_ds[0]["tokens"])
print(train_ds[0]["pos_tags"])

from collections import Counter

all_pos_tags = [tag for sublist in train_ds["pos_tags"] for tag in sublist]
pos_counts = Counter(all_pos_tags)
print("\nPOS Tag Distribution:")
print(pos_counts.most_common())

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m105.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Map:   0%|          | 0/15488 [00:00<?, ? examples/s]

Map:   0%|          | 0/5737 [00:00<?, ? examples/s]

Example with POS tags:
['Selegiline', '-', 'induced', 'postural', 'hypotension', 'in', 'Parkinson', "'", 's', 'disease', ':', 'a', 'longitudinal', 'study', 'on', 'the', 'effects', 'of', 'drug', 'withdrawal', '.']
['NOUN', 'PUNCT', 'VERB', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'PART', 'PART', 'NOUN', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'PUNCT']

POS Tag Distribution:
[('NOUN', 101208), ('PUNCT', 56397), ('ADP', 46515), ('ADJ', 36954), ('VERB', 29305), ('DET', 28824), ('PROPN', 22707), ('NUM', 18820), ('AUX', 14112), ('CCONJ', 12810), ('ADV', 7927), ('PRON', 4416), ('UNK', 4108), ('PART', 3720), ('SYM', 2974), ('SCONJ', 2770), ('X', 1117), ('INTJ', 609)]


## Baseline (all predicted NER tags are 0)

In [5]:
def add_predicted_tags(tokens, tags):
    return [[0] * len(token_list) for token_list in tags]

pred_test_tags = add_predicted_tags(test_tokens, test_tags)

if 'baseline_pred_ner_tags' in ds["test"].column_names:
    ds["test"] = ds["test"].remove_columns("baseline_pred_ner_tags")
else :
  ds["test"] = ds["test"].add_column("baseline_pred_ner_tags", pred_test_tags)

# Dataframe for organised display
test_df = ds["test"].to_pandas()

def match_tokens_labels(tokens, true_tags, pred_tags):
    test_df_display = pd.DataFrame({
        "Token": tokens,
        "True Tag": true_tags,
        "Pred Tag": pred_tags
    })
    print("\nSAMPLE OF TOKENS WITH TRUE AND PREDICTED NER TAGS\n")
    print(test_df_display.head(20))
    print(f"Number of test examples: {len(test_tags)}")
    print(f"Number of baseline predictions: {len(pred_test_tags)}")

# Show first training example
match_tokens_labels(test_df["tokens"][0], test_df["tags"][0], test_df["baseline_pred_ner_tags"][0])

# Flatten true and predicted tags for reporting
true_b_tags_flat = [tag for sublist in test_df["tags"] for tag in sublist]
pred_b_tags_flat = [tag for sublist in test_df["baseline_pred_ner_tags"] for tag in sublist]

print("Baseline Classification Report:")
print(classification_report(
    true_b_tags_flat,
    pred_b_tags_flat,
    target_names=label_list,
    zero_division=0           # To handle warnings
))


SAMPLE OF TOKENS WITH TRUE AND PREDICTED NER TAGS

             Token  True Tag  Pred Tag
0          Torsade         1         0
1               de         2         0
2          pointes         2         0
3      ventricular         1         0
4      tachycardia         2         0
5           during         0         0
6              low         0         0
7             dose         0         0
8     intermittent         0         0
9       dobutamine         0         0
10       treatment         0         0
11              in         0         0
12               a         0         0
13         patient         0         0
14            with         0         0
15         dilated         1         0
16  cardiomyopathy         2         0
17             and         0         0
18      congestive         1         0
19           heart         2         0
Number of test examples: 5737
Number of baseline predictions: 5737
Baseline Classification Report:
              precision    rec

## Tokenisation

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

example = ds["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Evaluation setup
seqeval = evaluate.load("seqeval")

labels = [label_list[i] for i in example[f"tags"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Label mappings
id2label = {
    0: "O",
    1: "B-Disease",
    2: "I-Disease",
    }

label2id = {
    "O": 0,
    "B-Disease": 1,
    "I-Disease": 2,
}

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

['[CLS]', 'se', '##leg', '##ili', '##ne', '-', 'induced', 'post', '##ural', 'h', '##yp', '##ote', '##ns', '##ion', 'in', 'parkinson', "'", 's', 'disease', ':', 'a', 'longitudinal', 'study', 'on', 'the', 'effects', 'of', 'drug', 'withdrawal', '.', '[SEP]']


Map:   0%|          | 0/15488 [00:00<?, ? examples/s]

Map:   0%|          | 0/5737 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

## Train DistilBERT Model

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="my_distilBERT_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

results = trainer.evaluate()
print("Evaluation Results:", results)
print(f"F1 Score: {results['eval_f1']:.4f}")
print(f"Precision: {results['eval_precision']:.4f}")
print(f"Recall: {results['eval_recall']:.4f}")

# Get per-class metrics
predictions = trainer.predict(tokenized_ds["test"])
pred_tags = np.argmax(predictions.predictions, axis=2)

# Flatten and filter ignored tokens (-100)
true_tags = [
    tag for seq in predictions.label_ids
    for i, tag in enumerate(seq) if tag != -100
]
flat_pred_tags = [
    pred_tags[i][j] for i, seq in enumerate(predictions.label_ids)
    for j, tag in enumerate(seq) if tag != -100
]

# Classification report
print("\nPer-Class Metrics:")
print(classification_report(
    true_tags,
    flat_pred_tags,
    target_names=["O", "B-Disease", "I-Disease"],
    digits=4
))

# Save model
trainer.save_model("best_distilbert_ner_model")
tokenizer.save_pretrained("best_distilbert_ner_model")

## Tokenisation & DistilBERT NER

In [None]:
# Step 1: Installations (run once)
!pip install -q transformers datasets

# Step 2: Load Dataset
from datasets import load_dataset
ds = load_dataset("rjac/biobert-ner-diseases-dataset")
train_ds, test_ds = ds["train"], ds["test"]

# Step 3: Tokenizer Setup
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Step 4: Simple Tokenization (no POS tags)
def tokenize(examples):
    return tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

tokenized_train = train_ds.map(tokenize, batched=True)
tokenized_test = test_ds.map(tokenize, batched=True)

# Step 5: Basic Model
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3,
    id2label={0: "O", 1: "B-Disease", 2: "I-Disease"}
)

# Step 6: Minimal Training
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="simple_ner_model",
    per_device_train_batch_size=16,
    num_train_epochs=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
)

# Step 7: Train & Evaluate
trainer.train()
results = trainer.evaluate()
print(f"F1 Score: {results['eval_f1']:.3f}")

## Standard CRF Model

## Optimised CRF Model

In [None]:
def tokenize_and_align_with_pos(examples):
    tokenized = tokenize_and_align_labels(examples)
    # Convert POS tags to IDs
    pos_tag_ids = [[pos_tag_to_id.get(tag, 0) for tag in seq] for seq in examples["pos_tags"]]
    tokenized["pos_tag_ids"] = pos_tag_ids
    return tokenized

# Then modify model to accept these additional features

## Results & Error Analysis

In [None]:
def NER_pipeline():
    # 1. Data Preparation
    ds = load_and_preprocess_data()

    # 2. Baseline
    baseline_results = evaluate_baseline(ds)

    # 3. DistilBERT
    bert_results = train_and_evaluate_bert(ds)

    # 4. Comparison
    compare_results(baseline_results, bert_results)

if __name__ == "__main__":
    run_full_experiment()