<a href="https://colab.research.google.com/github/aliyyah-u/NLP_Medical_NER/blob/main/NLP_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NER PIPELINE

## LOAD DATASET & TOKENISATION

In [None]:
!pip install transformers datasets evaluate seqeval

from datasets import load_dataset
from huggingface_hub import login
login()
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import evaluate

ds = load_dataset("rjac/biobert-ner-diseases-dataset")
train_ds = ds["train"]
test_ds = ds["test"]

print(ds)
print(train_ds[0])
print(ds["train"].features["tags"].feature.names)

label_list = ds["train"].features[f"tags"].feature.names
print(label_list)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

example = ds["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Evaluation setup
seqeval = evaluate.load("seqeval")

labels = [label_list[i] for i in example[f"tags"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Label mappings
id2label = {
    0: "O",
    1: "B-Disease",
    2: "I-Disease",
    }

label2id = {
    "O": 0,
    "B-Disease": 1,
    "I-Disease": 2,
}

## PREPROCESS DATASET (POS TAGGING)

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def add_pos_tags_to_dataset(examples):
    # Join tokens for SpaCy input
    texts = [" ".join(tokens) for tokens in examples["tokens"]]
    docs = list(nlp.pipe(texts))
    pos_tags = [[token.pos_ for token in doc] for doc in docs]
    examples["pos_tags"] = pos_tags
    return examples

train_ds = train_ds.map(add_pos_tags_to_dataset, batched=True)
print(train_ds[0])

BASELINES

In [None]:
# CRF

import nltk
import sklearn_crfsuite
import eli5

train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
train_sents[0]

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

X_train[0][1]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train);

eli5.show_weights(crf, top=30)

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=200,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)
eli5.show_weights(crf, top=30)

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=True,
)
crf.fit(X_train, y_train);

eli5.show_weights(crf, top=5, show=['transition_features'])

eli5.show_weights(crf, top=10, targets=['O', 'B-ORG', 'I-ORG'])

eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])




In [None]:
# BiLSTM-CRF



In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

from transformers import pipeline

# Preprocessing & baseline (all predicted tags are labelled as 'Other')

In [None]:
!pip install datasets
!pip install -U datasets huggingface_hub

from datasets import load_dataset
from huggingface_hub import login
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

login()

# Load the dataset
dataset = load_dataset("parsa-mhmdi/Medical_NER")
print('\nDATASET FEATURES:\n', dataset)

# Show a dataset sample
print('\nA DATASET SAMPLE:')
print(dataset["train"][0]["tokens"])    # Text is already tokenised
print(dataset["train"][0]["ner_tags"])  # NER tags (already in BIO format)

# Split dataset for testing
dataset = dataset['train'].train_test_split(test_size=0.1)
print('\nTHE SPLIT DATASET FEATURES:\n', dataset)

# Check column types
ner_feature = dataset["train"].features
print('\nDATA TYPES:\n', ner_feature)

# Convert dataset contents into lists for processing
train_tokens = dataset["train"]["tokens"]
train_tags = dataset["train"]["ner_tags"]

# View samples
print("\nSAMPLE TRAINING TOKENS:")
print(train_tokens[0])
print("\nSAMPLE TRAINING TAGS:")
print(train_tags[0])

# See all unique tag values
train_unique_tags = set(tag for sublist in train_tags for tag in sublist)
print("\nALL UNIQUE NER TAGS IN TRAINING SET:")
print(sorted(train_unique_tags))
print(f"\nNumber of unique NER tags in the training set: {len(train_unique_tags)}")

# Function to generate baseline predicted tags
def add_predicted_tags(tokens, tags):
    return [['Other'] * len(token_list) for token_list in tags]

# Remove 'pred_ner_tags' column if already exists
if 'pred_ner_tags' in dataset["train"].column_names:
    dataset["train"] = dataset["train"].remove_columns("pred_ner_tags")

# Generate and add predicted NER tags
predicted_train_tags = add_predicted_tags(train_tokens, train_tags)
dataset["train"] = dataset["train"].add_column("pred_ner_tags", predicted_train_tags)

# Dataframe for organised display
df = dataset["train"].to_pandas()

# Baseline model (all predicted tags as 'Other')
def match_tokens_labels(tokens, true_tags, pred_tags):
    df_display = pd.DataFrame({
        "Token": tokens,
        "True Tag": true_tags,
        "Pred Tag": pred_tags
    })
    print("\nSAMPLE OF TOKENS WITH TRUE AND PREDICTED NER TAGS\n")
    print(df_display.head(20))

# Show first training example
match_tokens_labels(df["tokens"][0], df["ner_tags"][0], df["pred_ner_tags"][0])

# Flatten true and predicted tags
true_tags = [tag for sublist in df["ner_tags"] for tag in sublist]
pred_tags = [tag for sublist in df["pred_ner_tags"] for tag in sublist]

print(classification_report(true_tags, pred_tags))
print(f1_score(true_tags, pred_tags, average='macro'))

## DistilBERT (Model 2)

In [None]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install -U huggingface_hub

In [None]:
import pandas as pd
from datasets import load_dataset

In [None]:
from huggingface_hub import login
import numpy as np

login()

data = load_dataset("parsa-mhmdi/Medical_NER")
data

In [None]:
data['train'].features

In [None]:
data['train'][0]['ner_tags']

In [None]:
# Step 1: Extract all unique NER tag strings from the training set
all_tag_strings = [tag for example in data['train']['ner_tags'] for tag in example]
unique_tags = sorted(set(all_tag_strings))

# Step 2: Create mappings
label_to_id = {label: idx for idx, label in enumerate(unique_tags)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

print("Label to ID mapping:", label_to_id)

In [None]:
# Example: Convert one sample's ner_tags to integer IDs
string_tags = data['train'][0]['ner_tags']
tag_ids = [label_to_id[tag] for tag in string_tags]

print(tag_ids)

In [None]:
# Extract all unique NER tag strings
all_tags = [tag for example in data['train']['ner_tags'] for tag in example]
unique_tags = sorted(set(all_tags))

# Create mapping from label to ID
label_to_id = {label: idx for idx, label in enumerate(unique_tags)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        elif word_id != current_word:
            current_word = word_id
            label = labels[word_id]
            label_id = label_to_id[label]
            new_labels.append(label_id)
        else:
            label = labels[word_id]
            label_id = label_to_id[label]
            # Optionally adjust label for I-type
            if "B-" in label:
                label_id = label_to_id[label.replace("B-", "I-")]
            new_labels.append(label_id)
    return new_labels

In [None]:
pd.DataFrame(data['train'][:])[['tokens', 'ner_tags']].iloc[0]

## model building

clean

In [None]:
!pip install -U transformers accelerate datasets huggingface_hub

import pandas as pd
import numpy as np
from datasets import load_dataset
from huggingface_hub import login
from transformers import AutoTokenizer

login()

data = load_dataset("parsa-mhmdi/Medical_NER")

In [None]:
print(data)
print(data['train'].features)
print(data['train'][0]['tokens'])
print(data['train'][0]['ner_tags'])  # These are strings (e.g. "B-CHEMICAL")

# Step 1: Extract unique string labels and build mappings
all_tag_strings = [tag for example in data['train']['ner_tags'] for tag in example]
unique_tags = sorted(set(all_tag_strings))

label_to_id = {label: idx for idx, label in enumerate(unique_tags)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

print("Label to ID mapping:", label_to_id)

# Step 2: Define label alignment function
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        elif word_id != current_word:
            current_word = word_id
            label = labels[word_id]
            label_id = label_to_id[label]
            new_labels.append(label_id)
        else:
            label = labels[word_id]
            label_id = label_to_id[label]
            # Optional: if the word is split, convert B- to I-
            if label.startswith("B-"):
                i_label = label.replace("B-", "I-")
                label_id = label_to_id.get(i_label, label_id)
            new_labels.append(label_id)
    return new_labels

# Step 3: Load tokenizer
model_checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
assert tokenizer.is_fast  # must be fast tokenizer

# Step 4: Tokenize and align labels
def tokenize_and_align(example):
    tokenized = tokenizer(example['tokens'], is_split_into_words=True, truncation=True)
    word_ids = tokenized.word_ids()
    tokenized['labels'] = align_labels_with_tokens(example['ner_tags'], word_ids)
    return tokenized

# Apply to dataset
data = data.map(tokenize_and_align)

# Preview sample tokenized entry
print(data['train'][0])

In [None]:
pd.DataFrame(data['train'][:])[['tokens','ner_tags']].iloc[0]

In [None]:
def add_numeric_tags(example):
    example['ner_tags_num'] = [label_to_id[tag] for tag in example['ner_tags']]
    return example

data = data.map(add_numeric_tags)

print(data['train'][0]['ner_tags'])      # Original string labels
print(data['train'][0]['ner_tags_num'])  # Integer-mapped labels

pd.DataFrame(data['train'][:])[['tokens','ner_tags_num']].iloc[0]

In [None]:
tags = data['train'].features['ner_tags']

{idx:tag for idx, tag in enumerate(tags.feature.names)}

In [None]:
print(data['train'].features)

In [None]:
# Step 1: Extract all unique NER tag strings
all_tag_strings = [tag for example in data['train']['ner_tags'] for tag in example]
unique_tags = sorted(set(all_tag_strings))

# Step 2: Create mappings
label_to_id = {label: idx for idx, label in enumerate(unique_tags)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

In [None]:
from datasets import ClassLabel, Sequence

# Define the class label with your tag list (must be sorted to match IDs if already mapped)
class_label = ClassLabel(names=unique_tags)

# Cast the string-based 'ner_tags' column to ClassLabel (automatically maps strings to ints)
data = data.cast_column("ner_tags", Sequence(class_label))

In [None]:
# 1. Check type of ner_tags feature
print(data['train'].features['ner_tags'])

In [None]:
# 2. Print one example row: should now show integer IDs
print(data['train'][0]['ner_tags'])  # -> list of ints, e.g. [0, 1, 1, 2, ...]

In [None]:
tags = data['train'].features['ner_tags'].feature

index2tag = {idx:tag for idx, tag in enumerate(tags.names)}
tag2index = {tag:idx for idx, tag in enumerate(tags.names)}

In [None]:
tags.feature.names

In [None]:
pd.DataFrame(data['train'][:])[['tokens', 'ner_tags']].iloc[0]

In [None]:
index2tag

In [None]:
tag2index

In [None]:
tags.int2str(3)

In [None]:
def create_tag_names(batch):
  tag_name = {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}
  return tag_name

data = data.map(create_tag_names)

print(data)
print(pd.DataFrame(data['train'][:])[['tokens', 'ner_tags', 'ner_tags_str']].iloc[0])

## model building

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer.is_fast

In [None]:
inputs = data['train'][0]['tokens']
inputs = tokenizer(inputs, is_split_into_words=True)
inputs.tokens()

In [None]:
data['train'][0]['tokens']

In [None]:
inputs.word_ids()

In [None]:
def align_labels_with_tokens(labels, word_ids):
  new_labels = []
  current_word=None
  for word_id in word_ids:
    if word_id != current_word:
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)

    elif word_id is None:
      new_labels.append(-100)

    else:
      label = labels[word_id]

      if label%2==1:
        label = label + 1
      new_labels.append(label)

  return new_labels

In [None]:
labels = data['train'][0]['ner_tags']
word_ids = inputs.word_ids()
print(labels, word_ids)

In [None]:
align_labels_with_tokens(labels, word_ids)

In [None]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

  all_labels = examples['ner_tags']

  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs['labels'] = new_labels

  return tokenized_inputs

In [None]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True, remove_columns=data['train'].column_names)

In [None]:
tokenized_datasets

## data collation

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch

## metrics

In [None]:
!pip install seqeval
!pip install evaluate

import evaluate
metric = evaluate.load('seqeval')

In [None]:
ner_feature = data['train'].features['ner_tags']
ner_feature

In [None]:
label_names = ner_feature.feature.names
label_names

In [None]:
labels = data['train'][0]['ner_tags']
labels = [label_names[i] for i in labels]
labels

In [None]:
predictions = labels.copy()
predictions[2] = "Other"

metric.compute(predictions=[predictions], references=[labels])

In [None]:
import numpy as np

def compute_metrics(eval_preds):
  logits, labels = eval_preds

  predictions = np.argmax(logits, axis=1)

  true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

  true_predictions = [[label_names[l] for p,l in zip(prediction, label) if l!=-100]
                      for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return all_metrics

## model training

In [None]:
id2label = {i:label for i, label in enumerate(label_names)}
label2id = {label:i for i, label in enumerate(label_names)}

In [None]:
print(id2label)

In [None]:
!pip install -U transformers

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=index2tag)

In [None]:
model.config.num_labels

86

In [None]:
from transformers import TrainingArguments

args = TrainingArguments("distilbert-finetuned-ner",
                        # evaluation_strategy = "epoch",
                         save_strategy="epoch",
                         learning_rate = 2e-5,
                         num_train_epochs=3,
                         weight_decay=0.01)

In [None]:
# from sklearn.model_selection import train_test_split
# from datasets import Dataset

# # Convert the dataset to a pandas DataFrame
# df = pd.DataFrame(data['train'])

# # Split the dataset into train and validation
# train_df, val_df = train_test_split(df, test_size=0.1)  # 90% train, 10% validation

# # Convert DataFrames back to Hugging Face Dataset format
# train_dataset = Dataset.from_pandas(train_df)
# val_dataset = Dataset.from_pandas(val_df)

# # Define training arguments with evaluation strategy
# from transformers import TrainingArguments

# args = TrainingArguments(
#     output_dir="distilbert-finetuned-ner",
#     evaluation_strategy="epoch",  # Evaluate every epoch
#     save_strategy="epoch",       # Save model every epoch
#     learning_rate=2e-5,          # Learning rate
#     num_train_epochs=3,          # Number of training epochs
#     weight_decay=0.01            # Weight decay
# )

# from transformers import Trainer

# trainer = Trainer(
#     model=model,  # Pre-trained or fine-tuned model
#     args=args,     # Training arguments
#     train_dataset=train_dataset,  # Training data
#     eval_dataset=val_dataset,    # Validation data
#     compute_metrics=compute_metrics  # Metrics function
# )

# trainer.train()  # Start training