<a href="https://colab.research.google.com/github/aliyyah-u/NLP_Medical_NER/blob/main/NLP_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing & baseline

In [None]:
!pip install datasets
!pip install -U datasets huggingface_hub

from datasets import load_dataset
from huggingface_hub import login
import pandas as pd

login()

# Load the dataset
dataset = load_dataset("parsa-mhmdi/Medical_NER")
print('\nDATASET FEATURES:\n', dataset)

# Show a dataset sample
print('\nA DATASET SAMPLE:')
print(dataset["train"][0]["tokens"])    # Text is already tokenised
print(dataset["train"][0]["ner_tags"])  # NER tags (already in BIO format)

# Split dataset for testing
dataset = dataset['train'].train_test_split(test_size=0.1)
print('\nTHE SPLIT DATASET FEATURES:\n', dataset)

# Check column types
ner_feature = dataset["train"].features
print('\nDATA TYPES:\n', ner_feature)

# Convert dataset contents into lists for processing
train_tokens = dataset["train"]["tokens"]
train_tags = dataset["train"]["ner_tags"]

# View samples
print("\nSAMPLE TRAINING TOKENS:")
print(train_tokens[0])
print("\nSAMPLE TRAINING TAGS:")
print(train_tags[0])

# See all unique tag values
train_unique_tags = set(tag for sublist in train_tags for tag in sublist)
print("\nALL UNIQUE NER TAGS IN TRAINING SET:")
print(sorted(train_unique_tags))

# Function to generate baseline predicted tags
def add_predicted_tags(tokens, tags):
    return [['Other'] * len(token_list) for token_list in tags]

# Remove 'pred_ner_tags' column if already exists
if 'pred_ner_tags' in dataset["train"].column_names:
    dataset["train"] = dataset["train"].remove_columns("pred_ner_tags")

# Generate and add predicted NER tags
predicted_train_tags = add_predicted_tags(train_tokens, train_tags)
dataset["train"] = dataset["train"].add_column("pred_ner_tags", predicted_train_tags)

# Dataframe for organised display
df = dataset["train"].to_pandas()

def match_tokens_labels(tokens, true_tags, pred_tags):
    df_display = pd.DataFrame({
        "Token": tokens,
        "True Tag": true_tags,
        "Pred Tag": pred_tags
    })
    print("\nSAMPLE OF TOKENS WITH TRUE AND PREDICTED NER TAGS\n")
    print(df_display.head(20))

# Show first training example
match_tokens_labels(df["tokens"][0], df["ner_tags"][0], df["pred_ner_tags"][0])

# Preprocessing

## Imports & load dataset

In [None]:
!pip install datasets
!pip install -U huggingface_hub datasets

from datasets import load_dataset
from huggingface_hub import login
!pip install -U huggingface_hub
login()
import pandas as pd

# Load the dataset
dataset = load_dataset("parsa-mhmdi/Medical_NER")

## Explore Dataset

In [None]:
print('\nDATASET FEATURES:' + '\n', dataset)

# See dataset sample
print('\nA DATASET SAMPLE:')
print(dataset["train"][0]["tokens"])  # Can see that dataset is already tokenized
print(dataset["train"][0]["ner_tags"])  # Can see that tags are already in BIO format

## Extract Test Set & convert dataset to suitable format

In [None]:
# Split dataset for testing
dataset = dataset['train'].train_test_split(test_size=0.1)
print('\nTHE SPLIT DATASET FEATURES:' + '\n', dataset)

# See type
ner_feature = dataset["train"].features
print('\nDATA TYPES:' + '\n', ner_feature)

# Convert dataset contents into lists for further processing
df = dataset["train"].to_pandas()
train_tokens = df["tokens"].tolist()
train_tags = df["ner_tags"].tolist()

# View samples
print("\nSAMPLE TRAINING TOKENS:")
print(train_tokens[0])
print("\nSAMPLE TRAINING TAGS:")
print(train_tags[0])

## See actual tag distribution

In [None]:
# See all unique tag names
train_unique_tags = set(tag for sublist in train_tags for tag in sublist)
print("\nALL UNIQUE NER TAGS IN TRAINING SET:")
print(sorted(train_unique_tags))

# Baseline (Model 1) - all predicted tags as 'Other'

## Baseline model (all predicted tags as 'Other')

In [None]:
# Function to add predicted tags (just assigns "Other" for baseline)
def add_predicted_tags(tokens, tags):
    predicted_tags = [['Other'] * len(token_list) for token_list in tags]
    return predicted_tags

# Check if 'pred_ner_tags' already exists and remove it
if 'pred_ner_tags' in dataset["train"].column_names:
    dataset["train"] = dataset["train"].remove_columns("pred_ner_tags")

# Add predicted NER tags to the dataset
predicted_train_tags = add_predicted_tags(train_tokens, train_tags)
dataset["train"] = dataset["train"].add_column("pred_ner_tags", predicted_train_tags)

# Updated function to display tokens, true labels, and predicted labels in a table format
def match_tokens_labels(tokens, true_tags, pred_tags):
    df = pd.DataFrame({
        "Token": tokens,
        "True Tag": true_tags,
        "Pred Tag": pred_tags
    })
    print("\nSAMPLE OF TOKENS WITH TRUE AND PREDICTED NER TAGS\n")
    print(df.head(20))

# Match tokens, true NER tags, and predicted "Other" labels for the first sample
match_tokens_labels(train_tokens[0], train_tags[0], predicted_train_tags[0])

In [None]:
def add_predicted_tags(tokens, tags):
    predicted_tags = [['Other'] * len(token_list) for token_list in tags]
    return predicted_tags

# Add predicted NER tags to the dataset
predicted_train_tags = add_predicted_tags(train_tokens, train_tags)
# Now, add predicted_ner_tags to the original dataset
dataset["train"] = dataset["train"].add_column("predicted_ner_tags", predicted_train_tags)

# Print a sample of tokens and their true and predicted NER tags
def match_tokens_labels(tokens, true_labels, predicted_labels):
    line1 = ""
    line2 = ""
    line3 = ""
    # Iterate over tokens, true labels, and predicted labels together
    for word, true_label, predicted_label in zip(tokens, true_labels, predicted_labels):
        max_length = max(len(word), len(true_label), len(predicted_label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += true_label + " " * (max_length - len(true_label) + 1)
        line3 += predicted_label + " " * (max_length - len(predicted_label) + 1)
    # Print the output with correct alignment
    print('\nSAMPLE OF TOKENS WITH TRUE AND PREDICTED NER TAGS' + '\n', line1)
    print(line2)
    print(line3)

# Match tokens, true NER tags, and predicted "Other" labels for the first sample
match_tokens_labels(train_tokens[0], train_tags[0], predicted_train_tags[0])

from sklearn.metrics import accuracy_score

# Flatten true and predicted tags
flat_true = [tag for sent in train_tags for tag in sent]
flat_pred = [tag for sent in predicted_train_tags for tag in sent]

# Calculate accuracy
accuracy = accuracy_score(flat_true, flat_pred)
print(f"\nBaseline Accuracy: {accuracy:.4f}")

from sklearn.metrics import classification_report

# Compute report excluding "Other"
labels = sorted(set(flat_true) - {"Other"})
report = classification_report(flat_true, flat_pred, labels=labels, zero_division=0)
print("\nBaseline Classification Report (excluding 'Other'):\n")
print(report)

# DistilBERT (Model 2)