<a href="https://colab.research.google.com/github/aliyyah-u/NLP_Medical_NER/blob/main/NLP_coursework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
__author__ = "Aliyyah U."
__version__ = "IN3045 City St George's, University of London, Spring 2025"
#Ideally, we would reuse trusted libraries to perform NLP. Some of the commonly recommended ones are: , nltk, spacy, torchtext, huggingface: transformers, datasets.

# CLEAN

## BASELINE MODEL

In [None]:
# Import necessary libraries
from datasets import load_dataset
from huggingface_hub import login

# Log in to Huggingface Hub (make sure you have valid credentials)
!pip install -U huggingface_hub
login()

# Load the dataset
dataset = load_dataset("parsa-mhmdi/Medical_NER")
print('\nDATASET FEATURES:' + '\n', dataset)

# See dataset samples
print('\nA DATASET SAMPLE:')
print(dataset["train"][0]["tokens"])  # Can see that dataset is already tokenized
print(dataset["train"][0]["ner_tags"])  # Can see that tags are already in BIO format

# Split dataset for testing
dataset = dataset['train'].train_test_split(test_size=0.1)
print('\nTHE SPLIT DATASET FEATURES:' + '\n', dataset)

# See type
ner_feature = dataset["train"].features
print('\nDATA TYPES:' + '\n', ner_feature)

# Convert dataset contents into lists for further processing
df = dataset["train"].to_pandas()
train_tokens = df["tokens"].tolist()
train_tags = df["ner_tags"].tolist()

# See all unique tag names
train_unique_tags = set(tag for sublist in train_tags for tag in sublist)
print("\nALL UNIQUE NER TAGS IN TRAINING SET:")
print(sorted(train_unique_tags))

# View samples
print("\nSAMPLE TRAINING TOKENS:")
print(train_tokens[0])
print("\nSAMPLE TRAINING TAGS:")
print(train_tags[0])

# BASELINE: Function tags each token with "Other"
def add_predicted_tags(tokens, tags):
    predicted_tags = [['Other'] * len(token_list) for token_list in tags]
    return predicted_tags

# Add predicted NER tags to the dataset
predicted_train_tags = add_predicted_tags(train_tokens, train_tags)
# Now, add predicted_ner_tags to the original dataset
dataset["train"] = dataset["train"].add_column("predicted_ner_tags", predicted_train_tags)

# Print a sample of tokens and their true and predicted NER tags
def match_tokens_labels(tokens, true_labels, predicted_labels):
    line1 = ""
    line2 = ""
    line3 = ""
    # Iterate over tokens, true labels, and predicted labels together
    for word, true_label, predicted_label in zip(tokens, true_labels, predicted_labels):
        max_length = max(len(word), len(true_label), len(predicted_label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += true_label + " " * (max_length - len(true_label) + 1)
        line3 += predicted_label + " " * (max_length - len(predicted_label) + 1)
    # Print the output with correct alignment
    print('\nSAMPLE OF TOKENS WITH TRUE AND PREDICTED NER TAGS' + '\n', line1)
    print(line2)
    print(line3)

# Match tokens, true NER tags, and predicted "Other" labels for the first sample
match_tokens_labels(train_tokens[0], train_tags[0], predicted_train_tags[0])

from sklearn.metrics import accuracy_score

# Flatten true and predicted tags
flat_true = [tag for sent in train_tags for tag in sent]
flat_pred = [tag for sent in predicted_train_tags for tag in sent]

# Calculate accuracy
accuracy = accuracy_score(flat_true, flat_pred)
print(f"\nBaseline Accuracy: {accuracy:.4f}")

from sklearn.metrics import classification_report

# Compute report excluding "Other"
labels = sorted(set(flat_true) - {"Other"})
report = classification_report(flat_true, flat_pred, labels=labels, zero_division=0)
print("\nBaseline Classification Report (excluding 'Other'):\n")
print(report)

# Messy

## Data Preprocessing

In [None]:
#load dataset
from datasets import load_dataset
!pip install -U huggingface_hub
from huggingface_hub import login
login()
dataset = load_dataset("parsa-mhmdi/Medical_NER")
print('\nDATASET FEATURES:'+'\n',dataset)

#see dataset samples
print('\nA DATASET SAMPLE:')
print(dataset["train"][0]["tokens"]) #can see that dataset is already tokenised
print(dataset["train"][0]["ner_tags"]) #can see that tags already in BIO format

#split dataset for testing
dataset = dataset['train'].train_test_split(test_size=0.1)
print('\nTHE SPLIT DATASET FEATURES:'+'\n', dataset)

#see type
ner_feature = dataset["train"].features
print('\nDATA TYPES:'+'\n',ner_feature)

#convert dataset contents into lists for further processing
df = dataset["train"].to_pandas()
train_tokens = df["tokens"].tolist()
train_tags = df["ner_tags"].tolist()

# See all unique tag names
train_unique_tags = set(tag for sublist in train_tags for tag in sublist)
print("\nALL UNIQUE NER TAGS IN TRAINING SET:")
print(sorted(train_unique_tags))

#view samples
print("\nSAMPLE TRAINING TOKENS:")
print(train_tokens[:2])
print("\nSAMPLE TRAINING TAGS:")
print(train_tags[:2])

#print sample of tokens with correct ner_tags
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["ner_tags"]

def match_tokens_labels(tokens, labels):
  line1 = ""
  line2 = ""
  for word, label in zip(words, labels):
    max_length = max(len(word), len(label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += label + " " * (max_length - len(label) + 1)
  print('\nSAMPLE OF TOKENS WITH CORRECT MATCHING NER TAGS'+'\n',line1)
  print(line2)

match_tokens_labels(words, labels)

In [None]:
from collections import defaultdict

# Create a dictionary to store tokens by their NER tags
ner_groups = defaultdict(list)

# Loop through all tokens and their corresponding NER tags
for tokens, tags in zip(train_tokens, train_tags):
    for token, tag in zip(tokens, tags):
        # Only add tokens that aren't tagged with "Other"
        if tag != 'Other':
            ner_groups[tag].append(token)

# Display tokens grouped by their NER tag
for tag, tokens in sorted(ner_groups.items()):
    print(f"\nNER Tag: {tag}")
    print(f"Tokens: {', '.join(sorted(set(tokens)))}")  # Remove duplicates with set and sort the tokens

In [None]:
import pandas as pd
from collections import defaultdict

# Create a dictionary to store tokens by their NER tags
ner_groups = defaultdict(list)

# Loop through all tokens and their corresponding NER tags
for tokens, tags in zip(train_tokens, train_tags):
    for token, tag in zip(tokens, tags):
        # Only add tokens that aren't tagged with "Other"
        if tag != 'Other':
            ner_groups[tag].append(token)

# Convert the grouped data into a DataFrame
data = {'NER_Tag': list(ner_groups.keys()), 'Tokens': [', '.join(sorted(set(tokens))) for tokens in ner_groups.values()]}
mydf = pd.DataFrame(data)

# Display the DataFrame
print(mydf)

## complete preprocessing code

In [None]:
from datasets import load_dataset
from huggingface_hub import login
login()
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import spacy

# Load dataset
dataset = load_dataset("parsa-mhmdi/Medical_NER")
print('\nDATASET FEATURES:\n', dataset)

# See dataset samples
print('\nA DATASET SAMPLE:')
print(dataset["train"][0]["tokens"])  # Tokenized dataset
print(dataset["train"][0]["ner_tags"])  # BIO-formatted tags

# Split dataset for testing
dataset = dataset['train'].train_test_split(test_size=0.1)
print('\nTHE SPLIT DATASET FEATURES:\n', dataset)

# Convert dataset contents into lists for further processing
df = dataset["train"].to_pandas()
train_tokens = df["tokens"].tolist()
train_tags = df["ner_tags"].tolist()

# See all unique tag names
train_unique_tags = set(tag for sublist in train_tags for tag in sublist)
print("\nALL UNIQUE NER TAGS IN TRAINING SET:")
print(sorted(train_unique_tags))

# View samples of tokens and tags
print("\nSAMPLE TRAINING TOKENS:", train_tokens[:2])
print("\nSAMPLE TRAINING TAGS:", train_tags[:2])

# Label Encoding for ner_tags
label_encoder = LabelEncoder()
label_encoder.fit(sorted(train_unique_tags))
encoded_tags = [label_encoder.transform(tags) for tags in train_tags]
df['encoded_ner_tags'] = encoded_tags
print(df[['tokens', 'ner_tags', 'encoded_ner_tags']].head())

# Optionally, print the unique encoded tags
print("Unique encoded tags:", label_encoder.classes_)

# Function to match tokens and their labels
def match_tokens_labels(tokens, labels):
    line1 = ""
    line2 = ""
    for word, label in zip(tokens, labels):
        max_length = max(len(word), len(label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += label + " " * (max_length - len(label) + 1)
    print('\nSAMPLE OF TOKENS WITH CORRECT MATCHING NER TAGS\n', line1)
    print(line2)

# Display a sample of tokens with labels
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["ner_tags"]
match_tokens_labels(words, labels)

#POS TAGGING!
# Initialise the spaCy model for POS tagging
nlp = spacy.load("en_core_web_sm")

# Function to get POS tags and process the tokens
def get_pos_tags(tokens):
    # Join the tokens into a single string for processing with spaCy
    sentence = " ".join(tokens)

    # Process the sentence using spaCy's pipeline
    doc = nlp(sentence)

    # Extract POS tags for each token
    pos_tags = [(token.text, token.pos_, token.lemma_, token.tag_, token.dep_) for token in doc]

    return pos_tags

# Example: Process the first training sample
tokens_example = train_tokens[0]
pos_tags_example = get_pos_tags(tokens_example)

# Print the POS tags for the first sample
print("\nPOS Tags for the first sample:")
for token, pos, lemma, tag, dep in pos_tags_example:
    print(f"Token: {token}, POS: {pos}, Lemma: {lemma}, Tag: {tag}, Dependency: {dep}")

In [None]:
!pip install datasets
from datasets import load_dataset
from huggingface_hub import login
login()
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import spacy

# Load dataset
dataset = load_dataset("parsa-mhmdi/Medical_NER")
print('\nDATASET FEATURES:\n', dataset)

# See dataset samples
print('\nA DATASET SAMPLE:')
print(dataset["train"][0]["tokens"])  # Tokenized dataset
print(dataset["train"][0]["ner_tags"])  # BIO-formatted tags

# Split dataset for testing
dataset = dataset['train'].train_test_split(test_size=0.1)
print('\nTHE SPLIT DATASET FEATURES:\n', dataset)

# Convert dataset contents into lists for further processing
df = dataset["train"].to_pandas()
train_tokens = df["tokens"].tolist()
train_tags = df["ner_tags"].tolist()

# See all unique tag names
train_unique_tags = set(tag for sublist in train_tags for tag in sublist)
print("\nALL UNIQUE NER TAGS IN TRAINING SET:")
print(sorted(train_unique_tags))

# View samples of tokens and tags
print("\nSAMPLE TRAINING TOKENS:", train_tokens[:2])
print("\nSAMPLE TRAINING TAGS:", train_tags[:2])

# Label Encoding for ner_tags
label_encoder = LabelEncoder()
label_encoder.fit(sorted(train_unique_tags))
encoded_tags = [label_encoder.transform(tags) for tags in train_tags]
df['encoded_ner_tags'] = encoded_tags
print(df[['tokens', 'ner_tags', 'encoded_ner_tags']].head())

# Optionally, print the unique encoded tags
print("Unique encoded tags:", label_encoder.classes_)

# Function to match tokens and their labels
def match_tokens_labels(tokens, labels):
    line1 = ""
    line2 = ""
    for word, label in zip(tokens, labels):
        max_length = max(len(word), len(label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += label + " " * (max_length - len(label) + 1)
    print('\nSAMPLE OF TOKENS WITH CORRECT MATCHING NER TAGS\n', line1)
    print(line2)

# Display a sample of tokens with labels
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["ner_tags"]
match_tokens_labels(words, labels)

#POS TAGGING!
# Initialise the spaCy model for POS tagging
nlp = spacy.load("en_core_web_sm")

# Function to get POS tags and process the tokens
def get_pos_tags_aligned(tokens):
    doc = nlp(" ".join(tokens))
    spacy_tokens = [token.text for token in doc]
    pos_tags = [token.pos_ for token in doc]

    aligned_tags = []
    idx = 0
    for tok in tokens:
        # Try matching token from original with spacy's token stream
        if idx < len(spacy_tokens) and tok == spacy_tokens[idx]:
            aligned_tags.append(pos_tags[idx])
            idx += 1
        else:
            # Fallback: assign "X" if alignment fails
            aligned_tags.append("X")

    return aligned_tags

# Example: Process the first training sample
tokens_example = train_tokens[0]
pos_tags_example = get_pos_tags(tokens_example)

# Print the POS tags for the first sample
print("\nPOS Tags for the first sample:")
for token, pos, lemma, tag, dep in pos_tags_example:
    print(f"Token: {token}, POS: {pos}, Lemma: {lemma}, Tag: {tag}, Dependency: {dep}")

from sklearn.metrics import classification_report
import numpy as np

# === Create a mapping from encoded tag IDs to tag names ===
label_encoder = LabelEncoder()
all_tags = df["ner_tags"].explode().unique()
label_encoder.fit(sorted(all_tags))
label_map = dict(enumerate(label_encoder.classes_))

# === Rule-based NER function using POS tags ===
def rule_based_ner(tokens):
    pos_tags = get_pos_tags_aligned(tokens)
    predicted_labels = []

    for pos in pos_tags:
        if pos in ["PROPN", "NOUN"]:
            predicted_labels.append("B-MED")
        else:
            predicted_labels.append("O")

    return predicted_labels

# === Evaluate rule-based NER on first 50 training samples ===
true_labels_flat = []
pred_labels_flat = []

for i in range(50):
    tokens = train_tokens[i]
    true_ids = encoded_tags[i]
    true_tags = label_encoder.inverse_transform(true_ids)

    pred_tags = rule_based_ner(tokens)

    true_labels_flat.extend(true_tags)
    pred_labels_flat.extend(pred_tags)

# === Classification report ===
print("\nRule-Based Baseline NER Results (first 50 samples):")
print(classification_report(true_labels_flat, pred_labels_flat))