<a href="https://colab.research.google.com/github/aliyyah-u/NLP_Medical_NER/blob/main/NLP_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing & baseline

In [33]:
!pip install datasets
!pip install -U datasets huggingface_hub

from datasets import load_dataset
from huggingface_hub import login
import pandas as pd

login()

# Load the dataset
dataset = load_dataset("parsa-mhmdi/Medical_NER")
print('\nDATASET FEATURES:\n', dataset)

# Show a dataset sample
print('\nA DATASET SAMPLE:')
print(dataset["train"][0]["tokens"])    # Text is already tokenised
print(dataset["train"][0]["ner_tags"])  # NER tags (already in BIO format)

# Split dataset for testing
dataset = dataset['train'].train_test_split(test_size=0.1)
print('\nTHE SPLIT DATASET FEATURES:\n', dataset)

# Check column types
ner_feature = dataset["train"].features
print('\nDATA TYPES:\n', ner_feature)

# Convert dataset contents into lists for processing
train_tokens = dataset["train"]["tokens"]
train_tags = dataset["train"]["ner_tags"]

# View samples
print("\nSAMPLE TRAINING TOKENS:")
print(train_tokens[0])
print("\nSAMPLE TRAINING TAGS:")
print(train_tags[0])

# See all unique tag values
train_unique_tags = set(tag for sublist in train_tags for tag in sublist)
print("\nALL UNIQUE NER TAGS IN TRAINING SET:")
print(sorted(train_unique_tags))

# Function to generate baseline predicted tags
def add_predicted_tags(tokens, tags):
    return [['Other'] * len(token_list) for token_list in tags]

# Remove 'pred_ner_tags' column if already exists
if 'pred_ner_tags' in dataset["train"].column_names:
    dataset["train"] = dataset["train"].remove_columns("pred_ner_tags")

# Generate and add predicted NER tags
predicted_train_tags = add_predicted_tags(train_tokens, train_tags)
dataset["train"] = dataset["train"].add_column("pred_ner_tags", predicted_train_tags)

# Dataframe for organised display
df = dataset["train"].to_pandas()

def match_tokens_labels(tokens, true_tags, pred_tags):
    df_display = pd.DataFrame({
        "Token": tokens,
        "True Tag": true_tags,
        "Pred Tag": pred_tags
    })
    print("\nSAMPLE OF TOKENS WITH TRUE AND PREDICTED NER TAGS\n")
    print(df_display.head(20))

# Show first training example
match_tokens_labels(df["tokens"][0], df["ner_tags"][0], df["pred_ner_tags"][0])



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…


DATASET FEATURES:
 DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 100
    })
})

A DATASET SAMPLE:
['angiotensin-converting', 'enzyme', '2', 'ace2', 'as', 'a', 'sars-cov-2', 'receptor', 'molecular', 'mechanisms', 'and', 'potential', 'therapeutic', 'target', 'sars-cov-2', 'has', 'been', 'sequenced', '[3]', '.', 'a', 'phylogenetic', 'analysis', '[3,', '4]', 'found', 'a', 'bat', 'origin', 'for', 'the', 'sars-cov-2', 'there', 'is', 'a', 'diversity', 'of', 'possible', 'intermediate', 'hosts', 'for', 'sars-cov-2', 'including', 'pangolins', 'but', 'not', 'mice', 'and', 'rats', '[5]', '.', 'there', 'are', 'many', 'similarities', 'of', 'sars-cov-2', 'with', 'the', 'original', 'sars', 'cov', 'using', 'computer', 'modeling,', 'xu', 'et', 'al', '[6]', 'found', 'that', 'the', 'spike', 'proteins', 'of', 'sars-cov-2', 'and', 'sars', 'cov', 'have', 'almost', 'identical', '3-d', 'structures', 'in', 'the', 'receptor-binding', 'domain', 'that', 'maintains'

Flattening the indices:   0%|          | 0/90 [00:00<?, ? examples/s]


SAMPLE OF TOKENS WITH TRUE AND PREDICTED NER TAGS

       Token       True Tag Pred Tag
0       mers  B-CORONAVIRUS    Other
1         at          Other    Other
2      34.4%          Other    Other
3      april         B-DATE    Other
4       2012         I-DATE    Other
5   november         B-DATE    Other
6       2019         I-DATE    Other
7          (          Other    Other
8      table          Other    Other
9         1)          Other    Other
10         .          Other    Other
11       the          Other    Other
12    median          Other    Other
13      ages     B-CHEMICAL    Other
14       for          Other    Other
15       the          Other    Other
16  patients        B-GROUP    Other
17        of          Other    Other
18  covid-19  B-CORONAVIRUS    Other
19      sars  B-CORONAVIRUS    Other


# Preprocessing

## Imports & load dataset

In [28]:
!pip install datasets
!pip install -U huggingface_hub datasets

from datasets import load_dataset
from huggingface_hub import login
!pip install -U huggingface_hub
login()
import pandas as pd

# Load the dataset
dataset = load_dataset("parsa-mhmdi/Medical_NER")



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Explore Dataset

In [29]:
print('\nDATASET FEATURES:' + '\n', dataset)

# See dataset sample
print('\nA DATASET SAMPLE:')
print(dataset["train"][0]["tokens"])  # Can see that dataset is already tokenized
print(dataset["train"][0]["ner_tags"])  # Can see that tags are already in BIO format


DATASET FEATURES:
 DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 100
    })
})

A DATASET SAMPLE:
['angiotensin-converting', 'enzyme', '2', 'ace2', 'as', 'a', 'sars-cov-2', 'receptor', 'molecular', 'mechanisms', 'and', 'potential', 'therapeutic', 'target', 'sars-cov-2', 'has', 'been', 'sequenced', '[3]', '.', 'a', 'phylogenetic', 'analysis', '[3,', '4]', 'found', 'a', 'bat', 'origin', 'for', 'the', 'sars-cov-2', 'there', 'is', 'a', 'diversity', 'of', 'possible', 'intermediate', 'hosts', 'for', 'sars-cov-2', 'including', 'pangolins', 'but', 'not', 'mice', 'and', 'rats', '[5]', '.', 'there', 'are', 'many', 'similarities', 'of', 'sars-cov-2', 'with', 'the', 'original', 'sars', 'cov', 'using', 'computer', 'modeling,', 'xu', 'et', 'al', '[6]', 'found', 'that', 'the', 'spike', 'proteins', 'of', 'sars-cov-2', 'and', 'sars', 'cov', 'have', 'almost', 'identical', '3-d', 'structures', 'in', 'the', 'receptor-binding', 'domain', 'that', 'maintains'

## Extract Test Set & convert dataset to suitable format

In [30]:
# Split dataset for testing
dataset = dataset['train'].train_test_split(test_size=0.1)
print('\nTHE SPLIT DATASET FEATURES:' + '\n', dataset)

# See type
ner_feature = dataset["train"].features
print('\nDATA TYPES:' + '\n', ner_feature)

# Convert dataset contents into lists for further processing
df = dataset["train"].to_pandas()
train_tokens = df["tokens"].tolist()
train_tags = df["ner_tags"].tolist()

# View samples
print("\nSAMPLE TRAINING TOKENS:")
print(train_tokens[0])
print("\nSAMPLE TRAINING TAGS:")
print(train_tags[0])


THE SPLIT DATASET FEATURES:
 DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 90
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 10
    })
})

DATA TYPES:
 {'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

SAMPLE TRAINING TOKENS:
['the' '95th' 'percentile' 'estimate' 'of' 'the' 'incubation' 'period'
 'we' 'found' 'that' 'the' 'length' 'of' 'quarantine' 'should' 'be' 'at'
 'least' '14' 'days' 'and' 'we' 'stress' 'that' 'the' '17-24-day' 'time'
 'delay' 'from' 'illness' 'onset' 'to' 'death' 'must' 'be' 'addressed'
 'when' 'estimating' 'covid-19' 'case' 'fatality' 'risk' 'this' 'study'
 'was' 'made' 'possible' 'only' 'through' 'open' 'sharing' 'of' 'case'
 'data' 'from' 'china' 'and' 'other' 'countries' 'where' 'cases' 'were'
 'diagnosed.' 'continued' 'communication' 'of' 'dates' 'and' 'other

## See actual tag distribution

In [31]:
# See all unique tag names
train_unique_tags = set(tag for sublist in train_tags for tag in sublist)
print("\nALL UNIQUE NER TAGS IN TRAINING SET:")
print(sorted(train_unique_tags))


ALL UNIQUE NER TAGS IN TRAINING SET:
['B-ANATOMICAL_STRUCTURE', 'B-BODY_PART_ORGAN_OR_ORGAN_COMPONENT', 'B-BODY_SUBSTANCE', 'B-CARDINAL', 'B-CELL', 'B-CELL_COMPONENT', 'B-CELL_FUNCTION', 'B-CELL_OR_MOLECULAR_DYSFUNCTION', 'B-CHEMICAL', 'B-CORONAVIRUS', 'B-DAILY_OR_RECREATIONAL_ACTIVITY', 'B-DATE', 'B-DIAGNOSTIC_PROCEDURE', 'B-DISEASE_OR_SYNDROME', 'B-EDUCATIONAL_ACTIVITY', 'B-EUKARYOTE', 'B-EVOLUTION', 'B-FAC', 'B-FOOD', 'B-GENE_OR_GENOME', 'B-GOVERNMENTAL_OR_REGULATORY_ACTIVITY', 'B-GPE', 'B-GROUP', 'B-INDIVIDUAL_BEHAVIOR', 'B-INJURY_OR_POISONING', 'B-LABORATORY_OR_TEST_RESULT', 'B-LABORATORY_PROCEDURE', 'B-LOC', 'B-MOLECULAR_FUNCTION', 'B-MONEY', 'B-NORP', 'B-ORDINAL', 'B-ORG', 'B-ORGANISM', 'B-ORGAN_OR_TISSUE_FUNCTION', 'B-PERCENT', 'B-PERSON', 'B-PHYSICAL_SCIENCE', 'B-PRODUCT', 'B-QUANTITY', 'B-RESEARCH_ACTIVITY', 'B-SIGN_OR_SYMPTOM', 'B-SOCIAL_BEHAVIOR', 'B-SUBSTRATE', 'B-THERAPEUTIC_OR_PREVENTIVE_PROCEDURE', 'B-TIME', 'B-TISSUE', 'B-VIRAL_PROTEIN', 'B-VIRUS', 'B-WILDLIFE', 'I-CA

# Baseline (Model 1) - all predicted tags as 'Other'

## Baseline model (all predicted tags as 'Other')

In [27]:
# Function to add predicted tags (just assigns "Other" for baseline)
def add_predicted_tags(tokens, tags):
    predicted_tags = [['Other'] * len(token_list) for token_list in tags]
    return predicted_tags

# Check if 'pred_ner_tags' already exists and remove it
if 'pred_ner_tags' in dataset["train"].column_names:
    dataset["train"] = dataset["train"].remove_columns("pred_ner_tags")

# Add predicted NER tags to the dataset
predicted_train_tags = add_predicted_tags(train_tokens, train_tags)
dataset["train"] = dataset["train"].add_column("pred_ner_tags", predicted_train_tags)

# Updated function to display tokens, true labels, and predicted labels in a table format
def match_tokens_labels(tokens, true_tags, pred_tags):
    df = pd.DataFrame({
        "Token": tokens,
        "True Tag": true_tags,
        "Pred Tag": pred_tags
    })
    print("\nSAMPLE OF TOKENS WITH TRUE AND PREDICTED NER TAGS\n")
    print(df.head(20))

# Match tokens, true NER tags, and predicted "Other" labels for the first sample
match_tokens_labels(train_tokens[0], train_tags[0], predicted_train_tags[0])


SAMPLE OF TOKENS WITH TRUE AND PREDICTED NER TAGS

          Token               True Tag Pred Tag
0      midnight                 I-TIME    Other
1            of                  Other    Other
2            18                 B-DATE    Other
3      february       B-GENE_OR_GENOME    Other
4          2020       I-GENE_OR_GENOME    Other
5           the                  Other    Other
6         novel                  Other    Other
7   coronavirus          B-CORONAVIRUS    Other
8     pneumonia  B-DISEASE_OR_SYNDROME    Other
9      covid-19          B-CORONAVIRUS    Other
10          had                  Other    Other
11       spread                  Other    Other
12         from                  Other    Other
13        hubei             B-CHEMICAL    Other
14           to                  Other    Other
15           34                  Other    Other
16    provinces                  Other    Other
17           in                  Other    Other
18        china                 B-CE

In [23]:
def add_predicted_tags(tokens, tags):
    predicted_tags = [['Other'] * len(token_list) for token_list in tags]
    return predicted_tags

# Add predicted NER tags to the dataset
predicted_train_tags = add_predicted_tags(train_tokens, train_tags)
# Now, add predicted_ner_tags to the original dataset
dataset["train"] = dataset["train"].add_column("predicted_ner_tags", predicted_train_tags)

# Print a sample of tokens and their true and predicted NER tags
def match_tokens_labels(tokens, true_labels, predicted_labels):
    line1 = ""
    line2 = ""
    line3 = ""
    # Iterate over tokens, true labels, and predicted labels together
    for word, true_label, predicted_label in zip(tokens, true_labels, predicted_labels):
        max_length = max(len(word), len(true_label), len(predicted_label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += true_label + " " * (max_length - len(true_label) + 1)
        line3 += predicted_label + " " * (max_length - len(predicted_label) + 1)
    # Print the output with correct alignment
    print('\nSAMPLE OF TOKENS WITH TRUE AND PREDICTED NER TAGS' + '\n', line1)
    print(line2)
    print(line3)

# Match tokens, true NER tags, and predicted "Other" labels for the first sample
match_tokens_labels(train_tokens[0], train_tags[0], predicted_train_tags[0])

from sklearn.metrics import accuracy_score

# Flatten true and predicted tags
flat_true = [tag for sent in train_tags for tag in sent]
flat_pred = [tag for sent in predicted_train_tags for tag in sent]

# Calculate accuracy
accuracy = accuracy_score(flat_true, flat_pred)
print(f"\nBaseline Accuracy: {accuracy:.4f}")

from sklearn.metrics import classification_report

# Compute report excluding "Other"
labels = sorted(set(flat_true) - {"Other"})
report = classification_report(flat_true, flat_pred, labels=labels, zero_division=0)
print("\nBaseline Classification Report (excluding 'Other'):\n")
print(report)

Flattening the indices:   0%|          | 0/90 [00:00<?, ? examples/s]


SAMPLE OF TOKENS WITH TRUE AND PREDICTED NER TAGS
 midnight of    18     february         2020             the   novel coronavirus   pneumonia             covid-19      had   spread from  hubei      to    34    provinces in    china  and   another 25    countries, resulting in    75     199        confirmed        cases            with  2009   deaths                (table 1)    [7]   .     at    present, the   number of    cases is    increasing rapidly in    china  and   even  around the   world, which is    a     big   threat to    public     health     thirty-one provinces of    china  have  initiated a          level-1    public     health     response. the   aim   of    this  article is    to    provide a     timely review of    the   characteristics on    31    december         2019             wuhan municipal health commission reported a     number of    unknown pneumonia             cases related to    huanan     seafood    wholesale  market     27    cases were  hospitalized,

# DistilBERT (Model 2)