<a href="https://colab.research.google.com/github/WaryFriend456/NLP/blob/main/nlplab_p7_22BD1A660W_04_04_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 7.	Develop a Named Entity Recognition system: Use a dataset with annotated entities (e.g., persons, organizations).
## i.	Implement an NER model.
## ii.	Evaluate the model's performance.

In [1]:
%pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [2]:
import spacy
import random
from spacy.training import Example
from datasets import load_dataset
from sklearn.metrics import classification_report
from spacy.tokens import DocBin
from spacy.training.iob_utils import offsets_to_biluo_tags

In [3]:
conll = load_dataset("conll2003")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [4]:
ner_label_map = conll["train"].features["ner_tags"].feature.int2str

In [5]:
def convert_conll_to_spacy(conll_dataset):
    spacy_data = []

    for entry in conll_dataset:
        text = " ".join(entry["tokens"])
        entities = []
        start = 0

        for token, label in zip(entry["tokens"], entry["ner_tags"]):
            token_start = text.find(token, start)
            token_end = token_start + len(token)

            if label != 0:
                entity_label = ner_label_map(label)
                entities.append((token_start, token_end, entity_label))

            start = token_end

        spacy_data.append((text, {"entities": entities}))

    return spacy_data

In [6]:
train_data_spacy = convert_conll_to_spacy(conll["train"])
test_data_spacy = convert_conll_to_spacy(conll["test"])

In [7]:
def train_ner(train_data, n_iter=3, model="en_core_web_sm"):
    nlp = spacy.load(model)
    print("Loaded model")
    ner = nlp.get_pipe("ner")


    for _, annotations in train_data:
        for ent in annotations["entities"]:
            ner.add_label(ent[2])

    unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

    optimizer = nlp.resume_training()

    with nlp.disable_pipes(*unaffected_pipes):
        for i in range(n_iter):
            random.shuffle(train_data)
            losses = {}

            for text, annotations in train_data:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)

                nlp.update([example], drop=0.3, losses=losses)

            print(f"Iteration {i+1}: Losses {losses}")

    return nlp

In [8]:
train_data_spacy = train_data_spacy[:2000]
nlp_custom = train_ner(train_data_spacy, n_iter=3)

Loaded model
Iteration 1: Losses {'ner': np.float32(3004.0825)}
Iteration 2: Losses {'ner': np.float32(1628.968)}
Iteration 3: Losses {'ner': np.float32(1087.7428)}


In [9]:
def evaluate_model(nlp, test_data):
    y_true, y_pred = [], []

    for text, annotations in test_data:
        doc = nlp(text)
        true_tags = offsets_to_biluo_tags(doc, annotations["entities"])
        pred_tags = [token.ent_iob_ + ("-" + token.ent_type_ if token.ent_type_ else "")
                     for token in doc]
        y_true.extend(true_tags)
        y_pred.extend(pred_tags)

    print("NER Model Evaluation Report:")
    print(classification_report(y_true, y_pred))

In [10]:
evaluate_model(nlp_custom, test_data_spacy)

NER Model Evaluation Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

     B-B-LOC       0.00      0.20      0.00        10
    B-B-MISC       0.02      0.22      0.03        49
     B-B-ORG       0.00      0.00      0.00        20
     B-B-PER       0.00      0.88      0.01         8
     B-I-LOC       0.00      0.00      0.00         0
    B-I-MISC       0.00      0.00      0.00         3
     B-I-ORG       0.00      0.00      0.00         1
     B-I-PER       0.01      0.71      0.02        17
     I-B-LOC       0.00      0.00      0.00         9
    I-B-MISC       0.14      0.16      0.15        51
     I-B-ORG       0.00      0.00      0.00        19
     I-B-PER       0.12      0.60      0.19        10
    I-I-MISC       0.00      0.00      0.00         3
     I-I-ORG       0.00      0.00      0.00         1
     I-I-PER       0.10      0.05      0.07        19
     L-B-LOC       0.00      0.00      0.00        10
    L-B-MISC       0.00      0.00      0.00        49
     L-B-ORG       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
# Mapping entity labels to their meanings
entity_meanings = {
    "B-PER": "Person",
    "I-PER": "Person",
    "B-ORG": "Organization",
    "I-ORG": "Organization",
    "B-LOC": "Location",
    "I-LOC": "Location",
    "B-MISC": "Miscellaneous",
    "I-MISC": "Miscellaneous"
}

def test_finetuned_ner(nlp_finetuned, text):
    """Test the fine-tuned NER model and provide entity descriptions."""
    doc = nlp_finetuned(text)
    print("Entities found:\n")

    for ent in doc.ents:
        entity_label = ent.label_
        description = entity_meanings.get(entity_label, "Unknown")
        print(f"{ent.text} -> {entity_label} ({description})")

sample_text = "Google was founded by Larry Page and Sergey Brin in 1998 at Stanford University."
test_finetuned_ner(nlp_custom, sample_text)


Entities found:

Larry -> B-PER (Person)
Page -> I-PER (Person)
Sergey -> B-PER (Person)
Brin -> I-PER (Person)
Stanford -> B-ORG (Organization)
University -> I-ORG (Organization)
