Named Entity Recognition (NER) using spaCy

In [1]:
!pip install spacy scikit-learn
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m83.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import spacy
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score


pre-trained spaCy NER model

In [3]:

nlp = spacy.load("en_core_web_sm")


In [4]:
texts = [
    "Narendra Modi addressed the UN in New York on Monday.",
    "Apple was founded by Steve Jobs in California in 1976.",
    "Barack Obama visited India in January.",
    "Google announced a new product in San Francisco on Tuesday."
]


In [5]:
ground_truth = [
    {
        "Narendra": "PERSON", "Modi": "PERSON",
        "UN": "ORG",
        "New": "GPE", "York": "GPE",
        "Monday": "DATE"
    },
    {
        "Apple": "ORG",
        "Steve": "PERSON", "Jobs": "PERSON",
        "California": "GPE",
        "1976": "DATE"
    },
    {
        "Barack": "PERSON", "Obama": "PERSON",
        "India": "GPE",
        "January": "DATE"
    },
    {
        "Google": "ORG",
        "San": "GPE", "Francisco": "GPE",
        "Tuesday": "DATE"
    }
]


In [6]:
predicted_labels = []
true_labels = []


In [7]:
for text, gt in zip(texts, ground_truth):
    doc = nlp(text)
    for token in doc:
        # True label
        true_label = gt.get(token.text, "O")
        true_labels.append(true_label)

        # Predicted label
        pred_label = "O"
        for ent in doc.ents:
            if token.text in ent.text.split():
                pred_label = ent.label_
        predicted_labels.append(pred_label)


In [9]:
print(predicted_labels)

['PERSON', 'PERSON', 'O', 'O', 'ORG', 'O', 'GPE', 'GPE', 'O', 'DATE', 'O', 'ORG', 'O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'GPE', 'O', 'DATE', 'O', 'PERSON', 'PERSON', 'O', 'GPE', 'O', 'DATE', 'O', 'ORG', 'O', 'O', 'O', 'O', 'O', 'GPE', 'GPE', 'O', 'DATE', 'O']


In [8]:
print("Accuracy:", accuracy_score(true_labels, predicted_labels))
print("Precision:", precision_score(true_labels, predicted_labels, average="weighted", zero_division=0))
print("Recall:", recall_score(true_labels, predicted_labels, average="weighted", zero_division=0))
print("F1 Score:", f1_score(true_labels, predicted_labels, average="weighted", zero_division=0))


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [10]:
print(classification_report(true_labels, predicted_labels, zero_division=0))


              precision    recall  f1-score   support

        DATE       1.00      1.00      1.00         4
         GPE       1.00      1.00      1.00         6
           O       1.00      1.00      1.00        21
         ORG       1.00      1.00      1.00         3
      PERSON       1.00      1.00      1.00         6

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



In [11]:
for text in texts:
    doc = nlp(text)
    print("\nText:", text)
    for ent in doc.ents:
        print(ent.text, "->", ent.label_)



Text: Narendra Modi addressed the UN in New York on Monday.
Narendra Modi -> PERSON
UN -> ORG
New York -> GPE
Monday -> DATE

Text: Apple was founded by Steve Jobs in California in 1976.
Apple -> ORG
Steve Jobs -> PERSON
California -> GPE
1976 -> DATE

Text: Barack Obama visited India in January.
Barack Obama -> PERSON
India -> GPE
January -> DATE

Text: Google announced a new product in San Francisco on Tuesday.
Google -> ORG
San Francisco -> GPE
Tuesday -> DATE
