 # NER
AI Solution Architect | CTO and Co-founder at Treeleaf/Anydone

## NER with nltk

In [2]:
import nltk
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("maxent_ne_chunker_tab")
nltk.download("words")

[nltk_data] Downloading package punkt to /Users/ashokpant/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ashokpant/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /Users/ashokpant/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to /Users/ashokpant/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [4]:
text = "Elon Musk is the CEO of Tesla and SpaceX, and he was born in South Africa."

# Tokenize and Part-of-Speech (POS) tagging
words = word_tokenize(text)
pos_tags = pos_tag(words)

tree = ne_chunk(pos_tags)

print("Words", words)
print("Pos", pos_tags)
print("Tree", tree)
for subtree in tree:
    if hasattr(subtree, "label"):
        entity_name = " ".join([token for token, pos in subtree.leaves()])
        entity_type = subtree.label()
        print(f"{entity_name} - {entity_type}")
# Elon - PERSON
# Musk - ORGANIZATION
# CEO of Tesla - ORGANIZATION
# SpaceX - ORGANIZATION
# South Africa - GPE

Words ['Elon', 'Musk', 'is', 'the', 'CEO', 'of', 'Tesla', 'and', 'SpaceX', ',', 'and', 'he', 'was', 'born', 'in', 'South', 'Africa', '.']
Pos [('Elon', 'NNP'), ('Musk', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('CEO', 'NN'), ('of', 'IN'), ('Tesla', 'NNP'), ('and', 'CC'), ('SpaceX', 'NNP'), (',', ','), ('and', 'CC'), ('he', 'PRP'), ('was', 'VBD'), ('born', 'VBN'), ('in', 'IN'), ('South', 'NNP'), ('Africa', 'NNP'), ('.', '.')]
Tree (S
  (PERSON Elon/NNP)
  (ORGANIZATION Musk/NNP)
  is/VBZ
  the/DT
  (ORGANIZATION CEO/NN of/IN Tesla/NNP)
  and/CC
  (ORGANIZATION SpaceX/NNP)
  ,/,
  and/CC
  he/PRP
  was/VBD
  born/VBN
  in/IN
  (GPE South/NNP Africa/NNP)
  ./.)
Elon - PERSON
Musk - ORGANIZATION
CEO of Tesla - ORGANIZATION
SpaceX - ORGANIZATION
South Africa - GPE


## NER with Spacy

In [5]:
!pip install -q spacy
!python -m spacy download en_core_web_sm --quiet

[33mDEPRECATION: Loading egg at /Users/ashokpant/miniconda3/envs/ml/lib/python3.12/site-packages/grpcio-1.69.0-py3.12-macosx-11.0-arm64.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /Users/ashokpant/miniconda3/envs/ml/lib/python3.12/site-packages/google_resumable_media-2.7.2-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /Users/ashokpant/miniconda3/envs/ml/lib/python3.12/site-packages/Deprecated-1.2.15-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/1

In [13]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Elon Musk is the CEO of Tesla and SpaceX, and he was born in South Africa."
text = "Nabin Gyawali is smart and CEO of Extra Pvt. Ltd.. He lives in Kathmandu, Nepal"
doc = nlp(text)

print("Entities, their labels, and explanations:")
for ent in doc.ents:
    print(f"{ent.text} - {ent.label_} ({spacy.explain(ent.label_)})")
#
# Entities, their labels, and explanations:
# Elon Musk - PERSON (People, including fictional)
# Tesla - ORG (Companies, agencies, institutions, etc.)
# SpaceX - PERSON (People, including fictional)
# South Africa - GPE (Countries, cities, states)

Entities, their labels, and explanations:
Nabin Gyawali - PERSON (People, including fictional)
Extra Pvt. Ltd - ORG (Companies, agencies, institutions, etc.)
Kathmandu - GPE (Countries, cities, states)
Nepal - GPE (Countries, cities, states)
