NLTK Named Entity Extraction

This basic demonstration shows how to use the NLTK library to extract named entities. It is somewhat more complicated than Spacy as it requires you to do the preprocessing.


In [2]:
# in this cell we setup the tokenization and part of speech tagging provided by nltk. 
# From this we will be able to assigned named entity tags to the words and POS tuples

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

ex = """Samsung ships 73 percent more phones, makes much less money. 
The issue is not just a deceptive portrayal of percentages of growth. 
Samsung's calendar Q2 mobile revenues from its IM segment were reported to be 22.67 Trillion KRW ($20 billion). 
Apple's net sales for the same quarter were more than double that: $53.265 billion. 
"""


def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(ex)
sent

[('Samsung', 'NNP'),
 ('ships', 'VBZ'),
 ('73', 'CD'),
 ('percent', 'NN'),
 ('more', 'JJR'),
 ('phones', 'NNS'),
 (',', ','),
 ('makes', 'VBZ'),
 ('much', 'RB'),
 ('less', 'JJR'),
 ('money', 'NN'),
 ('.', '.'),
 ('The', 'DT'),
 ('issue', 'NN'),
 ('is', 'VBZ'),
 ('not', 'RB'),
 ('just', 'RB'),
 ('a', 'DT'),
 ('deceptive', 'JJ'),
 ('portrayal', 'NN'),
 ('of', 'IN'),
 ('percentages', 'NNS'),
 ('of', 'IN'),
 ('growth', 'NN'),
 ('.', '.'),
 ('Samsung', 'NNP'),
 ("'s", 'POS'),
 ('calendar', 'NN'),
 ('Q2', 'NNP'),
 ('mobile', 'NN'),
 ('revenues', 'NNS'),
 ('from', 'IN'),
 ('its', 'PRP$'),
 ('IM', 'NNP'),
 ('segment', 'NN'),
 ('were', 'VBD'),
 ('reported', 'VBN'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('22.67', 'CD'),
 ('Trillion', 'NNP'),
 ('KRW', 'NNP'),
 ('(', '('),
 ('$', '$'),
 ('20', 'CD'),
 ('billion', 'CD'),
 (')', ')'),
 ('.', '.'),
 ('Apple', 'NNP'),
 ("'s", 'POS'),
 ('net', 'JJ'),
 ('sales', 'NNS'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('same', 'JJ'),
 ('quarter', 'NN'),
 ('were', 'VBD'),
 ('m

In [3]:
# here we are constructing a syntactic parse pattern that we will be able to extract chunks from the parsed sentences
# Notice: It will attempt to identify noun phrases consisting of determiners adjuectives and nouns.

pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  Samsung/NNP
  ships/VBZ
  73/CD
  (NP percent/NN)
  more/JJR
  phones/NNS
  ,/,
  makes/VBZ
  much/RB
  less/JJR
  (NP money/NN)
  ./.
  (NP The/DT issue/NN)
  is/VBZ
  not/RB
  just/RB
  (NP a/DT deceptive/JJ portrayal/NN)
  of/IN
  percentages/NNS
  of/IN
  (NP growth/NN)
  ./.
  Samsung/NNP
  's/POS
  (NP calendar/NN)
  Q2/NNP
  (NP mobile/NN)
  revenues/NNS
  from/IN
  its/PRP$
  IM/NNP
  (NP segment/NN)
  were/VBD
  reported/VBN
  to/TO
  be/VB
  22.67/CD
  Trillion/NNP
  KRW/NNP
  (/(
  $/$
  20/CD
  billion/CD
  )/)
  ./.
  Apple/NNP
  's/POS
  net/JJ
  sales/NNS
  for/IN
  (NP the/DT same/JJ quarter/NN)
  were/VBD
  more/JJR
  than/IN
  double/JJ
  that/IN
  :/:
  $/$
  53.265/CD
  billion/CD
  ./.)


In [4]:
# here we are examining the POS tags and syntactic parses and applying the CONLL tags to them.
# basically these are indicating whether different words could constitute the beginning and 
# end of a chunk candidate.
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('Samsung', 'NNP', 'O'),
 ('ships', 'VBZ', 'O'),
 ('73', 'CD', 'O'),
 ('percent', 'NN', 'B-NP'),
 ('more', 'JJR', 'O'),
 ('phones', 'NNS', 'O'),
 (',', ',', 'O'),
 ('makes', 'VBZ', 'O'),
 ('much', 'RB', 'O'),
 ('less', 'JJR', 'O'),
 ('money', 'NN', 'B-NP'),
 ('.', '.', 'O'),
 ('The', 'DT', 'B-NP'),
 ('issue', 'NN', 'I-NP'),
 ('is', 'VBZ', 'O'),
 ('not', 'RB', 'O'),
 ('just', 'RB', 'O'),
 ('a', 'DT', 'B-NP'),
 ('deceptive', 'JJ', 'I-NP'),
 ('portrayal', 'NN', 'I-NP'),
 ('of', 'IN', 'O'),
 ('percentages', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('growth', 'NN', 'B-NP'),
 ('.', '.', 'O'),
 ('Samsung', 'NNP', 'O'),
 ("'s", 'POS', 'O'),
 ('calendar', 'NN', 'B-NP'),
 ('Q2', 'NNP', 'O'),
 ('mobile', 'NN', 'B-NP'),
 ('revenues', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('its', 'PRP$', 'O'),
 ('IM', 'NNP', 'O'),
 ('segment', 'NN', 'B-NP'),
 ('were', 'VBD', 'O'),
 ('reported', 'VBN', 'O'),
 ('to', 'TO', 'O'),
 ('be', 'VB', 'O'),
 ('22.67', 'CD', 'O'),
 ('Trillion', 'NNP', 'O'),
 ('KRW', 'NNP', 'O'),
 ('('

In [5]:
# here we are running named entity extraction over the sentence.
# this will tokenize, then part of speech tage, and finally identify Named entities
# Notice, words like Samsung have an extra tag prepended. 
# in this case Samsung is identified as a geo-political entity, a person, etc.
# not right, but it is at least recognized as something special.

ne_tree =  nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (GPE Samsung/NNP)
  ships/VBZ
  73/CD
  percent/NN
  more/JJR
  phones/NNS
  ,/,
  makes/VBZ
  much/RB
  less/JJR
  money/NN
  ./.
  The/DT
  issue/NN
  is/VBZ
  not/RB
  just/RB
  a/DT
  deceptive/JJ
  portrayal/NN
  of/IN
  percentages/NNS
  of/IN
  growth/NN
  ./.
  (PERSON Samsung/NNP)
  's/POS
  calendar/NN
  Q2/NNP
  mobile/NN
  revenues/NNS
  from/IN
  its/PRP$
  IM/NNP
  segment/NN
  were/VBD
  reported/VBN
  to/TO
  be/VB
  22.67/CD
  Trillion/NNP
  KRW/NNP
  (/(
  $/$
  20/CD
  billion/CD
  )/)
  ./.
  (PERSON Apple/NNP)
  's/POS
  net/JJ
  sales/NNS
  for/IN
  the/DT
  same/JJ
  quarter/NN
  were/VBD
  more/JJR
  than/IN
  double/JJ
  that/IN
  :/:
  $/$
  53.265/CD
  billion/CD
  ./.)


Spacy - Now we will try a more sophisticated named entity extraction library.

In [6]:
# the models that you can install with Spacy contain named entity capabilities
# although you should check if you are using languages other than English
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [7]:
# here we can apply Spacy to a document and then look at the ents property on the document object
# this will give us a generator of entities that we can scan through.
# Notice anything off with the returned entities?
doc = nlp(ex)
pprint([(X.text, X.label_) for X in doc.ents])

[('Samsung', 'ORG'),
 ('73 percent', 'PERCENT'),
 ('\n', 'GPE'),
 ('\n', 'GPE'),
 ('Samsung', 'ORG'),
 ('IM', 'ORG'),
 ('22.67 Trillion KRW', 'MONEY'),
 ('$20 billion', 'MONEY'),
 ('Apple', 'ORG'),
 ('the same quarter', 'DATE'),
 ('$53.265 billion', 'MONEY'),
 ('\n', 'GPE')]


In [8]:
# you can also use displacy to display a markedup version of the text with the entity tags applied.
displacy.render(nlp(ex), jupyter=True, style='ent')