In [5]:
import nltk
import spacy

nltk.download("reuters")

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/alextanhongpin/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

### Named-entity recognition

In [7]:
nlp = spacy.load("en_core_web_sm")
print(*nlp.pipeline, sep="\n")

('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x12f9d2c70>)
('tagger', <spacy.pipeline.tagger.Tagger object at 0x12fb9d3b0>)
('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x12f9e66a0>)
('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x12fba00c0>)
('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x12fb56440>)
('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x12f9d4d60>)


In [9]:
text = """Hughes Tool Co Chairman W.A. Kistler said its merger with
Baker International Corp was still under consideration.
We hope to come soon to a mutual agreement, Kistler said.
The directors of Baker filed a law suit in Texas to force Hughes
to complete the merger."""

doc = nlp(text)

print(*[(e.text, e.label_) for e in doc.ents], sep="\n")

('Hughes Tool Co', 'ORG')
('W.A. Kistler', 'PERSON')
('Baker International Corp', 'ORG')
('Kistler', 'ORG')
('Baker', 'PERSON')
('Texas', 'GPE')
('Hughes', 'ORG')


In [11]:
from spacy import displacy

displacy.render(doc, style="ent")

### Blueprint: Using Rule-based Named-entity recognition

In [30]:
from spacy.pipeline import EntityRuler

departments = ["Justice", "Transportation"]
patterns = [
    {
        "label": "GOV",
        "pattern": [
            {"TEXT": "U.S.", "OP": "?"},
            {"TEXT": "Department"},
            {"TEXT": "of"},
            {"TEXT": {"IN": departments}, "ENT_TYPE": "ORG"},
        ],
    },
    {
        "label": "GOV",
        "pattern": [
            {"TEXT": "U.S.", "OP": "?"},
            {"TEXT": {"IN": departments}, "ENT_TYPE": "ORG"},
            {"TEXT": "Department"},
        ],
    },
    {
        "label": "GOV",
        "pattern": [
            {"TEXT": "Securities"},
            {"TEXT": "and"},
            {"TEXT": "Exchange"},
            {"TEXT": "Commision"},
        ],
    },
]

In [42]:
if nlp.has_pipe("entity_ruler"):
    nlp.remove_pipe("entity_ruler")

entity_ruler = nlp.add_pipe(
    "entity_ruler",
    config={
        "overwrite_ents": True,
    },
)
entity_ruler.add_patterns(patterns)

In [43]:
text = """Justice Department is an alias for the U.S. Department of Justice.
Department of Transportation and the Securities and Exchange Commission
are government organisations, but the Sales Department is not."""

doc = nlp(text)
displacy.render(doc, style="ent", jupyter=True)

### Blueprint: Normalizing Named Entities

In [44]:
text = "Baker International's shares climbed on the New York Stock Exchange."

doc = nlp(text)
print(*[([t.text for t in e], e.label_) for e in doc.ents], sep="\n")

(['Baker', 'International', "'s"], 'ORG')
(['the', 'New', 'York', 'Stock', 'Exchange'], 'ORG')


In [47]:
from spacy import Language
from spacy.tokens import Span


@Language.component("norm_entities")
def norm_entities(doc):
    ents = []
    for ent in doc.ents:
        if ent[0].pos_ == "DET":  # leading article
            ent = Span(doc, ent.start + 1, ent.end, label=ent.label)
        if len(ent) > 0:
            if ent[-1].pos_ == "PART":  # trailing particle like 's
                ent = Span(doc, ent.start, ent.end - 1, label=ent.label)
            ents.append(ent)
    doc.ents = tuple(ents)
    return doc

In [49]:
nlp.add_pipe("norm_entities")
doc = nlp(text)

print(*[([t.text for t in e], e.label_) for e in doc.ents], sep="\n")

(['Baker', 'International'], 'ORG')
(['New', 'York', 'Stock', 'Exchange'], 'ORG')


**Merging Entity Token**

In [51]:
from spacy.pipeline import merge_entities

if nlp.has_pipe("merge_entities"):  ###
    _ = nlp.remove_pipe("merge_entities")  ###
nlp.add_pipe("merge_entities")

doc = nlp(text)
print(*[(t.text, t.ent_type_) for t in doc if t.ent_type_ != ""])

('Baker International', 'ORG') ('New York Stock Exchange', 'ORG')
