In [1]:
import spacy

nlp= spacy.load("en_core_web_sm")

In [2]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f7bdd6dc440>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f7bdd6dc2f0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f7bdd576c50>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f7bdd47e460>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f7bdd481370>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f7bdd4bc050>)]

In [3]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [4]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:  # Entities
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [13]:
from spacy import displacy

displacy.render(doc, style="ent")

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Tesla Inc\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n is going to acquire twitter for \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    $45 billion\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">MONEY</span>\n</mark>\n</div>'

In [12]:
from spacy import displacy

displacy.render(doc, style="ent", jupyter=True)

In [14]:
doc = nlp("Tesla Inc is going to acquire Twitter Inc for $45 billion")
for ent in doc.ents:  # Entities
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [15]:
from spacy import displacy

displacy.render(doc, style="ent", jupyter=True)

In [17]:
nlp.pipe_labels['ner'] # Different Entities supported

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [18]:
doc = nlp("Michael Bloomberg founded Bloomberg in 1982")
for ent in doc.ents:
  print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Michael Bloomberg  |  PERSON  |  People, including fictional
Bloomberg  |  GPE  |  Countries, cities, states
1982  |  DATE  |  Absolute or relative dates or periods


In [19]:
doc = nlp("Michael Bloomberg founded Bloomberg Inc in 1982")
for ent in doc.ents:
  print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Michael Bloomberg  |  PERSON  |  People, including fictional
Bloomberg Inc  |  ORG  |  Companies, agencies, institutions, etc.
1982  |  DATE  |  Absolute or relative dates or periods


In [20]:
from spacy import displacy

displacy.render(doc, style="ent", jupyter=True)

In [21]:
doc[2]

founded

In [26]:
doc[2:6]

founded Bloomberg Inc in

In [25]:
 type(doc[2:6])

spacy.tokens.span.Span

## Adding Custom entity annotations

> For tokens outside of any provided spans

In [27]:
doc = nlp("Tesla is going to acquire Twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_)

Twitter  |  ORG
$45 billion  |  MONEY


In [29]:
from spacy.tokens import Span

s1 = Span(doc, 0, 1, label="ORG")
s2 = Span(doc, 5, 6, label="ORG")

doc.set_ents([s1, s2], default="unmodified")  # default="unmodified" -> Keep the default settings, do not change them

In [30]:
for ent in doc.ents:
    print(ent.text, " | ", ent.label_)

Tesla  |  ORG
Twitter  |  ORG
$45 billion  |  MONEY


In [31]:
from spacy import displacy

displacy.render(doc, style="ent", jupyter=True)