In [1]:
import spacy

In [None]:
!pip install en_core_web_sm

In [2]:
nlp= spacy.blank("en")

doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [3]:
nlp.pipe_names

[]

In [6]:
# !pip install en_core_web_sm. After install this we are using/loading this package now in spacy.

nlp= spacy.load("en_core_web_sm")

> These are all the inbuild features that we are getting when we are loading a pre-defined trained pipeline(**en_core_web_sm**).

In [7]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [9]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f4795480fa0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f4795480f30>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f479531b850>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f4795251460>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f4795258410>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f479531b6d0>)]

>'pos' -> Part of Speech, 'lemma' is the base word.

> Part of Speech is coming from the 'tagger' component of the pipeline.

> Lemma is icmung from the 'lemmatizer' component of the pipeline.

In [10]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token, " | ", token.pos_, " | ", token.lemma_) # 'pos' -> Part of Speech, 'lemma' is the base word.

Captain  |  PROPN  |  Captain
america  |  PROPN  |  america
ate  |  VERB  |  eat
100  |  NUM  |  100
$  |  NUM  |  $
of  |  ADP  |  of
samosa  |  NOUN  |  samosa
.  |  PUNCT  |  .
Then  |  ADV  |  then
he  |  PRON  |  he
said  |  VERB  |  say
I  |  PRON  |  I
can  |  AUX  |  can
do  |  VERB  |  do
this  |  PRON  |  this
all  |  DET  |  all
day  |  NOUN  |  day
.  |  PUNCT  |  .


# Named Entity Recognition(NER)

In [17]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


## Display it in a fancier way

In [16]:
from spacy import displacy

displacy.render(doc, style="ent")

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Tesla Inc\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n is going to acquire twitter for \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    $45 billion\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">MONEY</span>\n</mark>\n</div>'

# Blank Pipeline with Custom Components


> If we want custom components in a blank pipeline, then we have to follow the below code. Since I don't want the in-build components that comes with loading a pre-build/trained pipeline.

> The problem this is solving is that someone wants to use only one of the components and not all of them.


In [19]:
source_nlp_pipeline = spacy.load("en_core_web_sm")

nlp= spacy.blank("en")

nlp.add_pipe("ner", source=source_nlp_pipeline) # From the 'en_core_web_sm' pipeline, add only 'ner' component here.
nlp.pipe_names


['ner']

In [20]:
nlp.pipeline

[('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f4794fa8d50>)]

In [22]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit
