In [1]:
import spacy

In [5]:
# We start by doing the same thing we did previously

nlp = spacy.blank('en')

doc = nlp('I like to study and learn computer science. My favorite subject is quantum AI')

for token in doc:
    print(token)

I
like
to
study
and
learn
computer
science
.
My
favorite
subject
is
quantum
AI


As we said in the previous presentation we get tokenizer without adding it, it's come with the pipeline.

In [6]:
# You can see that we got nothing in the pipeline
nlp.pipe_names

[]

In [7]:
# For each language you can download a pre-trained pipeline

In [9]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


You should consider upgrading via the 'C:\Users\JULIA\anaconda3\envs\nlp\python.exe -m pip install --upgrade pip' command.


In [10]:
nlp = spacy.load("en_core_web_sm")

If you want to use other language than english you can visit : https://spacy.io/usage/training

In [12]:
# You can see we have other items in the pipeline
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [13]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x27fee30ec40>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x27f9a18be80>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x27f99dd6f90>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x27f9a36bec0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x27f9a3a2780>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x27f99ef70b0>)]

In [18]:
doc = nlp('I like to study and learn computer science. My favorite subject is quantum AI')

for token in doc:
    print(token,"|", token.pos_, "|", token.lemma_)

I | PRON | I
like | VERB | like
to | PART | to
study | VERB | study
and | CCONJ | and
learn | VERB | learn
computer | NOUN | computer
science | NOUN | science
. | PUNCT | .
My | PRON | my
favorite | ADJ | favorite
subject | NOUN | subject
is | AUX | be
quantum | ADJ | quantum
AI | PROPN | AI


- pos mean part of speech ; it allows us to distinguish nouns, verbs, proper names and common names..., You can see study is a VERB
- lemma shows the base word for example is & be

In [27]:
# Let's try "ner", it's mean named entity recognition

doc = nlp('Sony, Microsoft and Nintendo are waging a fierce war to determine the future of the video game market which weighs $195.65 billion')

for ent in doc.ents:  # ent mean entity
    print(ent.text,"|",ent.label_)

Sony | ORG
Microsoft | ORG
Nintendo | ORG
$195.65 billion | MONEY


You can see that it has detected the 3 organizations and Money that are presented in the text

In [29]:
# You can have an explanation of the labels too

doc = nlp('Sony, Microsoft and Nintendo are waging a fierce war to determine the future of the video game market which weighs $195.65 billion')

for ent in doc.ents:  
    print(ent.text,"|",ent.label_,"|",spacy.explain(ent.label_))

Sony | ORG | Companies, agencies, institutions, etc.
Microsoft | ORG | Companies, agencies, institutions, etc.
Nintendo | ORG | Companies, agencies, institutions, etc.
$195.65 billion | MONEY | Monetary values, including unit


In [32]:
# You can have a better visualization

from spacy import displacy

displacy.render(doc,style='ent')

# Custom components

Let's use a blank pipeline and add a custom component like we did in the last presentation

In [38]:
# We start by loading the english pipeline
base_nlp = spacy.load("en_core_web_sm")

#We will create now a blank pipeline

nlp = spacy.blank("eng")

#We will add "ner" from the english pipeline

nlp.add_pipe("ner", source = base_nlp)

<spacy.pipeline.ner.EntityRecognizer at 0x27fa64926d0>

In [39]:
nlp.pipe_names

['ner']