# Spacy Tokenization 

In [50]:
#!pip install spacy

In [3]:
import spacy

# Creating blank language object and tokenize words in a sentence
# Creating blank language object gives a tokenizer and an empty pipeline
nlp = spacy.blank("en")

doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.")

for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


In [6]:
print(doc[0])

Dr.


In [7]:
token = doc[1]
print(token.text)

Strange


In [8]:
print(dir(token))

['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'ancestors', 'check_flag', 'children', 'cluster', 'conjuncts', 'dep', 'dep_', 'doc', 'ent_id', 'ent_id_', 'ent_iob', 'ent_iob_', 'ent_kb_id', 'ent_kb_id_', 'ent_type', 'ent_type_', 'get_extension', 'has_dep', 'has_extension', 'has_head', 'has_morph', 'has_vector', 'head', 'i', 'idx', 'iob_strings', 'is_alpha', 'is_ancestor', 'is_ascii', 'is_bracket', 'is_currency', 'is_digit', 'is_left_punct', 'is_lower', 'is_oov', 'is_punct', 'is_quote', 'is_right_punct', 'is_sent_end', 'is_sent_start', 'is_space', 'is_stop', 'is_title', 'is_upper', 'lang', 'lang_', 'left_edge', 'lefts', 'lemma', 'lemma_', 'lex', 'lex_id', 'like_email', 'like

In [9]:
print(type(nlp))

<class 'spacy.lang.en.English'>


In [10]:
print(type(doc))

<class 'spacy.tokens.doc.Doc'>


In [11]:
print(type(token))

<class 'spacy.tokens.token.Token'>


In [12]:
nlp.pipe_names # blank 

[]

# Token attributes

In [13]:
doc = nlp("Tony gave two $ to Peter.")
token0 = doc[0]
token0

Tony

In [14]:
token0.is_alpha

True

In [15]:
token0.like_num

False

In [16]:
token2 = doc[2]
token2

two

In [17]:
token2.like_num

True

In [18]:
token3 = doc[3]
token3

$

In [19]:
token3.like_num

False

In [20]:
token3.is_currency

True

In [23]:
for token in doc:
    print(token, "==>", "index:", token.i, ",     is_alpha:", token.is_alpha, 
          "     is_punct:", token.is_punct, 
          "     like_num:", token.like_num,
          "    is_currency:", token.is_currency,
         )

Tony ==> index: 0 ,     is_alpha: True      is_punct: False      like_num: False     is_currency: False
gave ==> index: 1 ,     is_alpha: True      is_punct: False      like_num: False     is_currency: False
two ==> index: 2 ,     is_alpha: True      is_punct: False      like_num: True     is_currency: False
$ ==> index: 3 ,     is_alpha: False      is_punct: False      like_num: False     is_currency: True
to ==> index: 4 ,     is_alpha: True      is_punct: False      like_num: False     is_currency: False
Peter ==> index: 5 ,     is_alpha: True      is_punct: False      like_num: False     is_currency: False
. ==> index: 6 ,     is_alpha: False      is_punct: True      like_num: False     is_currency: False


# Extracting email ids from a text

In [38]:
doc2 = nlp("Virat   5 June, 1882    virat@kohli.com")
emails = []
for token in doc2:
    if token.like_email:
        emails.append(token.text)
emails 

['virat@kohli.com']

# Blank nlp pipeline

In [24]:
import spacy

nlp = spacy.blank("en")

doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [25]:
nlp.pipe_names

# nlp.pipe_names is empty array indicating no components in the pipeline. Pipeline is something that starts with a tokenizer

[]

# Download trained pipeline
- To download trained pipeline use a command such as,

python -m spacy download en_core_web_sm

This downloads the small (sm) pipeline for english language

In [42]:
# !python -m spacy download en_core_web_sm

In [30]:
nlp = spacy.load("en_core_web_sm") # pretrained model
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [31]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1a27cc1ffa0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1a27cc1f8e0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1a205c88a50>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1a205af3380>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1a205a86600>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1a205c885f0>)]

In [32]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token, " | ", spacy.explain(token.pos_), " | ", token.lemma_)

Captain  |  proper noun  |  Captain
america  |  proper noun  |  america
ate  |  verb  |  eat
100  |  numeral  |  100
$  |  numeral  |  $
of  |  adposition  |  of
samosa  |  proper noun  |  samosa
.  |  punctuation  |  .
Then  |  adverb  |  then
he  |  pronoun  |  he
said  |  verb  |  say
I  |  pronoun  |  I
can  |  auxiliary  |  can
do  |  verb  |  do
this  |  pronoun  |  this
all  |  determiner  |  all
day  |  noun  |  day
.  |  punctuation  |  .


# Adding a component to a blank pipeline

In [40]:
source_nlp = spacy.load("en_core_web_sm")

blank_nlp = spacy.blank("en")
blank_nlp.add_pipe("ner", source=source_nlp) # adding NER to a blank pipeline from source_nlp
blank_nlp.pipe_names # we see that blank_nlp has only NER component

['ner']

# Named Entity Recognition

In [35]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for entity in doc.ents:
    print(entity.text, " | " ,entity.label_, " | ", spacy.explain(entity.label_)) # it's telling the entity text and it's recognition

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [36]:
from spacy import displacy

displacy.render(doc, style="ent")

In [37]:
doc1 = nlp("Bloomberg founded a data company Bloomberg")
for entity in doc1.ents:
    print(entity.text, " | ", entity.label_," | ")
# clearly identifying the entities

Bloomberg  |  PERSON  | 
Bloomberg  |  ORG  | 


In [43]:
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [44]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))
    
    # it didn't recognize twitter as a company as(maybe because it's written in small case and Inc is missing after it)

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [45]:
doc = nlp("Tesla Inc is going to acquire Twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))
    
    # it is recognizing Twitter as a Product here as Inc is missing after it

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  PRODUCT  |  Objects, vehicles, foods, etc. (not services)
$45 billion  |  MONEY  |  Monetary values, including unit


In [46]:
doc = nlp("Tesla Inc is going to acquire Twitter Inc for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))
    
# now Twitter Inc is being recognized as a company here

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [47]:
from spacy import displacy

displacy.render(doc, style="ent")

# List down all the entities provided in pretrained model ("en_core_web_sm")

In [49]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [53]:
doc = nlp("Michael Bloomberg founded Bloomberg in 1982")
for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))
    
# Bloomberg is wrongly recognized as a Person, but it is a company

Michael Bloomberg | PERSON | People, including fictional
Bloomberg | PERSON | People, including fictional
1982 | DATE | Absolute or relative dates or periods


In [54]:
doc = nlp("Michael Bloomberg founded Bloomberg Inc in 1982")
for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))
    
    #Bloomberg Inc is recognized as a company now

Michael Bloomberg | PERSON | People, including fictional
Bloomberg Inc | ORG | Companies, agencies, institutions, etc.
1982 | DATE | Absolute or relative dates or periods


# It is not perfect 

# We can use Hugging Face (BERT base NER) 
Ref Link - https://huggingface.co/dslim/bert-base-NER?text=Michael+Bloomberg+founded+Bloomberg+in+1982

# but it also has limitations

# So we may want to use custom entities

In [55]:
doc = nlp("Tesla is going to acquire Twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_)

Tesla  |  ORG
Twitter  |  PRODUCT
$45 billion  |  MONEY


In [58]:
doc = nlp("tesla is going to acquire Twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_)

tesla  |  ORG
Twitter  |  PRODUCT
$45 billion  |  MONEY


In [60]:
print(doc[0]) #individual token

print(type(doc[0])) # Class Token

tesla
<class 'spacy.tokens.token.Token'>


In [63]:
# span - a class in spacy

s = doc[0:3]
print(s)
print(type(s))

tesla is going
<class 'spacy.tokens.span.Span'>


In [69]:
from spacy.tokens import Span

s1 = Span(doc, 0, 1, label="ORG") # span1 - tesla - explicitly labelling it as ORG
s2 = Span(doc, 5, 6, label="ORG") # span2 - Twitter - explicitly labelling it as ORG

doc.set_ents([s1, s2], default="unmodified") # seeting the entities and default="unmodified" means leave other entities unmodified

In [70]:
doc

tesla is going to acquire Twitter for $45 billion

In [71]:
sfor ent in doc.ents:
    print(ent.text, " | ", ent.label_)

tesla  |  ORG
Twitter  |  ORG
$45 billion  |  MONEY
