In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
# write a function to display basic entity info:
def show_ents(doc):
  if doc.ents:
    for ent in doc.ents:
      print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
  else:
    print('No named entities found.')

In [None]:
doc=nlp(u'May i go to washington,dc next May to see the washington monument?')
show_ents(doc)

washington - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
washington - GPE - Countries, cities, states


In [None]:
doc=nlp(u'can i please borrow 500 dollars from you to buy some microsoft stock?')

In [None]:
for ent in doc.ents:
  print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

500 dollars 4 6 20 31 MONEY


In [None]:
doc=nlp(u'Tesla to go build a U.K. factory for $6 million')
show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


# **Adding a Named Entity to a span**

In [None]:
from spacy.tokens import Span
# Get the hash value of the ORG entity labbel
ORG = nlp.vocab.strings[u'ORG']
# Create a Span for the new entity
new_ent=Span(doc, 0 ,1, label=ORG)
# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

In [None]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [None]:
doc=nlp(u'Our company plants to introduce a new vacuum cleaner.'
       u'If successful, the vacuum cleaner will be our first product.')
show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [None]:
# import Phrasematcher and create a matcherobject:
from spacy.matcher import PhraseMatcher
matcher=PhraseMatcher(nlp.vocab)


In [None]:
# create the desired phrase patterns:
phrase_list=['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns=[nlp(text) for text in phrase_list]

In [None]:
# Apply the pattern to our matcher object:
matcher.add('newproduct', None, *phrase_patterns)
# apply the matcher to our Doc object:
matches=matcher(doc)
matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [None]:
# Here we create Spans from each match, and create named entities from them:
from spacy.tokens import Span
PROD=doc.vocab.strings[u'PRODUCT']
new_ents=[Span(doc, match[1], match[2], label=PROD) for match in matches]
doc.ents=list(doc.ents)+new_ents

In [None]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - ORDINAL - "first", "second", etc.


In [None]:
doc=nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')
show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [None]:
len([ent for ent in doc.ents if ent.label_=='MONEY'])

2

In [None]:
# Quick function to remove ents formed on whitespace:
def remove_whitespace_entities(doc):
  doc.ents= [e for e in doc.ents if not e.text.isspace()]
  return doc

In [None]:
# insert this into the pipeline AFTER the ner component:
from spacy.language import Language

@Language.component('remove_whitespace_entities')
def remove_whitespace_entities(doc):
  doc.ents= [e for e in doc.ents if not e.text.isspace()]
  return doc
nlp.add_pipe('remove_whitespace_entities',after='ner')

In [None]:
doc=nlp(u'Originally priced at $29.50,\nthe sweater was marked down to five dollars.')
show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


# **Visualizing-NER With Spacy**

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy

In [None]:
doc=nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.'
     u'By contrast, Sony only sold 8 thousand Walkman music players.')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
for sent in doc.sents:
  displacy.render(nlp(sent.text), style='ent', jupyter=True)

# **Sentence Segmentation**

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
# From spacy basics:
doc=nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')
for sent in doc.sents:
  print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [None]:
print(doc[1])

is


In [None]:
print(doc.sents[1])

TypeError: '_cython_3_1_1.generator' object is not subscriptable

In [None]:
doc_sents=[sent for sent in doc.sents]
doc_sents

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [None]:
# how can acces indivisual sentenes:
print(doc_sents[1])

This is another sentence.


In [None]:
print(doc_sents[1].start,doc_sents[1].end)

6 11


# **Adding Rule**

In [None]:
doc2=nlp(u'This is a sentence. This is a sentence. This is a sentence.')
for token in doc2:
  print(token.is_sent_start, ' '+token.text)

True  This
False  is
False  a
False  sentence
False  .
True  This
False  is
False  a
False  sentence
False  .
True  This
False  is
False  a
False  sentence
False  .


In [None]:
# spacy's default behavior
doc3=nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')
for sent in doc3.sents:
  print(sent)

"Management is doing the right things; leadership is doing the right things."
-Peter Drucker


In [None]:
# add a new rule to the pipeline
from spacy.language import Language

@Language.component('set_custom_boundaries')
def set_custom_boundaries(doc):
  for token in doc[:-1]:
    if token.text==';':
      doc[token.i+1].is_sent_start=True
  return doc
nlp.add_pipe('set_custom_boundaries', before='parser')
nlp.pipe_names

['tok2vec',
 'tagger',
 'set_custom_boundaries',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [None]:
doc4=nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')
for sent in doc4.sents:
  print(sent)

"Management is doing the right things;
leadership is doing the right things."
-Peter Drucker
