In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
# create a show entity function
def show_ent(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - '+ent.label_ +' - '+str(spacy.explain(ent.label_)))
    else:
        print('No Entities Found')
        

In [3]:
doc = nlp(u'Hi, how are you?')
doc1 = nlp(u"May i go to Washington D.C. next May to see the Washington Monument?")
doc2 = nlp(u"Can i please have 500 dollars of Microsoft stock")
doc3 = nlp(u"Tesla to build a U.K. factory for $6 Million.")

In [4]:
show_ent(doc)

show_ent(doc1)

show_ent(doc2)

# It is unable to figure out entity for Tesla as an ORG in doc3
show_ent(doc3)

No Entities Found
Washington D.C. - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.
500 dollars - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 Million - MONEY - Monetary values, including unit


___
## Adding a Named Entity to a Span
Normally we would have spaCy build a library of named entities by training it on several samples of text.<br>In this case, we only want to add one value:

In [5]:
# ADD your custom entity
from spacy.tokens import Span

# Get the hash value of the ORG entity label
ORG = doc3.vocab.strings[u'ORG']  

# Create a Span for the new entity - 0 and 1 are starting and ending postions of the new entity in Doc
new_ent = Span(doc3, 0, 1, label=ORG)

# Add the entity to the existing Doc object
doc3.ents = list(doc3.ents) + [new_ent]


In [6]:
# It would now show Tesla as a new entity
show_ent(doc3)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 Million - MONEY - Monetary values, including unit


___
## Adding Named Entities to All Matching Spans
What if we want to tag *all* occurrences of "Tesla"? In this section we show how to use the PhraseMatcher to identify a series of spans in the Doc:

In [7]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner.'
          u'If successful, the vacuum cleaner will be our first product.')

show_ent(doc)

first - ORDINAL - "first", "second", etc.


In [8]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [9]:
# Create the desired phrase patterns:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_pattern = [nlp(text) for text in phrase_list]

In [10]:
# Apply the patterns to our matcher object:
matcher.add('newproduct', None, *phrase_pattern)

# Apply the matcher to our Doc object:
found_matches = matcher(doc)

# See what matches occur:
found_matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [11]:
# Here we create Spans from each match, and create named entities from them:
from spacy.tokens import Span

PROD = doc.vocab.strings[u"PRODUCT"]

new_ent = [Span(doc, match[1],match[2], label = PROD) for match in found_matches]

doc.ents = list(doc.ents) + new_ent

In [12]:
show_ent(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - ORDINAL - "first", "second", etc.


In [13]:
# to find count of particular type of entity
doc1 = nlp(u"i originally paid 1000 dollars instead of $10.00 for the charger.")

In [14]:
# to find count of particular type of entity
[ent for ent in doc1.ents if ent.label_ == 'MONEY']

[1000 dollars, 10.00]

In [15]:
# To find the count of MONEY Entities
len([ent for ent in doc1.ents if ent.label_ == 'MONEY'])

2

## Visualization NER

In [16]:
from spacy import displacy

In [17]:
doc = nlp(u'Over the last quarter Steve Jobs from Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Kenichiro from Sony sold only 7 thousand Walkman music players.')

In [18]:
# All the sentences appear together and are not seperated
displacy.render(doc, style = 'ent', jupyter=True)

In [19]:
# Now the sentences appear seperately
for sent in doc.sents:
    displacy.render(nlp(sent.text), style = 'ent', jupyter = True)

In [20]:
# Only rendering a particular ent
options = {'ents':['PRODUCT','ORG']}

In [21]:
displacy.render(doc, style = 'ent', jupyter=True, options = options)

## SENTENCE SEGMENTATION

In [91]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [23]:
# From Spacy Basics:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [92]:
# SPACY'S DEFAULT BEHAVIOR
# Sentence is broken by default at hifen -
doc = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter
Drucker


## Add new Segmentation Rule

In [93]:
def set_custom_boundaries(doc):
    for token in doc:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before = 'parser')
nlp.pipe_names


['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [94]:
# Re-run the Doc object creation:
doc1 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')


In [95]:

for sent in doc1.sents:
    print(sent)

"Management is doing things right;
leadership is doing the right things."
-Peter
Drucker


## Change Segmentation Rule Altogether

In [77]:
nlp = spacy.load('en_core_web_sm')  # reset to the original

mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

# SPACY DEFAULT BEHAVIOR:
doc = nlp(mystring)

for sent in doc.sents:
    print(sent)

This is a sentence.
This is another.


This is a 
third sentence.


In [78]:
# It segments at periods by default
# Change the segmentation rule
from spacy.pipeline import SentenceSegmenter 

In [79]:
# define a new function for new segmentation rule (on a new line \n)
def split_on_newline(doc):
    start = 0
    seen_newline = False
    
    for token in doc:
        if seen_newline:
            yield(doc[start:token.i])
            start = token.i
            seen_newline = False
        elif token.text.startswith('\n'):
            seen_newline = True            
    yield doc[start:]
            

In [80]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newline)

In [81]:
nlp.add_pipe(sbd)

In [82]:
mystring1 = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."
doc = nlp(mystring1)

In [83]:
for sent in doc.sents:
    print(sent)

This is a sentence. This is another.


This is a 

third sentence.
