In [24]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [25]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [26]:
# generator: Generates and returns the sentence, instead of holding them in memory
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [27]:
# to grab a sentence by index, you may turn it into a list
list(doc.sents)[0]

This is the first sentence.

In [28]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [29]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [30]:
doc.text

'"Management is doing the right things; leadership is doing the right things." - Peter Drucker'

In [31]:
for sent in doc.sents:
    print(sent)
    print('\n')

"


Management is doing the right things; leadership is doing the right things.


"


- Peter Drucker




In [32]:
# ADD A SEGMENTATION RULE

In [33]:
# 1. Create a pipeline
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [34]:
# set the pipeline in nlp
nlp.add_pipe(set_custom_boundaries, before='parser')

In [35]:
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [36]:
# all the document, except last token
doc[:-1]

"Management is doing the right things; leadership is doing the right things." - Peter

In [37]:
# create the same document again
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [39]:
# test it
for sent in doc4.sents:
    print(sent)

"
Management is doing the right things;
leadership is doing the right things.
"
- Peter Drucker


In [38]:
# CHANGE SEGMENTATION RULE

In [40]:
# reloading the libary to reset behaviours
nlp = spacy.load('en_core_web_sm')

In [41]:
mystring = u"This is a sentence. This is another. \n\nThis is a \nthird sentence."

In [42]:
print(mystring)

This is a sentence. This is another. 

This is a 
third sentence.


In [43]:
doc = nlp(mystring)

In [44]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another. 


This is a 
third sentence.


In [45]:
from spacy.pipeline import SentenceSegmenter

In [46]:
# yield is like return, but without stopping the function execution
def split_on_newlines(doc):
    start = 0
    seen_newline = False

    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
    
    yield doc[start: ]

In [47]:
# creating new sentence segmenter
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)

In [48]:
# adding it as a new pipe
nlp.add_pipe(sbd)

In [49]:
doc = nlp(mystring)

In [50]:
# test that our new segment (function) works
for sentence in doc.sents:
    print(sentence)

This is a sentence. This is another. 


This is a 

third sentence.
