In [25]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence")

In [3]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence


In [4]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [5]:
doc.text

'"Management is doing the right things; leadership is doing the right things." - Peter Drucker'

In [6]:
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."


- Peter Drucker




In [26]:
# ADD A SEGMENTATION RULE
from spacy.language import Language

In [27]:
@Language.component("component")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [28]:
nlp.add_pipe("component", before='parser')

<function __main__.set_custom_boundaries(doc)>

In [29]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'component',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [30]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [31]:
for sent in doc.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
- Peter Drucker


In [32]:
# CHANGE SEGMENTATION RULE

In [33]:
nlp = spacy.load('en_core_web_sm')

In [34]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

In [35]:
print(mystring)

This is a sentence. This is another.

This is a 
third sentence.


In [36]:
doc = nlp(mystring)

In [37]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another.


This is a 
third sentence.


In [44]:
from spacy.pipeline import Sentencizer

In [45]:
punct_marks = ['\n']

In [46]:
config = {"punct_chars":punct_marks}

In [49]:
nlp1 = spacy.load('en_core_web_sm', exclude=['parser'])

In [50]:
nlp1.add_pipe('sentencizer', config=config, before='attribute_ruler')

<spacy.pipeline.sentencizer.Sentencizer at 0x7f957f14f6c0>

In [51]:
doc2 = nlp1(mystring)

In [52]:
for sent in doc2.sents:
    print(sent)

This is a sentence. This is another.

This is a 

third sentence.
