In [97]:
import spacy

In [98]:
nlp = spacy.load('en_core_web_sm')

In [99]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [100]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [101]:
doc.sents

<generator at 0x221b15464a0>

In [102]:
doc[0]

This

In [103]:
list(doc.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [104]:
list(doc.sents)[0]

This is the first sentence.

In [105]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [106]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things."-Peter Drucker')

In [107]:
doc.text

'"Management is doing the right things; leadership is doing the right things."-Peter Drucker'

In [108]:
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things.


"-Peter


Drucker




In [109]:
# Add a segmentation rules

In [110]:
def set_custom_boundaries(doc):
    for token in doc:
        print(token)
        print(token.i)

In [111]:
set_custom_boundaries(doc)

"
0
Management
1
is
2
doing
3
the
4
right
5
things
6
;
7
leadership
8
is
9
doing
10
the
11
right
12
things
13
.
14
"-Peter
15
Drucker
16


In [112]:
from spacy.language import Language

In [113]:
@Language.component('set_custom_boundaries')
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [114]:
doc[:-1]

"Management is doing the right things; leadership is doing the right things."-Peter

In [115]:
nlp.add_pipe(factory_name="set_custom_boundaries",before='parser')
nlp.pipe_names

['tok2vec',
 'tagger',
 'set_custom_boundaries',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [116]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things."-Peter Drucker')

In [117]:
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things.
"-Peter
Drucker


In [118]:
# change segmentation rules

In [119]:
nlp = spacy.load('en_core_web_sm')

In [120]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

In [121]:
mystring = u"This is a sentence.This is another.\n\nThis is a \nThird sentence."

In [122]:
print(mystring)

This is a sentence.This is another.

This is a 
Third sentence.


In [123]:
doc = nlp(mystring)

In [124]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another.


This is a 
Third sentence.


In [125]:
from spacy.pipeline import Sentencizer

In [126]:
@Language.component('split_on_newlines')
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startsiwth('\n'):
            seen_newline = True
            
    yield doc[start:]

In [127]:
nlp.add_pipe('split_on_newlines',before='parser')
nlp.pipe_names

['tok2vec',
 'tagger',
 'split_on_newlines',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [None]:
# Changing The Rules
from spacy.pipeline import Sentencizer
punct_marks = ["\n"]
config = {"punct_chars": punct_marks}
nlp1.add_pipe("sentencizer", config=config,before='parser')
for s in doc.sents:
    print(doc)

In [None]:
# CHANGING THE RULES
from spacy.language import Language
 
@Language.component('split_on_newlines')
def split_on_newlines(doc):
    for tok in doc[1:]:
        tok.is_sent_start = doc[tok.i - 1].text.startswith('\n')
    return doc
 
nlp = spacy.load('en_core_web_sm')  # reset to the original
nlp.add_pipe('split_on_newlines', before='parser')
 
doc = nlp("1\n\n3")
for sent in doc.sents:
    print([token.text for token in sent])