In [1]:
import spacy

In [2]:
NLP=spacy.load('en_core_web_sm')

In [3]:
Doc1=NLP(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [4]:
# FIRST WAY:
for Sent in Doc1.sents:
    print(Sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [5]:
list(Doc1.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [6]:
list(Doc1.sents)[0]

This is the first sentence.

In [7]:
type(list(Doc1.sents)[0]) 

spacy.tokens.span.Span

In [8]:
# SECOND WAY:
Doc1_Sents=[Doc_Sents for Doc_Sents in Doc1.sents]
Doc1_Sents

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [9]:
print(Doc1_Sents[1])

This is another sentence.


In [10]:
print(Doc1_Sents[1].start,Doc1_Sents[1].end)

6 11


In [11]:
Doc2=NLP(u'This is a sentence1. This is a sentence2. This is a sentence3.')
for Token in Doc2:
    print(Token)

This
is
a
sentence1
.
This
is
a
sentence2
.
This
is
a
sentence3
.


In [12]:
Doc2=NLP(u'This is a sentence1. This is a sentence2. This is a sentence3.')
for Token in Doc2:
    print(Token.is_sent_start,'<----->'+Token.text)

True <----->This
None <----->is
None <----->a
None <----->sentence1
None <----->.
True <----->This
None <----->is
None <----->a
None <----->sentence2
None <----->.
True <----->This
None <----->is
None <----->a
None <----->sentence3
None <----->.


In [13]:
# SPACY'S DEFAULT BEHAVIOR
Doc3=NLP(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')
for Sent in Doc2.sents:
    print(Sent)

This is a sentence1.
This is a sentence2.
This is a sentence3.


In [14]:
# ADD A SEGMENTATION RULE
def Set_Custom_Boundary1(Doc):
    for Token in Doc:
        print(Token,'<----->',Token.i)

In [15]:
Set_Custom_Boundary1(Doc3)

" <-----> 0
Management <-----> 1
is <-----> 2
doing <-----> 3
things <-----> 4
right <-----> 5
; <-----> 6
leadership <-----> 7
is <-----> 8
doing <-----> 9
the <-----> 10
right <-----> 11
things <-----> 12
. <-----> 13
" <-----> 14
-Peter <-----> 15
Drucker <-----> 16


In [16]:
Doc3[:-1]

"Management is doing things right; leadership is doing the right things." -Peter

In [17]:
def Set_Custom_Boundary2(Doc):
    for Token in Doc[:-1]:
        if Token.text==';':
            print(Token)

In [18]:
Set_Custom_Boundary2(Doc3)

;


In [19]:
# ADD A NEW RULE TO THE PIPELINE
def Set_Custom_Boundary3(Doc):
    for Token in Doc[:-1]:
        if Token.text==';':
            Doc[Token.i+1].is_sent_start=True
    return Doc

In [20]:
# NER:NAMED ENTITY RECOGNITION
NLP.add_pipe(Set_Custom_Boundary3,before='parser')
NLP.pipe_names

['tagger', 'Set_Custom_Boundary3', 'parser', 'ner']

In [21]:
Doc4=NLP(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

In [22]:
for Sent in Doc4.sents:
    print(Sent)

"Management is doing things right;
leadership is doing the right things."
-Peter Drucker


In [23]:
for Sent in Doc3.sents:
    print(Sent)

"Management is doing things right; leadership is doing the right things."
-Peter Drucker


In [24]:
# CHANGE SEGMENTATION RULES:
String=u"This is a sentence. This is another.\n\nThis is a \nthird sentence."
Doc5=NLP(String)

In [25]:
for Sent in Doc5.sents:
    print(Sent)
    print([Token.text for Token in Sent])

This is a sentence.
['This', 'is', 'a', 'sentence', '.']
This is another.


['This', 'is', 'another', '.', '\n\n']
This is a 
third sentence.
['This', 'is', 'a', '\n', 'third', 'sentence', '.']


In [26]:
from spacy.pipeline import SentenceSegmenter

def Split_on_NewLines(Doc):
    Start=0
    Seen_NewLine=False
    for Token in Doc5:
        if Seen_NewLine:
            yield Doc[Start:Token.i]
            Start=Token.i
            Seen_NewLine=False
        elif Token.text.startswith('\n'):
            Seen_Newline=True
    yield Doc[Start:]
    
SBD=SentenceSegmenter(NLP.vocab,strategy=Split_on_NewLines)
NLP.add_pipe(SBD)

In [27]:
Doc6=NLP(String)

In [28]:
for Sent in Doc6.sents:
    print(Sent)
    print([Token.text for Token in Sent])

This is a sentence. This is another.

This is a 
third sentence.
['This', 'is', 'a', 'sentence', '.', 'This', 'is', 'another', '.', '\n\n', 'This', 'is', 'a', '\n', 'third', 'sentence', '.']
