## Sentence Segmentation

In [35]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [36]:
doc = nlp(u"This is the first sentence. This is the second sentence. This is the third sentence.")
for sent in doc.sents:
    print(sent)

# doc.sents[0] will give an error as this doc.sents is a generative statement and does not hold individual sentences in memory. 
# To do sentence interations, we need to do the following

print(list(doc.sents))

#The type of each sentence is Span object
print(type(list(doc.sents)[0]))


This is the first sentence.
This is the second sentence.
This is the third sentence.
[This is the first sentence., This is the second sentence., This is the third sentence.]
<class 'spacy.tokens.span.Span'>


In [37]:
doc1 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')
print(doc1.text)

for sent in doc1.sents:
    print(sent)

"Management is doing the right things; leadership is doing the right things." -Peter Drucker
"Management is doing the right things; leadership is doing the right things."
-Peter Drucker


### Adding Segmentation Rules

In [38]:
from spacy.language import Language

@Language.component('set_custom_boundaries') # Added for version Spacy v3 - https://spacy.io/usage/v3#migrating-add-pipe
def set_custom_boundaries(doc):
    for token in doc[:-1]: #[:-1] is used to exclude the last token to avoid index out of range error
        if token.text == ';': # ; is the token to be used as a sentence boundary
            doc[token.i+1].is_sent_start = True
    return doc

print("Old Pipeline:", nlp.pipe_names)
nlp.add_pipe('set_custom_boundaries', before='parser')
print("Modified Pipeline:", nlp.pipe_names)

Old Pipeline: ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
Modified Pipeline: ['tok2vec', 'tagger', 'set_custom_boundaries', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


In [39]:
print("\n\nUPDATED SENTENCE SEGMENTATION:\n")
for sent in doc1.sents:
    print(sent)




UPDATED SENTENCE SEGMENTATION:

"Management is doing the right things; leadership is doing the right things."
-Peter Drucker


### Changing Segmentation Rules

In [40]:
nlp1 = spacy.load('en_core_web_sm')
doc2 = nlp1(u"This is the first sentence. This is the second sentence.\n\nThis is the \nthird sentence.")
#print(doc2.text)

for sent in doc2.sents:
    print(sent)

This is the first sentence.
This is the second sentence.



This is the 
third sentence.


If this were a poem, we would be doing the sentence segmentation over the \n (line breaks) rather than the full stops, hence we need to entirely change the segmentation rules.

### Code to change sentence segmenter spacy v2

from spacy.pipeline import SentenceSegmenter

def split_on_newlines(doc):
    start = 0
    seen_newline = False

    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
    yield doc[start:]     

    sbd = SentenceSegmenter(nlp.vocab,strategy=split_on_newlines)
    nlp1.add_pipe(sbd)
   