# Sentence Boundary Detection: REGEX vs. Spacy vs. NLTK

In [2]:
import string
import re
import time
import numpy as np
import pandas as pd
import nltk
import spacy

In [3]:
# SPACY
# As opposed to sm. md, lg models, English() lass in spacy.lang.en contains only language-specific code
# and rules – e.g. for tokenization, stop words, etc.

# TRAINED MODEL
nlp_lg = spacy.load('en_core_web_lg')

# RULES-BASED SENTENCIZER
from spacy.lang.en import English
nlp_sentencizer = English()
sentencizer = nlp_sentencizer.create_pipe('sentencizer')
nlp_sentencizer.add_pipe(sentencizer)

# SEPARATE PACKAGE THAT WORKS WITH SPACY 
from pysbd.utils import PySBDFactory
nlp_pysbd = spacy.blank('en')
nlp_pysbd.add_pipe( PySBDFactory( nlp_pysbd ) )

In [4]:
# NLTK - 2 WAYS. IDENTICAL RESULTS, nltk_data_tokenizer is ~1 sec faster on 25K articles (14 vs. 15 sec.)
def nltk_tokenize( _text ):
    return nltk.tokenize.sent_tokenize( _text )


tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def nltk_data_tokenizer( _text ):
    return tokenizer.tokenize( _text )

In [5]:
# REGEX - SIMPLE (WEAK) AND COMPLEX WAY
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|ai|me|edu|co.uk|ru|info|biz|online)"
digits   = "([0-9])"

sent_bounds = re.compile('[.!?]')

# breaks on 9:30 p.m.
def simple_regex_sents( _text ):        
    return [ i.strip() for i in sent_bounds.split( _text ) ]
    

def regex_sents(text):
        
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
        
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    if "e.g." in text: text = text.replace("e.g.", "e<prd>g<prd>")
    if "i.e." in text: text = text.replace("i.e.", "i<prd>e<prd>")
    #if "..." in text: text = text.replace("...","<prd><prd><prd>")
                
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
        
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    
    text = text.replace(".", ".<stop>")
    text = text.replace("?", "?<stop>")
    text = text.replace("!", "!<stop>")
    text = text.replace("<prd>", ".")
        
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    
    return sentences

In [6]:
def print_sents(_list):
    
    for _idx, _sent in enumerate(_list):
        print('{}. {}'.format(_idx+1, _sent))
                

def apply_sent_split(_list):
    
    res = ''
    for _idx, _sent in enumerate(_list):
        res += '{}. {}\n'.format(_idx+1, _sent)
        
    return res

In [7]:
text = ''' Hawaii community spreads power of 'aloha' after security guard returns wallet
HAWAII -- A community in Hawaii is spreading the power of aloha after coming together to purchase a car for a hard-working security officer.

It all started when Aina Townsend, 22, a security guard from the island of Maui in Hawaii, was working at Foodland supermarket in Kahlui last month. While on his shift, he found a brown wallet left behind in one of the shopping carts.
The wallet belonged to 30-year-old Chloe Marino of Maui, who didn't realize her wallet was missing until Townsend came to return it in person that same day.

"It was just such a selfless act and you know, he wasn't expecting anything in return," Chloe Marino told "Good Morning America." "He said, 'I wanted to return it to you so you didn't have to be without it for the holidays.'"
'''

In [8]:
sents = regex_sents( text )
print_sents( sents )

1. Hawaii community spreads power of 'aloha' after security guard returns wallet HAWAII -- A community in Hawaii is spreading the power of aloha after coming together to purchase a car for a hard-working security officer.
2. It all started when Aina Townsend, 22, a security guard from the island of Maui in Hawaii, was working at Foodland supermarket in Kahlui last month.
3. While on his shift, he found a brown wallet left behind in one of the shopping carts.
4. The wallet belonged to 30-year-old Chloe Marino of Maui, who didn't realize her wallet was missing until Townsend came to return it in person that same day.
5. "It was just such a selfless act and you know, he wasn't expecting anything in return," Chloe Marino told "Good Morning America".
6. "He said, 'I wanted to return it to you so you didn't have to be without it for the holidays.


In [9]:
sents = simple_regex_sents( text )
print_sents( sents )

1. Hawaii community spreads power of 'aloha' after security guard returns wallet
HAWAII -- A community in Hawaii is spreading the power of aloha after coming together to purchase a car for a hard-working security officer
2. It all started when Aina Townsend, 22, a security guard from the island of Maui in Hawaii, was working at Foodland supermarket in Kahlui last month
3. While on his shift, he found a brown wallet left behind in one of the shopping carts
4. The wallet belonged to 30-year-old Chloe Marino of Maui, who didn't realize her wallet was missing until Townsend came to return it in person that same day
5. "It was just such a selfless act and you know, he wasn't expecting anything in return," Chloe Marino told "Good Morning America
6. " "He said, 'I wanted to return it to you so you didn't have to be without it for the holidays
7. '"


In [10]:
doc = nlp_sentencizer( text )
sents = [ sent.text.strip() for sent in doc.sents ]
print_sents( sents )

1. Hawaii community spreads power of 'aloha' after security guard returns wallet
HAWAII -- A community in Hawaii is spreading the power of aloha after coming together to purchase a car for a hard-working security officer.
2. It all started when Aina Townsend, 22, a security guard from the island of Maui in Hawaii, was working at Foodland supermarket in Kahlui last month.
3. While on his shift, he found a brown wallet left behind in one of the shopping carts.
4. The wallet belonged to 30-year-old Chloe Marino of Maui, who didn't realize her wallet was missing until Townsend came to return it in person that same day.
5. "It was just such a selfless act and you know, he wasn't expecting anything in return," Chloe Marino told "Good Morning America." "
6. He said, 'I wanted to return it to you so you didn't have to be without it for the holidays.'"
7. 


In [11]:
doc = nlp_lg( text )
sents = [ sent.text.strip() for sent in doc.sents ]
print_sents( sents )

1. Hawaii community spreads power of 'aloha' after security guard returns wallet
HAWAII -- A community in Hawaii is spreading the power of aloha after coming together to purchase a car for a hard-working security officer.
2. It all started when Aina Townsend, 22, a security guard from the island of Maui in Hawaii, was working at Foodland supermarket in Kahlui last month.
3. While on his shift, he found a brown wallet left behind in one of the shopping carts.
4. The wallet belonged to 30-year-old Chloe Marino of Maui, who didn't realize her wallet was missing until Townsend came to return it in person that same day.
5. "It was just such a selfless act
6. and you know, he wasn't expecting anything in return," Chloe Marino told "Good Morning America."
7. "He said, 'I wanted to return it to you
8. so you didn't have to be without it for the holidays.'"


In [12]:
doc = nlp_pysbd( text )
sents = [ sent.text.strip() for sent in doc.sents ]
print_sents( sents )

1. 
2. Hawaii community spreads power of 'aloha' after security guard returns wallet
3. HAWAII -- A community in Hawaii is spreading the power of aloha after coming together to purchase a car for a hard-working security officer.
4. It all started when Aina Townsend, 22, a security guard from the island of Maui in Hawaii, was working at Foodland supermarket in Kahlui last month.
5. While on his shift, he found a brown wallet left behind in one of the shopping carts.
6. The wallet belonged to 30-year-old Chloe Marino of Maui, who didn't realize her wallet was missing until Townsend came to return it in person that same day.
7. "It was just such a selfless act and you know, he wasn't expecting anything in return," Chloe Marino told "Good Morning America." "He said, 'I wanted to return it to you so you didn't have to be without it for the holidays.'"


In [13]:
sents = nltk_tokenize( text)
print_sents( sents )

1.  Hawaii community spreads power of 'aloha' after security guard returns wallet
HAWAII -- A community in Hawaii is spreading the power of aloha after coming together to purchase a car for a hard-working security officer.
2. It all started when Aina Townsend, 22, a security guard from the island of Maui in Hawaii, was working at Foodland supermarket in Kahlui last month.
3. While on his shift, he found a brown wallet left behind in one of the shopping carts.
4. The wallet belonged to 30-year-old Chloe Marino of Maui, who didn't realize her wallet was missing until Townsend came to return it in person that same day.
5. "It was just such a selfless act and you know, he wasn't expecting anything in return," Chloe Marino told "Good Morning America."
6. "He said, 'I wanted to return it to you so you didn't have to be without it for the holidays.'"


In [14]:
sents = nltk_data_tokenizer( text )
print_sents( sents )

1.  Hawaii community spreads power of 'aloha' after security guard returns wallet
HAWAII -- A community in Hawaii is spreading the power of aloha after coming together to purchase a car for a hard-working security officer.
2. It all started when Aina Townsend, 22, a security guard from the island of Maui in Hawaii, was working at Foodland supermarket in Kahlui last month.
3. While on his shift, he found a brown wallet left behind in one of the shopping carts.
4. The wallet belonged to 30-year-old Chloe Marino of Maui, who didn't realize her wallet was missing until Townsend came to return it in person that same day.
5. "It was just such a selfless act and you know, he wasn't expecting anything in return," Chloe Marino told "Good Morning America."
6. "He said, 'I wanted to return it to you so you didn't have to be without it for the holidays.'"


In [16]:
#ADD NEW RULE TO PIPELINE - to add a sentence end after the article title before next line's capitalized word
def set_custom_Sentence_end_points(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

nlp_lg.add_pipe(set_custom_Sentence_end_points, before='parser')
nlp_lg.pipe_names

['tagger', 'set_custom_Sentence_end_points', 'parser', 'ner']