In [446]:
# Tokenize the text
import re

def tokenize_sentences(text):
    # tokenize with . ! ? , : ; to segment sentences and clauses 
    RE_TOK = re.compile(r'([.!?]|\s+)')

    prev_idx = 0
    tokens = []
    for m in RE_TOK.finditer(text):
        t = text[prev_idx:m.start()].strip()
        if t: tokens.append(t)
        t = m.group().strip()
        if t: tokens.append(t)
        prev_idx = m.end()

    t = text[prev_idx:].strip()
    if t: tokens.append(t)
    return tokens

def tokenize_clauses(text):
    # tokenize with . ! ? , : ; to segment sentences and clauses 
    RE_TOK = re.compile(r'([,:;]|\s+)')

    prev_idx = 0
    tokens = []
    for m in RE_TOK.finditer(text):
        t = text[prev_idx:m.start()].strip()
        if t: tokens.append(t)
        t = m.group().strip()
        if t: tokens.append(t)
        prev_idx = m.end()

    t = text[prev_idx:].strip()
    if t: tokens.append(t)
    return tokens

def tokenize_regex(text):
    # tokenize with . ! ? , : ; to segment sentences and clauses 
    RE_TOK = re.compile(r'([.,:]|\s+)')

    prev_idx = 0
    tokens = []
    for m in RE_TOK.finditer(text):
        t = text[prev_idx:m.start()].strip()
        if t: tokens.append(t)
        t = m.group().strip()
        if t: tokens.append(t)
        prev_idx = m.end()

    t = text[prev_idx:].strip()
    if t: tokens.append(t)
    return tokens

In [529]:
# Spell Checker
from spellchecker import SpellChecker

def spell_check(text):
    spell = SpellChecker()
    result = text
    tokens = tokenize_regex(text)
    
    for index in range(len(tokens)):
        mispelled = spell.unknown([tokens[index]])

        if mispelled: 
            result = result.replace(tokens[index], spell.correction(tokens[index]))
            tokens[index] = spell.correction(tokens[index])
    return result

In [546]:
# Expand English Contractions 
# list of contractions from https://englishstudypage.com/grammar/list-of-contractions-in-english/
!pip install pyspellchecker

import csv

# Read the English contractions list csv file and convert to dictionary where key = abbreviation, value = contracted words
with open('english-contractions-list.csv', mode='r') as input:
        reader = csv.reader(input)
        contractionsDict = {rows[0]:rows[1] for rows in reader}
        contractionsDict.pop('Abbreviation')
    
def expand_contractions(text):
    result = text
    tokens = tokenize_regex(text)
    
#     r = re.compile(r'\b(aren\'t|can\'t|couldn\'t|didn\'t|don\'t|doesn\'t|hadn\'t|haven\'t)\b'
#                    r'\b(he\'s|he\'ll|he\'d|here\'s|I\'m|I\'ve|I\'ll|I\'d|isn\'t|it\'s|it\'ll)\b'
#                    r'\b(mustn\'t|she\'s|she\'ll|she\'d|shouldn\'t|that\'s|there\'s|they\'re)\b'
#                    r'\b(they\'ve|they\'ll|they\'d|wasn\'t|we\'re|we\'ve|we\'ll|we\'d|weren\'t)\b'
#                    r'\b(what\'s|where\'s|who\'s|who\'ll|won\'t|wouldn\'t|you\'re|you\'ve|you\'ll)\b'
#                    r'\b(you\'d)\b')
    
    for token in tokens:
        # If the token is an abbreviation, then we will change it to its contracted English words
        if contractionsDict.get(token.lower()) != None: 
            new = contractionsDict.get(token.lower())

            # If the token was the beginning of a sentence, then we change the capitalization to match it
            if token[:1].isupper(): new = new[:1].upper() + new[1:]   
            result = result.replace(token, new)

    return result



In [526]:
# Sentence and Clause Segmentation
from typing import List
import re

r_digits = re.compile(r'\d')

def segment_sentence(text):
    result = []
    sentence = ""
    tokens = tokenize_sentences(text)
    
    skip = False
    for index in range(len(tokens)):   
        
        if skip:
            skip = False
            continue
        
        # beginning of a new sentence
        elif sentence == "": sentence = tokens[index]
        
        # segment by sentences
        elif tokens[index] in {'.','!','?'}:
            
            # unambiguous punctuation that signifies end of a sentence
            if tokens[index] in {'!','?'}:
                result.append(sentence)
                sentence = ""
            
            # ambiguous punctuation that may or may not signify end of a sentece
            elif tokens[index] == '.':
                # period [.] represents a decimal number
                if r_digits.match(tokens[index-1]) and r_digits.match(tokens[index+1]): 
                    sentence = sentence + tokens[index] + tokens[index+1] 
                    skip = True
               
                # period [.] represents common abbreviations like e.g. and i.e.
                elif index+1 < len(tokens) and tokens[index-1] in {'e', 'i'} and tokens[index+1] in {'g','e'}: 
                    sentence = sentence + tokens[index+1]
                    skip = True
                
                # period[.] represents the end of an abbreviation with periods inbetween
                elif sentence[-2:] in {'eg','ie'}: continue
                    
                # period [.] represents end of common abbreviations with no periods inbetween
                elif index-1 >= 0 and tokens[index-1].lower() in {'am','pm','mr','ms','dr','mrs','inc','tbsp','tsp','gal','lb','lbs','qt','pt'}: continue
                
                # period [.] represents the end of a sentence
                else: 
                    result.append(sentence)
                    sentence = ""
            
            # punctuation represents token is a clause (, ; :)
            # else: sentence = sentence + tokens[index]
        
        else: sentence = sentence + " " + tokens[index]
    if sentence: result.append(sentence)
            
    return result

def segment_clause(sentences: List[str]):
    result = []

    for sentence in sentences:
        clause = ""
        tokens = tokenize_clauses(sentence)
        
        skip = False
        for index in range(len(tokens)):
            if skip:
                skip = False
                continue
            
            # beginning of a new clause
            elif clause == "": 
                clause = tokens[index]
            
            elif tokens[index] in {',',';',':'}:
                # unambiguous punctuation that signifies end of a clause
                if tokens[index] in {';'}:
                    result.append(clause)
                    clause = ""
                
                # ambiguous colons [:] that may or may not signify the end of a clause
                elif tokens[index] == ':':
                    # colon [:] represents a ratio
                    if r_digits.match(tokens[index-1]) and r_digits.match(tokens[index+1]): 
                        clause = clause + tokens[index] + tokens[index+1] 
                        skip = True
                    
                    # colon [:] represents end of a clause
                    else: 
                        result.append(clause)
                        clause = ""
                
                # ambiguous commas [,] that may or may not signify the end of a clause
                else: 
                    # comma [,] represents a number with commas
                    if r_digits.match(tokens[index-1]) and r_digits.match(tokens[index+1]): 
                        clause = clause + tokens[index] + tokens[index+1] 
                        skip = True
                        
                    # comma [,] used to separate items in a list
                    elif index-2 >= 0 and tokens[index-2] == ',' or index+2 < len(tokens) and tokens[index+2] == ',': continue
                    
                    # comma [,] represents end of a clause
                    else: 
                        result.append(clause)
                        clause = ""
            
            else: clause = clause + " " + tokens[index]

        if clause:
            result.append(clause)
            clause = ""
            
    return result

In [545]:
text = "They'd can't beleeve you didn't do it!"
#print(tokenize_regex(text))
# print(expand_contractions(text))


#text = "Hello there, my name is: Chloe Lam! I'd like a cheese, tomato, parmesan, and egg sandwich. I had 3.5 eggs, e.g. on toast at 5 pm. today. There has been 1:2 ratios; I would like 3,500 of those"
text = "I beleeve I had 1 avocado toast, 2 slices of tomatoes, and 3 pieces of chocolate. I'd rather have had an egg; they'd love that"
text_sc = spell_check(text)
text_ec = expand_contractions(text_sc)
print("spell_check:", text_sc)
print("expand_contractions:", text_ec)
print("segment_sentence:", segment_sentence(text_ec))
print("segment_clause:", segment_clause(segment_sentence(text_ec)))


spell_check: I believe I had 1 avocado toast, 2 slices of tomatoes, and 3 pieces of chocolate. I'd rather have had an egg; they'd love that
expand_contractions: I believe I had 1 avocado toast, 2 slices of tomatoes, and 3 pieces of chocolate. I would rather have had an egg; they would love that
segment_sentence: ['I believe I had 1 avocado toast, 2 slices of tomatoes, and 3 pieces of chocolate', 'I would rather have had an egg; they would love that']
segment_clause: ['I believe I had 1 avocado toast', '2 slices of tomatoes', 'and 3 pieces of chocolate', 'I would rather have had an egg', 'they would love that']


In [515]:
# Convert written words to numbers
