In [2]:
!pip install word2number
!pip install num2words
!pip install pyspellchecker
!pip install spacy
!python3 -m spacy download en_core_web_sm

import re
import csv
import spacy
from word2number import w2n
from num2words import num2words
from typing import List
from spellchecker import SpellChecker

# Processing
# 1. Expand English contractions
# 2. Convert written words to numbers
# 3. Segment by sentences, then segment by clauses
# 4. Spell checker
# 5. Case folding, remove non-alphanumeric characters, remove stop-words
# 6. Stemming and lemmatization
# 7. Vectorization

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 6.3 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Tokenize methods

def tokenize_sentences(text):
    # tokenize with . ! ? to segment sentences 
    RE_TOK = re.compile(r'([.!?]|\s+)')

    prev_idx = 0
    tokens = []
    for m in RE_TOK.finditer(text):
        t = text[prev_idx:m.start()].strip()
        if t: tokens.append(t)
        t = m.group().strip()
        if t: tokens.append(t)
        prev_idx = m.end()

    t = text[prev_idx:].strip()
    if t: tokens.append(t)
    return tokens

def tokenize_clauses(text):
    # tokenize with , : ; to segment clauses 
    RE_TOK = re.compile(r'([,:;]|\s+)')

    prev_idx = 0
    tokens = []
    for m in RE_TOK.finditer(text):
        t = text[prev_idx:m.start()].strip()
        if t: tokens.append(t)
        t = m.group().strip()
        if t: tokens.append(t)
        prev_idx = m.end()

    t = text[prev_idx:].strip()
    if t: tokens.append(t)
    return tokens

def tokenize_regex(text):
    # tokenize with . ! ? , : ; to segment sentences and clauses 
    RE_TOK = re.compile(r'([.,:]|\s+)')

    prev_idx = 0
    tokens = []
    for m in RE_TOK.finditer(text):
        t = text[prev_idx:m.start()].strip()
        if t: tokens.append(t)
        t = m.group().strip()
        if t: tokens.append(t)
        prev_idx = m.end()

    t = text[prev_idx:].strip()
    if t: tokens.append(t)
    return tokens

In [4]:
# Expand English Contractions 
# list of contractions from https://englishstudypage.com/grammar/list-of-contractions-in-english/

# Read the English contractions list csv file and convert to dictionary where key = abbreviation, value = contracted words
with open('english-contractions-list.csv', mode='r') as input:
        reader = csv.reader(input)
        contractionsDict = {rows[0]:rows[1] for rows in reader}
        contractionsDict.pop('Abbreviation')
    
def expand_contractions(text):
    result = text
    tokens = tokenize_regex(text)
    
    for token in tokens:
        # if the token is an abbreviation, then we will change it to its contracted English words
        if contractionsDict.get(token.lower()) != None: 
            new = contractionsDict.get(token.lower())

            # if the token was the beginning of a sentence, then we change the capitalization to match it
            if token[:1].isupper(): new = new[:1].upper() + new[1:]   
            result = result.replace(token, new)

    return result

In [5]:
# Convert written words to numbers

def word_to_number(text):
    # regular expression to find written number words and digits
    r = re.compile(r'(\d)|\b(zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|fourty|fifty|sixty|seventy|eight|ninety|hundred|thousand|million|billion|point)\b')
    r_digits = re.compile(r'(\d)')
    
    result = text
    tokens = tokenize_regex(text)
    words = []
    
    fraction = False
    mix = False
    for index in range(len(tokens)):

        # if we find a word that represents a number
        if r.match(tokens[index].lower()): 
            words.append(tokens[index])
        
        # if we find a fraction
        elif index+1 < len(tokens) and tokens[index].lower() == 'out' and tokens[index+1].lower() == 'of':
            words.append(tokens[index])
            words.append(tokens[index+1])
            fraction = True
            
        # if we reach here, then tokens[index] is not a number word but our previous token(s) were number words
        elif words:
            old, replace = "", ""
            for w in words: 
                # if there is a mix of digits and number words
                if r_digits.match(w):
                    replace = old + num2words(w) + " "
                    mix = True 
                elif mix: replace = replace + w + " "
                old = old + w + " "
                    
            old = old[:-1]
            if replace: replace = replace[:-1]
            
            # if old contains "out of", then we add a slash for the fraction
            if "out of" in old: 
                if r_digits.match(old): result = result.replace(" out of", "/")
                else: result = result.replace(old, str(w2n.word_to_num(str(old))) + '/')
                   
            # if we didn't have this "six out of ten" would be converted to "6/ 10" instead of "6/10"
            elif fraction: 
                result = result.replace(" " + old, str(w2n.word_to_num(old)))
                fraction = False
            
            # if the text we are converting has a mix of words and digits (eg. "fifty 4")
            elif mix: 
                result = result.replace(old, str(w2n.word_to_num(replace)))
                mix = False
                
            # w2n.word_to_num() handles the conversion of the number words to number form
            else: result = result.replace(old, str(w2n.word_to_num(old)))
            words.clear()
    
    return result

In [6]:
# Sentence and Clause Segmentation

r_digits = re.compile(r'\d')

def segment_sentence(text):
    result = []
    sentence = ""
    tokens = tokenize_sentences(text)
    
    skip = False
    for index in range(len(tokens)):   
        
        if skip:
            skip = False
            continue
        
        # beginning of a new sentence
        elif sentence == "": sentence = tokens[index]
            # sentence = tokens[index][:1].lower() + tokens[index][1:] #consider case folding
        
        # segment by sentences
        elif tokens[index] in {'.','!','?'}:
            
            # unambiguous punctuation that signifies end of a sentence
            if tokens[index] in {'!','?'}:
                result.append(sentence)
                sentence = ""
            
            # ambiguous punctuation that may or may not signify end of a sentece
            elif tokens[index] == '.':
                # period [.] represents a decimal number
                if r_digits.match(tokens[index-1]) and r_digits.match(tokens[index+1]): 
                    sentence = sentence + tokens[index] + tokens[index+1] 
                    skip = True
               
                # period [.] represents common abbreviations like e.g. and i.e.
                elif index+1 < len(tokens) and tokens[index-1] in {'e', 'i'} and tokens[index+1] in {'g','e'}: 
                    sentence = sentence + tokens[index+1]
                    skip = True
                
                # period[.] represents the end of an abbreviation with periods inbetween
                elif sentence[-2:] in {'eg','ie'}: continue
                    
                # period [.] represents end of common abbreviations with no periods inbetween
                elif index-1 >= 0 and tokens[index-1].lower() in {'am','pm','mr','ms','dr','mrs','inc','tbsp','tsp','gal','lb','lbs','qt','pt'}: continue
                
                # period [.] represents the end of a sentence
                else: 
                    result.append(sentence)
                    sentence = ""
            
            # punctuation represents token is a clause (, ; :)
            # else: sentence = sentence + tokens[index]
        
        else: sentence = sentence + " " + tokens[index]
    if sentence: result.append(sentence)
            
    return result

def segment_clause(sentences: List[str]):
    result = []

    for sentence in sentences:
        clause = ""
        tokens = tokenize_clauses(sentence)
        
        skip = False
        for index in range(len(tokens)):
            if skip:
                skip = False
                continue
            
            # beginning of a new clause
            elif clause == "": 
                clause = tokens[index]
            
            elif tokens[index] in {',',';',':'}:
                # unambiguous punctuation that signifies end of a clause
                if tokens[index] in {';'}:
                    result.append(clause)
                    clause = ""
                
                # ambiguous colons [:] that may or may not signify the end of a clause
                elif tokens[index] == ':':
                    # colon [:] represents a ratio
                    if r_digits.match(tokens[index-1]) and r_digits.match(tokens[index+1]): 
                        clause = clause + tokens[index] + tokens[index+1] 
                        skip = True
                    
                    # colon [:] represents end of a clause
                    else: 
                        result.append(clause)
                        clause = ""
                
                # ambiguous commas [,] that may or may not signify the end of a clause
                else: 
                    # comma [,] represents a number with commas
                    if r_digits.match(tokens[index-1]) and r_digits.match(tokens[index+1]): 
                        clause = clause + tokens[index] + tokens[index+1] 
                        skip = True
                        
                    # comma [,] used to separate items in a list
                    elif index-2 >= 0 and tokens[index-2] == ',' or index+2 < len(tokens) and tokens[index+2] == ',': continue
                    
                    # comma [,] represents end of a clause
                    else: 
                        result.append(clause)
                        clause = ""
            
            else: clause = clause + " " + tokens[index]

        if clause:
            result.append(clause)
            clause = ""
            
    return result

In [7]:
# Spell Checker

def spell_check(clauses: List[str]):
    spell = SpellChecker()
    result = clauses.copy()
    
    # for each clause in result
    for index in range(len(result)):
        
        # split each clause by whitespaces
        for word in result[index].split():
            mispelled = spell.unknown(word)
            if mispelled: result[index] = result[index].replace(word, spell.correction(word))
                
    return result

In [84]:
# Mihir's added code:

#  Case Folding
def case_fold(clauses: List[str]):
    for index in range(len(clauses)):
        clauses[index] = clauses[index].lower()
    return clauses

# Removing non-alphanumeric characters
def remove_non_alphanumeric(clauses: List[str]):
    result = []
    for clause in clauses:
        clause = re.sub(r'[^a-zA-Z0-9./ ]', '', clause)
        result.append(clause)
    return result

# # Removing stop-words
# def remove_stop_words(clauses: List[str]):
#     #"Long Stopword List" from https://www.ranks.nl/stopwords

#     result = clauses.copy()
#     stop_words_string = "a able about above abst accordance according accordingly across act actually added adj affected affecting affects after afterwards again against ah all almost alone along already also although always am among amongst an and announce another any anybody anyhow anymore anyone anything anyway anyways anywhere apparently approximately are aren arent arise around as aside ask asking at auth available away awfully b back be became because become becomes becoming been before beforehand begin beginning beginnings begins behind being believe below beside besides between beyond biol both brief briefly but by c ca came can cannot can't cause causes certain certainly co com come comes contain containing contains could couldnt d date did didn't different do does doesn't doing done don't down downwards due during e each ed edu effect eg eight eighty either else elsewhere end ending enough especially et et-al etc even ever every everybody everyone everything everywhere ex except f far few ff fifth first five fix followed following follows for former formerly forth found four from further furthermore g gave get gets getting give given gives giving go goes gone got gotten h had happens hardly has hasn't have haven't having he hed hence her here hereafter hereby herein heres hereupon hers herself hes hi hid him himself his hither home how howbeit however hundred i id ie if i'll im immediate immediately importance important in inc indeed index information instead into invention inward is isn't it itd it'll its itself i've j just k keep	keeps kept kg km know known knows l largely last lately later latter latterly least less lest let lets like liked likely line little 'll look looking looks ltd m made mainly make makes many may maybe me mean means meantime meanwhile merely mg might million miss ml more moreover most mostly mr mrs much mug must my myself n na name namely nay nd near nearly necessarily necessary need needs neither never nevertheless new next nine ninety no nobody non none nonetheless noone nor normally nos not noted nothing now nowhere o obtain obtained obviously of off often oh ok okay old omitted on once one ones only onto or ord other others otherwise ought our ours ourselves out outside over overall owing own p page pages part particular particularly past per perhaps placed please plus poorly possible possibly potentially pp predominantly present previously primarily probably promptly proud provides put q que quickly quite qv r ran rather rd re readily really recent recently ref refs regarding regardless regards related relatively research respectively resulted resulting results right run s said same saw say saying says sec section see seeing seem seemed seeming seems seen self selves sent seven several shall she shed she'll shes should shouldn't show showed shown showns shows significant significantly similar similarly since six slightly so some somebody somehow someone somethan something sometime sometimes somewhat somewhere soon sorry specifically specified specify specifying still stop strongly sub substantially successfully such sufficiently suggest sup sure	t take taken taking tell tends th than thank thanks thanx that that'll thats that've the their theirs them themselves then thence there thereafter thereby thered therefore therein there'll thereof therere theres thereto thereupon there've these they theyd they'll theyre they've think this those thou though thoughh thousand throug through throughout thru thus til tip to together too took toward towards tried tries truly try trying ts twice two u un under unfortunately unless unlike unlikely until unto up upon ups us use used useful usefully usefulness uses using usually v value various 've very via viz vol vols vs w want wants was wasnt way we wed welcome we'll went were werent we've what whatever what'll whats when whence whenever where whereafter whereas whereby wherein wheres whereupon wherever whether which while whim whither who whod whoever whole who'll whom whomever whos whose why widely willing wish with within without wont words world would wouldnt www x y yes yet you youd you'll your youre yours yourself yourselves you've z zero"
#     stop_words = stop_words_string.split()
    
#     for index in range(len(result)):
#         filtered = []
        
#         for word in result[index].split():
#             if word in stop_words: continue
#             filtered.append(word)
#         result[index] = " ".join(filtered)
#     return result

# Removing stop-words
def remove_stop_words(clauses: List[str]):
    #"Long Stopword List" from https://www.ranks.nl/stopwords
    nlp = spacy.load("en_core_web_sm")
#     doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

#     result = clauses.copy()
#     stop_words_string = "able about above abst accordance according accordingly across act actually added adj affected affecting affects after afterwards again against ah all almost alone along already also although always am among amongst and announce another any anybody anyhow anymore anyone anything anyway anyways anywhere apparently approximately are aren arent arise around as aside ask asking at auth available away awfully b back be became because become becomes becoming been before beforehand begin beginning beginnings begins behind being believe below beside besides between beyond biol both brief briefly but by c ca came can cannot can't cause causes certain certainly co com come comes contain containing contains could couldnt d date did didn't different do does doesn't doing done don't down downwards due during e each ed edu effect eg eight eighty either else elsewhere end ending enough especially et et-al etc even ever every everybody everyone everything everywhere ex except f far few ff fifth first five fix followed following follows for former formerly forth found four from further furthermore g gave get gets getting give given gives giving go goes gone got gotten h had happens hardly has hasn't have haven't having he hed hence her here hereafter hereby herein heres hereupon hers herself hes hi hid him himself his hither home how howbeit however hundred i id ie if i'll im immediate immediately importance important in inc indeed index information instead into invention inward is isn't it itd it'll its itself i've j just k keep	keeps kept kg km know known knows l largely last lately later latter latterly least less lest let lets like liked likely line little 'll look looking looks ltd m made mainly make makes many may maybe me mean means meantime meanwhile merely mg might million miss ml more moreover most mostly mr mrs much mug must my myself n na name namely nay nd near nearly necessarily necessary need needs neither never nevertheless new next nine ninety no nobody non none nonetheless noone nor normally nos not noted nothing now nowhere o obtain obtained obviously off often oh ok okay old omitted on once one ones only onto or ord other others otherwise ought our ours ourselves out outside over overall owing own p page pages part particular particularly past per perhaps placed please plus poorly possible possibly potentially pp predominantly present previously primarily probably promptly proud provides put q que quickly quite qv r ran rather rd re readily really recent recently ref refs regarding regardless regards related relatively research respectively resulted resulting results right run s said same saw say saying says sec section see seeing seem seemed seeming seems seen self selves sent seven several shall she shed she'll shes should shouldn't show showed shown showns shows significant significantly similar similarly since six slightly so some somebody somehow someone somethan something sometime sometimes somewhat somewhere soon sorry specifically specified specify specifying still stop strongly sub substantially successfully such sufficiently suggest sup sure	t take taken taking tell tends th than thank thanks thanx that that'll thats that've the their theirs them themselves then thence there thereafter thereby thered therefore therein there'll thereof therere theres thereto thereupon there've these they theyd they'll theyre they've think this those thou though thoughh thousand throug through throughout thru thus til tip to together too took toward towards tried tries truly try trying ts twice two u un under unfortunately unless unlike unlikely until unto up upon ups us use used useful usefully usefulness uses using usually v value various 've very via viz vol vols vs w want wants was wasnt way we wed welcome we'll went were werent we've what whatever what'll whats when whence whenever where whereafter whereas whereby wherein wheres whereupon wherever whether which while whim whither who whod whoever whole who'll whom whomever whos whose why widely willing wish with within without wont words world would wouldnt www x y yes yet you youd you'll your youre yours yourself yourselves you've z zero"
#     stop_words = stop_words_string.split()
    
    item = ""
    filtered = []
    for index in range(len(clauses)):
         
        for word in clauses[index].split():
            doc = nlp(word)
#                 print("word: ", word)
            for token in doc:
#                 print(token.text, token.pos_, token.is_stop)
#                 print("verb? ", token.pos_ == 'VERB')
            
#                 if (token.is_stop or token.pos_ == 'VERB') and item == "": continue
#                 elif (token.is_stop or token.pos_ == 'VERB') and item != "":
#                     filtered.append(item)
#                     item = ""
#                 elif token.is_stop == False and item == "": item = word
#                 elif token.is_stop == False and item != "": item = item + " " + word
                if (token.is_stop) and item == "": continue
                elif (token.is_stop) and item != "":
                    filtered.append(item)
                    item = ""
                elif token.is_stop == False and item == "": item = word
                elif token.is_stop == False and item != "": item = item + " " + word

                print("item: ", item)
                print("token.is_stop: ", token.is_stop)
                print(token.pos_)
                print(token.dep_)
                print()
        if item != "": 
            filtered.append(item)
            item = ""

#     if item != "": filtered.append(item)

    return filtered

doc = nlp('butter pancakes')
for token in doc:
    print(token.text, token.dep_)

# remove_stop_words(['i ate a turkey cheese sandwich with an apple and a glass of water'])
remove_stop_words(['i ate a turkey cheese sandwich with an apple and a glass of water', 'and butter pancakes'])
# remove_stop_words(['i believe i had 1 avocado toast', '2.5 slices of tomatoes', 'and 3/4 pieces of chocolate', 'i would rather have had an egg', 'they would love that'])

butter compound
pancakes ROOT
item:  ate
token.is_stop:  False
VERB
ROOT

item:  
token.is_stop:  True
DET
ROOT

item:  turkey
token.is_stop:  False
NOUN
ROOT

item:  turkey cheese
token.is_stop:  False
NOUN
ROOT

item:  turkey cheese sandwich
token.is_stop:  False
NOUN
ROOT

item:  
token.is_stop:  True
ADP
ROOT

item:  apple
token.is_stop:  False
NOUN
ROOT

item:  
token.is_stop:  True
CCONJ
ROOT

item:  glass
token.is_stop:  False
NOUN
ROOT

item:  
token.is_stop:  True
ADP
ROOT

item:  water
token.is_stop:  False
NOUN
ROOT

item:  butter
token.is_stop:  False
NOUN
ROOT

item:  butter pancakes
token.is_stop:  False
VERB
ROOT



['ate', 'turkey cheese sandwich', 'apple', 'glass', 'water', 'butter pancakes']

In [74]:
# Stemming and lemmatization
def lemmatize(clauses:List[str]):
    spacy_use = spacy.load("en_core_web_sm")
    lemmatizer = spacy_use.get_pipe("lemmatizer")

    for clause in clauses:
        doc = spacy_use(clause)
        new_clause = ""
        for token in doc:
            if token is not token.lemma_:
                if new_clause == "": new_clause = token.lemma_
                else: new_clause = new_clause + " " + token.lemma_
            else:
                if new_clause == "": new_clause = token
                else: new_clause = new_clause + " " + token.lemma_
        clauses[clauses.index(clause)] = new_clause
        
    return clauses
        
lemmatize(['ate turkey cheese sandwich apple glass water'])

['eat turkey cheese sandwich apple glass water']

In [41]:
#text = "Hello there, my name is: Chloe Lam! I'd like a cheese, tomato, parmesan, and egg sandwich. I had 3.5 eggs, e.g. on toast at 5 pm. today. There has been 1:2 ratios; I would like 3,500 of those"
text = "I beleeve I had one avocado toast, 2 point five slices of tomatoes, and 3 out of 4 pieces of chocolate. I'd rather have had an egg; they'd love that"
# text = "I had 1 cup of basmati rice with chicken tikka masala"
# text = "I ate a turkey cheese sandwich with an apple and a glass of water"

text_ec = expand_contractions(text)
text_w2n = word_to_number(text_ec)
text_ss = segment_sentence(text_w2n)
text_sc = segment_clause(text_ss)
text_sp = spell_check(text_sc)
text_cf = case_fold(text_sp)
text_rna = remove_non_alphanumeric(text_cf)
text_rsw = remove_stop_words(text_rna)

print("original:", text)
print()
print("expand_contractions:", text_ec)
print()
print("word to numbers:", text_w2n)
print()
print("segment_sentence:", text_ss)
print()
print("segment_clause:", text_sc)
print()
print("spell_check:", text_sp)
print()
print("case_fold:", text_cf)
print()
print("remove_non_alphanumeric:", text_rna)
print()
print("remove_stop_words:", text_rsw)

# Processing
# 1. Expand English contractions
# 2. Convert written words to numbers
# 3. Segment by sentences, then segment by clauses
# 4. Spell checker
# 5. Case folding, remove non-alphanumeric characters, remove stop-words
# 6. Stemming and lemmatization
# 7. Vectorization

original: I beleeve I had one avocado toast, 2 point five slices of tomatoes, and 3 out of 4 pieces of chocolate. I'd rather have had an egg; they'd love that

expand_contractions: I beleeve I had one avocado toast, 2 point five slices of tomatoes, and 3 out of 4 pieces of chocolate. I would rather have had an egg; they would love that

word to numbers: I beleeve I had 1 avocado toast, 2.5 slices of tomatoes, and 3/4 pieces of chocolate. I would rather have had an egg; they would love that

segment_sentence: ['I beleeve I had 1 avocado toast, 2.5 slices of tomatoes, and 3/4 pieces of chocolate', 'I would rather have had an egg; they would love that']

segment_clause: ['I beleeve I had 1 avocado toast', '2.5 slices of tomatoes', 'and 3/4 pieces of chocolate', 'I would rather have had an egg', 'they would love that']

spell_check: ['i believe i had 1 avocado toast', '2.5 slices of tomatoes', 'and 3/4 pieces of chocolate', 'i would rather have had an egg', 'they would love that']

case_fo

In [64]:
nlp = spacy.load("en_core_web_sm")
# doc = nlp("ate turkey cheese sandwich apple tablespoon cup water McDonald's 1 one")
    
# print(doc.ents)

nlp = spacy.load("en_core_web_sm")
# doc = nlp("Apple is the first U.S. public company to reach a $1 trillion market value")
doc = nlp("I ate a turkey cheese sandwich with an apple and a glass of water and butter pancakes")

# for token in doc:
#     print(token.text, token.pos_, token.dep_, token.head.text, token.head.pos_,
#             [child for child in token.children])

for chunk in doc.noun_chunks:
    print(chunk.text)

# for ent in doc.ents:
#     print(ent.text, ent.label_)

I
a turkey cheese sandwich
an apple
a glass
water and butter pancakes
