In [1]:
!pip install word2number
!pip install num2words
!pip install pyspellchecker
!pip install spacy
!python3 -m spacy download en_core_web_sm

import re
import csv
import spacy
import json
from word2number import w2n
from num2words import num2words
from typing import List
from spellchecker import SpellChecker

# Processing
# 1. Expand English contractions
# 2. Convert written words to numbers
# 3. Segment by sentences, then segment by clauses
# 4. Spell checker
# 5. Case folding, remove non-alphanumeric characters, remove stop-words
# 6. Stemming and lemmatization
# 7. Vectorization

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 2.7 MB/s eta 0:00:01     |█████████████████████████▉      | 11.1 MB 2.7 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
# Tokenize methods

def tokenize_sentences(text):
    # tokenize with . ! ? to segment sentences 
    RE_TOK = re.compile(r'([.!?]|\s+)')

    prev_idx = 0
    tokens = []
    for m in RE_TOK.finditer(text):
        t = text[prev_idx:m.start()].strip()
        if t: tokens.append(t)
        t = m.group().strip()
        if t: tokens.append(t)
        prev_idx = m.end()

    t = text[prev_idx:].strip()
    if t: tokens.append(t)
    return tokens

def tokenize_clauses(text):
    # tokenize with , : ; to segment clauses 
    RE_TOK = re.compile(r'([,:;]|\s+)')

    prev_idx = 0
    tokens = []
    for m in RE_TOK.finditer(text):
        t = text[prev_idx:m.start()].strip()
        if t: tokens.append(t)
        t = m.group().strip()
        if t: tokens.append(t)
        prev_idx = m.end()

    t = text[prev_idx:].strip()
    if t: tokens.append(t)
    return tokens

def tokenize_regex(text):
    # tokenize with . ! ? , : ; to segment sentences and clauses 
    RE_TOK = re.compile(r'([.,:]|\s+)')

    prev_idx = 0
    tokens = []
    for m in RE_TOK.finditer(text):
        t = text[prev_idx:m.start()].strip()
        if t: tokens.append(t)
        t = m.group().strip()
        if t: tokens.append(t)
        prev_idx = m.end()

    t = text[prev_idx:].strip()
    if t: tokens.append(t)
    return tokens

def tokenize_w2n(text):
    RE_TOK = re.compile(r'([[a-zA-Z]]|n\'t|\s+)')    # update with more symbols

    prev_idx = 0
    tokens = []
    for m in RE_TOK.finditer(text):
        t = text[prev_idx:m.start()].strip()
        if t: tokens.append(t)
        t = m.group().strip()
        if t: tokens.append(t)
        prev_idx = m.end()
#         if t:
#             if tokens and tokens[-1] in {'Mr', 'Ms'} and t == '.':
#                 tokens[-1] = tokens[-1] + t
#             else:
#                 tokens.append(t)
#         prev_idx = m.end()

    t = text[prev_idx:].strip()
    if t: tokens.append(t)
    return tokens

tokenize_w2n("I had 100ml of milk")

  RE_TOK = re.compile(r'([[a-zA-Z]]|n\'t|\s+)')    # update with more symbols


['I', 'had', '100ml', 'of', 'milk']

In [3]:
# Expand English Contractions 
# list of contractions from https://englishstudypage.com/grammar/list-of-contractions-in-english/

# Read the English contractions list csv file and convert to dictionary where key = abbreviation, value = contracted words
with open('english-contractions-list.csv', mode='r') as input:
        reader = csv.reader(input)
        contractionsDict = {rows[0]:rows[1] for rows in reader}
        contractionsDict.pop('Abbreviation')
    
def expand_contractions(text):
    result = text
    tokens = tokenize_regex(text)
    
    for token in tokens:
        # if the token is an abbreviation, then we will change it to its contracted English words
        if contractionsDict.get(token.lower()) != None: 
            new = contractionsDict.get(token.lower())

            # if the token was the beginning of a sentence, then we change the capitalization to match it
            if token[:1].isupper(): new = new[:1].upper() + new[1:]   
            result = result.replace(token, new)

    return result

In [4]:
# Convert written words to numbers

def word_to_number(text):
    # regular expression to find written number words and digits
    r = re.compile(r'(\d)|\b(zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|fourty|fifty|sixty|seventy|eight|ninety|hundred|thousand|million|billion|point)\b')
    r_digits = re.compile(r'(\d)')
    
    tokens = tokenize_regex(text)
#     tokens_updated = []
    for t in tokens:
#         print(t)
#         print("digits: ", r_digits.match(t))
#         print("letters: ", re.search('[a-zA-Z]', t))
        if r_digits.match(t) and re.search('[a-zA-Z]', t):
            for index in range(len(t)):
#                 print("index: ", index)
#                 print("t[index]: ", t[index])
                if re.search('[a-zA-Z]', t[index]) :
#                     print("index: ", index)
#                     print("index-1:", t[:index])
#                     print("index:", t[index:])
                    new_token = t[0:index] + " " + t[index:]
#                     print("t: ", t)
#                     print("new_token: ", new_token)
#                     print("old: ", text)
                    text = text.replace(t, new_token)
#                     print("new: ", text)
                    break
#     print("after: ", text)
    result = text
    tokens = tokenize_regex(text)
    
#     print(tokens)
    words = []
    
    fraction = False
    symbols_exist = False
    mix = False
    for index in range(len(tokens)):

        # if we find a word that represents a number
        if r.match(tokens[index].lower()): 
            words.append(tokens[index])
        
        # if we find a fraction
        elif index+1 < len(tokens) and tokens[index].lower() == 'out' and tokens[index+1].lower() == 'of':
            words.append(tokens[index])
            words.append(tokens[index+1])
            fraction = True
            
        # if we reach here, then tokens[index] is not a number word but our previous token(s) were number words
        elif words:
            old, replace = "", ""
            for w in words: 
#                 print(w)
                if '/' in w or '%' in w: symbols_exist = True
                
                # if there is a mix of digits and number words
#                 print(w)
                elif r_digits.match(w):
#                     if '/' in w: continue
                    replace = old + num2words(w) + " "
                    mix = True 
                elif mix: replace = replace + w + " "
                old = old + w + " "
                    
            old = old[:-1]
            if replace: replace = replace[:-1]
            if symbols_exist: 
                continue
                symbols_exist = False
            
            # if old contains "out of", then we add a slash for the fraction
            elif "out of" in old: 
                if r_digits.match(old): result = result.replace(" out of", "/")
                else: result = result.replace(old, str(w2n.word_to_num(str(old))) + '/')
                   
            # if we didn't have this "six out of ten" would be converted to "6/ 10" instead of "6/10"
            elif fraction: 
                result = result.replace(" " + old, str(w2n.word_to_num(old)))
                fraction = False
            
            # if the text we are converting has a mix of words and digits (eg. "fifty 4")
            elif mix: 
                result = result.replace(old, str(w2n.word_to_num(replace)))
                mix = False
                
            # w2n.word_to_num() handles the conversion of the number words to number form
            else: result = result.replace(old, str(w2n.word_to_num(old)))
            words.clear()
    
    return result

# word_to_number('I had a piece of pork chop, 1/2 cup of broccoli, and 1 cup of potatoes')
word_to_number('I had 100ml of 2% milk')

'I had 100 ml of 2% milk'

In [5]:
# Sentence and Clause Segmentation

r_digits = re.compile(r'\d')

def segment_sentence(text):
    result = []
    sentence = ""
    tokens = tokenize_sentences(text)
    
    skip = False
    for index in range(len(tokens)):   
        
        if skip:
            skip = False
            continue
        
        # beginning of a new sentence
        elif sentence == "": sentence = tokens[index]
            # sentence = tokens[index][:1].lower() + tokens[index][1:] #consider case folding
        
        # segment by sentences
        elif tokens[index] in {'.','!','?'}:
            
            # unambiguous punctuation that signifies end of a sentence
            if tokens[index] in {'!','?'}:
                result.append(sentence)
                sentence = ""
            
            # ambiguous punctuation that may or may not signify end of a sentece
            elif tokens[index] == '.':
                # period [.] represents a decimal number
                if r_digits.match(tokens[index-1]) and r_digits.match(tokens[index+1]): 
                    sentence = sentence + tokens[index] + tokens[index+1] 
                    skip = True
               
                # period [.] represents common abbreviations like e.g. and i.e.
                elif index+1 < len(tokens) and tokens[index-1] in {'e', 'i'} and tokens[index+1] in {'g','e'}: 
                    sentence = sentence + tokens[index+1]
                    skip = True
                
                # period[.] represents the end of an abbreviation with periods inbetween
                elif sentence[-2:] in {'eg','ie'}: continue
                    
                # period [.] represents end of common abbreviations with no periods inbetween
                elif index-1 >= 0 and tokens[index-1].lower() in {'am','pm','mr','ms','dr','mrs','inc','tbsp','tsp','gal','lb','lbs','qt','pt'}: continue
                
                # period [.] represents the end of a sentence
                else: 
                    result.append(sentence)
                    sentence = ""
            
            # punctuation represents token is a clause (, ; :)
            # else: sentence = sentence + tokens[index]
        
        else: sentence = sentence + " " + tokens[index]
    if sentence: result.append(sentence)
            
    return result

def segment_clause(sentences: List[str]):
    result = []

    for sentence in sentences:
        clause = ""
        tokens = tokenize_clauses(sentence)
        
        skip = False
        for index in range(len(tokens)):
            if skip:
                skip = False
                continue
            
            # beginning of a new clause
            elif clause == "": 
                clause = tokens[index]
            
            elif tokens[index] in {',',';',':'}:
                # unambiguous punctuation that signifies end of a clause
                if tokens[index] in {';'}:
                    result.append(clause)
                    clause = ""
                
                # ambiguous colons [:] that may or may not signify the end of a clause
                elif tokens[index] == ':':
                    # colon [:] represents a ratio
                    if r_digits.match(tokens[index-1]) and r_digits.match(tokens[index+1]): 
                        clause = clause + tokens[index] + tokens[index+1] 
                        skip = True
                    
                    # colon [:] represents end of a clause
                    else: 
                        result.append(clause)
                        clause = ""
                
                # ambiguous commas [,] that may or may not signify the end of a clause
                else: 
                    # comma [,] represents a number with commas
                    if r_digits.match(tokens[index-1]) and r_digits.match(tokens[index+1]): 
                        clause = clause + tokens[index] + tokens[index+1] 
                        skip = True
                        
                    # comma [,] used to separate items in a list
                    elif index-2 >= 0 and tokens[index-2] == ',' or index+2 < len(tokens) and tokens[index+2] == ',': continue
                    
                    # comma [,] represents end of a clause
                    else: 
                        result.append(clause)
                        clause = ""
            
            else: clause = clause + " " + tokens[index]

        if clause:
            result.append(clause)
            clause = ""
            
    return result

In [6]:
# Spell Checker

def spell_check(clauses: List[str]):
    spell = SpellChecker()
    result = clauses.copy()
    
    # for each clause in result
    for index in range(len(result)):
        
        # split each clause by whitespaces
        for word in result[index].split():
            mispelled = spell.unknown(word)
            if mispelled: result[index] = result[index].replace(word, spell.correction(word))
                
    return result

In [7]:
# Mihir's added code:

#  Case Folding
def case_fold(clauses: List[str]):
    for index in range(len(clauses)):
        clauses[index] = clauses[index].lower()
    return clauses

# Removing non-alphanumeric characters
def remove_non_alphanumeric(clauses: List[str]):
    result = []
    for clause in clauses:
        clause = re.sub(r'[^a-zA-Z0-9./ ]', '', clause)
        result.append(clause)
    return result

# Removing stop-words
def remove_stop_words(clauses: List[str]):
    #"Long Stopword List" from https://www.ranks.nl/stopwords
    nlp = spacy.load("en_core_web_sm")
    
    item = ""
    filtered = []
    for index in range(len(clauses)):
         
        for word in clauses[index].split():
            doc = nlp(word)
#                 print("word: ", word)
            for token in doc:
#                 print(token.text, token.pos_, token.is_stop)
#                 print("verb? ", token.pos_ == 'VERB')
            
#                 if (token.is_stop or token.pos_ == 'VERB') and item == "": continue
#                 elif (token.is_stop or token.pos_ == 'VERB') and item != "":
#                     filtered.append(item)
#                     item = ""
#                 elif token.is_stop == False and item == "": item = word
#                 elif token.is_stop == False and item != "": item = item + " " + word
                if (token.is_stop) and item == "": continue
                elif (token.is_stop) and item != "":
                    filtered.append(item)
                    item = ""
                elif token.is_stop == False and item == "": item = word
                elif token.is_stop == False and item != "": item = item + " " + word

        if item != "": 
            filtered.append(item)
            item = ""

#     if item != "": filtered.append(item)

    return filtered

# doc = nlp('butter pancakes')
# for token in doc:
#     print(token.text, token.dep_)

# remove_stop_words(['i ate a turkey cheese sandwich with an apple and a glass of water'])
# remove_stop_words(['i ate a turkey cheese sandwich with an apple and a glass of water', 'and butter pancakes'])
# remove_stop_words(['i believe i had 1 avocado toast', '2.5 slices of tomatoes', 'and 3/4 pieces of chocolate', 'i would rather have had an egg', 'they would love that'])

In [8]:
# Stemming and lemmatization
def lemmatize(clauses:List[str]):
    spacy_use = spacy.load("en_core_web_sm")
    lemmatizer = spacy_use.get_pipe("lemmatizer")

    for clause in clauses:
        doc = spacy_use(clause)
        new_clause = ""
        for token in doc:
            if token is not token.lemma_:
                if new_clause == "": new_clause = token.lemma_
                else: new_clause = new_clause + " " + token.lemma_
            else:
                if new_clause == "": new_clause = token
                else: new_clause = new_clause + " " + token.lemma_
        clauses[clauses.index(clause)] = new_clause
        
    return clauses
        
lemmatize(['ate turkey cheese sandwich apple glass water'])

['eat turkey cheese sandwich apple glass water']

In [32]:
!pip3 install pyahocorasick

import glob
import os
from types import SimpleNamespace
from typing import Iterable, Tuple, Any, List, Set

import ahocorasick



In [99]:
def create_ac(data: Iterable[Tuple[str, Any]]) -> ahocorasick.Automaton:
    AC = ahocorasick.Automaton(ahocorasick.STORE_ANY)

    for span, value in data:
        if span in AC:
            t = AC.get(span)
        else:
            t = SimpleNamespace(span=span, values=set())
            AC.add_word(span, t)
        t.values.add(value)

    AC.make_automaton()
    return AC


def read_gazetteers(dirname: str) -> ahocorasick.Automaton:
    data = []
    for filename in glob.glob(os.path.join(dirname, '*.txt')):
        label = os.path.basename(filename)[:-4]
        for line in open(filename):
            data.append((line.strip(), label))
    return create_ac(data)


def match(AC: ahocorasick.Automaton, tokens: List[str]) -> List[Tuple[str, int, int, Set[str]]]:
    tokens_updated = []
    for clause in tokens:
        for word in clause.split():
            tokens_updated.append(word)
    
    smap, emap, idx = dict(), dict(), 0
    
    for i, token in enumerate(tokens_updated):
        smap[idx] = i
        idx += len(token)
        emap[idx] = i
        idx += 1

    # find matches
    text = ' '.join(tokens_updated)
    spans = []
    for eidx, t in AC.iter(text):
        eidx += 1
        sidx = eidx - len(t.span)
        sidx = smap.get(sidx, None)
        eidx = emap.get(eidx, None)
        if sidx is None or eidx is None: continue
        spans.append((t.span, sidx, eidx + 1, t.values))
        
    return spans

def contains_match(word:str, matches:List[Tuple[str, int, int, Set[str]]]):
    result = False
    for match in matches:
#         print("word:", word)
#         print("match[0]:", match[0])
        if word == match[0]:
            result = True
            break
            
    return result

AC = read_gazetteers('/Users/chloelam/PycharmProjects/health-bot/data')
print(match(AC, ['ate 1 bowl', 'paneer tikka masala', 'basmati rice']))

[('bowl', 2, 3, {'units'})]


In [120]:
# Identify entities: quantity, unit, and food item
# Return a list of tuples; each item is a unique food item; tuple contains quantity (int or float), unit, and food item
def entity_recognition(clauses:List[str]):
    result = []
    
    matches = match(AC, clauses)
    nlp = spacy.load("en_core_web_sm")
    
    item = []
    for clause in clauses:
        add_to_food_entity = False
        doc = nlp(clause)
            
        for token in doc:            
            if token.pos_ == 'VERB' and token.idx == 0: 
#                 print("don't add verb")
                continue
            elif token.pos_ == 'NUM': 
                item.append(str(token))
#                 print("quantity:", item)
            elif contains_match(str(token), matches) and not item: 
                item.append("1")
                item.append(str(token))
#                 print("quantity default and unit:", item)
            elif contains_match(str(token), matches) and item: 
                item.append(str(token))
#                 print("unit:", item)
            elif add_to_food_entity:
                food_entity = item.pop()
                item.append(food_entity + " " + str(token))
#                 print("continue adding to food item:", item)
            elif not item:
                item.append("1")
                item.append("count")
                item.append(str(token))
#                 print("quantity and unit default and new food item:", item)
                add_to_food_entity = True
            elif len(item) == 1:
                item.append("count")
                item.append(str(token))
                add_to_food_entity = True
            else:
                item.append(str(token))
#                 print("new food item:", item)
                add_to_food_entity = True
            
        if len(item) < 3: continue
        else:
            result.append(item)
            item = []

    return result
                    
# entity_recognition(['ate 1 bowl', 'paneer tikka masala', 'basmati rice'])

In [41]:
#text = "Hello there, my name is: Chloe Lam! I'd like a cheese, tomato, parmesan, and egg sandwich. I had 3.5 eggs, e.g. on toast at 5 pm. today. There has been 1:2 ratios; I would like 3,500 of those"
text = "I beleeve I had one avocado toast, 2 point five slices of tomatoes, and 3 out of 4 pieces of chocolate. I'd rather have had an egg; they'd love that"
# text = "I had 1 cup of basmati rice with chicken tikka masala"
# text = "I ate a turkey cheese sandwich with an apple and a glass of water"

text_ec = expand_contractions(text)
text_w2n = word_to_number(text_ec)
text_ss = segment_sentence(text_w2n)
text_sc = segment_clause(text_ss)
text_sp = spell_check(text_sc)
text_cf = case_fold(text_sp)
text_rna = remove_non_alphanumeric(text_cf)
text_rsw = remove_stop_words(text_rna)

print("original:", text)
print()
print("expand_contractions:", text_ec)
print()
print("word to numbers:", text_w2n)
print()
print("segment_sentence:", text_ss)
print()
print("segment_clause:", text_sc)
print()
print("spell_check:", text_sp)
print()
print("case_fold:", text_cf)
print()
print("remove_non_alphanumeric:", text_rna)
print()
print("remove_stop_words:", text_rsw)

# Processing
# 1. Expand English contractions
# 2. Convert written words to numbers
# 3. Segment by sentences, then segment by clauses
# 4. Spell checker
# 5. Case folding, remove non-alphanumeric characters, remove stop-words
# 6. Stemming and lemmatization
# 7. Vectorization

original: I beleeve I had one avocado toast, 2 point five slices of tomatoes, and 3 out of 4 pieces of chocolate. I'd rather have had an egg; they'd love that

expand_contractions: I beleeve I had one avocado toast, 2 point five slices of tomatoes, and 3 out of 4 pieces of chocolate. I would rather have had an egg; they would love that

word to numbers: I beleeve I had 1 avocado toast, 2.5 slices of tomatoes, and 3/4 pieces of chocolate. I would rather have had an egg; they would love that

segment_sentence: ['I beleeve I had 1 avocado toast, 2.5 slices of tomatoes, and 3/4 pieces of chocolate', 'I would rather have had an egg; they would love that']

segment_clause: ['I beleeve I had 1 avocado toast', '2.5 slices of tomatoes', 'and 3/4 pieces of chocolate', 'I would rather have had an egg', 'they would love that']

spell_check: ['i believe i had 1 avocado toast', '2.5 slices of tomatoes', 'and 3/4 pieces of chocolate', 'i would rather have had an egg', 'they would love that']

case_fo

In [None]:
file = open('data_CL.json',)
data = json.load(file)

quantity_correct = 0
quantity_total = 0
unit_correct = 0
unit_total = 0
food_correct = 0
food_total = 0

for i in data:
    text = i['input']
    text_ec = expand_contractions(text)
    text_w2n = word_to_number(text_ec)
    text_ss = segment_sentence(text_w2n)
    text_sc = segment_clause(text_ss)
    text_sp = spell_check(text_sc)
    text_cf = case_fold(text_sp)
    text_rna = remove_non_alphanumeric(text_cf)
    text_rsw = remove_stop_words(text_rna)
    text_lemmatize = lemmatize(text_rsw)
    text_entities = entity_recognition(text_lemmatize)
    print(text)
    
    for index, entity in enumerate(text_entities):
        print("quantity:", entity[0])
        if index < len(i['entities']): print("actual quantity:", i['entities'][index]['quantity'])
        print("unit:", entity[1])
        if index < len(i['entities']): print("actual unit:", i['entities'][index]['unit'])
        print("food:", entity[2])
        if index < len(i['entities']): print("actual food:", i['entities'][index]['ingredient'])
        print()
        
#         print("entity[0]:", entity[0])
        if '/' in entity[0]: 
            dec_entity = fraction_to_decimal(entity[0])
            if index < len(i['entities']) and dec_entity == i['entities'][index]['quantity']: 
                quantity_correct = quantity_correct + 1
                quantity_total = quantity_total + 1
        elif index < len(i['entities']) and int(entity[0]) == i['entities'][index]['quantity']: 
            quantity_correct = quantity_correct + 1
            quantity_total = quantity_total + 1
        else: quantity_total = quantity_total + 1
            
        if index < len(i['entities']) and entity[1] == i['entities'][index]['unit']: 
            unit_correct = unit_correct + 1
            unit_total = unit_total + 1
        else: unit_total = unit_total + 1
            
        if index < len(i['entities']) and entity[2] == i['entities'][index]['ingredient']: 
            food_correct = food_correct + 1
            food_total = food_total + 1
        else: food_total = food_total + 1
            
quantity_score = (quantity_correct / quantity_total) * 100
unit_score = (unit_correct / unit_total) * 100
food_score = (food_correct / food_total) * 100

# print("quantity correct:", quantity_correct)
# print("quantity total:", quantity_total)
# print("unit correct:", unit_correct)
# print("unit total:", unit_total)
# print("food correct:", food_correct)
# print("food total:", food_total)

print()
print("quantity correct:", quantity_score)
print("unit correct:", unit_score)
print("food correct:", food_score)
            

I had 3 slices of avocado toast and an apple
quantity: 3
actual quantity: 3
unit: slice
actual unit: slices
food: avocado toast
actual food: avocado toast

quantity: 1
actual quantity: 1
unit: count
actual unit: count
food: apple
actual food: apple

I ate one cup of Noosa blueberry yogurt
quantity: 1
actual quantity: 1
unit: cup
actual unit: cup
food: noose blueberry yogurt
actual food: Noosa blueberry yogurt

i ate one bowl of paneer tikka masala with basmati rice
quantity: 1
actual quantity: 1
unit: bowl
actual unit: bowl
food: paneer tikka masala
actual food: paneer tikka masala

quantity: 1
actual quantity: 1
unit: count
actual unit: count
food: basmati rice
actual food: basmati rice

i had two chocolate chip cookies with a glass of whole milk
quantity: 2
actual quantity: 2
unit: count
actual unit: count
food: chocolate chip cookie
actual food: chocolate chip cookie

quantity: 1
actual quantity: 1
unit: glass
actual unit: glass
food: milk
actual food: whole milk

i had one ferrero 

I had an omelette with a cup of mixed greens and 1/2 cup of whole milk
quantity: 1
actual quantity: 1
unit: count
actual unit: count
food: omelette
actual food: omelette

quantity: 1
actual quantity: 1
unit: cup
actual unit: cup
food: mixed green
actual food: mixed greens

quantity: 1/2
actual quantity: 0.5
unit: cup
actual unit: cup
food: milk
actual food: whole milk

i ingested one piece of chicken breast with 1 cup of mashed potatoes
quantity: 1
actual quantity: 1
unit: piece
actual unit: piece
food: chicken breast
actual food: chicken breast

quantity: 1
actual quantity: 1
unit: cup
actual unit: cup
food: mash potato
actual food: mashed potatoes

i devoured a bowl of vegetable ramen and drank 1 cup of oolong milk tea
quantity: 1
actual quantity: 1
unit: bowl
actual unit: bowl
food: vegetable raman
actual food: vegetable ramen

quantity: 1
actual quantity: 1
unit: cup
actual unit: cup
food: oolong milk tea
actual food: oolong milk tea

i munched on one bowl of white cheddar popcorn 

I ate two chicken fajitas, a cup of baked beans, and a glass of mango lassi
quantity: 2
actual quantity: 2
unit: count
actual unit: count
food: chicken fajita
actual food: chicken fajita

quantity: 1
actual quantity: 1
unit: cup
actual unit: cup
food: baked bean
actual food: baked beans

quantity: 1
actual quantity: 1
unit: glass
actual unit: glass
food: mango lassi
actual food: mango lassi

I had a bowl of shrimp and grits!
quantity: 1
actual quantity: 1
unit: bowl
actual unit: bowl
food: grit
actual food: shrimp and grits

I had two slices of tuna casserole and a cup of chocolate milk
quantity: 2
actual quantity: 2
unit: slice
actual unit: slices
food: tuna casserole
actual food: tuna casserole

quantity: 1
actual quantity: 1
unit: cup
actual unit: cup
food: chocolate milk
actual food: chocolate milk

I ate a bowl of gnocchi and had a slice of tiramisu
quantity: 1
actual quantity: 1
unit: bowl
actual unit: bowl
food: gnocchi
actual food: gnocchi

quantity: 1
actual quantity: 1
unit: 

1 Hershey's chocolate bar
quantity: 1
actual quantity: 1
unit: count
actual unit: count
food: hershey chocolate bar
actual food: Hershey's chocolate bar

I ate a bowl of beef lo mein with six pieces of orange chicken
quantity: 1
actual quantity: 1
unit: bowl
actual unit: bowl
food: beef lo mein
actual food: beef lo mein

quantity: 6
actual quantity: 6
unit: piece
actual unit: pieces
food: orange chicken
actual food: orange chicken

I had 1 lobster tail with a glass of red wine and a piece of dark chocolate truffle
quantity: 1
actual quantity: 1
unit: count
actual unit: count
food: lobster tail
actual food: lobster tail

quantity: 1
actual quantity: 1
unit: glass
actual unit: glass
food: red wine
actual food: red wine

quantity: 1
actual quantity: 1
unit: piece
actual unit: piece
food: dark chocolate truffle
actual food: dark chocolate truffle

1 slice of sourdough with 1 tsp of blackberry preserve
quantity: 1
actual quantity: 1
unit: slice
actual unit: slice
food: sourdough
actual food

In [149]:
def fraction_to_decimal(fraction:str):
    index = fraction.index('/')  
    return int(fraction[:index]) / int(fraction[index+1:])

In [115]:
nlp = spacy.load("en_core_web_sm")
# doc = nlp("ate turkey cheese sandwich apple tablespoon cup water McDonald's 1 one")
    
# print(doc.ents)

nlp = spacy.load("en_core_web_sm")
# doc = nlp("Apple is the first U.S. public company to reach a $1 trillion market value")
doc = nlp("I ate a turkey cheese sandwich with an apple and a glass of water and butter pancakes")

# for token in doc:
#     print(token.text, token.pos_, token.dep_, token.head.text, token.head.pos_,
#             [child for child in token.children])

for chunk in doc.noun_chunks:
    print(chunk.text)

# for ent in doc.ents:
#     print(ent.text, ent.label_)

I
a turkey cheese sandwich
an apple
a glass
water and butter pancakes
