# Ingredient Parser Evaluation

In [1]:
ingredients = [
    "3 cups all-purpose flour",
    "1 teaspoon nutmeg",
    "1 teaspoon ground ginger",
    "2 teaspoons cinnamon",
    "1 teaspoon baking soda",
    "1 teaspoon salt",
    "1 cup chopped pecans",
    "3 ripe bananas",
    "2 cups granulated sugar",
    "1 20-ounce can of diced pineapple, drained",
    "1 cup canola oil",
    "3 large eggs",
    "16 ounces cream cheese",
    "4 ounces unsalted butter",
    "1 teaspoon vanilla extract",
    "½ teaspoon salt",
    "6 cups powdered sugar",
    "½ cup chopped pecans, for decorating",
    "9 tablespoons unsalted butter, at room temperature",
    "1 cup plus 2 tablespoons sugar",
    "3  large eggs",
    "1 ¼ cups all-purpose flour",
    "1 pinch salt",
    "1 cup fresh ricotta",
    "Zest of 1 lemon",
    "1 tablespoon baking powder",
    "1  apple, peeled and grated (should yield about 1 cup)",
    "Confectioners' sugar for serving",
    '1/2 large sweet red onion, thinly sliced'
]

import nltk
from nltk.tag.stanford import StanfordNERTagger

sentence = u"Twenty miles east of Reno, Nev., " \
    "where packs of wild mustangs roam free through " \
    "the parched landscape, Tesla Gigafactory 1 " \
    "sprawls near Interstate 80."

jar = './stanford-ner.jar'
model = './trained_ar_gk.ser.gz'

# Prepare NER tagger with english model
ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

for sample in ingredients:
    # Tokenize: Split sentence into words
    print(sample)
    words = nltk.word_tokenize(sample)

    # Run NER tagger on words
    print(ner_tagger.tag(words))
    print('\n')

3 cups all-purpose flour
[('3', 'QUANTITY'), ('cups', 'UNIT'), ('all-purpose', 'NAME'), ('flour', 'NAME')]


1 teaspoon nutmeg
[('1', 'QUANTITY'), ('teaspoon', 'UNIT'), ('nutmeg', 'NAME')]


1 teaspoon ground ginger
[('1', 'QUANTITY'), ('teaspoon', 'UNIT'), ('ground', 'STATE'), ('ginger', 'NAME')]


2 teaspoons cinnamon
[('2', 'QUANTITY'), ('teaspoons', 'UNIT'), ('cinnamon', 'NAME')]


1 teaspoon baking soda
[('1', 'QUANTITY'), ('teaspoon', 'UNIT'), ('baking', 'NAME'), ('soda', 'NAME')]


1 teaspoon salt
[('1', 'QUANTITY'), ('teaspoon', 'UNIT'), ('salt', 'NAME')]


1 cup chopped pecans
[('1', 'QUANTITY'), ('cup', 'UNIT'), ('chopped', 'STATE'), ('pecans', 'NAME')]


3 ripe bananas
[('3', 'QUANTITY'), ('ripe', 'STATE'), ('bananas', 'NAME')]


2 cups granulated sugar
[('2', 'QUANTITY'), ('cups', 'UNIT'), ('granulated', 'STATE'), ('sugar', 'NAME')]


1 20-ounce can of diced pineapple, drained
[('1', 'QUANTITY'), ('20-ounce', 'UNIT'), ('can', 'UNIT'), ('of', 'O'), ('diced', 'STATE'), ('pine

# Evaluation

In [1]:
def get_samples(file):
    label2id = {'DF': 0, 'NAME': 1, 'O': 2, 'QUANTITY': 3, 'SIZE': 4, 'STATE': 5, 'TEMP': 6, 'UNIT': 7}
    samples = []
    label_types = set()
    
    with open(file, 'r') as f:
        lines = f.readlines()
        label = []
        sentence = []
        for line in lines:
            line = line.strip().strip('\n')
            if not line:
                if label and sentence:
                    sample = {'text': " ".join(sentence), 'labels': label}
                    samples.append(sample)
                label = []
                sentence = []
            else:
                token, tag = line.split('\t')
#                 token = convert_number(token)
                token = token.lower()
                if len(token.split()) > 1:
                    tokensplit = token.split()
                    for tokensplit_item in tokensplit:
                        sentence.append(tokensplit_item)
                        label.append(label2id[tag])
                else:
                    sentence.append(token)
                    label.append(label2id[tag])
                    label_types.add(tag)
        sample = {'text': " ".join(sentence), 'labels': label}
        samples.append(sample)           
                    
    for sample in samples:
        sample['text'] = sample['text'].replace(' ,', ',')
        
    return samples, label_types

def compute_entity_level_f1_score(ground_truths, predictions):
    label2id = {'DF': 0, 'NAME': 1, 'O': 2, 'QUANTITY': 3, 'SIZE': 4, 'STATE': 5, 'TEMP': 6, 'UNIT': 7}
    id2label = {v: k for k, v in label2id.items()}
    statistics = {'tp': 0, 'fn': 0, 'fp': 0}
    counting = {k: statistics.copy() for k, v in id2label.items()}
    for g, p in zip(ground_truths, predictions):
        if g != p:
            counting[g]['fn'] += 1
            counting[p]['fp'] += 1
        else:
            counting[g]['tp'] += 1
    
    recall_precision_f1_score = {}    
    for k, v in counting.items():
        recall = v['tp'] / (v['tp'] + v['fn']) if v['tp'] + v['fn'] != 0 else 0
        precision = v['tp'] / (v['tp'] + v['fp']) if v['tp'] + v['fp'] != 0 else 0
        f1_score = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0
        recall_precision_f1_score[k] = {'Recall': recall, 'Precision': precision, 'F1_score': f1_score}
    return recall_precision_f1_score

In [2]:
test_file = ['./train/ar_gk_test.tsv', './train/ar_test.tsv', './train/gk_test.tsv']
samples, _ = get_samples(test_file)

In [9]:
import nltk
import tqdm
from nltk.tag.stanford import StanfordNERTagger

jar = './stanford-ner.jar'
model = './trained_ar.ser.gz' # './trained_ar_gk.ser.gz', './trained_ar.ser.gz', './trained_gk.ser.gz'

ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')
label2id = {'DF': 0, 'NAME': 1, 'O': 2, 'QUANTITY': 3, 'SIZE': 4, 'STATE': 5, 'TEMP': 6, 'UNIT': 7}
id2label = {v: k for k, v in label2id.items()}

test_files = ['./train/ar_gk_test.tsv', './train/ar_test.tsv', './train/gk_test.tsv']
for test_file in test_files:
    print('\n\nCurrent test file: {}\n'.format(test_file))
    test_ground_truth_list = []
    test_prediction_list = []
    samples, _ = get_samples(test_file)
    total_skip = 0
    for sample in tqdm.tqdm(samples):
        text, label = sample['text'], sample['labels']
        words = nltk.word_tokenize(text)
        output = ner_tagger.tag(words)
        prediction = [label2id[pred] for token, pred in output]
        if not len(label) == len(prediction): total_skip += 1
        test_ground_truth_list.extend(label)
        test_prediction_list.extend(prediction)

    test_f1_score = compute_entity_level_f1_score(test_ground_truth_list, test_prediction_list)
    correct = 0
    for p, g in zip(test_prediction_list, test_ground_truth_list):
        if p == g: correct += 1
    print(test_f1_score)
    print('Test Acc: {}'.format(correct / len(test_prediction_list)))
    print('Total Skip: {}'.format(total_skip))



Current test file: ./train/ar_gk_test.tsv



100%|█████████████████████████████████████████████████| 2188/2188 [08:12<00:00,  4.44it/s]


{0: {'Recall': 0.41836734693877553, 'Precision': 0.4205128205128205, 'F1_score': 0.4194373401534527}, 1: {'Recall': 0.68019747668678, 'Precision': 0.5746061167747915, 'F1_score': 0.6229590555136901}, 2: {'Recall': 0.4921109902067465, 'Precision': 0.611768684477511, 'F1_score': 0.5454545454545455}, 3: {'Recall': 0.6458885941644562, 'Precision': 0.6251604621309371, 'F1_score': 0.6353555120678408}, 4: {'Recall': 0.33653846153846156, 'Precision': 0.3333333333333333, 'F1_score': 0.33492822966507174}, 5: {'Recall': 0.49039341262580055, 'Precision': 0.46568201563857514, 'F1_score': 0.47771836007130125}, 6: {'Recall': 0.27906976744186046, 'Precision': 0.2608695652173913, 'F1_score': 0.2696629213483146}, 7: {'Recall': 0.5717501406865504, 'Precision': 0.6011834319526628, 'F1_score': 0.5860974906259014}}
Test Acc: 0.5798673429574717
Total Skip: 5


Current test file: ./train/ar_test.tsv



100%|███████████████████████████████████████████████████| 483/483 [01:47<00:00,  4.51it/s]


{0: {'Recall': 0.9607843137254902, 'Precision': 0.98, 'F1_score': 0.9702970297029702}, 1: {'Recall': 0.9765258215962441, 'Precision': 0.9433106575963719, 'F1_score': 0.9596309111880047}, 2: {'Recall': 0.9150779896013865, 'Precision': 0.947935368043088, 'F1_score': 0.9312169312169312}, 3: {'Recall': 0.9963702359346642, 'Precision': 1.0, 'F1_score': 0.9981818181818182}, 4: {'Recall': 1.0, 'Precision': 1.0, 'F1_score': 1.0}, 5: {'Recall': 0.9581993569131833, 'Precision': 0.952076677316294, 'F1_score': 0.9551282051282052}, 6: {'Recall': 0.7, 'Precision': 0.875, 'F1_score': 0.7777777777777777}, 7: {'Recall': 0.9661399548532731, 'Precision': 0.981651376146789, 'F1_score': 0.9738339021615472}}
Test Acc: 0.9630550621669627
Total Skip: 0


Current test file: ./train/gk_test.tsv



100%|█████████████████████████████████████████████████| 1705/1705 [06:20<00:00,  4.48it/s]

{0: {'Recall': 0.22758620689655173, 'Precision': 0.22758620689655173, 'F1_score': 0.22758620689655173}, 1: {'Recall': 0.5898353614889048, 'Precision': 0.47990681421083287, 'F1_score': 0.5292228644829802}, 2: {'Recall': 0.4132258064516129, 'Precision': 0.53375, 'F1_score': 0.46581818181818174}, 3: {'Recall': 0.5330216247808299, 'Precision': 0.5100671140939598, 'F1_score': 0.5212917976564733}, 4: {'Recall': 0.17857142857142858, 'Precision': 0.17647058823529413, 'F1_score': 0.1775147928994083}, 5: {'Recall': 0.30434782608695654, 'Precision': 0.2840095465393795, 'F1_score': 0.2938271604938272}, 6: {'Recall': 0.15151515151515152, 'Precision': 0.13157894736842105, 'F1_score': 0.14084507042253522}, 7: {'Recall': 0.44111027756939236, 'Precision': 0.4688995215311005, 'F1_score': 0.4545805952841129}}
Test Acc: 0.472
Total Skip: 5





In [8]:
import nltk
import tqdm
from nltk.tag.stanford import StanfordNERTagger

jar = './stanford-ner.jar'
model = './trained_gk.ser.gz' # './trained_ar_gk.ser.gz', './trained_ar.ser.gz', './trained_gk.ser.gz'

ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')
label2id = {'DF': 0, 'NAME': 1, 'O': 2, 'QUANTITY': 3, 'SIZE': 4, 'STATE': 5, 'TEMP': 6, 'UNIT': 7}
id2label = {v: k for k, v in label2id.items()}

test_files = ['./train/ar_gk_test.tsv', './train/ar_test.tsv', './train/gk_test.tsv']
for test_file in test_files:
    print('\n\nCurrent test file: {}\n'.format(test_file))
    test_ground_truth_list = []
    test_prediction_list = []
    samples, _ = get_samples(test_file)
    total_skip = 0
    for sample in tqdm.tqdm(samples):
        text, label = sample['text'], sample['labels']
        words = nltk.word_tokenize(text)
        output = ner_tagger.tag(words)
        prediction = [label2id[pred] for token, pred in output]
        if not len(label) == len(prediction): total_skip += 1
        test_ground_truth_list.extend(label)
        test_prediction_list.extend(prediction)

    test_f1_score = compute_entity_level_f1_score(test_ground_truth_list, test_prediction_list)
    correct = 0
    for p, g in zip(test_prediction_list, test_ground_truth_list):
        if p == g: correct += 1
    print(test_f1_score)
    print('Test Acc: {}'.format(correct / len(test_prediction_list)))
    print('Total Skip: {}'.format(total_skip))



Current test file: ./train/ar_gk_test.tsv



100%|█████████████████████████████████████████████████| 2188/2188 [10:20<00:00,  3.52it/s]


{0: {'Recall': 0.42857142857142855, 'Precision': 0.4329896907216495, 'F1_score': 0.43076923076923074}, 1: {'Recall': 0.6352166758091059, 'Precision': 0.642084835042972, 'F1_score': 0.6386322900868605}, 2: {'Recall': 0.6050054406964092, 'Precision': 0.5840336134453782, 'F1_score': 0.5943345804382685}, 3: {'Recall': 0.6370468611847923, 'Precision': 0.6398756660746003, 'F1_score': 0.638458130261409}, 4: {'Recall': 0.3269230769230769, 'Precision': 0.33663366336633666, 'F1_score': 0.33170731707317075}, 5: {'Recall': 0.48032936870997256, 'Precision': 0.48342541436464087, 'F1_score': 0.48187241854061497}, 6: {'Recall': 0.3488372093023256, 'Precision': 0.3409090909090909, 'F1_score': 0.3448275862068966}, 7: {'Recall': 0.6032639279684862, 'Precision': 0.6287390029325514, 'F1_score': 0.6157380815623205}}
Test Acc: 0.6017167381974249
Total Skip: 5


Current test file: ./train/ar_test.tsv



100%|███████████████████████████████████████████████████| 483/483 [02:18<00:00,  3.49it/s]


{0: {'Recall': 0.9607843137254902, 'Precision': 0.9423076923076923, 'F1_score': 0.9514563106796117}, 1: {'Recall': 0.9248826291079812, 'Precision': 0.9680589680589681, 'F1_score': 0.9459783913565427}, 2: {'Recall': 0.9618717504332756, 'Precision': 0.7838983050847458, 'F1_score': 0.8638132295719845}, 3: {'Recall': 0.9600725952813067, 'Precision': 1.0, 'F1_score': 0.9796296296296296}, 4: {'Recall': 0.95, 'Precision': 1.0, 'F1_score': 0.9743589743589743}, 5: {'Recall': 0.9228295819935691, 'Precision': 0.959866220735786, 'F1_score': 0.9409836065573771}, 6: {'Recall': 0.9, 'Precision': 1.0, 'F1_score': 0.9473684210526316}, 7: {'Recall': 0.8510158013544018, 'Precision': 0.9792207792207792, 'F1_score': 0.9106280193236714}}
Test Acc: 0.9282415630550621
Total Skip: 0


Current test file: ./train/gk_test.tsv



100%|█████████████████████████████████████████████████| 1705/1705 [08:10<00:00,  3.48it/s]

{0: {'Recall': 0.2413793103448276, 'Precision': 0.24647887323943662, 'F1_score': 0.24390243902439024}, 1: {'Recall': 0.5468861846814602, 'Precision': 0.5470819906910133, 'F1_score': 0.5469840701628781}, 2: {'Recall': 0.5383870967741935, 'Precision': 0.5383870967741935, 'F1_score': 0.5383870967741935}, 3: {'Recall': 0.5330216247808299, 'Precision': 0.5293093441671504, 'F1_score': 0.5311589982527666}, 4: {'Recall': 0.17857142857142858, 'Precision': 0.18292682926829268, 'F1_score': 0.1807228915662651}, 5: {'Recall': 0.30434782608695654, 'Precision': 0.30241423125794153, 'F1_score': 0.30337794773741233}, 6: {'Recall': 0.18181818181818182, 'Precision': 0.17142857142857143, 'F1_score': 0.1764705882352941}, 7: {'Recall': 0.5213803450862715, 'Precision': 0.5265151515151515, 'F1_score': 0.5239351677346399}}
Test Acc: 0.5098
Total Skip: 5





In [10]:
import nltk
import tqdm
from nltk.tag.stanford import StanfordNERTagger

jar = './stanford-ner.jar'
model = './trained_ar_gk.ser.gz' # './trained_ar_gk.ser.gz', './trained_ar.ser.gz', './trained_gk.ser.gz'

ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')
label2id = {'DF': 0, 'NAME': 1, 'O': 2, 'QUANTITY': 3, 'SIZE': 4, 'STATE': 5, 'TEMP': 6, 'UNIT': 7}
id2label = {v: k for k, v in label2id.items()}

test_files = ['./train/ar_gk_test.tsv', './train/ar_test.tsv', './train/gk_test.tsv']
for test_file in test_files:
    print('\n\nCurrent test file: {}\n'.format(test_file))
    test_ground_truth_list = []
    test_prediction_list = []
    samples, _ = get_samples(test_file)
    total_skip = 0
    for sample in tqdm.tqdm(samples):
        text, label = sample['text'], sample['labels']
        words = nltk.word_tokenize(text)
        output = ner_tagger.tag(words)
        prediction = [label2id[pred] for token, pred in output]
        if not len(label) == len(prediction): total_skip += 1
        test_ground_truth_list.extend(label)
        test_prediction_list.extend(prediction)

    test_f1_score = compute_entity_level_f1_score(test_ground_truth_list, test_prediction_list)
    correct = 0
    for p, g in zip(test_prediction_list, test_ground_truth_list):
        if p == g: correct += 1
    print(test_f1_score)
    print('Test Acc: {}'.format(correct / len(test_prediction_list)))
    print('Total Skip: {}'.format(total_skip))



Current test file: ./train/ar_gk_test.tsv



100%|█████████████████████████████████████████████████| 2188/2188 [10:44<00:00,  3.39it/s]


{0: {'Recall': 0.42346938775510207, 'Precision': 0.43005181347150256, 'F1_score': 0.4267352185089974}, 1: {'Recall': 0.6453647833241909, 'Precision': 0.6448342011510003, 'F1_score': 0.6450993831391364}, 2: {'Recall': 0.6006528835690969, 'Precision': 0.5978878960194963, 'F1_score': 0.5992672004342515}, 3: {'Recall': 0.6436781609195402, 'Precision': 0.6422584913983238, 'F1_score': 0.642967542503864}, 4: {'Recall': 0.3269230769230769, 'Precision': 0.3333333333333333, 'F1_score': 0.3300970873786408}, 5: {'Recall': 0.4876486733760293, 'Precision': 0.48498635122838946, 'F1_score': 0.4863138686131387}, 6: {'Recall': 0.3488372093023256, 'Precision': 0.3333333333333333, 'F1_score': 0.3409090909090909}, 7: {'Recall': 0.6268992684299382, 'Precision': 0.6369353916523728, 'F1_score': 0.6318774815655134}}
Test Acc: 0.6083495903238393
Total Skip: 5


Current test file: ./train/ar_test.tsv



100%|███████████████████████████████████████████████████| 483/483 [02:23<00:00,  3.37it/s]


{0: {'Recall': 0.9411764705882353, 'Precision': 0.9411764705882353, 'F1_score': 0.9411764705882353}, 1: {'Recall': 0.9624413145539906, 'Precision': 0.9738717339667459, 'F1_score': 0.9681227863046045}, 2: {'Recall': 0.9601386481802426, 'Precision': 0.8878205128205128, 'F1_score': 0.9225645295587012}, 3: {'Recall': 0.9872958257713249, 'Precision': 1.0, 'F1_score': 0.993607305936073}, 4: {'Recall': 0.95, 'Precision': 1.0, 'F1_score': 0.9743589743589743}, 5: {'Recall': 0.9453376205787781, 'Precision': 0.9735099337748344, 'F1_score': 0.9592169657422512}, 6: {'Recall': 0.9, 'Precision': 1.0, 'F1_score': 0.9473684210526316}, 7: {'Recall': 0.945823927765237, 'Precision': 0.9882075471698113, 'F1_score': 0.9665513264129182}}
Test Acc: 0.9616341030195382
Total Skip: 0


Current test file: ./train/gk_test.tsv



100%|█████████████████████████████████████████████████| 1705/1705 [08:24<00:00,  3.38it/s]

{0: {'Recall': 0.2413793103448276, 'Precision': 0.24647887323943662, 'F1_score': 0.24390243902439024}, 1: {'Recall': 0.5486757337151038, 'Precision': 0.5461346633416458, 'F1_score': 0.5474022495982861}, 2: {'Recall': 0.5335483870967742, 'Precision': 0.5389377647442164, 'F1_score': 0.5362295347706275}, 3: {'Recall': 0.5330216247808299, 'Precision': 0.5293093441671504, 'F1_score': 0.5311589982527666}, 4: {'Recall': 0.17857142857142858, 'Precision': 0.18072289156626506, 'F1_score': 0.17964071856287425}, 5: {'Recall': 0.30562659846547313, 'Precision': 0.2998745294855709, 'F1_score': 0.3027232425585814}, 6: {'Recall': 0.18181818181818182, 'Precision': 0.16666666666666666, 'F1_score': 0.17391304347826086}, 7: {'Recall': 0.5213803450862715, 'Precision': 0.5245283018867924, 'F1_score': 0.5229495861550038}}
Test Acc: 0.5089
Total Skip: 5



