# Recipe data from https://www.kaggle.com/hugodarwood/epirecipes

# Grab a subset of the data and convert to JSONL for labeling in Prodigy

In [2]:
import json

import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
recipe_file = "/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/full_format_recipes.json"

In [4]:
recipe_df = pd.read_json(recipe_file, orient='records')
recipe_df.head()

Unnamed: 0,calories,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title
0,426.0,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",2006-09-01 04:00:00,,"[1. Place the stock, lentils, celery, carrot, ...",7.0,"[4 cups low-sodium vegetable or chicken stock,...",30.0,2.5,559.0,"Lentil, Apple, and Turkey Wrap"
1,403.0,"[Food Processor, Onion, Pork, Bake, Bastille D...",2004-08-20 04:00:00,This uses the same ingredients found in boudin...,[Combine first 9 ingredients in heavy medium s...,23.0,"[1 1/2 cups whipping cream, 2 medium onions, c...",18.0,4.375,1439.0,Boudin Blanc Terrine with Red Onion Confit
2,165.0,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",2004-08-20 04:00:00,,[In a large heavy saucepan cook diced fennel a...,7.0,"[1 fennel bulb (sometimes called anise), stalk...",6.0,3.75,165.0,Potato and Fennel Soup Hodge
3,,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",2009-03-27 04:00:00,The Sicilian-style tomato sauce has tons of Me...,[Heat oil in heavy large skillet over medium-h...,,"[2 tablespoons extra-virgin olive oil, 1 cup c...",,5.0,,Mahi-Mahi in Tomato Olive Sauce
4,547.0,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",2004-08-20 04:00:00,,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,"[1 12-ounce package frozen spinach soufflé, th...",20.0,3.125,452.0,Spinach Noodle Casserole


In [5]:
recipe_df.shape

(20130, 11)

# Make a sample of the data for labeling 

Start with ~2000 recipes

In [6]:
use, ignore = train_test_split(recipe_df, test_size=0.9, random_state=12)
print(use.shape, ignore.shape)

(2013, 11) (18117, 11)


In [7]:
# save the split to csvs 
use_file = "/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/use_2013.tsv"
ignore_file = "/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/ignore_18117.tsv"

use.to_csv(use_file, sep="\t")
ignore.to_csv(ignore_file, sep="\t")

In [8]:
indices = use.index.tolist()
directions = use.directions.tolist()

In [9]:
# write jsonl
outfile = "/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/use_directions.jsonl"

with open(outfile, 'w', encoding='utf-8') as out:
    for index, direction_list in zip(indices, directions):
        subindex = 1  
        if type(direction_list) != list:   # skip nans
            continue
        for direction_item in direction_list:
            row = {"text" : direction_item, "meta": {"row": index, "subpart": subindex}}
            subindex += 1
            json.dump(row, out)
            out.write("\n")

# Make patterns file for teaching Prodigy food terms 

```

Start with some simple one-word terms:

prodigy terms.teach food_terms en_core_web_lg --seeds "flour, chicken, tomato"

prodigy terms.to-patterns food_terms /Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/food_term_patterns.jsonl --label FOOD
```

## Load resulting terms and merge with hand-written patterns

In [16]:
pattern_file_1 = "/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/food_term_patterns.jsonl"

term_patterns = []
with open(pattern_file_1, 'r') as p:
    for line in p:
        linedict = json.loads(line)
        term_patterns.append(linedict)

term_patterns

[{'label': 'FOOD', 'pattern': [{'lower': 'flour'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'chicken'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'tomato'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'sauce'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'onion'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'garlic'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'onions'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'potato'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'butter'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'bread'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'rice'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'spinach'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'salad'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'pepper'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'soup'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'pasta'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'vegetable'}]},
 {'label': 'FOOD', 'pattern': [{'lower': 'chili'}]},
 {'label': 'FOOD', 'pattern': [{'l

In [18]:
patterns = [
    {"label": "FOOD", "pattern": [{"lower": "butter"}]},
    {"label": "FOOD", "pattern": [{"POS": "NOUN"}, {"lower": "butter"}]},
    {"label": "FOOD", "pattern": [{"lower": "salt"}]},
    {"label": "FOOD", "pattern": [{"lower": "flour"}]},
    {"label": "FOOD", "pattern": [{"lower": "vinegar"}]},
    {"label": "FOOD", "pattern": [{"lower": "chicken"}]},
    {"label": "FOOD", "pattern": [{"lower": "rice"}]},
    {"label": "FOOD", "pattern": [{"POS": "ADJ"}, {"lower": "oil"}]},
    {"label": "FOOD", "pattern": [{"POS": "ADJ"}, {"lower": "pepper"}]},
        {"label": "FOOD", "pattern": [{"POS": "ADJ"}, {"lower": "pepper"},{"POS": "NOUN"}]},
    {"label": "FOOD", "pattern": [{"POS": "NOUN"}, {"lower": "sauce"}]},
    {"label": "FOOD", "pattern": [{"POS": "NOUN"}, {"lower": "extract"}]},
    {"label": "FOOD", "pattern": [{"POS": "NOUN"}, {"lower": "juice"}]},
    {"label": "FOOD", "pattern": [{"POS": "NOUN"}, {"lower":"stock"}]}
] + term_patterns



pattern_file_2 = "/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/food_patterns.jsonl"

with open(pattern_file_2, 'w', encoding='utf-8') as out:
        for pattern in patterns:
            json.dump(pattern, out)
            out.write("\n")

## Begin binary labeling in Prodigy:

```
prodigy ner.teach food_ner en_core_web '/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/use_directions.jsonl' --label FOOD --patterns '/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/food_patterns.jsonl' --unsegmented

```
  


## Batch train after accumulating ~2000 decisions

```
prodigy ner.batch-train food_ner en_core_web_lg --output '/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/ner_models/food_2192' --unsegmented --n-iter 20

```

## Continue labeling, now without patterns

```
prodigy ner.teach food_ner '/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/ner_models/food_2192' '/Users/Carol/Documents/epicurious-recipes-with-rating-and-nutrition/use_directions.jsonl' --label FOOD --unsegmented
```

In [None]:
# Possible other categories: tool/equipment ("bowl" or "small bowl"? "saucepan" or "heavy saucepan")
# Interesting items: butter and salt as verb vs. noun (could put in patterns)
# patterns: noun + juice, noun + extract, chicken + noun