In [1]:
from collections import Counter
import os

import pandas as pd
from sklearn.model_selection import train_test_split 

In [2]:
recipe_file = "/Users/Carol/Dropbox/epicurious-recipes-with-rating-and-nutrition/full_format_recipes.json"

In [3]:
recipe_df = pd.read_json(recipe_file, orient='records')
recipe_df.head(2)

Unnamed: 0,calories,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title
0,426.0,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",2006-09-01 04:00:00,,"[1. Place the stock, lentils, celery, carrot, ...",7.0,"[4 cups low-sodium vegetable or chicken stock,...",30.0,2.5,559.0,"Lentil, Apple, and Turkey Wrap"
1,403.0,"[Food Processor, Onion, Pork, Bake, Bastille D...",2004-08-20 04:00:00,This uses the same ingredients found in boudin...,[Combine first 9 ingredients in heavy medium s...,23.0,"[1 1/2 cups whipping cream, 2 medium onions, c...",18.0,4.375,1439.0,Boudin Blanc Terrine with Red Onion Confit


In [29]:
recipe_df.shape

(20130, 11)

In [24]:
recipe_df['categories'] = recipe_df['categories'].fillna(value="")

In [32]:
deduped = recipe_df.drop_duplicates(subset="title", keep="first")

In [33]:
deduped.shape

(17776, 11)

In [37]:
# compile all the categories and their counts
cats = deduped.categories.tolist()
all_cats = []
for cat in cats:
    if isinstance(cat, float):
        print(cat)
    else:
        all_cats.extend(cat)

In [38]:
cat_counts = Counter(all_cats)
cat_counts.most_common()

[('Bon Appétit', 8470),
 ('Peanut Free', 7564),
 ('Soy Free', 7279),
 ('Tree Nut Free', 6302),
 ('Vegetarian', 6027),
 ('Gourmet', 5745),
 ('Kosher', 5511),
 ('Pescatarian', 5399),
 ('Quick & Easy', 4728),
 ('Wheat/Gluten-Free', 4357),
 ('Bake', 4035),
 ('Summer', 3762),
 ('Dessert', 3153),
 ('Dairy Free', 2853),
 ('No Sugar Added', 2792),
 ('Winter', 2768),
 ('Side', 2710),
 ('Fall', 2676),
 ('Dinner', 2535),
 ('Sugar Conscious', 2172),
 ('Healthy', 2158),
 ('Tomato', 1961),
 ('Kidney Friendly', 1955),
 ('Onion', 1934),
 ('Sauté', 1900),
 ('Vegetable', 1876),
 ('Fruit', 1807),
 ('Milk/Cream', 1791),
 ('Kid-Friendly', 1653),
 ('Egg', 1599),
 ('Spring', 1540),
 ('Vegan', 1535),
 ('Herb', 1490),
 ('Garlic', 1438),
 ('Salad', 1432),
 ('Dairy', 1336),
 ('Appetizer', 1305),
 ('Lunch', 1284),
 ('Cheese', 1273),
 ('Chicken', 1240),
 ('Thanksgiving', 1228),
 ('Roast', 1173),
 ('Cocktail Party', 1041),
 ('Potato', 1032),
 ('Ginger', 1030),
 ('Soup/Stew', 1014),
 ('No-Cook', 1012),
 ('Grill/Barb

In [129]:
# Some interesting categories

cat_lists = [["Breakfast", "Lunch", "Dinner", "Appetizer", "Dessert"], ["Winter", "Spring", "Summer", "Fall"],
            ["Quick & Easy", "Gourmet"], ["Sugar Conscious", "Kidney Friendly", "Healthy", "Wheat/Gluten-Free", "Kosher", "Vegan"]]

for c_list in cat_lists:
    for c in c_list:
        print("{} : ".format(c), cat_counts[c])
    print("========")

Breakfast :  677
Lunch :  1284
Dinner :  2535
Appetizer :  1305
Dessert :  3153
Winter :  2768
Spring :  1540
Summer :  3762
Fall :  2676
Quick & Easy :  4728
Gourmet :  5745
Sugar Conscious :  2172
Kidney Friendly :  1955
Healthy :  2158
Wheat/Gluten-Free :  4357
Kosher :  5511
Vegan :  1535


In [69]:
def filter_by_category(df, category):
    mask = df.categories.apply(lambda x: category in x)
    selected = df[mask]
    return selected

def exclude_category(df, category):
    mask = df.categories.apply(lambda x: category not in x)
    selected = df[mask]
    return selected

winter = filter_by_category(deduped, "Winter")
winter.head()

Unnamed: 0,calories,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title
1,403.0,"[Food Processor, Onion, Pork, Bake, Bastille D...",2004-08-20 04:00:00,This uses the same ingredients found in boudin...,[Combine first 9 ingredients in heavy medium s...,23.0,"[1 1/2 cups whipping cream, 2 medium onions, c...",18.0,4.375,1439.0,Boudin Blanc Terrine with Red Onion Confit
7,,"[Egg, Fruit, No-Cook, Cocktail Party, Vegetari...",2004-08-20 04:00:00,Pop one of these in your mouth for a burst of ...,[Stir together sugar and chili powder. Whisk e...,,"[6 tablespoons granulated sugar, 1 1/2 tablesp...",,3.75,,Spicy-Sweet Kumquats
15,382.0,"[Soup/Stew, Garlic, Onion, No-Cook, Vegetarian...",2004-08-20 04:00:00,This uncooked soup isn't for the timid — raw g...,"[Purée vegetable juice, spinach, and escarole ...",31.0,[2 cups fresh tomato and/or carrot juice (from...,5.0,4.375,977.0,Raw Cream of Spinach Soup
24,279.0,"[Nut, Bake, Cocktail Party, Super Bowl, Quick ...",2007-01-18 04:25:29,,[Preheat oven to 325°F. Toss pecans and melted...,30.0,"[2 cups pecan halves, 3 tablespoons unsalted b...",3.0,3.75,206.0,Sea Salt-Roasted Pecans
25,95.0,"[Bread, Condiment/Spread, Fry, No-Cook, Quick ...",2004-08-20 04:00:00,,[Mince garlic and mash to a paste with a pinch...,7.0,"[1 garlic clove, 2 tablespoons olive oil, 1/4 ...",1.0,0.0,103.0,Garlic Baguette Crumbs


In [61]:
winter.iloc[1,1]

['Egg',
 'Fruit',
 'No-Cook',
 'Cocktail Party',
 'Vegetarian',
 'Winter',
 'Kumquat',
 'Gourmet']

In [67]:
gf = filter_by_category(deduped, 'Wheat/Gluten-Free')

In [68]:
gf.head()

Unnamed: 0,calories,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title
6,,"[Salad, Potato, Side, Easter, Low Fat, Quick &...",2004-08-20 04:00:00,Serve this newfangled main-course salad with a...,[Cook potatoes and carrots in large pot of boi...,,"[1 1/2 pounds small red-skinned potatoes, each...",,4.375,,Ham and Spring Vegetable Salad with Shallot Vi...
9,602.0,"[Salad, Mustard, Potato, Picnic, Lunch, Mayonn...",2008-10-23 22:24:26,Transform your picnic into un pique-nique to r...,[Chop enough parsley leaves to measure 1 table...,41.0,"[6 long parsley sprigs, divided, 1 3/4 cups re...",23.0,3.75,1696.0,Ham Persillade with Mustard Potato Salad and M...
13,174.0,"[Garlic, Sauté, Low Carb, Quick & Easy, Wheat/...",2004-08-20 04:00:00,This recipe can be prepared in 45 minutes or l...,[Sprinkle steaks with salt and pepper. Heat oi...,12.0,[4 6- to 7-ounce beef tenderloin steaks (each ...,11.0,4.375,176.0,Beef Tenderloin with Garlic and Brandy
16,146.0,"[Bread, Milk/Cream, Breakfast, Brunch, Dessert...",2008-12-04 04:00:00,Classic spoon bread is a savory pudding served...,[Butter and sugar six 2/3-to 3/4-cup ramekins....,5.0,"[1 cup water, 2/3 cup buttermilk, 1/3 cup heav...",4.0,1.875,160.0,Sweet Buttermilk Spoon Breads
19,421.0,"[Potato, High Fiber, Lunch, Tuna, Asparagus, R...",2009-03-04 04:00:00,"A springy take on Niçoise salad, with radishes...",[Puree first 5 ingredients in blender until sm...,33.0,"[1/3 cup chopped fresh chives, 1/4 cup Champag...",10.0,5.0,383.0,"Tuna, Asparagus, and New Potato Salad with Chi..."


In [70]:
non_gf = exclude_category(deduped, "Wheat/Gluten-Free")
non_gf.head()

Unnamed: 0,calories,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title
0,426.0,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",2006-09-01 04:00:00,,"[1. Place the stock, lentils, celery, carrot, ...",7.0,"[4 cups low-sodium vegetable or chicken stock,...",30.0,2.5,559.0,"Lentil, Apple, and Turkey Wrap"
1,403.0,"[Food Processor, Onion, Pork, Bake, Bastille D...",2004-08-20 04:00:00,This uses the same ingredients found in boudin...,[Combine first 9 ingredients in heavy medium s...,23.0,"[1 1/2 cups whipping cream, 2 medium onions, c...",18.0,4.375,1439.0,Boudin Blanc Terrine with Red Onion Confit
2,165.0,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",2004-08-20 04:00:00,,[In a large heavy saucepan cook diced fennel a...,7.0,"[1 fennel bulb (sometimes called anise), stalk...",6.0,3.75,165.0,Potato and Fennel Soup Hodge
3,,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",2009-03-27 04:00:00,The Sicilian-style tomato sauce has tons of Me...,[Heat oil in heavy large skillet over medium-h...,,"[2 tablespoons extra-virgin olive oil, 1 cup c...",,5.0,,Mahi-Mahi in Tomato Olive Sauce
4,547.0,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",2004-08-20 04:00:00,,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,"[1 12-ounce package frozen spinach soufflé, th...",20.0,3.125,452.0,Spinach Noodle Casserole


In [74]:
non_gf.iloc[1, 6]

['1 1/2 cups whipping cream',
 '2 medium onions, chopped',
 '5 teaspoons salt',
 '3 bay leaves',
 '3 whole cloves',
 '1 large garlic clove, crushed',
 '1 teaspoon pepper',
 '1/8 teaspoon ground nutmeg',
 'Pinch of dried thyme, crumbled',
 '8 large shallots, minced',
 '1 tablespoon butter',
 '1 pound trimmed boneless center pork loin, sinew removed cut into 1-inch chunks, well chilled',
 '3 eggs',
 '6 tablespoon all purpose flour',
 '1/4 cup tawny Port',
 '3 tablespoons dried currants, minced',
 'Lettuce leaves',
 'Cracked peppercorns',
 'Minced fresh parsley',
 'Bay leaves',
 'French bread baguette slices',
 '3 tablespoons olive oil',
 '2 large red onions, halved, sliced',
 '3 tablespoons dried currants',
 '3 tablespoons red wine vinegar',
 '1 tablespoons canned chicken broth',
 '2 teaspoons chopped fresh thyme or 3/4 teaspoon dried, crumbled',
 '1/2 teaspoon sugar']

In [75]:
kosher = filter_by_category(deduped, "Kosher")
kosher.head()

Unnamed: 0,calories,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title
3,,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",2009-03-27 04:00:00,The Sicilian-style tomato sauce has tons of Me...,[Heat oil in heavy large skillet over medium-h...,,"[2 tablespoons extra-virgin olive oil, 1 cup c...",,5.0,,Mahi-Mahi in Tomato Olive Sauce
12,766.0,"[Cake, Chocolate, Fruit, Dessert, Bake, Kid-Fr...",2012-09-18 04:00:00,"This cake is perfect for beginners—it's moist,...",[Preheat oven to 350°F. Coat cake pans with no...,48.0,"[Nonstick vegetable oil spray, 3 cups all-purp...",12.0,4.375,439.0,Banana-Chocolate Chip Cake With Peanut Butter ...
16,146.0,"[Bread, Milk/Cream, Breakfast, Brunch, Dessert...",2008-12-04 04:00:00,Classic spoon bread is a savory pudding served...,[Butter and sugar six 2/3-to 3/4-cup ramekins....,5.0,"[1 cup water, 2/3 cup buttermilk, 1/3 cup heav...",4.0,1.875,160.0,Sweet Buttermilk Spoon Breads
19,421.0,"[Potato, High Fiber, Lunch, Tuna, Asparagus, R...",2009-03-04 04:00:00,"A springy take on Niçoise salad, with radishes...",[Puree first 5 ingredients in blender until sm...,33.0,"[1/3 cup chopped fresh chives, 1/4 cup Champag...",10.0,5.0,383.0,"Tuna, Asparagus, and New Potato Salad with Chi..."
26,215.0,"[Egg, Herb, Vegetable, Side, Easter, Vegetaria...",2011-03-31 04:00:00,The glories of summer are captured in this pal...,"[Gently combine the eggs, cucumbers, shallots,...",20.0,"[6 hard-cooked eggs, diced (2 cups), 3/4 cup s...",6.0,3.75,250.0,Cucumber-Basil Egg Salad


In [76]:
non_kosher = exclude_category(deduped, "Kosher")
non_kosher.head()

Unnamed: 0,calories,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title
0,426.0,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",2006-09-01 04:00:00,,"[1. Place the stock, lentils, celery, carrot, ...",7.0,"[4 cups low-sodium vegetable or chicken stock,...",30.0,2.5,559.0,"Lentil, Apple, and Turkey Wrap"
1,403.0,"[Food Processor, Onion, Pork, Bake, Bastille D...",2004-08-20 04:00:00,This uses the same ingredients found in boudin...,[Combine first 9 ingredients in heavy medium s...,23.0,"[1 1/2 cups whipping cream, 2 medium onions, c...",18.0,4.375,1439.0,Boudin Blanc Terrine with Red Onion Confit
2,165.0,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",2004-08-20 04:00:00,,[In a large heavy saucepan cook diced fennel a...,7.0,"[1 fennel bulb (sometimes called anise), stalk...",6.0,3.75,165.0,Potato and Fennel Soup Hodge
4,547.0,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",2004-08-20 04:00:00,,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,"[1 12-ounce package frozen spinach soufflé, th...",20.0,3.125,452.0,Spinach Noodle Casserole
5,948.0,"[Sandwich, Food Processor, Tomato, Kid-Friendl...",2004-08-20 04:00:00,This recipe can be prepared in 45 minutes or l...,"[Mix basil, mayonnaise and butter in processor...",79.0,[2 1/2 cups (lightly packed) fresh basil leave...,19.0,4.375,1042.0,The Best Blts


# Make a dataset for classifying by vegan, gluten-free, and kosher categories

In [92]:
cats = deduped.categories.tolist()
ingreds = deduped.ingredients.tolist()
titles = deduped.title.tolist()

new_rows = []

for cat, ingred, title in zip(cats, ingreds, titles):
    gf = False
    k = False
    v = False
    if "Wheat/Gluten-Free" in cat:
        gf = True
    if "Kosher" in cat:
        k = True
    if "Vegan" in cat:
        v = True
    labels = (gf, k, v)
    new_row = {"title": title, "ingredients": ingred, "gluten_free": gf, "kosher" : k,
              "vegan": v}
    new_rows.append(new_row)
    

labeled = pd.DataFrame(new_rows)
labeled = labeled[["title", "ingredients", "gluten_free", "kosher", "vegan"]]
labeled.head()

Unnamed: 0,title,ingredients,gluten_free,kosher,vegan
0,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",False,False,False
1,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",False,False,False
2,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",False,False,False
3,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",False,True,False
4,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",False,False,False


In [96]:
labeled.groupby(["gluten_free"])["title"].count()

gluten_free
False    13418
True      4357
Name: title, dtype: int64

In [97]:
labeled.groupby(["kosher"])["title"].count()

kosher
False    12264
True      5511
Name: title, dtype: int64

In [98]:
labeled.groupby(["vegan"])["title"].count()

vegan
False    16240
True      1535
Name: title, dtype: int64

In [107]:
non = labeled[labeled["gluten_free"]==False]
non = non[non["kosher"]==False]
non  = non[non["vegan"]==False]
print(non.shape)
non.head()

(10497, 5)


Unnamed: 0,title,ingredients,gluten_free,kosher,vegan
0,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",False,False,False
1,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",False,False,False
2,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",False,False,False
4,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",False,False,False
5,The Best Blts,[2 1/2 cups (lightly packed) fresh basil leave...,False,False,False


In [110]:
pos = labeled[~labeled.index.isin(non.index)]
pos.shape

(7279, 5)

In [112]:
pos.groupby(["vegan"])["title"].count()

vegan
False    5744
True     1535
Name: title, dtype: int64

In [114]:
pos.groupby(["kosher"])["title"].count()

kosher
False    1768
True     5511
Name: title, dtype: int64

In [115]:
pos.groupby(["gluten_free"])["title"].count()

gluten_free
False    2922
True     4357
Name: title, dtype: int64

In [117]:
# use all rows that are vegan, gluten-free, or kosher and add 3743 rows that have none of
# those labels
non_to_use = non.sample(3743)

In [118]:
non_to_use.shape

(3743, 5)

In [119]:
all_to_use = pd.concat([pos, non_to_use])
all_to_use.shape

(11022, 5)

In [121]:
all_to_use.groupby(["vegan"])["title"].count()

vegan
False    9486
True     1535
Name: title, dtype: int64

In [122]:
all_to_use.groupby(["kosher"])["title"].count()

kosher
False    5510
True     5511
Name: title, dtype: int64

In [123]:
all_to_use.groupby(["gluten_free"])["title"].count()

gluten_free
False    6664
True     4357
Name: title, dtype: int64

In [124]:
all_to_use.head()

Unnamed: 0,title,ingredients,gluten_free,kosher,vegan
3,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",False,True,False
6,Ham and Spring Vegetable Salad with Shallot Vi...,"[1 1/2 pounds small red-skinned potatoes, each...",True,False,False
9,Ham Persillade with Mustard Potato Salad and M...,"[6 long parsley sprigs, divided, 1 3/4 cups re...",True,False,False
12,Banana-Chocolate Chip Cake With Peanut Butter ...,"[Nonstick vegetable oil spray, 3 cups all-purp...",False,True,False
13,Beef Tenderloin with Garlic and Brandy,[4 6- to 7-ounce beef tenderloin steaks (each ...,True,False,False


In [131]:
outfile = "/Users/Carol/Google Drive/nlp_data/recipe_data/20200524_gf_k_v_cats.tsv"
all_to_use.to_csv(outfile, sep="\t", index=False)

In [167]:
# write out the training, dev, and test sets
outpath = "/Users/Carol/Google Drive/nlp_data/recipe_data/20200524_categories"
for name, df in zip(["train", "dev", "test"], [train, dev, test]):
    outfile = os.path.join(outpath, name + ".tsv")
    df.to_csv(outfile, sep="\t", index=False)