In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
train = pd.read_json("../data/train.json")
test  = pd.read_json("../data/test.json")

In [3]:
train_ingredients_list = train.explode("ingredients")[['ingredients']]
train_ingredients_list.sample(5)

Unnamed: 0,ingredients
522,salt
31588,mayonaise
33965,cardamom
29483,white onion
26646,lemon juice


In [272]:
test_ingredients_list = test.explode("ingredients")[['ingredients']]
test_ingredients_list.sample(5)

Unnamed: 0,ingredients
1755,ground black pepper
1490,hellmann' or best food real mayonnais
2002,sugar
3218,unsalted butter
9351,worcestershire sauce


In [266]:
train_ingredient_counts = train.explode("ingredients").groupby('ingredients').count().rename(columns={"id":"count"})[['count']]
train_ingredient_counts.sample(5)

Unnamed: 0_level_0,count
ingredients,Unnamed: 1_level_1
meat marinade,2
dry red wine,309
salmon fillets,159
barley flakes,1
mixed greens,63


In [267]:
test_ingredient_counts = test.explode("ingredients").groupby('ingredients').count().rename(columns={"id":"count"})[['count']]
test_ingredient_counts.sample(5)

Unnamed: 0_level_0,count
ingredients,Unnamed: 1_level_1
pineapple chunks,14
acorn squash,3
stone-ground cornmeal,5
poblano peppers,28
spring water,1


In [268]:
def clean_ingredient(ingredient):
    # remove processing instructions, e.g. "frozen chopped spinach, thawed and squeezed dry"
    ingredient = re.sub(r", .* and .*","", ingredient)
    # remove oz sizing, typically found on cans 
    ingredient = re.sub(r"\(.*oz\..*\)","", ingredient)
    # standardize garlic, removing "clove" or "cloves"
    ingredient = re.sub(r"garlic cloves?","garlic", ingredient)
    if "black pepper" in ingredient:
        return "pepper"
    # remove words appearing anywhere in the sequence, including beginning or end, but separated by spaces
    size_words = ["small", "medium", "large", "jumbo"]
    prep_words = ["chopped", "shredded", "sliced", "firmly packed", "diced", "finely","^ground"]
    adjectives = ["fresh", "frozen", "firm", "extra firm", "^dried"]
    modifiers  = ["reduced fat", "reduced sodium", "fat free", "low-fat"]
    words = size_words + prep_words + adjectives + modifiers
    for word in words:
        reg = r"( "+word+" )|( "+word+")|("+word+" )"
        ingredient = re.sub(reg," ",ingredient)
    return ingredient.strip()

In [292]:
train[['id','cuisine']]

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [296]:
processed_ingredients = train.head(3).explode("ingredients").set_index('id')[['ingredients']].applymap(clean_ingredient).reset_index()
df = train[['id','cuisine']].merge(processed_ingredients, on='id')
df

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,romaine lettuce
1,10259,greek,black olives
2,10259,greek,grape tomatoes
3,10259,greek,garlic
4,10259,greek,pepper
5,10259,greek,purple onion
6,10259,greek,seasoning
7,10259,greek,garbanzo beans
8,10259,greek,feta cheese crumbles
9,25693,southern_us,plain flour


In [302]:
df2 = df.groupby('id').agg({'cuisine': 'first', "ingredients":lambda x: list(x)}).reset_index()
df2

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
2,25693,southern_us,"[plain flour, pepper, salt, tomatoes, pepper, ..."


In [313]:
df2.to_json("temp.json",orient='records')

In [314]:
pd.read_json("temp.json")

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
2,25693,southern_us,"[plain flour, pepper, salt, tomatoes, pepper, ..."


In [269]:
def get_altered_ingredients(ingredients, cleaning_func):
    cleaned = ingredients.applymap(cleaning_func)
    altered_flag = (ingredients.applymap(cleaning_func)['ingredients'] != ingredients['ingredients'])
    return ingredients[altered_flag].merge(cleaned[altered_flag],left_index=True, right_index=True, suffixes=["_orig",'_cleaned'])

In [270]:
get_altered_ingredients(train_ingredients_list.reset_index().drop(columns=['index']).sample(500),clean_ingredient)

Unnamed: 0,ingredients_orig,ingredients_cleaned
210858,diced tomatoes,tomatoes
346898,ground black pepper,pepper
129578,chopped celery,celery
243036,black pepper,pepper
316839,diced tomatoes,tomatoes
...,...,...
34808,black pepper,pepper
208758,fresh cilantro,cilantro
176771,ground beef,beef
404403,ground black pepper,pepper


In [280]:
get_altered_ingredients(test_ingredients_list.reset_index().drop(columns=['index']).sample(500),clean_ingredient)

Unnamed: 0,ingredients_orig,ingredients_cleaned
61247,large egg whites,egg whites
30002,ground turmeric,turmeric
104090,fresh thyme,thyme
47532,ground black pepper,pepper
82009,fresh basil leaves,basil leaves
...,...,...
89987,fresh rosemary,rosemary
106074,fresh chili,chili
60398,dried porcini mushrooms,porcini mushrooms
52622,large eggs,eggs


In [282]:
ingredient_counts = train_ingredient_counts
ingredients_list = train_ingredients_list

In [16]:
def get_ingredients_containing(ingredients_list, ingredient_str):
    return [s  for s in ingredients_list if ingredient_str in s]

In [261]:
get_ingredients_containing(ingredient_counts.index, "tofu")

['deep-fried tofu',
 'extra firm silken tofu',
 'extra firm tofu',
 'firm silken tofu',
 'firm tofu',
 'fresh tofu',
 'low-fat firm silken tofu',
 'medium firm tofu',
 'nigari tofu',
 'pressed tofu',
 'reduced fat firm tofu',
 'regular tofu',
 'semi firm tofu',
 'silken tofu',
 'soft tofu',
 'tofu',
 'tofu puffs',
 'tofu sour cream']

In [19]:
get_ingredients_containing(ingredient_counts.index, "oz.")

['(    oz.) tomato sauce',
 '(   oz.) tomato paste',
 '(10 oz.) frozen chopped spinach',
 '(10 oz.) frozen chopped spinach, thawed and squeezed dry',
 '(14 oz.) sweetened condensed milk',
 '(14.5 oz.) diced tomatoes',
 '(15 oz.) refried beans']

In [20]:
get_ingredients_containing(ingredient_counts.index, "chopped")

['(10 oz.) frozen chopped spinach',
 '(10 oz.) frozen chopped spinach, thawed and squeezed dry',
 'Old El Paso™ chopped green chiles',
 'canned chopped tomatoes',
 'chopped almonds',
 'chopped bacon',
 'chopped bell pepper',
 'chopped celery',
 'chopped cilantro',
 'chopped cilantro fresh',
 'chopped cooked ham',
 'chopped cooked meat',
 'chopped fresh chives',
 'chopped fresh herbs',
 'chopped fresh mint',
 'chopped fresh sage',
 'chopped fresh thyme',
 'chopped garlic',
 'chopped green bell pepper',
 'chopped green chilies',
 'chopped ham',
 'chopped hazelnuts',
 'chopped leaves',
 'chopped nuts',
 'chopped onion',
 'chopped parsley',
 'chopped pecans',
 'chopped potatoes',
 'chopped tomatoes',
 'chopped walnuts',
 'finely chopped fresh parsley',
 'finely chopped onion',
 'frozen chopped broccoli',
 'frozen chopped spinach',
 'frozen chopped spinach, thawed and squeezed dry',
 'water chestnuts, drained and chopped']

In [21]:
get_ingredients_containing(ingredient_counts.index, "frozen")

['(10 oz.) frozen chopped spinach',
 '(10 oz.) frozen chopped spinach, thawed and squeezed dry',
 'coffee low-fat frozen yogurt',
 'crabmeat frozen',
 'disco empanada frozen',
 'fat free frozen top whip',
 'french fri frozen',
 'frozen artichoke hearts',
 'frozen banana',
 'frozen banana leaf',
 'frozen basil',
 'frozen blackberries',
 'frozen blueberries',
 'frozen bread dough',
 'frozen broad beans',
 'frozen broccoli',
 'frozen broccoli florets',
 'frozen brussels sprouts',
 'frozen carrots',
 'frozen cheese ravioli',
 'frozen cherries',
 'frozen chopped broccoli',
 'frozen chopped spinach',
 'frozen chopped spinach, thawed and squeezed dry',
 'frozen cod fillets',
 'frozen corn',
 'frozen corn kernels',
 'frozen crabmeat, thaw and drain',
 'frozen cranberry juice concentrate',
 'frozen edamame beans',
 'frozen fruit',
 'frozen garden peas',
 'frozen green beans',
 'frozen hash browns',
 'frozen lemonade concentrate',
 'frozen lemonade concentrate, thawed and undiluted',
 'frozen li

In [23]:
get_ingredients_containing(ingredient_counts.index, "low-fat")

['1% low-fat buttermilk',
 '1% low-fat chocolate milk',
 '1% low-fat cottage cheese',
 '1% low-fat milk',
 '2% low-fat cottage cheese',
 'coffee low-fat frozen yogurt',
 'evapor low-fat milk',
 'evaporated low-fat 2% milk',
 'low-fat baked tortilla chips',
 'low-fat balsamic vinaigrette',
 'low-fat bottled italian dressing',
 'low-fat buttermilk',
 'low-fat caesar dressing',
 'low-fat canned coconut milk',
 'low-fat cheddar',
 'low-fat cheddar cheese',
 'low-fat cheese',
 'low-fat chicken broth',
 'low-fat coconut milk',
 'low-fat coffee ice cream',
 'low-fat cottage cheese',
 'low-fat cream cheese',
 'low-fat crème fraîche',
 'low-fat deli ham',
 'low-fat feta',
 'low-fat firm silken tofu',
 'low-fat flour tortillas',
 'low-fat goat cheese',
 'low-fat greek yogurt',
 'low-fat marinara sauce',
 'low-fat mayonnaise',
 'low-fat milk',
 'low-fat monterey jack',
 'low-fat mozzarella cheese',
 'low-fat natural yogurt',
 'low-fat parmesan cheese',
 'low-fat pasta sauce',
 'low-fat plain gree

In [24]:
get_ingredients_containing(ingredient_counts.index, "shredded")

['2% milk shredded mozzarella cheese',
 'fat-free shredded cheddar cheese',
 'low-fat shredded cheddar cheese',
 'reduced fat shredded cheese',
 'shredded American cheese',
 'shredded Italian cheese',
 'shredded Monterey Jack cheese',
 'shredded bamboo',
 'shredded basil',
 'shredded cabbage',
 'shredded carrots',
 'shredded cheddar cheese',
 'shredded cheese',
 'shredded coconut',
 'shredded colby',
 'shredded coleslaw mix',
 'shredded extra sharp cheddar cheese',
 'shredded lettuce',
 'shredded low-fat cheddar',
 'shredded low-fat cheddar cheese',
 'shredded low-fat jarlsberg cheese',
 'shredded low-fat mozzarella cheese',
 'shredded low-fat sharp cheddar',
 'shredded lowfat monterey jack cheese',
 'shredded mild cheddar cheese',
 'shredded monterey jack cheese',
 'shredded mozzarella cheese',
 'shredded nori',
 'shredded parmesan cheese',
 'shredded pepper jack cheese',
 'shredded reduced fat cheddar cheese',
 'shredded reduced fat reduced sodium swiss cheese',
 'shredded romano che

In [97]:
get_ingredients_containing(ingredient_counts.index, "%")

['1% low-fat buttermilk',
 '1% low-fat chocolate milk',
 '1% low-fat cottage cheese',
 '1% low-fat milk',
 '2% low fat cheddar chees',
 '2% low-fat cottage cheese',
 '2% lowfat greek yogurt',
 '2% milk shredded mozzarella cheese',
 '2% reduced-fat milk',
 '25% less sodium chicken broth',
 '33% less sodium cooked deli ham',
 '33% less sodium cooked ham',
 '33% less sodium ham',
 '33% less sodium smoked fully cooked ham',
 '40% less sodium taco seasoning',
 '40% less sodium taco seasoning mix',
 '95% lean ground beef',
 'KNUDSEN 2% Milkfat Low Fat Cottage Cheese',
 'KRAFT Mexican Style 2% Milk Finely Shredded Four Cheese',
 'Yoplait® Greek 2% caramel yogurt',
 'evaporated low-fat 2% milk',
 'low sodium 96% fat free ham']

In [98]:
get_ingredients_containing(ingredient_counts.index, "reduced fat")

['condensed reduced fat reduced sodium cream of chicken soup',
 'condensed reduced fat reduced sodium cream of mushroom soup',
 'condensed reduced fat reduced sodium tomato soup',
 'less sodium reduced fat ham',
 'reduced fat Mexican cheese',
 'reduced fat alfredo sauce',
 'reduced fat cheddar cheese',
 'reduced fat chunky peanut butter',
 'reduced fat coconut milk',
 'reduced fat cream cheese',
 'reduced fat cream of mushroom soup',
 'reduced fat creamy peanut butter',
 'reduced fat firm tofu',
 'reduced fat italian dressing',
 'reduced fat mayonnaise',
 'reduced fat milk',
 'reduced fat monterey jack cheese',
 'reduced fat mozzarella',
 'reduced fat provolone cheese',
 'reduced fat ranch dressing',
 'reduced fat reduced sodium cream of mushroom soup',
 'reduced fat reduced sodium tomato and herb pasta sauce',
 'reduced fat ricotta cheese',
 'reduced fat sharp cheddar cheese',
 'reduced fat shredded cheese',
 'reduced fat swiss cheese',
 'reduced fat whipped topping',
 'reduced sodium

In [99]:
get_ingredients_containing(ingredient_counts.index, "reduced sodium")

['condensed reduced fat reduced sodium cream of chicken soup',
 'condensed reduced fat reduced sodium cream of mushroom soup',
 'condensed reduced fat reduced sodium tomato soup',
 'fat free reduced sodium chicken broth',
 'fat skimmed reduced sodium chicken broth',
 'low fat reduced sodium pasta sauce',
 'progresso reduced sodium chicken broth',
 'reduced fat reduced sodium cream of mushroom soup',
 'reduced fat reduced sodium tomato and herb pasta sauce',
 'reduced sodium beef broth',
 'reduced sodium beef stock',
 'reduced sodium black beans',
 'reduced sodium canned chicken broth',
 'reduced sodium chicken bouillon granules',
 'reduced sodium chicken broth',
 'reduced sodium chicken flavor stuffing mix',
 'reduced sodium chicken stock',
 'reduced sodium condensed cream of chicken soup',
 'reduced sodium cream of mushroom soup',
 'reduced sodium fat free chicken broth',
 'reduced sodium garbanzos',
 'reduced sodium ham',
 'reduced sodium italian style stewed tomatoes',
 'reduced sod

In [100]:
get_ingredients_containing(ingredient_counts.index, "fat free")

['and fat free half half',
 'fat free beef broth',
 'fat free cream cheese',
 'fat free cream of mushroom soup',
 'fat free frozen top whip',
 'fat free greek yogurt',
 'fat free ground turkey breast',
 'fat free ice cream',
 'fat free lemon curd',
 'fat free less sodium beef broth',
 'fat free less sodium chicken broth',
 'fat free less sodium vegetable broth',
 'fat free milk',
 'fat free reduced sodium chicken broth',
 'fat free whipped topping',
 'fat free yogurt',
 'less sodium fat free chicken broth',
 'low sodium 96% fat free ham',
 'low sodium fat free vegetable broth',
 'reduced sodium fat free chicken broth']

In [25]:
get_ingredients_containing(ingredient_counts.index, "cheese")

['1% low-fat cottage cheese',
 '2% low-fat cottage cheese',
 '2% milk shredded mozzarella cheese',
 'American cheese',
 'Italian cheese',
 'Italian cheese blend',
 'Mexican cheese',
 'Mexican cheese blend',
 'aged Manchego cheese',
 'aged cheddar cheese',
 'american cheese food',
 'american cheese slices',
 'blue cheese',
 'blue cheese dressing',
 'brie cheese',
 'cheddar cheese',
 'cheddar cheese soup',
 'cheese',
 'cheese cubes',
 'cheese curds',
 'cheese dip',
 'cheese ravioli',
 'cheese sauce',
 'cheese slices',
 'cheese soup',
 'cheese spread',
 'cheese sticks',
 'cheese tortellini',
 'chihuahua cheese',
 'colby cheese',
 'colby jack cheese',
 'condensed cheddar cheese soup',
 'condensed fiesta nacho cheese soup',
 'cottage cheese',
 'cream cheese',
 'cream cheese frosting',
 'cream cheese lowfat',
 'cream cheese spread',
 'cream cheese with chives',
 'cream cheese with chives and onion',
 'cream cheese, soften',
 'cream style cottage cheese',
 'crumbled blue cheese',
 'crumbled c

In [26]:
get_ingredients_containing(ingredient_counts.index, "sliced")

['Green Giant™ sliced mushrooms',
 'red bell pepper, sliced',
 'sliced almonds',
 'sliced apples',
 'sliced beets',
 'sliced black olives',
 'sliced carrots',
 'sliced chicken',
 'sliced chorizo',
 'sliced cucumber',
 'sliced fresh fruit',
 'sliced green olives',
 'sliced green onions',
 'sliced ham',
 'sliced kalamata olives',
 'sliced leeks',
 'sliced mango',
 'sliced meat',
 'sliced mushrooms',
 'sliced olives',
 'sliced pears',
 'sliced salami',
 'sliced shallots',
 'sliced tomatoes',
 'sliced turkey']

In [27]:
get_ingredients_containing(ingredient_counts.index, "diced")

['(14.5 oz.) diced tomatoes',
 'Italian seasoned diced tomatoes',
 'Red Gold® diced tomatoes',
 'diced apples',
 'diced bacon',
 'diced bell pepper',
 'diced celery',
 'diced chicken',
 'diced green chilies',
 'diced ham',
 'diced lamb',
 'diced mushrooms',
 'diced onions',
 'diced pimentos',
 'diced potatoes',
 'diced red onions',
 'diced tomatoes',
 'diced tomatoes and green chilies',
 'diced tomatoes in juice',
 'diced tomatoes with garlic and onion',
 'diced yellow onion',
 'fire roasted diced tomatoes',
 'low sodium diced tomatoes',
 'no-salt-added diced tomatoes',
 'peeled diced tomatoes']

In [28]:
get_ingredients_containing(ingredient_counts.index, "minced")

['dried minced garlic',
 'dried minced onion',
 'extra lean minced beef',
 'lean minced beef',
 'lean minced lamb',
 'minced beef',
 'minced chicken',
 'minced garlic',
 'minced ginger',
 'minced lean steak',
 'minced meat',
 'minced onion',
 'minced peperoncini',
 'minced pork']

In [29]:
get_ingredients_containing(ingredient_counts.index, "pepper")

['aleppo pepper',
 'ancho chile pepper',
 'ancho chili ground pepper',
 'banana peppers',
 'bell pepper',
 'bird pepper',
 'black pepper',
 'black peppercorns',
 'blackpepper',
 'canned jalapeno peppers',
 'cayenne pepper',
 'cayenne pepper sauce',
 'cherry peppers',
 'chile pepper',
 'chili habanero pepper',
 'chili pepper',
 'chili pepper flakes',
 'chinese pepper',
 'chipotle peppers',
 'chopped bell pepper',
 'chopped green bell pepper',
 'coars ground black pepper',
 'cracked black pepper',
 'crushed peppercorn',
 'crushed peppermint candy',
 'crushed red pepper',
 'crushed red pepper flakes',
 'cuban peppers',
 'diced bell pepper',
 'diet dr. pepper',
 'dr pepper',
 'dr. pepper',
 'dried chile peppers',
 'dried chipotle pepper',
 'dried red chile peppers',
 'fresh poblano pepper',
 'freshly ground pepper',
 'fresno pepper',
 'frozen peppers and onions',
 'garlic pepper blend',
 'garlic pepper seasoning',
 'green bell pepper',
 'green bell pepper, slice',
 'green bellpepper',
 'gr

In [32]:
get_ingredients_containing(ingredient_counts.index, ", ")

['(10 oz.) frozen chopped spinach, thawed and squeezed dry',
 '2 1/2 to 3 lb. chicken, cut into serving pieces',
 '8 ounc ziti pasta, cook and drain',
 'bacon, crisp-cooked and crumbled',
 'boneless, skinless chicken breast',
 'bread, cut french into loaf',
 'bread, cut into italian loaf',
 'chop green chilies, undrain',
 'clams, well scrub',
 'clove garlic, fine chop',
 'cream cheese, soften',
 'dri basil leaves, crush',
 'dri oregano leaves, crush',
 'dri thyme leaves, crush',
 'egg noodles, cooked and drained',
 'english muffins, split and toasted',
 'fettuccine, cook and drain',
 'fresh spinach leaves, rins and pat dry',
 'frozen chopped spinach, thawed and squeezed dry',
 'frozen crabmeat, thaw and drain',
 'frozen lemonade concentrate, thawed and undiluted',
 'frozen orange juice concentrate, thawed and undiluted',
 'frozen whip topping, thaw',
 'green bell pepper, slice',
 'jumbo shell pasta , cook and drain',
 'lasagna noodles, cooked and drained',
 'linguine, cook and drain',


In [33]:
get_ingredients_containing(ingredient_counts.index, "ground")

['95% lean ground beef',
 'ancho chili ground pepper',
 'coars ground black pepper',
 'coarse ground mustard',
 'extra lean ground beef',
 'extra-lean ground beef',
 'fat free ground turkey breast',
 'finely ground coffee',
 'freshly ground pepper',
 'ground Italian sausage',
 'ground allspice',
 'ground almonds',
 'ground asafetida',
 'ground beef',
 'ground bison',
 'ground black pepper',
 'ground blanched almonds',
 'ground caraway',
 'ground cardamom',
 'ground cashew',
 'ground cayenne pepper',
 'ground chicken',
 'ground chicken breast',
 'ground chile',
 'ground chipotle chile pepper',
 'ground chuck',
 'ground cinnamon',
 'ground cloves',
 'ground coffee',
 'ground coriander',
 'ground cumin',
 'ground dried shrimp',
 'ground espresso',
 'ground fennel',
 'ground flaxseed',
 'ground ginger',
 'ground hazelnuts',
 'ground lamb',
 'ground meat',
 'ground mustard',
 'ground nutmeg',
 'ground nuts',
 'ground oregano',
 'ground paprika',
 'ground peanut',
 'ground pecans',
 'ground 

In [84]:
get_ingredients_containing(ingredient_counts.index, "small")

['pasta shell small',
 'shrimp small uncook',
 'small capers, rins and drain',
 'small curd cottage cheese',
 'small eggs',
 'small green chile',
 'small new potatoes',
 'small pasta',
 'small pearl tapioca',
 'small potatoes',
 'small red beans',
 'small red potato',
 'small shells',
 'small tomatoes',
 'small white beans',
 'small yellow onion']

In [85]:
get_ingredients_containing(ingredient_counts.index, "medium")

['medium cheddar cheese',
 'medium curry powder',
 'medium dry sherry',
 'medium egg noodles',
 'medium eggs',
 'medium firm tofu',
 'medium potatoes',
 'medium salsa',
 'medium shrimp',
 'medium shrimp uncook',
 'medium tomatoes',
 'medium whole wheat tortillas',
 'medium zucchini',
 'medium-grain rice',
 'uncook medium shrimp, peel and devein']

In [86]:
get_ingredients_containing(ingredient_counts.index, "large")

['extra large eggs',
 'extra large shrimp',
 'large curd cottage cheese',
 'large egg whites',
 'large egg yolks',
 'large eggs',
 'large flour tortillas',
 'large free range egg',
 'large garlic cloves',
 'large marshmallows',
 'large sausage casing',
 'large shrimp',
 'large snails',
 'large tomato',
 'rigatoni or large tube pasta']

In [89]:
get_ingredients_containing(ingredient_counts.index, "extra large")

['extra large eggs', 'extra large shrimp']

In [93]:
get_ingredients_containing(ingredient_counts.index, "jumbo")

['jumbo macaroni shells',
 'jumbo pasta shells',
 'jumbo shell pasta , cook and drain',
 'jumbo shells',
 'jumbo shrimp']

In [94]:
get_ingredients_containing(ingredient_counts.index, "firm")

['extra firm silken tofu',
 'extra firm tofu',
 'firm silken tofu',
 'firm tofu',
 'firmly packed brown sugar',
 'firmly packed light brown sugar',
 'low-fat firm silken tofu',
 'medium firm tofu',
 'reduced fat firm tofu',
 'semi firm tofu']

In [95]:
get_ingredients_containing(ingredient_counts.index, "brown sugar")

['brown sugar',
 'dark brown sugar',
 'firmly packed brown sugar',
 'firmly packed light brown sugar',
 'golden brown sugar',
 'light brown sugar']

In [96]:
get_ingredients_containing(ingredient_counts.index, "firmly packed")

['firmly packed brown sugar', 'firmly packed light brown sugar']

In [34]:
get_ingredients_containing(ingredient_counts.index, "fresh")

['assorted fresh vegetables',
 'bertolli vodka sauc made with fresh cream',
 'bread crumb fresh',
 'chees fresh mozzarella',
 'chopped cilantro fresh',
 'chopped fresh chives',
 'chopped fresh herbs',
 'chopped fresh mint',
 'chopped fresh sage',
 'chopped fresh thyme',
 'finely chopped fresh parsley',
 'fresh angel hair',
 'fresh asparagus',
 'fresh basil',
 'fresh basil leaves',
 'fresh bay leaves',
 'fresh bean',
 'fresh blueberries',
 'fresh brussels sprouts',
 'fresh cheese',
 'fresh chervil',
 'fresh chevre',
 'fresh chicken stock',
 'fresh chile',
 'fresh chili',
 'fresh chives',
 'fresh chorizo',
 'fresh cilantro',
 'fresh cod',
 'fresh coriander',
 'fresh corn',
 'fresh cranberries',
 'fresh curry',
 'fresh curry leaves',
 'fresh dates',
 'fresh dill',
 'fresh fava bean',
 'fresh flounder fillets',
 'fresh ginger',
 'fresh ginger root',
 'fresh green bean',
 'fresh green peas',
 'fresh ham',
 'fresh herbs',
 'fresh lavender',
 'fresh leav spinach',
 'fresh lemon',
 'fresh lemo

In [217]:
get_ingredients_containing(ingredient_counts.index, "cloves")

['garlic cloves', 'ground cloves', 'large garlic cloves', 'whole cloves']

In [45]:
ingredients_list[ingredients_list['ingredients'].str.contains(r".*oz\.",regex=True)]

Unnamed: 0,ingredients
285,( oz.) tomato sauce
1224,"(10 oz.) frozen chopped spinach, thawed and sq..."
3472,( oz.) tomato sauce
3605,( oz.) tomato sauce
3605,( oz.) tomato paste
6597,(14.5 oz.) diced tomatoes
10258,( oz.) tomato sauce
11932,( oz.) tomato sauce
11981,( oz.) tomato sauce
14201,( oz.) tomato paste


In [52]:
exp_oz = ingredients_list[ingredients_list['ingredients'].str.contains(r".*oz\.",regex=True)].iloc[5,0]

In [53]:
exp_oz

'(14.5 oz.) diced tomatoes'

In [249]:
ing_with_oz = ingredients_list[ingredients_list['ingredients'].str.contains(r".*oz\.",regex=True)]
ing_with_oz.head()

Unnamed: 0,ingredients
285,( oz.) tomato sauce
1224,"(10 oz.) frozen chopped spinach, thawed and sq..."
3472,( oz.) tomato sauce
3605,( oz.) tomato sauce
3605,( oz.) tomato paste


In [246]:
ing_with_oz = ingredients_list[ingredients_list['ingredients'].str.contains(r".*oz\.",regex=True)]
out = ing_with_oz['ingredients'].str.extract(r"\(.*oz\.\) (.*)").merge(ing_with_oz,left_index=True,right_index=True)

In [248]:
ingredients_list[ingredients_list['ingredients'].str.contains(r",.*and.*",regex=True)].head()

Unnamed: 0,ingredients
100,"english muffins, split and toasted"
196,"red kidnei beans, rins and drain"
212,"water chestnuts, drained and chopped"
283,"bacon, crisp-cooked and crumbled"
419,"uncook medium shrimp, peel and devein"


In [244]:
ing_with_instructs = ingredients_list[ingredients_list['ingredients'].str.contains(r",.*and.*",regex=True)]

compare_remove_instructs = ing_with_instructs['ingredients'].str.extract(r"(.*),.* and .*").merge(ing_with_instructs,left_index=True,right_index=True)

In [243]:
# compare_remove_instructs.sample(50)

In [242]:
# get_ingredients_containing(ingredient_counts.index, "dried")