A notebook for cleaning celebrity chef recipe data and putting it in a local MongoDB collection.

#0. Setup

In [None]:
import pickle
import re
import nltk
from nltk.stem import WordNetLemmatizer
import numpy as np
import pymongo
from tqdm import tqdm

Mongo initialization:

In [None]:
client = pymongo.MongoClient()
chefs = client.chefs_db
celebrity_recipes = client.chefs_db.celebrity_recipes
recipe_links = client.chefs_db.celebrity_recipe_links #more metadata-y

Barrel of picklefile names:

In [None]:
barrel = ['giada_betterinfo_0.pkl', 'giada_betterinfo_1.pkl', 'giada_betterinfo_2.pkl', 
          'alton_betterinfo_0.pkl', 'alton_betterinfo_1.pkl', 'yearwood_betterinfo.pkl', 
          'fieri_betterinfo_0.pkl', 'fieri_betterinfo_1.pkl', 'bertinelli_betterinfo.pkl', 
          'valladolid_betterinfo.pkl', 'garten_betterinfo_0.pkl', 'garten_betterinfo_1.pkl', 
          'garten_betterinfo_2.pkl', 'drummond_betterinfo_0.pkl', 'drummond_betterinfo_1.pkl', 
          'flay_betterinfo_0.pkl', 'flay_betterinfo_1.pkl', 'flay_betterinfo_2.pkl', 
          'irvine_betterinfo_0.pkl', 'irvine_betterinfo_1.pkl', 'sunny_betterinfo.pkl', 
          'duff_betterinfo.pkl']

#1. NLP setup

Stopwords to remove from ingredients text:

In [None]:
specs = ['dash', 'pinch', 'teaspoon', 'tablespoon', 'cup', 'scoop', 'pound', 'ounce', 'oz', 
         'quart', 'pint', 'gallon', 'milliliter', 'ml', 'liter', 'small', 'medium', 'large', 
         'freshly', 'fresh', 'ground', 'piece', 'clove', 'boneless', 'cube', 'dice', 'finely', 
         'grated', 'to', 'inch', 'each', 'whole', 'about', 'as', 'thawed', 'by', 'all', 'a',
         'chopped', 'crushed', 'plus', 'minus', 'such', 'the', 'an', 'slice', 'approximately',
         'and', 'or', 'weight', 'of', 'recipe', 'basic', 'slab', 'stick', 'pure', 'melt', 
         'melted', 'dry', 'dried'] 

Helper function to clean ingredients text:

In [None]:
def scrub_ingredients(ingredient_list, specs=specs):
    ingredients = []
    for item in ingredient_list:
        if ',' in item:
            item = item.split(',')[0]
        line = [WordNetLemmatizer().lemmatize(term.decode('latin-1')) 
                for term in item.lower().split()] 
        line = [term.encode('utf-8') for term in line 
                if re.sub('[^a-z]+', '', term) not in specs] 
        for i, term in enumerate(line):
            try: 
                if re.match('[0-9]', term) and not re.match('[0-9]', line[i+1]):
                    ingredient = ' '.join(line[i+1:])
                    if ingredient[0] == ' ':
                        ingredient = ingredient[1:]
                    ingredients.append(re.sub('[^a-z -]+', '', ingredient))
            except:
                continue
    return ingredients

Sample input:
```
['2 cups water', '1 cup apple cider vinegar', '1/4 cup kosher salt', '6 cloves garlic, peeled and crushed', '2 tablespoons sugar', '2 tablespoons yellow mustard seed', '2 tablespoons hot sauce', '1 tablespoon celery seed', '1 bay leaf', '1/4 teaspoon whole black peppercorns', '8 ounces ice', '1 1/2 pounds fresh boneless pork butt, cut into 2-inch cubes']
```

Sample output:
```
['bay leaf', 'celery seed', 'sugar', 'water', 'garlic', 'ice', 'hot sauce', 'apple cider vinegar', 'black peppercorn', 'yellow mustard seed', 'pork butt', 'kosher salt']
```

Future optimization: extract and normalize ingredient and yield amounts. Not useful for this project but would be useful for a 'recipe quantity multiplier' side project.

Helper function to extract durations (in minutes) from text about recipe times:

In [None]:
def get_duration(time_string):
    '''Converts a string representing a duration of time to an integer of that duration in 
    minutes.
    '''
    time_string = time_string.split()
    duration = 0
    for i, term in enumerate(time_string):
        if term == 'hr':
            duration += 60 * int(time_string[i-1])
        if term == 'min':
            duration += int(time_string[i-1])
    if duration == 0:
        duration = np.nan
    return duration

#2. Implementation

In [None]:
celebrity_recipes.drop()

for pik in tqdm(barrel):
    with open(pik, 'r') as picklefile: 
        info = pickle.load(picklefile)
    for title, recipe in info.values()[0].items():
        recipe['chef'] = info.keys()[0]
        recipe['title'] = title
        recipe['ingredients'] = scrub_ingredients(recipe['ingredients'])
        recipe['cook_time'] = get_duration(recipe['cook_time'])
        recipe['inactive_time'] = get_duration(recipe['inactive_time'])
        recipe['prep_time'] = get_duration(recipe['prep_time'])
        recipe['total_time'] = get_duration(recipe['total_time'])
        celebrity_recipes.save(recipe)

In [None]:
nopic = 0
for pik in tqdm(barrel):
    with open(pik, 'r') as picklefile: 
        info = pickle.load(picklefile)
    for title, recipe in info.values()[0].items():
        entry = {}
        entry['chef'] = info.keys()[0]
        entry['title'] = title
        entry['categories'] = recipe['categories']
        try:
            entry['img_link'] = recipe['img_link']
        except:
            entry['img_link'] = 'Image Unavailable'
            nopic += 1
        entry['page_link'] = recipe['page_link']
        recipe_links.save(entry)