A notebook for gathering celebrity chef recipe data via Food Network.

#0. Setup

In [1]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import unicodedata
import re
import pickle
from tqdm import tqdm

BeautifulSoup setup:

In [2]:
def get_page(url):
    '''
    url: str, webpage address to scrape
    page: str, webpage DOM
    '''
    response = requests.get(url)
    page = response.text
    return page

def get_soup(webpage):
    '''
    webpage: str, page to soup
    soup: bs4.BeautifulSoup object, souped page
    '''
    soup = BeautifulSoup(webpage)
    return soup

#1. Assemble links to chef pages

Chefs and the numbers of pages of recipes they have on the Food Network website:

In [3]:
chefs_and_pages = {
    'Valerie Bertinelli': 11,
    'Ina Garten': 110,
    'Ree Drummond': 77,
    'Giada De Laurentiis': 173,
    'Trisha Yearwood': 33,
    'Guy Fieri': 85, 
    'Robert Irvine': 99,
    'Alton Brown': 76,
    'Bobby Flay': 198,
    'Duff Goldman': 5, 
    'Sunny Anderson': 52,
    'Marcela Valladolid': 33
    }

Helper function to change name format:

In [4]:
def change_chef_name_format(chef):
    a = chef.lower().split()
    name = ''
    for n in range(len(a)):
        name += a[n] + '-'
    name = name[:-1]
    return name

In [5]:
def get_chef_links(chefs_and_pages):
    links = {}
    for chef, pages in chefs_and_pages.items():
        name = change_chef_name_format(chef)
        for i in range(1, pages+1):
            link = 'http://www.foodnetwork.com/chefs/' + name + '/recipes.mostpopular.page-' \
            + str(i) + '.html'
            if chef not in links:
                links[chef] = [link]
            else:
                links[chef].append(link)  
    return links

In [6]:
chef_page_links = get_chef_links(chefs_and_pages)

#2. Get recipe page links from chef pages

In [7]:
def get_recipe_links(chef_page_links):
    recipe_links = {}
    
    for chef, pages in chef_page_links.items():
        soups = []
        for link in tqdm(pages):
            soup = get_soup(get_page(link))
            soups.append(soup)
            time.sleep(.002)
        recipe_links[chef] = soups
        
    for chef, pages in recipe_links.items():
        name = change_chef_name_format(chef)
        recipes = []
        for page in pages:
            page = page.find_all('h6')
            for line in page:
                line = str(line).split()
                if len(line) > 1 and '/recipes/' + name + '/' in line[1]:
                    recipes.append('http://www.foodnetwork.com' + line[1][6:-1])
        recipe_links[chef] = recipes
    
    return recipe_links

In [8]:
recipe_links = get_recipe_links(chef_page_links)

In [9]:
#pickling
with open('recipe_links.pkl', 'w') as picklefile:
    pickle.dump(recipe_links, picklefile)

#3. Get recipe info from recipe pages

In [10]:
#unpickling
with open('recipe_links.pkl', 'r') as picklefile: 
    recipe_links = pickle.load(picklefile)

In [11]:
recipe_info_keys = ['total_time','cook_time','ingredients','difficulty_level',
        'yield','inactive_time','prep_time','directions','categories','page_link','img_link']

In [12]:
def get_recipe_info(recipe_links=recipe_links, keys=recipe_info_keys):
    chefs_and_recipes = {}
    for chef, links in recipe_links.items():
        print chef
        recipes = {}
        for link in tqdm(links):
            soup = get_soup(get_page(link))
            info = {'page_link': link}
            
            img = soup.find_all('div', {'class': 'col12 pic collapsed'})
            for line in str(img).split('\n'):
                if 'src=' in line:
                    info['img_link'] = line.split('src="')[1].split('"')[0]
             
            times = soup.find('div', {'class': 'cooking-times'})  
            for line in str(times).split('\n'):
                if 'Total' in line:
                    info['total_time'] = line.split('<dd class="total">')[1].split('<')[0]
                if 'Prep' in line:
                    info['prep_time'] = line.split('<dd>')[1].split('<')[0]
                if 'Inactive' in line:
                    info['inactive_time'] = line.split('<dd>')[1].split('<')[0]
                if 'Cook' in line:
                    info['cook_time'] = line.split('<dd>')[1].split('<')[0]

            yield_and_level = soup.find('div', {'class': 'difficulty'})
            yield_and_level_split = str(yield_and_level).split('\n')
            for i, line in enumerate(yield_and_level_split):
                if 'Yield' in line:
                    try:
                        info['yield'] = yield_and_level_split[i+1].split('<dd>')[1].split(
                            '<')[0]
                    except:
                        pass
                if 'Level' in line:
                    info['difficulty_level'] = yield_and_level_split[i+1].split('<dd>')[
                        1].split('<')[0]

            ingredients = soup.find_all('div', {'class': 'col8 ingredients responsive'})
            ingredient_list = []
            for line in str(ingredients).split('\n'):
                if 'itemprop="ingredients"' in line:   
                    ingredient_list.append(line.split('"ingredients">')[1].split('<')[0])
            info['ingredients'] = ingredient_list

            directions = soup.find_all('div', {'class': 'col10 directions'})
            if directions != []:
                for line in str([line.text for line in directions]).split('\\n'):
                    if len(line) > 200:
                        info['directions'] = line
            
            categories = soup.find_all('div', {'class': 'categories'})
            categories_list = []
            for line in str(categories).split('\n'):
                if 'href="/topics/' in line:
                    categories_list.append(line.split('.html">')[1].split('<')[0])
            info['categories'] = categories_list
        
            title = soup.find('div', {'class': 'tier-3 title'})
            for line in str(title).split('\n'):
                if 'itemprop="name"' in line:  
                    title = line.split('"name">')[1].split('<')[0]
            info['title'] = title
            
            ## If wanting NaNs, include these lines:
            #             entries = info.keys()
            #             na = str(np.nan)
            #             for key in recipe_info_keys:
            #                 if key not in entries:
            #                     info[key] = na
            #                 if info[key] == []:
            #                     info[key] = [na]
                    
            recipes[title] = info 
            time.sleep(.002)
            
        chefs_and_recipes[chef] = recipes
    return chefs_and_recipes

Going in segments:

In [13]:
fieri0 = {'Guy Fieri': recipe_links['Guy Fieri'][:400]}
fieri1 = {'Guy Fieri': recipe_links['Guy Fieri'][400:]}
fieri_info_0 = get_recipe_info(fieri0)
fieri_info_1 = get_recipe_info(fieri1)
###pickling!#####################################################
##### -- "but first you've got to barrel your scrapes" -- ########
with open('fieri_betterinfo_0.pkl', 'w') as picklefile:         
    pickle.dump(fieri_info_0, picklefile)                 
with open('fieri_betterinfo_1.pkl', 'w') as picklefile:     
    pickle.dump(fieri_info_1, picklefile)            

In [14]:
yearwood ={'Trisha Yearwood': recipe_links['Trisha Yearwood']}
yearwood_info = get_recipe_info(yearwood)
###pickling!#####################################################
##### -- "but first you've got to barrel your scrapes" -- ########
with open('yearwood_betterinfo.pkl', 'w') as picklefile:      
    pickle.dump(yearwood_info, picklefile)         

In [15]:
giada0 = {'Giada De Laurentiis': recipe_links['Giada De Laurentiis'][:400]}
giada1 = {'Giada De Laurentiis': recipe_links['Giada De Laurentiis'][400:850]}
giada2 = {'Giada De Laurentiis': recipe_links['Giada De Laurentiis'][850:1300]}
giada3 = {'Giada De Laurentiis': recipe_links['Giada De Laurentiis'][1300:]}
giada_info_0 = get_recipe_info(giada0)
giada_info_1 = get_recipe_info(giada1)
giada_info_2 = get_recipe_info(giada2)
giada_info_3 = get_recipe_info(giada3)
###pickling!#####################################################
##### -- "but first you've got to barrel your scrapes" -- ########
with open('giada_betterinfo_0.pkl', 'w') as picklefile:
    pickle.dump(giada_info_0, picklefile)
with open('giada_betterinfo_1.pkl', 'w') as picklefile:
    pickle.dump(giada_info_1, picklefile)
with open('giada_betterinfo_2.pkl', 'w') as picklefile:
    pickle.dump(giada_info_2, picklefile)
with open('giada_betterinfo_3.pkl', 'w') as picklefile:
    pickle.dump(giada_info_3, picklefile)

In [16]:
alton0 = {'Alton Brown': recipe_links['Alton Brown'][:400]}
alton1 = {'Alton Brown': recipe_links['Alton Brown'][400:]}
alton_info_0 = get_recipe_info(alton0)
alton_info_1 = get_recipe_info(alton1)
###pickling!##################################################
##### -- "but first you've got to barrel your scrapes" -- #####
with open('alton_betterinfo_0.pkl', 'w') as picklefile:
    pickle.dump(alton_info_0, picklefile)
with open('alton_betterinfo_1.pkl', 'w') as picklefile:
    pickle.dump(alton_info_1, picklefile)

In [17]:
bertinelli = {'Valerie Bertinelli': recipe_links['Valerie Bertinelli']}
bertinelli_info = get_recipe_info(bertinelli)
###pickling!#####################################################
##### -- "but first you've got to barrel your scrapes" -- ########
with open('bertinelli_betterinfo.pkl', 'w') as picklefile:        
    pickle.dump(bertinelli_info, picklefile)                

In [18]:
valladolid = {'Marcela Valladolid': recipe_links['Marcela Valladolid']}
valladolid_info = get_recipe_info(valladolid)
###pickling!#####################################################
##### -- "but first you've got to barrel your scrapes" -- ########
with open('valladolid_betterinfo.pkl', 'w') as picklefile:  
    pickle.dump(valladolid_info, picklefile)         

In [19]:
garten0 = {'Ina Garten': recipe_links['Ina Garten'][:400]}
garten1 = {'Ina Garten': recipe_links['Ina Garten'][400:800]}
garten2 = {'Ina Garten': recipe_links['Ina Garten'][800:]}
garten_info_0 = get_recipe_info(garten0)
garten_info_1 = get_recipe_info(garten1)
garten_info_2 = get_recipe_info(garten2)
###pickling!#####################################################
##### -- "but first you've got to barrel your scrapes" -- ########
with open('garten_betterinfo_0.pkl', 'w') as picklefile:        
    pickle.dump(garten_info_0, picklefile)            
with open('garten_betterinfo_1.pkl', 'w') as picklefile:   
    pickle.dump(garten_info_1, picklefile)             
with open('garten_betterinfo_2.pkl', 'w') as picklefile:   
    pickle.dump(garten_info_2, picklefile)         

In [20]:
drummond0 = {'Ree Drummond': recipe_links['Ree Drummond'][:400]}
drummond1 = {'Ree Drummond': recipe_links['Ree Drummond'][400:]}
drummond_info_0 = get_recipe_info(drummond0)
drummond_info_1 = get_recipe_info(drummond1)
###pickling!#####################################################
##### -- "but first you've got to barrel your scrapes" -- ########
with open('drummond_betterinfo_0.pkl', 'w') as picklefile:     
    pickle.dump(drummond_info_0, picklefile)              
with open('drummond_betterinfo_1.pkl', 'w') as picklefile:     
    pickle.dump(drummond_info_1, picklefile)        

In [21]:
flay0 = {'Bobby Flay': recipe_links['Bobby Flay'][:500]}
flay1 = {'Bobby Flay': recipe_links['Bobby Flay'][500:1000]}
flay2 = {'Bobby Flay': recipe_links['Bobby Flay'][1000:1500]}
flay3 = {'Bobby Flay': recipe_links['Bobby Flay'][1500:]}
flay_info_0 = get_recipe_info(flay0)
flay_info_1 = get_recipe_info(flay1)
flay_info_2 = get_recipe_info(flay2)
flay_info_3 = get_recipe_info(flay3)
###pickling!#####################################################
##### -- "but first you've got to barrel your scrapes" -- ########
with open('flay_betterinfo_0.pkl', 'w') as picklefile:        
    pickle.dump(flay_info_0, picklefile)               
with open('flay_betterinfo_1.pkl', 'w') as picklefile:       
    pickle.dump(flay_info_1, picklefile)            
with open('flay_betterinfo_2.pkl', 'w') as picklefile:     
    pickle.dump(flay_info_2, picklefile)  
with open('flay_betterinfo_3.pkl', 'w') as picklefile:     
    pickle.dump(flay_info_3, picklefile) 

In [22]:
irvine0 = {'Robert Irvine': recipe_links['Robert Irvine'][:450]}
irvine1 = {'Robert Irvine': recipe_links['Robert Irvine'][450:]}
irvine_info_0 = get_recipe_info(irvine0)
irvine_info_1 = get_recipe_info(irvine1)
###pickling!#####################################################
##### -- "but first you've got to barrel your scrapes" -- ########
with open('irvine_betterinfo_0.pkl', 'w') as picklefile:        
    pickle.dump(irvine_info_0, picklefile)             
with open('irvine_betterinfo_1.pkl', 'w') as picklefile:  
    pickle.dump(irvine_info_1, picklefile)           

In [23]:
sunny = {'Sunny Anderson': recipe_links['Sunny Anderson']}
sunny_info = get_recipe_info(sunny)
###pickling!#####################################################
##### -- "but first you've got to barrel your scrapes" -- ########
with open('sunny_betterinfo.pkl', 'w') as picklefile:         
    pickle.dump(sunny_info, picklefile)             

In [24]:
duff = {'Duff Goldman': recipe_links['Duff Goldman']}
duff_info = get_recipe_info(duff)
###pickling!#####################################################
##### -- "but first you've got to barrel your scrapes" -- ########
with open('duff_betterinfo.pkl', 'w') as picklefile:    
    pickle.dump(duff_info, picklefile)        