In [1]:
from time import sleep
import requests
from bs4 import BeautifulSoup
from math import ceil
import pandas as pd
from pymongo import MongoClient

In [2]:
SCRAPING_REQUEST_STAGGER = 2.0 # in seconds
# the link for Epicurious cuisine recipes
CUISINE_URL = 'http://www.epicurious.com/tools/searchresults?type=simple&att={}&search={}'
# the link for Epicurious cuisine recipes for pages 2 and higher
CUISINE_RECIPES_URL = 'http://www.epicurious.com/tools/searchresults?att={}&search={}&type=simple&pageNumber={}&pageSize=20&resultOffset={}'
RECIPE_URL = 'http://www.epicurious.com{}'
NUMBER_OF_RECIPES_PER_PAGE = 20.

DB_NAME = 'PROJECT_RECIPIES'
COLLECTION_NAME = 'EPICUR'

client = MongoClient()
db = client[DB_NAME]
coll = db[COLLECTION_NAME]

In [3]:
def get_content_from_url(link):
    # make request with the input link and return the content of the web link
    sleep(SCRAPING_REQUEST_STAGGER)        
    response = requests.get(link)
    
    # if the status code of response is not 200, then raise a warning
    if response.status_code != 200:
        return False
    
    return response.content

In [4]:
def get_number_of_recipes(cuisine, att_value):
    # find number of recipes for any given cuisine
    
    # create a full link using the cuisine_link and cuisine name to obtain web content from link
    cuisine_link = CUISINE_URL.format(att_value, cuisine)
    cuisine_recipes = get_content_from_url(cuisine_link)
    
    # if no content is returned from the cuisine link page
    if not cuisine_recipes:
        print "no content for:", cuisine_link
        return None  
    
    # convert web content to soup to better access the content
    soup_cuisine = BeautifulSoup(cuisine_recipes)
    
    # select characters from phrase and convert it into integer
    return int(soup_cuisine.find("div", {"id": "sr_sortby"}).find("p").get_text().split()[2])

In [5]:
def get_cuisine_pages(cuisine, att_value, page):
    # create link for given cuisine and page number and obtain recipe links from page 
    
    if page == 1:
        link = CUISINE_URL.format(att_value, cuisine)
    else:
        link = CUISINE_RECIPES_URL.format(att_value, cuisine, page, (page-1)*20 + 1)
    cuisine_recipe_links = get_content_from_url(link)
    if not cuisine_recipe_links:
        print "no content for:", link
        return None
    
    soup_search = BeautifulSoup(cuisine_recipe_links)

    # return the list of links for the cuisine page
    return soup_search.find("div", {"id": "recipe_main"}).find_all("a", {"class": "recipeLnk"})

In [6]:
def get_recipe(r_link):
    # make web request and obtain site content for recipe page
    recipe_link = RECIPE_URL.format(r_link)
    recipe_response = get_content_from_url(recipe_link)
    if not recipe_response:
        print "no content for:", recipe_link
        return None
    
    return BeautifulSoup(recipe_response)

In [7]:
def get_recipe_title(soup_recipe):
    # recipe title
    return soup_recipe.find("h1",{"itemprop":"name"}).get_text().strip()

In [8]:
def get_recipe_chef(soup_recipe):
    # recipe chef name
    chef_name = soup_recipe.find("span",{"itemprop":"author"})
    
    # if chef name is not there for the recipe
    if not chef_name:
        return None
    
    # strip new line characters on both sides and the "By " from front
    return chef_name.get_text().strip()[3:]

In [9]:
def get_description(soup_recipe):
    # recipe description
    summary = soup_recipe.find("div",{"itemprop":"description"})
    
    # if summary is not there for the recipe
    if not summary:
        return None
    
    return summary.find("p").get_text().strip()

In [10]:
def get_recipe_ingredients(soup_recipe):
    #recipe ingredients
    ingredients_list = soup_recipe.find_all("li", {"class":"ingredient"})
    ingredients = []
    for ing in ingredients_list:
        ingredients.append(ing.get_text())
    return ingredients

In [11]:
def get_recipe_preperation(soup_recipe):
    # recipe preperation steps
    prep_steps = soup_recipe.find_all("li", {"class":"preparation-step"})
    prep = []
    for step in prep_steps:
        # get text for the each step and strip of new line characters at both ends
        prep.append(step.get_text().strip())
    return prep

In [12]:
def get_recipe_time(soup_recipe):
    # recipe preperation time
    prep_time = soup_recipe.find("p", {"class":"recipe-metadata__prep-time"})
    # if recipe has prep_time given
    if prep_time:
        prep_time = prep_time.get_text()
    else:
        prep_time = None
    # recipe cooking time
    cooking_time = soup_recipe.find("p", {"class":"recipe-metadata__cook-time"})
    # if recipe has cooking_time given
    if cooking_time:
        cooking_time = cooking_time.get_text()
    else:
        cooking_time = None
    return prep_time, cooking_time

In [13]:
def get_servings(soup_recipe):
    # recipe servings
    servings = soup_recipe.find("dd", {"itemprop":"recipeYield"})
    if not servings:
        return None
    
    return servings.get_text()

In [26]:
def get_recommendations(soup_recipe):
    # recipe recommendations
    ratings = soup_recipe.find("span", {"class":"rating"})
    # if recipe has ratings
    if not ratings:
        ratings = None
    else:
        ratings = ratings.get_text()
    recommendations = soup_recipe.find("div", {"class":"prepare-again-rating"})
    # if recipe has recommendations
    if not recommendations:
        recommendations = None
    else:
        recommendations = recommendations.find("span").get_text()
    return ratings, recommendations

In [27]:
def get_nutrition_per_serving(soup_recipe):
    # recipe servings
    nutritional_info = soup_recipe.find("div", {"class":"nutritional-info"})
    if not nutritional_info:
        return None
    
    return nutritional_info.get_text()

In [28]:
def get_image_source(soup_recipe):
    # recipe image source
    image_source = soup_recipe.find("div", {"class":"recipe-image"})
    if not image_source:
        return None
    
    return image_source.find("img")["src"]

In [29]:
def get_recipe_details(recipe_links):
    # obtain necessary recipe details from recipe page
    
    cuisine_recipes = {}
    
    # loop over recipe links to get recipe details for each recipe page
    for r in recipe_links:
        recipe = {}
        recipe['r_link'] = r["href"]
        print "recipe link: ", recipe['r_link']
        
        sleep(SCRAPING_REQUEST_STAGGER)        
        soup_recipe = get_recipe(recipe['r_link'])
        
        recipe['recipe title'] = get_recipe_title(soup_recipe)
        recipe['chef'] = get_recipe_chef(soup_recipe)
        recipe['description'] = get_description(soup_recipe)
        recipe['ingredient list'] = get_recipe_ingredients(soup_recipe)
        recipe['preperation steps'] = get_recipe_preperation(soup_recipe)
        recipe['prep_time'], recipe['cook_time'] = get_recipe_time(soup_recipe)
        recipe['servings'] = get_servings(soup_recipe)
        recipe['rating'], recipe['recommendation'] = get_recommendations(soup_recipe)
        recipe['nutritional_info'] = get_nutrition_per_serving(soup_recipe)
        recipe['image_source'] = get_image_source(soup_recipe)
        
        cuisine_recipes[recipe['recipe title']] = recipe
    
    return cuisine_recipes

In [30]:
def get_recipe_links(cuisine, att_value, pages):
    # loop over each page in the cuisine to obtain all the recipe details
    
    recipe_links = []
    
    for page in xrange(1, pages + 1):
        sleep(SCRAPING_REQUEST_STAGGER)
        # recipe links from each cuisine search page
        recipe_links.extend(get_cuisine_pages(att_value, cuisine, page))
        
    cuisine_recipes = get_recipe_details(recipe_links)
    return cuisine_recipes

In [31]:
def get_cuisine_recipes(cuisines, att_values):
    # loop over each cuisine and store data regarding that cuisine recipies
    
    # to store the data in a pandas Dataframe
    cuisine_df = pd.DataFrame()
    
    for cuisine, att_value in zip(cuisines, att_values):
        cuisine_dict = {}
        
        cuisine_dict['cuisine'] = cuisine
        cuisine_dict['source'] = 'Epicurious'
        
        # replace all spaces in the cuisine title with '%20', as used by website
        cuisine_no_space = cuisine.replace(" ", "%20")
        
        # obtain the number of recipes for each cuisine
        cuisine_dict['num_recipes'] = get_number_of_recipes(cuisine_no_space, att_value)
        
        # convert number of recipes into pages to scrape
        cuisine_dict['pages'] = int(ceil(cuisine_dict['num_recipes'] / NUMBER_OF_RECIPES_PER_PAGE))

        # print the cuisine details
        print '#####'
        print "Cuisine: %s \t Number of recipes: %r \t\t Number of pages: %r" \
                    % (cuisine, cuisine_dict['num_recipes'], cuisine_dict['pages'])
        
        cuisine_dict['recipes_details'] = get_recipe_links(cuisine_no_space, att_value, cuisine_dict['pages'])
    
        #coll.insert_one(cuisine_dict)
        
        # convert the dictionary into a dataframe and append it to the final dataframe
        cuisine_df = cuisine_df.append(pd.DataFrame.from_dict(cuisine_dict, orient='columns'), ignore_index=True)
        
    return cuisine_df

In [49]:
if __name__ == '__main__':
    
    # list of cuisines on Epicurious
    cuisines = ['Greek']#['African', 'American', 'Asian', 'Cajun/Creole', 'Central/South American', 'Chinese', 'Eastern European/Russian', 'German', 'Greek', 'Indian', 'Irish', 'Italian', 'Jewish', 'Mediterranean', 'Mexican', 'Middle Eastern', 'Moroccan', 'Scandinavian', 'Southwestern', 'Spanish/Portuguese', 'Thai', 'Vietnamese']
    att_values = [12]#[1, 2, 3, 4, 6, 7, 8, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
    #coll.delete_many({})
    cuisine_dataframe = get_cuisine_recipes(cuisines, att_values)
    

#####
Cuisine: Greek 	 Number of recipes: 287 		 Number of pages: 15
recipe link:  /recipes/food/views/pecan-molasses-bundt-cake-with-bourbon-glaze-107886
recipe link:  /recipes/food/views/cassoulet-107409
recipe link:  /recipes/food/views/moroccan-chickpea-soup-104356
recipe link:  /recipes/food/views/spring-vegetable-paella-106419
recipe link:  /recipes/food/views/steamed-mussels-103839
recipe link:  /recipes/food/views/black-bean-corn-burger-367251
recipe link:  /recipes/food/views/steak-and-mushroom-reubens-356330
recipe link:  /recipes/food/views/spicy-sweet-potato-spread-358369
recipe link:  /recipes/food/views/lime-vanilla-frozen-yogurt-101768
recipe link:  /recipes/food/views/giant-chocolate-cake-with-bittersweet-chocolate-ganache-and-edible-flowers-353421
recipe link:  /recipes/food/views/white-chocolate-tiramisu-trifle-with-spiced-pears-240701
recipe link:  /recipes/food/views/strawberry-and-blueberry-summer-pudding-238537
recipe link:  /recipes/food/views/grilled-vanilla-fre

In [50]:
cuisine_dataframe

Unnamed: 0,cuisine,num_recipes,pages,recipes_details,source
0,Greek,287,15,"{u'cook_time': None, u'description': u'Soundtr...",Epicurious
1,Greek,287,15,"{u'cook_time': None, u'description': None, u'r...",Epicurious
2,Greek,287,15,"{u'cook_time': None, u'description': u'Chocola...",Epicurious
3,Greek,287,15,"{u'cook_time': None, u'description': None, u'r...",Epicurious
4,Greek,287,15,"{u'cook_time': None, u'description': u'In Asia...",Epicurious
5,Greek,287,15,"{u'cook_time': None, u'description': u'With th...",Epicurious
6,Greek,287,15,"{u'cook_time': None, u'description': u'Gravlak...",Epicurious
7,Greek,287,15,"{u'cook_time': None, u'description': u'This sa...",Epicurious
8,Greek,287,15,"{u'cook_time': None, u'description': u'Why you...",Epicurious
9,Greek,287,15,"{u'cook_time': None, u'description': u'Chinese...",Epicurious


In [51]:
cuisine_dataframe.to_csv("recipes_data_Epicurious_Greek.csv")

Cuisines that had to removed due to page error
(cuisine, att_value) = [('English/Scottish', 9), ('French', 10), ('Japanese', 16), ('Southern/Soul Food', 167)]