In [1]:
from time import sleep
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from math import ceil
import pandas as pd
from pymongo import MongoClient

In [2]:
SCRAPING_REQUEST_STAGGER = 2.0 # in seconds
# the link for Saveur cuisine recipes
CUISINE_URL = 'http://www.saveur.com/recipes-search?filter[2]={}'
# the link for Saveur recipes
RECIPE_URL = 'http://www.saveur.com{}'
NUMBER_OF_RECIPES_PER_PAGE = 48.

In [None]:
DB_NAME = 'PROJECT_RECIPIES'
COLLECTION_NAME = 'Saveur'

client = MongoClient()
db = client[DB_NAME]
coll = db[COLLECTION_NAME]

In [3]:
def get_content_from_url(link):
    # make request with the input link and return the content of the web link
    sleep(SCRAPING_REQUEST_STAGGER)        
    response = requests.get(link)
    
    # if the status code of response is not 200, then raise a warning
    if response.status_code != 200:
        return False
    
    return response.content

In [None]:
def get_content_from_dynamic_url(link):
    # make request with the input link and return the content of the web link
    browser = webdriver.Chrome("/Applications/chromedriver")  
    browser.set_page_load_timeout(45)
    try:
        browser.get(link)
    except:
        page_source = browser.page_source
        browser.close()
        return page_source
    page_source = browser.page_source
    browser.quit()
    return page_source

In [5]:
def get_number_of_recipes(filter2_value):
    # find number of recipes for any given cuisine
    
    # create a full link using the cuisine_link and cuisine name to obtain web content from link
    cuisine_link = CUISINE_URL.format(filter2_value)
    cuisine_recipes = get_content_from_dynamic_url(cuisine_link)
    
    # if no content is returned from the cuisine link page
    if not cuisine_recipes:
        print "no content for:", cuisine_link
        return None  
    
    # convert web content to soup to better access the content
    soup_cuisine = BeautifulSoup(cuisine_recipes)
    
    # select characters from phrase and convert it into integer
    return int(soup_cuisine.find("div", {"class": "results_label"}).get_text().split()[0].replace(",", ""))

In [None]:
def get_recipe(r_link):
    # make web request and obtain site content for recipe page
    recipe_link = RECIPE_URL.format(r_link)
    recipe_response = get_content_from_url(recipe_link)
    if not recipe_response:
        print "no content for:", recipe_link
        return None
    
    return BeautifulSoup(recipe_response)

In [None]:
def get_recipe_title(soup_recipe):
    # recipe title
    return soup_recipe.find("div",{"class":"content-onecol one-col"}).find("h1").get_text().strip()

In [None]:
def get_recipe_chef(soup_recipe):
    # recipe chef name
    chef_name = soup_recipe.find("span",{"class":"author"})
    
    # if chef name is not there for the recipe
    if not chef_name:
        return None
    
    # strip new line characters on both sides and the "By " from front
    return chef_name.find("a").get_text()

In [None]:
def get_description(soup_recipe):
    # recipe description
    summary = soup_recipe.find("div",{"class":"field-body inContentAds-processed"})
    
    # if summary is not there for the recipe
    if not summary:
        return None
    
    return summary.find("p").get_text().strip()

In [None]:
def get_recipe_ingredients(soup_recipe):
    #recipe ingredients
    ingredients_list = soup_recipe.find_all("div", {"class":"ingredient"})
    ingredients = []
    for ing in ingredients_list:
        ingredients.append(ing.get_text())
    return ingredients

In [None]:
def get_recipe_preperation(soup_recipe):
    # recipe preperation steps
    prep_steps = soup_recipe.find_all("div", {"class":"instruction"})
    prep = []
    for step in prep_steps:
        # get text for the each step and strip of new line characters at both ends
        prep.append(step.get_text().strip())
    return prep

In [None]:
def get_recipe_time(soup_recipe):
    # recipe preperation time
    prep_time = soup_recipe.find("div", {"class":"prep-time"})
    # if recipe has prep_time given
    if prep_time:
        prep_time = prep_time.get_text().strip()
    else:
        prep_time = None
    # recipe cooking time
    cooking_time = soup_recipe.find("div", {"class":"cook-time"})
    # if recipe has cooking_time given
    if cooking_time:
        cooking_time = cooking_time.get_text().strip()
    else:
        cooking_time = None
    return prep_time, cooking_time

In [None]:
def get_servings(soup_recipe):
    # recipe servings
    servings = soup_recipe.find("div", {"class":"yield"})
    if not servings:
        return None
    
    return servings.get_text().strip()

In [None]:
def get_recommendations(soup_recipe):
    # recipe recommendations
    ratings = soup_recipe.find("span", {"class":"rating"})
    # if recipe has ratings
    if not ratings:
        ratings = None
    else:
        ratings = ratings.get_text()
    recommendations = soup_recipe.find("div", {"class":"prepare-again-rating"}).find("span")
    # if recipe has recommendations
    if not recommendations:
        recommendations = None
    else:
        recommendations = recommendations.get_text()
    return ratings, recommendations

In [None]:
def get_nutrition_per_serving(soup_recipe):
    # recipe servings
    nutritional_info = soup_recipe.find("div", {"class":"nutritional-info"})
    if not nutritional_info:
        return None
    
    return nutritional_info.get_text()

In [None]:
def get_image_source(soup_recipe):
    # recipe image source
    image_source = soup_recipe.find("div", {"class":"field-image-inner"})
    if not image_source:
        return None
    
    return image_source.find("img")["data-src"]

In [31]:
def get_recipe_details(recipe_links):
    # obtain necessary recipe details from recipe page
    
    cuisine_recipes = {}
    
    # loop over recipe links to get recipe details for each recipe page
    for r in recipe_links:
        recipe = {}
        recipe['r_link'] = r.a["href"]
        print "recipe link: ", recipe['r_link']
        
        sleep(SCRAPING_REQUEST_STAGGER)        
        soup_recipe = get_recipe(recipe['r_link'])
        
        recipe['recipe title'] = get_recipe_title(soup_recipe)
        recipe['chef'] = get_recipe_chef(soup_recipe)
        recipe['description'] = get_description(soup_recipe)
        recipe['ingredient list'] = get_recipe_ingredients(soup_recipe)
        recipe['preperation steps'] = get_recipe_preperation(soup_recipe)
        recipe['prep_time'], recipe['cook_time'] = get_recipe_time(soup_recipe)
        recipe['servings'] = get_servings(soup_recipe)
        recipe['rating'], recipe['recommendation'] = get_recommendations(soup_recipe)
        recipe['nutritional_info'] = get_nutrition_per_serving(soup_recipe)
        recipe['image_source'] = get_image_source(soup_recipe)
        
        cuisine_recipes[recipe['recipe title']] = recipe
    
    return cuisine_recipes

In [38]:
def get_recipe_links(filter2_value, pages):
    # loop over each page in the cuisine to obtain all the recipe details
    
    recipe_links = []
    browser = webdriver.Chrome("/Applications/chromedriver")  
    browser.set_page_load_timeout(45)
    link = CUISINE_URL.format(filter2_value)
    try:
        browser.get(link)
        for page in xrange(1, pages+1):
            # recipe links from each cuisine search page
            recipes_links_per_page = BeautifulSoup(browser.page_source).find_all("div",{"class":"result_title"})
            recipe_links.extend(recipes_links_per_page)
            sleep(SCRAPING_REQUEST_STAGGER)
            if page < pages:
                browser.find_element_by_class_name("pager-next").click()
            sleep(SCRAPING_REQUEST_STAGGER)
    except:
        for page in xrange(1, pages+1):
            # recipe links from each cuisine search page
            sleep(SCRAPING_REQUEST_STAGGER)
            recipes_links_per_page = BeautifulSoup(browser.page_source).find_all("div",{"class":"result_title"})
            recipe_links.extend(recipes_links_per_page)
            sleep(SCRAPING_REQUEST_STAGGER)
            if page < pages:
                browser.find_element_by_class_name("pager-next").click()
            sleep(SCRAPING_REQUEST_STAGGER)
        
    browser.quit()
    
    cuisine_recipes = get_recipe_details(recipe_links)
    return cuisine_recipes

In [39]:
def get_cuisine_recipes(cuisines, filter2_values):
    # loop over each cuisine and store data regarding that cuisine recipies
    
    # to store the data in a pandas Dataframe
    cuisine_df = pd.DataFrame()
    
    for cuisine, filter2_value in zip(cuisines, filter2_values):
        cuisine_dict = {}
        
        cuisine_dict['cuisine'] = cuisine
        cuisine_dict['source'] = 'Saveur'
        
        # obtain the number of recipes for each cuisine
        cuisine_dict['num_recipes'] = get_number_of_recipes(filter2_value)
        
        # convert number of recipes into pages to scrape
        cuisine_dict['pages'] = int(ceil(cuisine_dict['num_recipes'] / NUMBER_OF_RECIPES_PER_PAGE))

        # print the cuisine details
        print '#####'
        print "Cuisine: %s \t Number of recipes: %r \t\t Number of pages: %r" \
                    % (cuisine, cuisine_dict['num_recipes'], cuisine_dict['pages'])
        
        
        cuisine_dict['recipes_details'] = get_recipe_links(filter2_value, cuisine_dict['pages'])
    
        coll.insert_one(cuisine_dict)
        
        # convert the dictionary into a dataframe and append it to the final dataframe
        cuisine_df = cuisine_df.append(pd.DataFrame.from_dict(cuisine_dict, orient='columns'), ignore_index=True)
        
    return cuisine_df

In [40]:
if __name__ == '__main__':
    
    # list of cuisines on Saveur
    cuisines = ['African', 'American', 'Asian', 'Cajun/Creole', 'Caribbean', 'Chinese', 'Cuban', 'Eastern European/Russian', 'English/Scottish', 'French', 'German', 'Greek', 'Indian', 'Indonesian', 'Italian', 'Japanese', 'Jewish', 'Mediterranean', 'Mexican', 'Middle Eastern', 'Moroccan', 'Scandinavian', 'Southwestern/Soul Food', 'Spanish/Portuguese', 'Tex-Mex', 'Thai', 'Vietnamese']
    filter2_values = [1000489, 1000490, 1000491, 1000493, 1000494, 1000496, 1000497, 1000498, 1000499, 1000500, 1000501, 1000503, 1000506, 1000932, 1000508, 1000509, 1000510, 1000512, 1000513, 1000514, 1000515, 1000517, 1000518, 1000520, 1000521, 1000522, 1000525]
    coll.delete_many({})
    cuisine_dataframe = get_cuisine_recipes(cuisines, filter2_values)

#####
Cuisine: African 	 Number of recipes: 73 		 Number of pages: 2
1
2
[<div class="result_title"><a href="/semolina-coconut-cake-with-orange-and-rose-waters-recipe">Semolina Coconut Cake with Orange and Rose Water</a></div>, <div class="result_title"><a href="/date-semolina-cookies-makroud-recipe">Date-Filled Semolina Cookies (Makroud)</a></div>, <div class="result_title"><a href="/sugared-rosewater-marzipan-balls-kaber-ellouz-recipe">Sugared Rosewater Marzipan Balls (Kaber Ellouz)</a></div>, <div class="result_title"><a href="/syrup-soaked-pastries-with-hazelnuts-pistachios-and-pine-nuts-deblah-recipe">Syrup-Soaked Pastries with Hazelnuts, Pistachios, and Pine Nuts (Deblah)</a></div>, <div class="result_title"><a href="/1000-hole-crepes-baghrir-recipe">1,000-Hole Crêpes (Baghrir)</a></div>, <div class="result_title"><a href="/shredded-collard-green-salad-roasted-sweet-potatoes-cashews-recipe">Shredded Collard Green Salad with Roasted Sweet Potatoes and Cashews</a></div>, <div class

TimeoutException: Message: timeout: Timed out receiving message from renderer: 14.693
  (Session info: chrome=47.0.2526.106)
  (Driver info: chromedriver=2.20.353124 (035346203162d32c80f1dce587c8154a1efa0c3b),platform=Mac OS X 10.10.5 x86_64)


In [28]:
if __name__ == '__main__':
    
    # list of cuisines on Saveur
    cuisines = ['African', 'American', 'Asian', 'Cajun/Creole', 'Caribbean', 'Chinese', 'Cuban', 'Eastern European/Russian', 'English/Scottish', 'French', 'German', 'Greek', 'Indian', 'Indonesian', 'Italian', 'Japanese', 'Jewish', 'Mediterranean', 'Mexican', 'Middle Eastern', 'Moroccan', 'Scandinavian', 'Southwestern/Soul Food', 'Spanish/Portuguese', 'Tex-Mex', 'Thai', 'Vietnamese']
    filter2_values = [1000489, 1000490, 1000491, 1000493, 1000494, 1000496, 1000497, 1000498, 1000499, 1000500, 1000501, 1000503, 1000506, 1000932, 1000508, 1000509, 1000510, 1000512, 1000513, 1000514, 1000515, 1000517, 1000518, 1000520, 1000521, 1000522, 1000525]
    #coll.delete_many({})
    cuisine_dataframe = get_cuisine_recipes(cuisines, filter2_values)
    

#####
Cuisine: African 	 Number of recipes: 73 		 Number of pages: 2
1
2
[<div class="result_title"><a href="/semolina-coconut-cake-with-orange-and-rose-waters-recipe">Semolina Coconut Cake with Orange and Rose Water</a></div>, <div class="result_title"><a href="/date-semolina-cookies-makroud-recipe">Date-Filled Semolina Cookies (Makroud)</a></div>, <div class="result_title"><a href="/sugared-rosewater-marzipan-balls-kaber-ellouz-recipe">Sugared Rosewater Marzipan Balls (Kaber Ellouz)</a></div>, <div class="result_title"><a href="/syrup-soaked-pastries-with-hazelnuts-pistachios-and-pine-nuts-deblah-recipe">Syrup-Soaked Pastries with Hazelnuts, Pistachios, and Pine Nuts (Deblah)</a></div>, <div class="result_title"><a href="/1000-hole-crepes-baghrir-recipe">1,000-Hole Crêpes (Baghrir)</a></div>, <div class="result_title"><a href="/shredded-collard-green-salad-roasted-sweet-potatoes-cashews-recipe">Shredded Collard Green Salad with Roasted Sweet Potatoes and Cashews</a></div>, <div class

CannotSendRequest: 