In [1]:
from time import sleep
import requests
from bs4 import BeautifulSoup
from math import ceil
import pandas as pd
from pymongo import MongoClient
from selenium import webdriver  
import time
from selenium.webdriver.support.ui import WebDriverWait

In [2]:
SCRAPING_REQUEST_STAGGER = 3.0 # in seconds
# the main link for BBC Good Food
SEARCH_URL = 'http://www.bbcgoodfood.com/search/recipes?query=#page={}&path=cuisine/{}'
COLLECTION_URL = 'http://www.bbcgoodfood.com/recipes/collection/{}'
RECIPE_URL = 'http://www.bbcgoodfood.com{}'
NUMBER_OF_RECIPES_PER_SEARCH_PAGE = 15.

DB_NAME = 'PROJECT_RECIPIES'
COLLECTION_NAME = 'BBC'

client = MongoClient()
db = client[DB_NAME]
coll = db[COLLECTION_NAME]

In [3]:
def get_content_from_static_url(link):
    # make request with the input link and return the content of the web link
    
    sleep(SCRAPING_REQUEST_STAGGER)        
    headers = {"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36"}
    response = requests.get(link, headers=headers)
    # if the status code of response is not 200, then raise a warning
    if response.status_code != 200:
        return False
    return response.content

In [49]:
def get_content_from_dynamic_url(link):
    # make request with the input link and return the content of the web link
    browser = webdriver.Chrome("/Applications/chromedriver")  
    browser.set_page_load_timeout(15)
    try:
        browser.get(link)
    except:
        page_source = browser.page_source
        browser.quit()
        return page_source
    page_source = browser.page_source
    browser.quit()
    return page_source

In [50]:
def get_number_of_search_recipes(cuisine):
    # calculate number of recipe search results for any given cuisine
    
    # create a full link using the page number and cuisine name to obtain web content from link
    cuisine_search_link = SEARCH_URL.format(0, cuisine)
    cuisine_recipes = get_content_from_dynamic_url(cuisine_search_link)
    
    # if no content is returned from the cuisine search link page
    if not cuisine_recipes:
        print "no content for:", cuisine_search_link
        return None  
    # convert web content to soup to better access the content
    soup_cuisine = BeautifulSoup(cuisine_recipes)
    # get recipe-count and convert it into integer
    return int(soup_cuisine.find("h1",{"class": "search-title"}).find("em").get_text())

In [51]:
def get_cuisine_search_pages(cuisine, page):
    # create link for given cuisine and page number and obtain recipe links from page 
    print cuisine, page
    link = SEARCH_URL.format(page, cuisine)
    cuisine_recipe_links = get_content_from_dynamic_url(link)
    if not cuisine_recipe_links:
        print "no content for:", link
        return None
    
    soup_search = BeautifulSoup(cuisine_recipe_links)

    # return the list of links for the cuisine search page
    return soup_search.find_all("h2",{"class":"node-title"})

In [52]:
def get_cuisine_collection_page(cuisine):
    # create collection link for given cuisine and obtain recipe links from the page
    
    # create a full link using the cuisine name to obtain web content from link
    cuisine_link = COLLECTION_URL.format(cuisine)
    cuisine_recipes = get_content_from_static_url(cuisine_link)
    
    # if no content is returned from the cuisine link page
    if not cuisine_recipes:
        print "no content for:", cuisine_link
        return None  
    
    # convert web content to soup to better access the content
    soup_cuisine = BeautifulSoup(cuisine_recipes)
    
    # return the list of links for the cuisine collection page
    return soup_cuisine.find_all("h2",{"class": "node-title"})

In [53]:
def get_recipe(r_link):
    # make web request and obtain site content for recipe page
    recipe_link = RECIPE_URL.format(r_link)
    recipe_response = get_content_from_static_url(recipe_link)
    if not recipe_response:
        print "no content for:", recipe_link
        return None
    
    return BeautifulSoup(recipe_response)

In [54]:
def get_recipe_title(soup_recipe):
    # recipe title
    return soup_recipe.find("h1",{"itemprop":"name"}).get_text()

In [55]:
def get_recipe_chef(soup_recipe):
    # recipe chef name
    chef_name = soup_recipe.find("div",{"class":"recipe-header__chef recipe-header__chef--first"}).find("a")
    
    # if chef name is not there for the recipe
    if not chef_name:
        chef_name = soup_recipe.find("div",{"class":"recipe-header__chef recipe-header__chef--first"}).find("span")
    if not chef_name:
        return None
    
    return chef_name.get_text()

In [56]:
def get_description(soup_recipe):
    # recipe description
    summary = soup_recipe.find("div",{"itemprop":"description"}).find("div", {"class":"field-items"}).find("div")
    if not summary:
        return None
    
    # if summary is not there for the recipe
    return summary.get_text()

In [57]:
def get_recipe_ingredients(soup_recipe):
    # recipe ingredients
    ingredients_list = soup_recipe.find_all("li", {"itemprop":"ingredients"})
    ingredients = []
    for ing in ingredients_list:
        ingredients.append(ing.get_text().split('\n')[0])
    return ingredients

In [58]:
def get_recipe_preperation(soup_recipe):
    # recipe preperation steps
    prep_steps = soup_recipe.find_all("li", {"itemprop":"recipeInstructions"})
    prep = []
    for step in prep_steps:
        # get text for the each step and strip of new line characters at both ends
        prep.append(step.get_text().strip())
    return prep

In [59]:
def get_recipe_time(soup_recipe):
    # recipe preperation time
    prep_time_check = soup_recipe.find("span", {"class":"recipe-details__cooking-time-prep"})
    # default for prep_time and cooking_time is None, which will be updated if there is a value for the recipe
    prep_time, cooking_time = None, None
    # if recipe has prep_time given
    if prep_time_check:
        prep_time = prep_time_check.get_text().split(":")[1].strip()
    
    # recipe cooking time
    cooking_time_check = soup_recipe.find("span", {"class":"recipe-details__cooking-time-cook"})
    # if recipe has cooking_time given
    if cooking_time_check:
        cooking_time = cooking_time_check.get_text().split(":")[1].strip()
    
    return prep_time, cooking_time

In [60]:
def get_servings(soup_recipe):
    # recipe servings
    servings = soup_recipe.find("span", {"itemprop":"recipeYield"})
    if not servings:
        return None
    return servings.get_text().strip()

In [61]:
def get_skill_level(soup_recipe):
    # recipe servings
    skill_level = soup_recipe.find("section", {"class":"recipe-details__item recipe-details__item--skill-level"}).find("span" , {"class":"recipe-details__text"})
    if not skill_level:
        return None
    return skill_level.get_text().strip()

In [62]:
def get_recommendations(soup_recipe):
    # recipe recommendations
    ratings = soup_recipe.find("meta", {"itemprop":"ratingValue"})["content"]
    ratings_count = soup_recipe.find("meta", {"itemprop":"ratingCount"})["content"]
    if ratings == 0:
        return None, None
    
    return ratings, ratings_count

In [63]:
def get_nutrition_per_serving(soup_recipe):
    # recipe servings
    nutritional_info = soup_recipe.find_all("span", {"class":"nutrition__value"})
    if not nutritional_info:
        return None
    
    nutrition_name, nutrition_value = [], []
    for nutrition in nutritional_info:
        # get text for the each step
        if nutrition.get_text() != '-':
            nutrition_name.append(nutrition["itemprop"])
            nutrition_value.append(nutrition.get_text())
    return dict(zip(nutrition_name, nutrition_value))

In [64]:
def get_image_source(soup_recipe):
    # recipe image source
    image_source = soup_recipe.find("img", {"itemprop":"image"})
    if not image_source:
        return None
    
    return image_source["src"]

In [65]:
def get_recipe_details(recipe_links):
    # obtain necessary recipe details from recipe page
    
    cuisine_recipes = {}
    print "recipe_links", recipe_links
    # loop over recipe links to get recipe details for each recipe page
    for r in recipe_links:
        recipe = {}
        recipe['r_link'] = r.a["href"]
        print "recipe link: ", recipe['r_link']
        
        sleep(SCRAPING_REQUEST_STAGGER)        
        soup_recipe = get_recipe(recipe['r_link'])
        
        recipe['recipe title'] = get_recipe_title(soup_recipe)
        recipe['chef'] = get_recipe_chef(soup_recipe)
        recipe['description'] = get_description(soup_recipe)
        recipe['ingredient list'] = get_recipe_ingredients(soup_recipe)
        recipe['preperation steps'] = get_recipe_preperation(soup_recipe)
        recipe['prep_time'], recipe['cook_time'] = get_recipe_time(soup_recipe)
        recipe['servings'] = get_servings(soup_recipe)
        recipe['skill_level'] = get_skill_level(soup_recipe)
        recipe['rating'], recipe['rating count'] = get_recommendations(soup_recipe)
        recipe['nutritional_info'] = get_nutrition_per_serving(soup_recipe)
        recipe['image_source'] = get_image_source(soup_recipe)
        
        cuisine_recipes[recipe['recipe title']] = recipe
        
    return cuisine_recipes

In [101]:
def get_recipe_links(cuisine, pages, collection):
    # loop over each page in the cuisine to obtain all the recipe details
    recipe_links = []
    
    for page in xrange(50, 10ß0):#pages + 1):
        sleep(SCRAPING_REQUEST_STAGGER)
        # recipe links from each cuisine search page
        recipe_links.extend(get_cuisine_search_pages(cuisine, page))
        
    # if cuisine is a collection, add unique recipe links from the collection page to the list
    if collection:
        recipe_links.extend(get_cuisine_collection_page(cuisine))
    
    '''
    cuisine_recipes = get_recipe_details(list(set(recipe_links)))
    return cuisine_recipes
    '''
    return recipe_links

In [102]:
def get_cuisine_recipes(search_cuisisnes, cuisines):
    # loop over each cuisine and store data regarding that cuisine recipies
    
    # to store the data in a pandas Dataframe
    cuisine_df = pd.DataFrame()
    
    for cuisine in search_cuisisnes:
        cuisine_dict = {}
        cuisine_dict['cuisine'] = cuisine
        cuisine_dict['source'] = 'BBC Good Food'
        
        # convert text to lower case and convert spaces and '&' to '-', as used by website
        cuisine_no_space = cuisine.lower().replace(' & ', '-').replace(' ', '-')
        
        # obtain the number of recipe search results for each cuisine
        recipes_cuisine_search = get_number_of_search_recipes(cuisine_no_space)
        # convert number of recipes into pages to scrape, number search pages and one page for cuisine collection
        cuisine_dict['pages'] = int(ceil(recipes_cuisine_search / NUMBER_OF_RECIPES_PER_SEARCH_PAGE))

        # default value for collection is 'No', to indicate that the cuisine is not in the collection
        collection = False
        
        # if cuisine is in collections get the links from there as well, and change collection = 'Yes'
        if cuisine in cuisines:
            cuisine_dict['pages'] += 1
            collection = True
        cuisine_dict['recipes_links'] = get_recipe_links(cuisine_no_space, cuisine_dict['pages']-1, collection)
        
        cuisine_dict['num_recipes'] = len(cuisine_dict['recipes_links'])
        
        # print the cuisine details
        print '#####'
        print "Cuisine: %s \t Number of recipes: %d \t\t Number of pages: %d" \
                    % (cuisine, cuisine_dict['num_recipes'], cuisine_dict['pages'])
        
        #coll.insert_one(cuisine_dict)
        
        '''
        # convert the dictionary into a dataframe and append it to the final dataframe
        cuisine_df = cuisine_df.append(pd.DataFrame.from_dict(cuisine_dict, orient='columns'), ignore_index=True)
        
    return cuisine_df 
    '''
    return pd.DataFrame.from_dict(cuisine_dict, orient='columns')

In [103]:
if __name__ == '__main__':
    
    # list of cuisines on BBC Food
    cuisines = ['American', 'British', 'Caribbean', 'Chinese', 'French', 'Greek', 'Indian', \
                'Italian', 'Japanese', 'Mediterranean', 'Mexican', 'Moroccan', 'Spanish', \
                'Thai', 'Turkish', 'Vietnamese']
    search_cuisisnes = ['British']
#     search_cuisisnes = ['African', 'American', 'Asian', 'Australian', 'Austrian', \
#                     'Balinese', 'Belgian', 'Brazilian', 'British', 'Cajun & Creole', \
#                     'Caribbean', 'Chilean', 'Chinese', 'Cuban', 'Danish', 'Eastern European', \
#                     'English', 'French', 'German', 'Greek', 'Hungarian', 'Indian', 'Irish', \
#                     'Italian', 'Japanese', 'Jewish', 'Korean', 'Latin American', \
#                     'Mediterranean', 'Mexican', 'Middle Eastern', 'Moroccan', 'North Africa', \
#                     'Portuguese', 'Scandinavian', 'Scottish', 'Southern & Soul', 'Spanish', \
#                     'Swedish', 'Swiss', 'Thai', 'Tunisian', 'Turkish', 'Vietnamese']
    #coll.delete_many({})
    cuisine_dataframe = get_cuisine_recipes(search_cuisisnes, cuisines)

british 0
british 1
british 2
british 3
british 4
british 5
british 6
british 7
british 8
british 9
british 10
british 11
british 12
british 13
british 14
british 15
british 16
british 17
british 18
british 19
british 20
british 21
british 22
british 23
british 24
british 25
british 26
british 27
british 28
british 29
british 30
british 31
british 32
british 33
british 34
british 35
british 36
british 37
british 38
british 39
british 40
british 41
british 42
british 43
british 44
british 45
british 46
british 47
british 48
british 49
#####
Cuisine: British 	 Number of recipes: 715 		 Number of pages: 275


In [104]:
British_dataframe = cuisine_dataframe
British_dataframe

Unnamed: 0,cuisine,num_recipes,pages,recipes_links,source
0,British,715,275,"<h2 class=""node-title""><a href=""/recipes/4942/...",BBC Good Food
1,British,715,275,"<h2 class=""node-title""><a href=""/recipes/3092/...",BBC Good Food
2,British,715,275,"<h2 class=""node-title""><a href=""/recipes/3229/...",BBC Good Food
3,British,715,275,"<h2 class=""node-title""><a href=""/recipes/1521/...",BBC Good Food
4,British,715,275,"<h2 class=""node-title""><a href=""/recipes/4622/...",BBC Good Food
5,British,715,275,"<h2 class=""node-title""><a href=""/recipes/11695...",BBC Good Food
6,British,715,275,"<h2 class=""node-title""><a href=""/recipes/1997/...",BBC Good Food
7,British,715,275,"<h2 class=""node-title""><a href=""/recipes/2174/...",BBC Good Food
8,British,715,275,"<h2 class=""node-title""><a href=""/recipes/2818/...",BBC Good Food
9,British,715,275,"<h2 class=""node-title""><a href=""/recipes/4382/...",BBC Good Food


#cuisine_recipe_link_dataframe = pd.DataFrame()
#African_dataframe = cuisine_dataframe
African_dataframe
#American_dataframe = cuisine_dataframe
American_dataframe
#Asian_dataframe = cuisine_dataframe
Asian_dataframe
#Australian_dataframe = cuisine_dataframe
Australian_dataframe
#Austrian_dataframe = cuisine_dataframe
Austrian_dataframe
#Balinese_dataframe = cuisine_dataframe
Balinese_dataframe
#Belgian_dataframe = cuisine_dataframe
Belgian_dataframe
#Brazilian_dataframe = cuisine_dataframe
Brazilian_dataframe
#British_dataframe = cuisine_dataframe
British_dataframe

In [105]:
cuisine_recipe_link_dataframe = cuisine_recipe_link_dataframe.append(British_dataframe, ignore_index=True)

In [106]:
cuisine_recipe_link_dataframe

Unnamed: 0,cuisine,num_recipes,pages,recipes_links,source
0,African,5,1,"<h2 class=""node-title""><a href=""/recipes/4750/...",BBC Good Food
1,African,5,1,"<h2 class=""node-title""><a href=""/recipes/25526...",BBC Good Food
2,African,5,1,"<h2 class=""node-title""><a href=""/recipes/cape-...",BBC Good Food
3,African,5,1,"<h2 class=""node-title""><a href=""/recipes/south...",BBC Good Food
4,African,5,1,"<h2 class=""node-title""><a href=""/recipes/dukka...",BBC Good Food
5,American,335,23,"<h2 class=""node-title""><a href=""/recipes/1223/...",BBC Good Food
6,American,335,23,"<h2 class=""node-title""><a href=""/recipes/3431/...",BBC Good Food
7,American,335,23,"<h2 class=""node-title""><a href=""/recipes/2869/...",BBC Good Food
8,American,335,23,"<h2 class=""node-title""><a href=""/recipes/4915/...",BBC Good Food
9,American,335,23,"<h2 class=""node-title""><a href=""/recipes/2882/...",BBC Good Food


In [107]:
cuisine_recipe_link_dataframe.to_csv("recipes_links_A_Balinese_Belgian_Brazilian_British1.csv")

In [None]:
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

browser = webdriver.Chrome("/Applications/chromedriver")  
#browser.wait = WebDriverWait(browser, 5)
browser.set_page_load_timeout(10)
#browser.set_preference('webdriver.load.strategy', 'unstable')
#WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.CLASS_NAME,'first last')))
try:
    browser.get("http://www.bbcgoodfood.com/search/recipes?query=#page=3&path=cuisine/american")
except:
    browser.page_source
browser.page_source

import requests, browsercookie
URL = 'http://www.bbcgoodfood.com/recipes/collection/american'

#cj = browsercookie.chrome()
#print dict_from_cookiejar(cj)
#my_cookies = {'__gads': 'ID=34a7a4f38cb8817a:T=1451292952:S=ALNI_MbDpXpQCYmlnxDPCWH-fs0lSnMDMQ', '__utma': '113338708.821724792.1451292937.1451292937.1451292937.1', '__utmb': '113338708.1.10.1451292937', '__utmc': '113338708', '__utmt': '1', '__utmz': '113338708.1451292937.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', '_qsst_s': '1451292929507', '_qst_s': '1', 'has_js':'1', 'x_qtag_1309579': 'EYWbbcgoodfood.com*1451292929509*category*cuisines*recipes@*a*Qsc*Q*j1*C*B1*C*P1*5-@1-*C*R*Z*a*Idirect*Y*9-*@0-/@4-/@2-/@3-*Y*A@1-*b*E*C*F*Q*@0-/@4-/@2-/@3-*Y*Q__v*z'}

headers = {"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36"}

r = requests.get(URL,headers=headers)#, cookies=my_cookies)
print r.content
r.status_code