In [2]:
from time import sleep
import requests
from bs4 import BeautifulSoup
import re
from math import ceil
import pandas as pd
from pymongo import MongoClient

In [3]:
SCRAPING_REQUEST_STAGGER = 2.0 # in seconds
# the main link for BBC food
URL = 'http://www.bbc.co.uk/food/cuisines/'
CUISINE_URL = 'http://www.bbc.co.uk/food/recipes/search?page={}&cuisines%5B0%5D={}&sortBy=lastModified'
RECIPE_URL = 'http://www.bbc.co.uk{}'
NUMBER_OF_RECIPES_PER_PAGE = 15.

DB_NAME = 'PROJECT_RECIPIES'
COLLECTION_NAME = 'BBC'

client = MongoClient()
db = client[DB_NAME]
coll = db[COLLECTION_NAME]

In [4]:
def get_content_from_url(link):
    # make request with the input link and return the content of the web link
    sleep(SCRAPING_REQUEST_STAGGER)        
    response = requests.get(link)
    
    # if the status code of response is not 200, then raise a warning
    if response.status_code != 200:
        return False
    return response.content

In [5]:
def get_number_of_recipes(cuisine):
    # calculate number of recipes for any given cuisine
    
    # create a full link using the cuisine_link and cuisine name to obtain web content from link
    cuisine_link = URL + cuisine
    cuisine_recipes = get_content_from_url(cuisine_link)
    
    # if no content is returned from the cuisine link page
    if not cuisine_recipes:
        print "no content for:", cuisine_link
        return None  
    
    # convert web content to soup to better access the content
    soup_cuisine = BeautifulSoup(cuisine_recipes)

    # remove non-alphanumeric characters from recipe-count and convert it into integer
    return int(re.sub('\W', '', soup_cuisine.find("span",{"class": "recipe-count"}).get_text()))

In [6]:
def get_cuisine_pages(cuisine, page):
    # create link for given cuisine and page number and obtain recipe links from page 
    
    link = CUISINE_URL.format(str(page), cuisine)
    cuisine_recipe_links = get_content_from_url(link)
    if not cuisine_recipe_links:
        print "no content for:", link
        return None
    
    soup_search = BeautifulSoup(cuisine_recipe_links)

    # return the list of links for the cuisine page
    return soup_search.find("div",{"id":"article-list"}).find_all("h3")

In [7]:
def get_recipe(r_link):
    # make web request and obtain site content for recipe page
    recipe_link = RECIPE_URL.format(r_link)
    recipe_response = get_content_from_url(recipe_link)
    if not recipe_response:
        print "no content for:", recipe_link
        return None
    
    return BeautifulSoup(recipe_response)

In [8]:
def get_recipe_title(soup_recipe):
    # recipe title
    return soup_recipe.find("h1",{"class":"fn"}).get_text()

In [9]:
def get_recipe_chef(soup_recipe):
    # recipe chef name
    chef_name = soup_recipe.find("span",{"class":"author"})
    
    # if chef name is not there for the recipe
    if not chef_name:
        return None
    
    return chef_name.get_text()

In [10]:
def get_description(soup_recipe):
    # recipe description
    summary = soup_recipe.find("span",{"class":"summary"})
    
    # if summary is not there for the recipe
    if not summary:
        return None
    
    return summary.get_text()

In [11]:
def get_recipe_ingredients(soup_recipe):
    # recipe ingredients
    ingredients_list = soup_recipe.find_all("p", {"class":"ingredient"})
    ingredients = []
    for ing in ingredients_list:
        ingredients.append(ing.get_text())
    return ingredients

In [12]:
def get_recipe_preperation(soup_recipe):
    # recipe preperation steps
    prep_steps = soup_recipe.find_all("li", {"class":"instruction"})
    prep = []
    for step in prep_steps:
        # get text for the each step and strip of new line characters at both ends
        prep.append(step.get_text().strip())
    return prep

In [13]:
def get_recipe_time(soup_recipe):
    # recipe preperation time
    prep_time = soup_recipe.find("span", {"class":"prepTime"})
    # if recipe has prep_time given
    if prep_time:
        prep_time = prep_time.get_text()
    else:
        prep_time = None
    # recipe cooking time
    cooking_time = soup_recipe.find("span", {"class":"cookTime"})
    # if recipe has cooking_time given
    if cooking_time:
        cooking_time = cooking_time.get_text()
    else:
        cooking_time = None
    return prep_time, cooking_time

In [14]:
def get_servings(soup_recipe):
    # recipe servings
    servings = soup_recipe.find("h3", {"class":"yield"})
    if not servings:
        return None
    return servings.get_text()

In [15]:
def get_recommendations(soup_recipe):
    # recipe recommendations
    recommendations = soup_recipe.find("h2", {"class":"description"})
    if not recommendations:
        return None
    
    return recommendations.get_text()

In [16]:
def get_image_source(soup_recipe):
    # recipe image source
    image_source = soup_recipe.find("img", {"id":"food-image"})
    if not image_source:
        return None
    
    return image_source["src"]

In [17]:
def get_recipe_details(recipe_links):
    # obtain necessary recipe details from recipe page
    
    cuisine_recipes = {}
    
    # loop over recipe links to get recipe details for each recipe page
    for r in recipe_links:
        recipe = {}
        recipe['r_link'] = r.a["href"]
        print "recipe link: ", recipe['r_link']
        
        sleep(SCRAPING_REQUEST_STAGGER)        
        soup_recipe = get_recipe(recipe['r_link'])
        
        recipe['recipe title'] = get_recipe_title(soup_recipe)
        recipe['chef'] = get_recipe_chef(soup_recipe)
        recipe['description'] = get_description(soup_recipe)
        recipe['ingredient list'] = get_recipe_ingredients(soup_recipe)
        recipe['preperation steps'] = get_recipe_preperation(soup_recipe)
        recipe['prep_time'], recipe['cook_time'] = get_recipe_time(soup_recipe)
        recipe['servings'] = get_servings(soup_recipe)
        recipe['recommendations'] = get_recommendations(soup_recipe)
        recipe['image_source'] = get_image_source(soup_recipe)
        
        cuisine_recipes[recipe['recipe title']] = recipe
    
    return cuisine_recipes

In [18]:
def get_recipe_links(cuisine, pages):
    # loop over each page in the cuisine to obtain all the recipe details
    recipe_links = []
    
    for page in xrange(1, pages + 1):
        sleep(SCRAPING_REQUEST_STAGGER)
        # recipe links from each cuisine search page
        recipe_links.extend(get_cuisine_pages(cuisine, page))
        
    cuisine_recipes = get_recipe_details(recipe_links)
    return cuisine_recipes

In [19]:
def get_cuisine_recipes(cuisines):
    # loop over each cuisine and store data regarding that cuisine recipies
    
    # to store the data in a pandas Dataframe
    cuisine_df = pd.DataFrame()
    
    for cuisine in cuisines:
        cuisine_dict = {}
        
        cuisine_dict['cuisine'] = cuisine
        cuisine_dict['source'] = 'BBC Food'
        
        # convert text to lower case and replace all spaces in the cuisine title with '_', as used by website
        cuisine_no_space = cuisine.lower().replace(" ", "_")
        
        # obtain the number of recipes for each cuisine
        cuisine_dict['num_recipes'] = get_number_of_recipes(cuisine_no_space)
        
        # convert number of recipes into pages to scrape
        cuisine_dict['pages'] = int(ceil(cuisine_dict['num_recipes'] / NUMBER_OF_RECIPES_PER_PAGE))

        # print the cuisine details
        print '#####'
        print "Cuisine: %s \t Number of recipes: %d \t\t Number of pages: %d" \
                    % (cuisine, cuisine_dict['num_recipes'], cuisine_dict['pages'])
        
        cuisine_dict['recipes_details'] = get_recipe_links(cuisine_no_space, cuisine_dict['pages'])
    
        #coll.insert_one(cuisine_dict)
        
        # convert the dictionary into a dataframe and append it to the final dataframe
        cuisine_df = cuisine_df.append(pd.DataFrame.from_dict(cuisine_dict, orient='columns'), ignore_index=True)
        
    return cuisine_df 

In [30]:
if __name__ == '__main__':
    
    # list of cuisines on BBC Food
    cuisines = ['Thai and South-east Asian']#'African', 'American', 'British', 'Caribbean', 'Chinese', , 'French', 'Greek', 'Indian', 'Irish', 'Italian', 'Japanese', 'Mexican', 'Nordic', 'North African', 'Portuguese', 'South American', 'Spanish', 'Thai and South-east Asian', 'Turkish and Middle Eastern']
    #coll.delete_many({})
    cuisine_dataframe = get_cuisine_recipes(cuisines)
    # the A's and C's are not there in mongo

#####
Cuisine: Thai and South-east Asian 	 Number of recipes: 223 		 Number of pages: 15
recipe link:  /food/recipes/nasi_goreng_with_lime_87031
recipe link:  /food/recipes/thai_prawn_curry_with_09136
recipe link:  /food/recipes/halibut_with_spicy_37680
recipe link:  /food/recipes/stirfrylimeandcoconu_80402
recipe link:  /food/recipes/thai_noodles_with_72276
recipe link:  /food/recipes/hot_and_spicy_thai_squid_01824
recipe link:  /food/recipes/nice_and_spicy_thai_90314
recipe link:  /food/recipes/sea_bass_with_spring_70065
recipe link:  /food/recipes/chickenlimeandcashew_4133
recipe link:  /food/recipes/helens_coconut_pandan_32016
recipe link:  /food/recipes/summer_salad_with_72489
recipe link:  /food/recipes/thaigreenchickencurr_92440
recipe link:  /food/recipes/whole_baked_seabass_with_36512
recipe link:  /food/recipes/grilled_lamb_chops_with_01274
recipe link:  /food/recipes/mickeys_khao_soi_noodles_24933
recipe link:  /food/recipes/bobbys_vegetables_cooked_00387
recipe link:  /food

In [31]:
cuisine_dataframe

Unnamed: 0,cuisine,num_recipes,pages,recipes_details,source
0,Thai and South-east Asian,223,15,"{u'cook_time': u'30 mins to 1 hour', u'descrip...",BBC Food
1,Thai and South-east Asian,223,15,"{u'cook_time': u'30 mins to 1 hour', u'descrip...",BBC Food
2,Thai and South-east Asian,223,15,"{u'cook_time': u'30 mins to 1 hour', u'descrip...",BBC Food
3,Thai and South-east Asian,223,15,"{u'cook_time': u'No cooking required', u'descr...",BBC Food
4,Thai and South-east Asian,223,15,"{u'cook_time': u'Over 2 hours', u'description'...",BBC Food
5,Thai and South-east Asian,223,15,"{u'cook_time': u'10 to 30 mins', u'description...",BBC Food
6,Thai and South-east Asian,223,15,"{u'cook_time': u'10 to 30 mins', u'description...",BBC Food
7,Thai and South-east Asian,223,15,"{u'cook_time': u'30 mins to 1 hour', u'descrip...",BBC Food
8,Thai and South-east Asian,223,15,"{u'cook_time': u'30 mins to 1 hour', u'descrip...",BBC Food
9,Thai and South-east Asian,223,15,"{u'cook_time': u'30 mins to 1 hour', u'descrip...",BBC Food


In [32]:
cuisine_dataframe.to_csv("recipes_data_BBC_Food_Thai_and_South-east_Asian.csv")

cuisines = ['indian']

for l in links:
    print (l.a["href"])
    #print single_query("http://www.bbc.co.uk{}".format(l.a["href"]))
    
namespace['recipe_str_%s_%d_1' % (cuisine, page)] = single_query("http://www.bbc.co.uk{}".format(l.a["href"]))

namespace['recipe_str_%s_%d_1' % (cuisine, page)]

soup = BeautifulSoup(namespace['recipe_str_%s_%d_1' % (cuisine, page)])
recipe_title = soup.find("div",{"class":"article-title"}).find("h1").get_text()
description_summary = soup.find("span",{"class":"summary"}).get_text()
ingredient_list = soup.find_all("p",{"class":"ingredient"})

namespace['ingredients_%s_%s' % (cuisine, recipe_title[:20])] = []

for ing in ingredient_list:
    namespace['ingredients_with quant_%s_%s' % (cuisine, recipe_title[:20])].append(ing.get_text())

prep_steps = soup.find("div",{"id":"preparation"}).find("ol", {"class":"instructions"}).find_all("li")

namespace['prep_%s_%s' % (cuisine, recipe_title[:20])] = []
for step in prep_steps:
    namespace['prep_%s_%s' % (cuisine, recipe_title[:20])].append(step.get_text().strip())

namespace.keys()

list1 = [1, 2, 3, 4]
list2 = [2, 3, 4, 5]
d1 = {'recipe': "a", "desc": 'b', 'prep': 'c'}
d2 = {'recipe': "d", "desc": 'e', 'prep': 'f'}
d3 = {}
d3[d1['recipe']] = d1
d3[d2['recipe']] = d2