In [1]:
from time import sleep
import requests
from bs4 import BeautifulSoup
from math import ceil
import pandas as pd
from pymongo import MongoClient

In [2]:
SCRAPING_REQUEST_STAGGER = 4.0 # in seconds
# the link for Chowhound recipes
URL = 'http://www.chowhound.com/recipes?page={}'
NUMBER_OF_RECIPES_PER_PAGE = 27

DB_NAME = 'PROJECT_RECIPIES'
COLLECTION_NAME = 'Chowhound'

client = MongoClient()
db = client[DB_NAME]
coll = db[COLLECTION_NAME]

In [3]:
def get_content_from_url(link):
    # make request with the input link and return the content of the web link
    sleep(SCRAPING_REQUEST_STAGGER)        
    response = requests.get(link)
    
    # if the status code of response is not 200, then raise a warning
    if response.status_code != 200:
        return False
    
    return response.content

In [4]:
def get_number_of_pages():
    # find number of recipes from the first recipe page
    
    # create a full link using the cuisine_link and cuisine name to obtain web content from link
    first_page_link = URL.format("1")
    cuisine_recipes = get_content_from_url(first_page_link)
    
    # if no content is returned from the cuisine link page
    if not cuisine_recipes:
        print "no content for:", first_page_link
        return None  
    
    # convert web content to soup to better access the content
    soup_cuisine = BeautifulSoup(cuisine_recipes)
    
    # select characters from phrase and convert it into integer
    return int(soup_cuisine.find("span", {"class": "last"}).a["href"].split('=')[1])

In [5]:
def get_recipe_links_by_page(page):
    # create link for given cuisine and page number and obtain recipe links from page 
    
    # create a full link using the cuisine_link and cuisine name to obtain web content from link
    page_link = URL.format(page)
    cuisine_recipe_links = get_content_from_url(page_link)
    if not cuisine_recipe_links:
        print "no content for:", link
        return None
    
    soup_search = BeautifulSoup(cuisine_recipe_links)

    # return the list of links for the cuisine page
    return soup_search.find_all("div", {"class": "image_link_medium"})

In [6]:
def get_recipe(recipe_link):
    # make web request and obtain site content for recipe page
    recipe_response = get_content_from_url(recipe_link)
    if not recipe_response:
        print "no content for:", recipe_link
        return None
    
    return BeautifulSoup(recipe_response)

In [7]:
def get_recipe_title(soup_recipe):
    # recipe title
    return soup_recipe.find("h1",{"itemprop":"name"}).get_text().strip()

In [8]:
def get_recipe_chef(soup_recipe):
    # recipe chef name
    chef_name = soup_recipe.find("span",{"itemprop":"author"})
    
    # if chef name is not there for the recipe
    if not chef_name:
        return None
    
    # strip new line characters on both sides and the "By " from front
    return chef_name.get_text().strip()

In [9]:
def get_description(soup_recipe):
    # recipe description
    summary = soup_recipe.find("div",{"itemprop":"description"})
    
    # if summary is not there for the recipe
    if not summary:
        return None
    
    return summary.get_text().strip()

In [10]:
def get_recipe_ingredients(soup_recipe):
    #recipe ingredients
    ingredients_list = soup_recipe.find_all("li", {"itemprop":"ingredients"})
    ingredients = []
    for ing in ingredients_list:
        ingredients.append(ing.get_text())
    return ingredients

In [11]:
def get_recipe_preperation(soup_recipe):
    # recipe preperation steps
    prep_steps = soup_recipe.find("div", {"itemprop":"recipeInstructions"}).find_all("li")
    prep = []
    for step in prep_steps:
        # get text for the each step and strip of new line characters at both ends
        prep.append(step.get_text().strip())
    return prep

In [12]:
def get_recipe_time(soup_recipe):
    # recipe preperation time
    total_time = soup_recipe.find("time", {"itemprop":"totalTime"})
    # if recipe has total_time given
    if total_time:
        total_time = total_time.get_text().strip()
    else:
        total_time = None
    # recipe cooking time
    active_time = soup_recipe.find("span", {"class":"frr_totaltime frr_active"})
    # if recipe has active_time given
    if active_time:
        active_time = active_time.find("time").get_text().strip()
    else:
        active_time = None
    return total_time, active_time

In [13]:
def get_servings(soup_recipe):
    # recipe servings
    servings = soup_recipe.find("span", {"itemprop":"recipeYield"})
    if not servings:
        return None
    
    return servings.get_text()

In [14]:
def get_recipe_difficulty(soup_recipe):
    # recipe difficulty level
    difficulty = soup_recipe.find("span", {"class":"frr_difficulty fr_sep"})
    if not difficulty:
        return None
    
    return difficulty.get_text().strip()

In [15]:
def get_ratings(soup_recipe):
    # recipe ratings
    ratings = soup_recipe.find("span", {"itemprop":"ratingValue"})
    # if recipe has ratings
    if not ratings:
        ratings = None
    else:
        ratings = ratings.get_text()
    rating_count = soup_recipe.find("span", {"itemprop":"reviewCount"})
    if not rating_count:
        rating_count = None
    else:
        rating_count = rating_count.get_text()
    return ratings, rating_count

In [16]:
def get_nutrition_per_serving(soup_recipe):
    # recipe servings
    nutritional_info = soup_recipe.find("div", {"class":"nutritional-info"})
    if not nutritional_info:
        return None
    
    return nutritional_info.get_text()

In [17]:
def get_image_source(soup_recipe):
    # recipe image source
    image_source = soup_recipe.find("img", {"id":"recipe_top_img"})
    if not image_source:
        return None
    
    return image_source["src"]

In [18]:
def get_recipe_details(recipe_links):
    # obtain necessary recipe details from recipe page
    
    cuisine_recipes = {}
    
    # loop over recipe links to get recipe details for each recipe page
    for r in recipe_links:
        if "www.chowhound.com" in r.a["href"]:
            recipe = {}
            recipe['r_link'] = r.a["href"]
            print "recipe link: ", recipe['r_link']

            sleep(SCRAPING_REQUEST_STAGGER)        
            soup_recipe = get_recipe(recipe['r_link'])

            recipe['recipe title'] = get_recipe_title(soup_recipe)
            recipe['chef'] = get_recipe_chef(soup_recipe)
            recipe['description'] = get_description(soup_recipe)
            recipe['ingredient list'] = get_recipe_ingredients(soup_recipe)
            recipe['preperation steps'] = get_recipe_preperation(soup_recipe)
            recipe['total_time'], recipe['active_time'] = get_recipe_time(soup_recipe)
            recipe['servings'] = get_servings(soup_recipe)
            recipe['skill_level'] = get_recipe_difficulty(soup_recipe)
            recipe['rating'], recipe['rating count'] = get_ratings(soup_recipe)
            recipe['nutritional_info'] = get_nutrition_per_serving(soup_recipe)
            recipe['image_source'] = get_image_source(soup_recipe)
            
            cuisine_recipes[recipe['recipe title']] = recipe
    
    return cuisine_recipes

In [78]:
def get_recipe_links(pages):
    # loop over each page in the cuisine to obtain all the recipe details
    
    recipe_links = []
    page_start, page_end = 31, 40
    print "page_start: %d, page_end: %d" % (page_start, page_end)
    
    for page in xrange(page_start, page_end+1):#pages + 1):
        sleep(SCRAPING_REQUEST_STAGGER)
        # recipe links from each cuisine search page
        recipe_links.extend(get_recipe_links_by_page(page))
        
    # get recipe details for unique recipe links
    cuisine_recipes = get_recipe_details(list(set(recipe_links)))
    return cuisine_recipes

In [79]:
def get_recipes(num_of_pages):
    # loop over each cuisine and store data regarding that cuisine recipies
    
    # to store the data in a pandas Dataframe
    cuisine_df = pd.DataFrame()
    
    cuisine_dict = {}
        
    cuisine_dict['cuisine'] = 'Unknown'
    cuisine_dict['source'] = 'Chowhound'

    cuisine_dict['num_recipes'] = NUMBER_OF_RECIPES_PER_PAGE * num_of_pages

    # convert number of recipes into pages to scrape
    cuisine_dict['pages'] = num_of_pages

    # print the cuisine details
    print '#####'
    print "Cuisine: %s \t Number of recipes: %r \t\t Number of pages: %r" \
                % (cuisine_dict['cuisine'], cuisine_dict['num_recipes'], cuisine_dict['pages'])

    cuisine_dict['recipes_details'] = get_recipe_links(cuisine_dict['pages'])

    #coll.insert_one(cuisine_dict)

    # convert the dictionary into a dataframe and append it to the final dataframe
    cuisine_df = pd.DataFrame.from_dict(cuisine_dict, orient='columns')

    return cuisine_df

In [80]:
if __name__ == '__main__':
    
    #coll.delete_many({})
    num_of_pages = get_number_of_pages()
    cuisine_dataframe = get_recipes(num_of_pages)

#####
Cuisine: Unknown 	 Number of recipes: 6318 		 Number of pages: 234
page_start: 31, page_end: 40
recipe link:  http://www.chowhound.com/recipes/the-sweeter-welcome-punch-30602
recipe link:  http://www.chowhound.com/recipes/tarragon-chicken-salad-30698
recipe link:  http://www.chowhound.com/recipes/copa-verde-11038
recipe link:  http://www.chowhound.com/recipes/roasted-shrimp-with-romesco-sauce-10752
recipe link:  http://www.chowhound.com/recipes/green-garlic-aioli-11705
recipe link:  http://www.chowhound.com/recipes/ham-cheese-and-mushroom-strata-30893
recipe link:  http://www.chowhound.com/recipes/stuffed-bell-peppers-with-feta-and-herbs-10934
recipe link:  http://www.chowhound.com/recipes/orange-honey-and-thyme-brined-turkey-breast-30869
recipe link:  http://www.chowhound.com/recipes/warm-cheesy-swiss-chard-and-roasted-garlic-dip-30685
recipe link:  http://www.chowhound.com/recipes/sauteed-asian-broccoli-30676
recipe link:  http://www.chowhound.com/recipes/rib-eye-with-pineapple

In [85]:
cuisine_dataframe

Unnamed: 0,recipe_title,cuisine,num_recipes,pages,recipes_details,source
0,Adobo-Marinated BBQ Chicken,Unknown,6318,234,"{u'total_time': u'1 hr 30 mins', u'active_time...",Chowhound
1,Adobo-Marinated Chicken Tacos,Unknown,6318,234,"{u'total_time': u'1 hr, plus marinating time',...",Chowhound
2,Almost Arnie,Unknown,6318,234,"{u'total_time': u'Under 5 mins', u'active_time...",Chowhound
3,Apple Brandy Hot Toddy,Unknown,6318,234,"{u'total_time': u'Under 5 mins', u'active_time...",Chowhound
4,Apple-Cinnamon Waffles,Unknown,6318,234,"{u'total_time': u'55 mins', u'active_time': No...",Chowhound
5,Asparagus Benedict with Chèvre-Dijon Sauce,Unknown,6318,234,"{u'total_time': u'50 mins', u'active_time': No...",Chowhound
6,BBQ Chicken Pizza,Unknown,6318,234,"{u'total_time': u'2 hrs 15 mins', u'active_tim...",Chowhound
7,BLT Scrambled Eggs,Unknown,6318,234,"{u'total_time': u'40 mins', u'active_time': No...",Chowhound
8,Bacon Cheeseburger,Unknown,6318,234,"{u'total_time': u'45 mins', u'active_time': No...",Chowhound
9,Bacon Jam,Unknown,6318,234,"{u'total_time': u'55 mins', u'active_time': No...",Chowhound


In [82]:
cuisine_dataframe.reset_index(inplace=True)

In [83]:
cuisine_dataframe.rename(columns={'index': 'recipe_title'}, inplace=True)

In [86]:
cuisine_dataframe.to_csv("recipes_data_Chowhound_31-40.csv", encoding='UTF-8')

In [84]:
cuisine_dataframe

Unnamed: 0,recipe_title,cuisine,num_recipes,pages,recipes_details,source
0,Adobo-Marinated BBQ Chicken,Unknown,6318,234,"{u'total_time': u'1 hr 30 mins', u'active_time...",Chowhound
1,Adobo-Marinated Chicken Tacos,Unknown,6318,234,"{u'total_time': u'1 hr, plus marinating time',...",Chowhound
2,Almost Arnie,Unknown,6318,234,"{u'total_time': u'Under 5 mins', u'active_time...",Chowhound
3,Apple Brandy Hot Toddy,Unknown,6318,234,"{u'total_time': u'Under 5 mins', u'active_time...",Chowhound
4,Apple-Cinnamon Waffles,Unknown,6318,234,"{u'total_time': u'55 mins', u'active_time': No...",Chowhound
5,Asparagus Benedict with Chèvre-Dijon Sauce,Unknown,6318,234,"{u'total_time': u'50 mins', u'active_time': No...",Chowhound
6,BBQ Chicken Pizza,Unknown,6318,234,"{u'total_time': u'2 hrs 15 mins', u'active_tim...",Chowhound
7,BLT Scrambled Eggs,Unknown,6318,234,"{u'total_time': u'40 mins', u'active_time': No...",Chowhound
8,Bacon Cheeseburger,Unknown,6318,234,"{u'total_time': u'45 mins', u'active_time': No...",Chowhound
9,Bacon Jam,Unknown,6318,234,"{u'total_time': u'55 mins', u'active_time': No...",Chowhound


recipe link:  http://www.chowhound.com/recipes/devils-on-horseback-29172
{'total_time': u'40 mins', 'active_time': u'\n                20 mins\n              ', 'description': u'We know\u2014wrap pretty much anything in bacon, cook until crisp, and it\u2019s bound to be eaten. This British pub snack, though, with a name that sounds like it\u2019s from some inverted version of the apocalypse, combines salty, crunchy bacon and boozy, sweet plums (a.k.a. prunes). It\u2019s a major crowd-pleaser, and simple to prepare. Steep the dried prunes in an easy syrup of port wine and sugar, then drain (you can do this ahead of time). Wrap a slice of uncooked bacon around each plum and secure with a toothpick. When guests arrive, just pop the prunes in a hot oven and bake until the bacon\u2019s cooked and crispy, the plums hot all the way through. Feel free to experiment, for instance stuff the prunes with blue cheese or toasted walnuts before wrapping.\n\nFor additional appetizer inspiration, check out our Chicken Skewers with Dukkah Crust, Mongolian Beef Kebabs with Chili Jam, and Sesame Shrimp with Cilantro-Lime Sauce.', 'rating': u'5.0', 'nutritional_info': None, 'skill_level': u'Easy', 'ingredient list': [u'30 pitted prunes (about 8 ounces)', u'1 cup tawny port', u'1 tablespoon granulated sugar', u'10 thin slices smoky bacon, cut crosswise into thirds', u'30 toothpicks, soaked in water for at least 15 minutes'], 'preperation steps': [u'1Heat the oven to 500\xb0F and arrange a rack in the middle.', u'2Combine the prunes, port, and sugar in a medium saucepan. Bring to a boil over high heat, stirring until the sugar has dissolved. Reduce the heat to medium low and simmer, stirring occasionally, until the port thickens into a loose syrup, about 15 minutes. Let cool slightly, for about 10 minutes, then drain the prunes of excess syrup or reserve syrup for other uses (for example, serve with seared meats or drizzle into a glass of seltzer or Champagne).', u'3Line a rimmed baking sheet with aluminum foil, then set a cooling rack over the foil.', u'4Wrap a piece of bacon around each prune and secure with a toothpick. Place the prunes at least 1 inch apart on the cooling rack. Bake for 7 minutes, then flip the prunes with tongs and continue to bake until crispy, about 7 to 9 minutes more. Transfer to a paper-towel-lined plate to drain. Cool slightly before serving.'], 'chef': u'Jill Santopietro', 'r_link': 'http://www.chowhound.com/recipes/devils-on-horseback-29172', 'servings': u'30 pieces', 'recipe title': u'Devils on Horseback', 'rating count': u'7', 'image_source': 'http://search.chow.com/thumbnail/320/0/www.chowstatic.com/assets/recipe_photos/29172_devils_horseback.jpg'}