# Web Scraping: Yummly

web scraping yummly Recipe data  using `request` and `BeautifulSoup` packages.

## Importhing the Libraries

In [1]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd


## Read the Webpage

reading the content of a web page directly into a python object. 
Using `request` library to receive the html content,
using `text` method to extract the html.

In [2]:
recipe_url = "http://yummly.com/recipe/Hot-and-Sweet-Cubano-Sandwich-1313298"

# make a get request of a webpage
recipe_request = requests.get(recipe_url)

# The .text returns the text from the request
recipe_html = recipe_request.text

## Parse with BeautifulSoup
We use `BeautifulSoup` to decompose the string into html tags and  we search through the html tree to find the tags we're interested in.

In [3]:
# Turn into soup, specify the HTML parser
recipe_soup = BeautifulSoup(recipe_html, 'html.parser')

## Getting the Name of Recipe

In [4]:
Recipe_Name = recipe_soup.find_all("h1", {'class' : "recipe-title font-bold h2-text primary-dark"})[0].text

print ('Recipe Name: ',Recipe_Name)

Recipe Name:  Hot and Sweet Cubano Sandwich


## Getting the List of Ingredients 

In [5]:
ingredient_info = recipe_soup.find_all("li", {'class' : 'IngredientLine'})
ingredients = [row.find_all('span', {'class': 'ingredient'})[0].text.rstrip() for row in ingredient_info]
print('List of  Ingredients: ', set(ingredients))
Number_of_ingredients = len(set(ingredients))
print('Number of  Ingredients: ', Number_of_ingredients)


List of  Ingredients:  {'honey glazed ham', 'salt', 'unsalted butter', 'green chile', 'pepper', 'swiss cheese', 'submarine rolls', 'chinese mustard', 'boneless pork loin roast', 'plum jelly'}
Number of  Ingredients:  10


## Getting the Ingredients' Amount and Unit

In [6]:
ingredients_amount = [row.find_all('span', {'class': 'amount'}) for row in ingredient_info]
ingredients_amount_1 = [row.find_all('span', {'data-singular' :"false"}) for row in ingredient_info]
ingredient_amount_integer = [0] * len(ingredient_info)
for i, row in enumerate(ingredients_amount_1):
    if len(row) > 0 :
        ingredient_amount_integer[i] = int(row[0].text.split(' ')[0])


ingredient_amount_numerator = [0] * len(ingredient_info)
ingredients_amount_2 =  [row.find_all('span', {'class' :"numerator"}) for row in ingredient_info]
for i, row in enumerate(ingredients_amount_2):
    if len(row) > 0 :
        ingredient_amount_numerator[i] = int(row[0].text)


ingredient_amount_denominator = [0] * len(ingredient_info)
ingredients_amount_2 =  [row.find_all('span', {'class' :"denominator"}) for row in ingredient_info]
for i, row in enumerate(ingredients_amount_2):
    if len(row) > 0 :
        ingredient_amount_denominator[i] = int(row[0].text)



ingredient_amounts=[0] * len(ingredient_info)
for i in range(len(ingredient_amounts)):
    if ingredient_amount_denominator[i] != 0:
        ingredient_amounts[i] = round(ingredient_amount_integer[i]+(ingredient_amount_numerator[i]/ingredient_amount_denominator[i]),2)
    else:
        ingredient_amounts[i] = ingredient_amount_integer[i]

ingredient_amounts = [i if i != 0 else 1 for i in ingredient_amounts]
ingredients_unit = [row.find_all('span', {'class' :"unit"}) for row in ingredient_info]
ingredient_units = [0] * len(ingredient_info)
for i, row in enumerate(ingredients_unit):
    if len(row) > 0 :
        ingredient_units[i] = row[0].text
    else:
        ingredient_units[i] = ' '


Amounts = dict(zip(ingredients,[str(m)+' '+n for m,n in zip(ingredient_amounts,ingredient_units)]))
print('Ingredient Amounts: ', Amounts)

Ingredient Amounts:  {'boneless pork loin roast': '2 pounds ', 'honey glazed ham': '1 pound ', 'salt': '1 teaspoon ', 'pepper': '1 teaspoon ', 'chinese mustard': '1 cup ', 'submarine rolls': '8 whole ', 'plum jelly': '0.75 cup ', 'swiss cheese': '8 slices ', 'green chile': '14 ounces ', 'unsalted butter': '4 tablespoons '}


## Getting the Cooking Time

In [7]:
Time = recipe_soup.find_all("div", {'class' : 'recipe-summary-item'})[1]
Time_unit = Time.find_all('span', {'class': 'unit font-normal p3-text'})[0].text.rstrip()
Time_value = Time.find_all('span', {'class': 'value font-light h2-text'})[0].text
print('Cooking time: ',Time_value, " ",Time_unit)

Cooking time:  105   Minutes


## Getting the Calories

In [8]:
try:
    calories = recipe_soup.find_all("div", {'class' : 'recipe-summary-item'})[2]
    unit = calories.find_all('span', {'class': 'unit font-normal p3-text'})[0].text.rstrip()
    Calories_value = calories.find_all('span', {'class': 'value font-light h2-text'})[0].text
except:
    Calories_value = ''
    unit = ''
    
print('Calories : ',Calories_value, " ",unit)

Calories :     


## Getting the Nutrition Information

In [9]:
Nutrition_info = recipe_soup.find_all("div", {'class' : 'recipe-nutrition'})

In [10]:
try:
    nutritions = [row.find_all('span', {'class': 'label font-bold micro-caps'}) for row in Nutrition_info][0]
    nutrition_labels = [row.text for row in nutritions]
    nutrition_values = [row.find_all('span', {'class': "raw-value micro-text"}) for row in Nutrition_info][0]
    Nutrition_values = [row.text for row in nutrition_values]
except:
    Nutrition_values = ['','','','','']


try :
    Sodiumindex = nutrition_labels.index('Sodium')
    Sodium = Nutrition_values[Sodiumindex]
except :
    Sodium = ''

try :
    Fatindex = nutrition_labels.index('Fat')
    Fat = Nutrition_values[Fatindex]
except :
    Fat = ''

try :
    Proteinindex = nutrition_labels.index('Protein')
    Protein = Nutrition_values[Proteinindex]
except :
    Protein = ''    

try :
    Carbsindex = nutrition_labels.index('Carbs')
    Carbs = Nutrition_values[Carbsindex]
except :
    Carbs = '' 

try :
    Fiberindex = nutrition_labels.index('Fiber')
    Fiber = Nutrition_values[Fiberindex]
except :
    Fiber = '' 

try:
    print('Nutrition: ', dict(zip(nutrition_labels,Nutrition_values)))
except:
    print('No Information on Nutrition Values' )



No Information on Nutrition Values


## Getting the Star Rating and Number of Reviews

In [11]:
try:
    rating = recipe_soup.find_all('a', {'class': 'recipe-details-rating p2-text primary-orange'})
    star_rating_full = [row.find_all('span', {'class': "icon full-star y-icon"}) for row in rating][0]
    star_rating_half = [row.find_all('span', {'class': "icon half-star y-icon"}) for row in rating][0]
    star_rating = len(star_rating_full)+ 0.5*len(star_rating_half)
    num_reviews =  [row.find_all('span', {'class': "count font-bold micro-text"}) for row in rating][0]
    num_reviews = num_reviews[0].text.replace('(', '')
    num_reviews = int(num_reviews.replace(')', ''))
except:
    num_reviews = 0
    star_rating = ''

print('star rating is',star_rating, ' with ',num_reviews, 'reviews' )


star rating is   with  0 reviews


## Getting the Servings

In [12]:
servings = recipe_soup.find_all('label', {'class' : 'micro-caps greyscale-1'})
servings = int(servings[0].find_all('input',{'class':'font-bold greyscale-1'})[0]['value'])
print('Servings: ', servings)


Servings:  8


## Getting the Course

In [13]:
course = recipe_soup.find_all('li', {'class' : 'recipe-tag micro-text font-bold'})
course = course[0]['title'].strip('Course: ')
print('Course: ', course)


Course:  Main Dish


## Getting the url

In [14]:
rec_url = recipe_url
print(rec_url)

http://yummly.com/recipe/Hot-and-Sweet-Cubano-Sandwich-1313298


## Putting everything together

In [15]:
import pandas as pd

In [16]:
column_names = ['Recipe_Name','Ingredients', 'Number_of_Ingredients', 'Amounts', 'Cooking_Time', 'Cooking_Time_Unit',
               'Calories', 'Sodium','Fat','Protein','Carbs','Fiber','Servings', 'Star_Rating','Number_of_Reviews','Course','URL']


In [17]:
df = pd.DataFrame([[Recipe_Name,ingredients, Number_of_ingredients, Amounts, Time_value, Time_unit, Calories_value,
                    Sodium, Fat, Protein, Carbs, Fiber, servings, star_rating, num_reviews,course,rec_url]], columns=column_names)
df.head()

Unnamed: 0,Recipe_Name,Ingredients,Number_of_Ingredients,Amounts,Cooking_Time,Cooking_Time_Unit,Calories,Sodium,Fat,Protein,Carbs,Fiber,Servings,Star_Rating,Number_of_Reviews,Course,URL
0,Hot and Sweet Cubano Sandwich,"[boneless pork loin roast, honey glazed ham, s...",10,"{'boneless pork loin roast': '2 pounds ', 'hon...",105,Minutes,,,,,,,8,,0,Main Dish,http://yummly.com/recipe/Hot-and-Sweet-Cubano-...
