# Web Scraping: Yummly - Getting All the Features

web scraping yummly Recipe data  using `request` and `BeautifulSoup` packages.

## Importhing the Libraries

In [1]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [5]:
def recipe_details(recipe_url):
    # make a get request of a webpage
    recipe_request = requests.get(recipe_url)

    # The .text returns the text from the request
    recipe_html = recipe_request.text
    
    # Parse with BeautifulSoup
    # We use `BeautifulSoup` to decompose the string into html tags and  we search through the html tree to find the tags we're interested in.
    # Turn into soup, specify the HTML parser
    recipe_soup = BeautifulSoup(recipe_html, 'html.parser')
    
    # Getting the Name of Recipe
    Recipe_Name = recipe_soup.find_all("h1", {'class' : "recipe-title font-bold h2-text primary-dark"})[0].text
    
    # Getting the List of Ingredients
    ingredient_info = recipe_soup.find_all("li", {'class' : 'IngredientLine'})
    ingredients = [row.find_all('span', {'class': 'ingredient'})[0].text.rstrip() for row in ingredient_info]
    ingredients = list(set(ingredients))
    Number_of_ingredients = len(ingredients)    
    
    #Getting the Ingredients' Amount and Unit
    ingredients_amount = [row.find_all('span', {'class': 'amount'}) for row in ingredient_info]
    ingredients_amount_1 = [row.find_all('span', {'data-singular' :"false"}) for row in ingredient_info]
    ingredient_amount_integer = [0] * len(ingredient_info)
    for i, row in enumerate(ingredients_amount_1):
        if len(row) > 0 :
            ingredient_amount_integer[i] = int(row[0].text.split(' ')[0])


    ingredient_amount_numerator = [0] * len(ingredient_info)
    ingredients_amount_1 =  [row.find_all('span', {'class' :"numerator"}) for row in ingredient_info]
    for i, row in enumerate(ingredients_amount_1):
        if len(row) > 0 :
            ingredient_amount_numerator[i] = int(row[0].text)


    ingredient_amount_denominator = [0] * len(ingredient_info)
    ingredients_amount_1 =  [row.find_all('span', {'class' :"denominator"}) for row in ingredient_info]
    for i, row in enumerate(ingredients_amount_1):
        if len(row) > 0 :
            ingredient_amount_denominator[i] = int(row[0].text)



    ingredient_amounts=[0] * len(ingredient_info)
    for i in range(len(ingredient_amounts)):
        if ingredient_amount_denominator[i] != 0:
            ingredient_amounts[i] = round(ingredient_amount_integer[i]+(ingredient_amount_numerator[i]/ingredient_amount_denominator[i]),2)
        else:
            ingredient_amounts[i] = ingredient_amount_integer[i]

    ingredient_amounts = [i if i != 0 else 1 for i in ingredient_amounts]
    ingredients_unit = [row.find_all('span', {'class' :"unit"}) for row in ingredient_info]
    ingredient_units = [0] * len(ingredient_info)
    for i, row in enumerate(ingredients_unit):
        if len(row) > 0 :
            ingredient_units[i] = row[0].text
        else:
            ingredient_units[i] = ' '

    Amounts = dict(zip(ingredients,[str(m)+' '+n for m,n in zip(ingredient_amounts,ingredient_units)]))
    
    # Getting the Cooking Time
    try:
        Time = recipe_soup.find_all("div", {'class' : 'recipe-summary-item'})[1]
        Time_unit = Time.find_all('span', {'class': 'unit font-normal p3-text'})[0].text.rstrip()
        Time_value = Time.find_all('span', {'class': 'value font-light h2-text'})[0].text
    except:
        Time_unit = ''
        Time_value = ''
    # Getting the Calories
    try:
        calories = recipe_soup.find_all("div", {'class' : 'recipe-summary-item'})[2]
        unit = calories.find_all('span', {'class': 'unit font-normal p3-text'})[0].text.rstrip()
        Calories_value = calories.find_all('span', {'class': 'value font-light h2-text'})[0].text
    except:
        Calories_value = ''
    
    # Getting the Nutrition Information
    try:
        Nutrition_info = recipe_soup.find_all("div", {'class' : 'recipe-nutrition'})
        nutritions = [row.find_all('span', {'class': 'label font-bold micro-caps'}) for row in Nutrition_info][0]
        nutrition_labels = [row.text for row in nutritions]
        nutrition_values = [row.find_all('span', {'class': "raw-value micro-text"}) for row in Nutrition_info][0]
        Nutrition_values = [row.text for row in nutrition_values]
    except:
        Nutrition_values = ['','','','','']
        
    try :
        Sodiumindex = nutrition_labels.index('Sodium')
        Sodium = Nutrition_values[Sodiumindex]
    except :
        Sodium = ''
        
    try :
        Fatindex = nutrition_labels.index('Fat')
        Fat = Nutrition_values[Fatindex]
    except :
        Fat = ''
        
    try :
        Proteinindex = nutrition_labels.index('Protein')
        Protein = Nutrition_values[Proteinindex]
    except :
        Protein = ''    
 
    try :
        Carbsindex = nutrition_labels.index('Carbs')
        Carbs = Nutrition_values[Carbsindex]
    except :
        Carbs = '' 
        
    try :
        Fiberindex = nutrition_labels.index('Fiber')
        Fiber = Nutrition_values[Fiberindex]
    except :
        Fiber = '' 
    
    
    # Getting the Servings
    try:
        servings = recipe_soup.find_all('label', {'class' : 'micro-caps greyscale-1'})
        servings = int(servings[0].find_all('input',{'class':'font-bold greyscale-1'})[0]['value'])
    except:
        servings = ''
    
    # Getting the Star Rating and Number of Reviews
    try:
        rating = recipe_soup.find_all('a', {'class': 'recipe-details-rating p2-text primary-orange'})
        star_rating_full = [row.find_all('span', {'class': "icon full-star y-icon"}) for row in rating][0]
        star_rating_half = [row.find_all('span', {'class': "icon half-star y-icon"}) for row in rating][0]
        star_rating = len(star_rating_full)+ 0.5*len(star_rating_half)
        num_reviews =  [row.find_all('span', {'class': "count font-bold micro-text"}) for row in rating][0]
        num_reviews = num_reviews[0].text.replace('(', '')
        num_reviews = int(num_reviews.replace(')', ''))
    except:
        num_reviews = 0
        star_rating = ''
        
    try:
        course = recipe_soup.find_all('li', {'class' : 'recipe-tag micro-text font-bold'})
        course = course[0]['title'].strip('Course: ')
    except:
        course = ''
        
        
    
    
        

    # Getting the Directions
    rec_url = recipe_url 
    return [Recipe_Name,ingredients, Number_of_ingredients, Amounts, Time_value, Time_unit, Calories_value,
                    Sodium, Fat, Protein, Carbs, Fiber, servings, star_rating, num_reviews,course, rec_url]
    
    
    





## Putting everything together

In [6]:
column_names = ['Recipe_Name','Ingredients', 'Number_of_Ingredients', 'Amounts', 'Cooking_Time', 'Cooking_Time_Unit',
               'Calories', 'Sodium','Fat','Protein','Carbs','Fiber','Servings', 'Star_Rating','Number_of_Reviews','Course','Directions']
df= pd.DataFrame(columns=column_names)

In [7]:
recipe_urls = np.load('Recipes.npy')
len(recipe_urls)

1033

In [8]:
for recipe_url in recipe_urls:
    url  = 'http://yummly.com' + recipe_url
    try:
        recipe_df = pd.DataFrame([recipe_details(url)],columns=column_names) 
        df = pd.concat([df,recipe_df])
    except:
        print('This URL Does Not Exist', url)


This URL Does Not Exist http://yummly.com/recipes/salmon
This URL Does Not Exist http://yummly.com/recipes/midori-pineapple-juice
This URL Does Not Exist http://yummly.com/recipes/chicken-pasta
This URL Does Not Exist http://yummly.com/recipes/pasta
This URL Does Not Exist http://yummly.com/recipes/sandwiches
This URL Does Not Exist http://yummly.com/recipes/appetizers
This URL Does Not Exist http://yummly.com/recipes/pepperoncini-peppers
This URL Does Not Exist http://yummly.com/recipes/potato-soup-with-beef-broth
This URL Does Not Exist http://yummly.com/recipes/wild-rice-dried-cranberries-pine-nuts
This URL Does Not Exist http://yummly.com/recipe/Chicken-Soup-2346075
This URL Does Not Exist http://yummly.com/recipes/parsnips-turnips
This URL Does Not Exist http://yummly.com/recipes/ground-beef
This URL Does Not Exist http://yummly.com/recipe/The-Ultimate-Beef-Stew-2590056
This URL Does Not Exist http://yummly.com/recipe/Chicken-a-Jardineira-2569427
This URL Does Not Exist http://yum

970

In [None]:
acceptable_course_list = ['Main Dish', 'Lunch', 'Beverag', 'Salad', 'Appetiz','Breakfast and Brunch', 'Dessert','Side Dish']
df['Course']= df['Course'].apply (lambda x : x if x in acceptable_course_list else '')
df.to_csv("all_recipes_new.csv", index=False) #Convert DataFrame to .csv file
print('Number of Recipes: ' , len(df))