In [86]:
import time
import random
import requests
import selenium
from string import ascii_lowercase
from selenium.webdriver import Firefox, Chrome
from bs4 import BeautifulSoup
import pandas as pd
import string
import numpy as np
from selenium.webdriver.common.keys import Keys

In [182]:
browser = Chrome()

In [314]:
url='https://www.allrecipes.com/recipe/141833/thai-green-curry-chicken/'
browser.get(url)

In [84]:
time.sleep(2)

In [307]:
def _get_ingredients(browser):
    all_items = []
    count = 1
    while True:
        try:
            sel = 'ul#lst_ingredients_{0}'.format(count)
            ing_list = browser.find_element_by_css_selector(sel)
            all_items += ing_list.text.split('\n')
            count += 1
        except:
            break
    ingredients = []
    for item in all_items:
        if item[0] in string.digits:
            ingredients.append(item)
    return ingredients

In [302]:
ingredients = get_ingredients()
ingredients

['1 pound bulk Italian sausage',
 '4 cups half-and-half',
 '3 cups cubed potatoes',
 '2 cups low-sodium chicken broth',
 '2 cups whole milk',
 '1 onion, chopped',
 '1/2 teaspoon dried oregano',
 '1/2 teaspoon red pepper flakes, or more to taste',
 '1/2 teaspoon ground black pepper',
 '2 cups torn kale leaves (bite-size pieces)']

In [261]:
def _determine_quantity(item):
    quantity = 0
    for i, elem in enumerate(item.split()):
        if elem[0] in string.digits:
            try:
                quantity += float(elem)
            except:
                numer, denom = elem.split('/')
                quantity += float(numer) / float(denom)
        else:
            idx = i
            break
    remainder = ' '.join(item.split()[idx:])
    return quantity, remainder

In [262]:
def _parse_special(item, flag_words):
    # Determine special word
    sp_word = ')'
    for word in flag_words:
        if word in item.split():
            sp_word = ' ' + word + ' '
            break
    
    # Parse item 
    count_and_size = item.split(sp_word)[0]
    remainder = item.split(sp_word)[1]
    count, rest = _determine_quantity(count_and_size)
    if sp_word == ')':
        size, unit = _determine_quantity(rest[1:])
    else:
        size, unit = _determine_quantity(rest[1:-1])
    quantity = count * size
    return quantity, unit, remainder

In [263]:
def _remove_descriptors(item,
                        phrases=phrases,
                        stopwords=stopwords,
                        suffixes=suffixes):
    # Remove common/unnecessary ending phrases
    for phrase in phrases:
        if len(item.split(phrase)) > 1:
            item = item.split(phrase)[0]
    # Remove punctuation and stopwords
    words = []
    for elem in item.split():
        word = ''.join([letter for letter in elem.lower() if letter in string.ascii_lowercase])
        if word not in stopwords:
            words.append(word)
    # Remove adjectives and adverbs    
    for suffix in suffixes:
        for word in words.copy():
            try:
                if word[-len(suffix):] == suffix:
                    words.remove(word)
            except:
                continue
    return ' '.join(words)

In [264]:
def parse_ingredients(ingredients, units=units, flag_words=flag_words):
    '''
    Parses a list of ingredients into a list of dictionaries with the following format: 
        {'quantity': (float),
         'units': (str),
         'ingredient': (str)}
    Also takes argument 'units', a list of accepted units (e.g., ['cups', 'tablespoon']).
    If an ingredident does not specify a unit in this list, the label 'each' will be applied.
    '''
    ing_list = []
    for item in ingredients:
        item_dict = {}
        # Check item for flag words (require special parsing treatment)
        flag = False
        for word in item.split():
            if word in flag_words:
                flag = True
        if item.split()[1][0] == '(':
            flag = True  
        # Parse quantities and units        
        if flag:
            quantity, unit, remainder = _parse_special(item, flag_words)
            item_dict['quantity'] = quantity
            item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
        else:
            quantity, remainder = _determine_quantity(item) 
            item_dict['quantity'] = quantity
            if remainder.split()[0] in units:
                unit = remainder.split()[0]
                item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
                remainder = ' '.join(remainder.split()[1:])
            else:
                item_dict['units'] = 'each'
        # Remove preparation instructions from remaining text to isolate ingredient
        item_dict['ingredient'] = _remove_descriptors(remainder)
        # Add item dictionary to list
        ing_list.append(item_dict)
    return ing_list

In [304]:
units = ['pound', 'pounds', 'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons',
         'clove', 'cloves', 'stalk', 'stalks', 'ounce', 'ounces', 'oz.', 'cubes', 'pint', 'pints',
         'quart', 'quarts']
phrases = [' - ',', or', ', for garnish', ', cut']
stopwords = ['and', 'into', 'very', 'hot', 'cold', 'fresh', 'large', 'medium', 'small', 'halves', 'torn']
suffixes = ['ed','less','ly']
flag_words = ['can', 'cans', 'package', 'packages', 'jar', 'jars', 'container', 'containers']

In [303]:
pd.DataFrame(parse_ingredients(ingredients))

Unnamed: 0,ingredient,quantity,units
0,bulk italian sausage,1.0,pound
1,halfandhalf,4.0,cup
2,potatoes,3.0,cup
3,lowsodium chicken broth,2.0,cup
4,whole milk,2.0,cup
5,onion,1.0,each
6,oregano,0.5,teaspoon
7,pepper flakes,0.5,teaspoon
8,ground black pepper,0.5,teaspoon
9,torn kale leaves bitesize pieces,2.0,cup


In [297]:
def _get_id(browser):
    id_and_name = browser.current_url.split('recipe/')[1]
    return id_and_name.split('/')[0]

In [273]:
def _get_name(browser):
    sel = 'h1#recipe-main-content'
    name = browser.find_element_by_css_selector(sel)
    return name.text

In [275]:
_get_name(browser)

"Catherine's Spicy Chicken Soup"

In [285]:
def _get_rating_info(browser):
    rating_info = {}
    sel = 'div.rating-stars'
    rating = browser.find_element_by_css_selector(sel)
    rating_info['rating'] = float(rating.get_attribute('data-ratingstars'))
    sel = 'div.summary-stats-box a.read--reviews'
    reviews = browser.find_element_by_css_selector(sel).text.split()
    try:
        n_made = int(reviews[0])
    except:
        n_made = int(reviews[0][:-1]) * 1000
    try:
        n_reviews = int(reviews[4])
    except:
        n_reviews = int(reviews[4][:-1]) * 1000    
    rating_info['made_by'] = n_made
    rating_info['reviews'] = n_reviews
    return rating_info

In [284]:
sel = 'div.summary-stats-box a.read--reviews'
reviews = browser.find_element_by_css_selector(sel)
reviews.text.split()

['2k', 'made', 'it', '|', '1k', 'reviews']

In [286]:
_get_rating_info(browser)

{'rating': 4.74810457229614, 'made_by': 2000, 'reviews': 1000}

In [305]:
def _get_categories(browser):
    sel = 'ol.breadcrumbs li'
    categories = browser.find_elements_by_css_selector(sel)
    cat_list = [category.text for category in categories]
    cat_dict = {}
    cat_dict['lvl_1'] = cat_list[2]
    try:
        cat_dict['lvl_2'] = cat_list[3]
    except:
        cat_dict['lvl_2'] = None
    try:
        cat_dict['lvl_3'] = cat_list[4]
    except:
        cat_dict['lvl_3'] = None
    return cat_dict

In [290]:
_get_categories(browser)

{'cat_1': 'Soups, Stews and Chili', 'cat_2': 'Soup', 'cat_3': 'Chicken Soup'}

In [310]:
def _get_submitter_info(browser):
    submitter_info = {}
    sel = 'div.summary-background div.submitter'
    submitter = browser.find_element_by_css_selector(sel)
    followers = submitter.find_element_by_css_selector('div.submitter__img span').text
    name = submitter.find_element_by_css_selector('p span.submitter__name').text
    href = (submitter.find_element_by_css_selector('div.submitter__img a')
                     .get_attribute('href'))
    id_num = href.split('/')[-2]
    submitter_info['id'] = int(id_num)
    submitter_info['name'] = name
    submitter_info['followers'] = int(followers)
    submitter_info['href'] = href
    return submitter_info

In [281]:
_get_submitter_info(browser)

{'href': 'https://www.allrecipes.com/cook/177251/',
 'id': 177251,
 'name': 'AUNTTAF',
 'followers': 14}

In [338]:
def get_recipe_info(browser):
    recipe_info = {}
    recipe_info['id'] = _get_id(browser)
    recipe_info['name'] = _get_name(browser)
    recipe_info['href'] = browser.current_url.split('?')[0]
    recipe_info['category'] = _get_categories(browser)
    recipe_info['rating_info'] = _get_rating_info(browser)
    recipe_info['submitter_info'] = _get_submitter_info(browser)
    ingredients = _get_ingredients(browser)
    recipe_info['ingredients'] = parse_ingredients(ingredients)
    recipe_info['directions'] = _get_directions(browser)
    return recipe_info

In [339]:
get_recipe_info(browser)

{'id': '141833',
 'name': 'Thai Green Curry Chicken',
 'href': 'https://www.allrecipes.com/recipe/141833/thai-green-curry-chicken/',
 'category': {'lvl_1': 'World Cuisine', 'lvl_2': 'Asian', 'lvl_3': 'Thai'},
 'rating_info': {'rating': 4.53484010696411, 'made_by': 813, 'reviews': 378},
 'submitter_info': {'id': 2370128,
  'name': 'laus',
  'followers': 0,
  'href': 'https://www.allrecipes.com/cook/2370128/'},
 'ingredients': [{'quantity': 1.0,
   'units': 'pound',
   'ingredient': 'chicken breast'},
  {'quantity': 1.0, 'units': 'tablespoon', 'ingredient': 'dark soy sauce'},
  {'quantity': 1.0, 'units': 'tablespoon', 'ingredient': 'allpurpose flour'},
  {'quantity': 2.0, 'units': 'tablespoon', 'ingredient': 'cooking oil'},
  {'quantity': 2.0, 'units': 'tablespoon', 'ingredient': 'green curry paste'},
  {'quantity': 2.0, 'units': 'each', 'ingredient': 'green onions with tops'},
  {'quantity': 3.0, 'units': 'clove', 'ingredient': 'garlic'},
  {'quantity': 1.0, 'units': 'teaspoon', 'ingred

In [335]:
def _get_directions(browser):
    directions = {}
    directions['timing'] = _get_timing(browser)
    sel = 'div.directions--section li.step'
    steps = browser.find_elements_by_css_selector(sel)
    directions['steps'] = [step.text for step in steps if step.text]
    return directions

In [336]:
_get_directions(browser)

{'timing': {'prep': 20, 'cook': 40, 'ready_in': 60},
 'steps': ['Toss chicken first in 1 tablespoon dark soy sauce, then in the flour, coating pieces evenly. Heat the oil in a large skillet over medium high heat. Place chicken in the skillet, cook and stir chicken until browned, about 5 minutes. Remove chicken.',
  'Reduce heat to medium and stir in curry paste. Cook for 1 minute until fragrant, then stir in green onions, garlic, and ginger; cook an additional 2 minutes. Return chicken to the skillet, stirring to coat with the curry mixture. Stir the coconut milk, fish sauce, 1 tablespoon soy sauce, and sugar into the chicken-curry mixture. Allow to simmer over medium heat for 20 minutes until the chicken is tender. Serve garnished with cilantro leaves.']}

In [330]:
def _get_timing(browser):
    timing = {}
    sel = 'div.directions--section ul.prepTime li.prepTime__item'
    timing_list = browser.find_elements_by_css_selector(sel)
    prep_time = timing_list[1].get_attribute('aria-label')
    num, unit = prep_time.split(': ')[1].split()
    timing['prep'] = int(num) if unit == 'Minutes' else 60 * int(num)
    cook_time = timing_list[2].get_attribute('aria-label')
    num, unit = cook_time.split(': ')[1].split()
    timing['cook'] = int(num) if unit == 'Minutes' else 60 * int(num)
    total_time = timing_list[3].get_attribute('aria-label')
    num, unit = total_time.split('Ready in ')[1].split()
    timing['ready_in'] = int(num) if unit == 'Minutes' else 60 * int(num)
    return timing

In [337]:
_get_timing(browser)

{'prep': 20, 'cook': 40, 'ready_in': 60}