In [86]:
import time
import random
import requests
import selenium
from string import ascii_lowercase
from selenium.webdriver import Firefox, Chrome
from bs4 import BeautifulSoup
import pandas as pd
import string
import numpy as np
from selenium.webdriver.common.keys import Keys

In [182]:
browser = Chrome()

In [183]:
url='https://www.allrecipes.com/recipe/141833/thai-green-curry-chicken/'
browser.get(url)

In [84]:
time.sleep(2)

In [90]:
def get_ingredients():
    all_items = []
    count = 1
    while True:
        try:
            sel = 'ul#lst_ingredients_{0}'.format(count)
            ing_list = browser.find_element_by_css_selector(sel)
            all_items += ing_list.text.split('\n')
            count += 1
        except:
            break
    ingredients = []
    for item in all_items:
        if item[0] in string.digits:
            ingredients.append(item)
    return ingredients

In [240]:
ingredients = get_ingredients()
ingredients

['2 quarts water',
 '8 skinless, boneless chicken breast halves',
 '1/2 teaspoon salt',
 '1 teaspoon ground black pepper',
 '1 teaspoon garlic powder',
 '2 tablespoons dried parsley',
 '1 tablespoon onion powder',
 '5 cubes chicken bouillon',
 '3 tablespoons olive oil',
 '1 onion, chopped',
 '3 cloves garlic, chopped',
 '1 (16 ounce) jar chunky salsa',
 '2 (14.5 ounce) cans peeled and diced tomatoes',
 '1 (14.5 ounce) can whole peeled tomatoes',
 '1 (10.75 ounce) can condensed tomato soup',
 '3 tablespoons chili powder',
 '1 (15 ounce) can whole kernel corn, drained',
 '2 (16 ounce) cans chili beans, undrained',
 '1 (8 ounce) container sour cream']

In [261]:
def _determine_quantity(item):
    quantity = 0
    for i, elem in enumerate(item.split()):
        if elem[0] in string.digits:
            try:
                quantity += float(elem)
            except:
                numer, denom = elem.split('/')
                quantity += float(numer) / float(denom)
        else:
            idx = i
            break
    remainder = ' '.join(item.split()[idx:])
    return quantity, remainder

In [262]:
def _parse_special(item, flag_words):
    # Determine special word
    sp_word = ')'
    for word in flag_words:
        if word in item.split():
            sp_word = ' ' + word + ' '
            break
    
    # Parse item 
    count_and_size = item.split(sp_word)[0]
    remainder = item.split(sp_word)[1]
    count, rest = _determine_quantity(count_and_size)
    if sp_word == ')':
        size, unit = _determine_quantity(rest[1:])
    else:
        size, unit = _determine_quantity(rest[1:-1])
    quantity = count * size
    return quantity, unit, remainder

In [263]:
def _remove_descriptors(item,
                        phrases=phrases,
                        stopwords=stopwords,
                        suffixes=suffixes):
    # Remove common/unnecessary ending phrases
    for phrase in phrases:
        if len(item.split(phrase)) > 1:
            item = item.split(phrase)[0]
    # Remove punctuation and stopwords
    words = []
    for elem in item.split():
        word = ''.join([letter for letter in elem.lower() if letter in string.ascii_lowercase])
        if word not in stopwords:
            words.append(word)
    # Remove adjectives and adverbs    
    for suffix in suffixes:
        for word in words.copy():
            try:
                if word[-len(suffix):] == suffix:
                    words.remove(word)
            except:
                continue
    return ' '.join(words)

In [264]:
def parse_ingredients(ingredients, units=units, flag_words=flag_words):
    '''
    Parses a list of ingredients into a list of dictionaries with the following format: 
        {'quantity': (float),
         'units': (str),
         'ingredient': (str)}
    Also takes argument 'units', a list of accepted units (e.g., ['cups', 'tablespoon']).
    If an ingredident does not specify a unit in this list, the label 'each' will be applied.
    '''
    ing_list = []
    for item in ingredients:
        item_dict = {}
        # Check item for flag words (require special parsing treatment)
        flag = False
        for word in item.split():
            if word in flag_words:
                flag = True
        if item.split()[1][0] == '(':
            flag = True  
        # Parse quantities and units        
        if flag:
            quantity, unit, remainder = _parse_special(item, flag_words)
            item_dict['quantity'] = quantity
            item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
        else:
            quantity, remainder = _determine_quantity(item) 
            item_dict['quantity'] = quantity
            if remainder.split()[0] in units:
                unit = remainder.split()[0]
                item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
                remainder = ' '.join(remainder.split()[1:])
            else:
                item_dict['units'] = 'each'
        # Remove preparation instructions from remaining text to isolate ingredient
        item_dict['ingredient'] = _remove_descriptors(remainder)
        # Add item dictionary to list
        ing_list.append(item_dict)
    return ing_list

In [260]:
units = ['pound', 'pounds', 'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons',
         'clove', 'cloves', 'stalk', 'stalks', 'ounce', 'ounces', 'oz.', 'cubes', 'pint', 'pints',
         'quart', 'quarts']
phrases = [' - ',', or', ', for garnish', ', cut']
stopwords = ['and', 'into', 'very', 'hot', 'cold', 'fresh', 'large', 'medium', 'small', 'halves']
suffixes = ['ed','less','ly']
flag_words = ['can', 'cans', 'package', 'packages', 'jar', 'jars', 'container', 'containers']

In [265]:
pd.DataFrame(parse_ingredients(ingredients))

Unnamed: 0,ingredient,quantity,units
0,water,2.0,quart
1,chicken breast,8.0,each
2,salt,0.5,teaspoon
3,ground black pepper,1.0,teaspoon
4,garlic powder,1.0,teaspoon
5,parsley,2.0,tablespoon
6,onion powder,1.0,tablespoon
7,chicken bouillon,5.0,cube
8,olive oil,3.0,tablespoon
9,onion,1.0,each


In [273]:
def _get_name(browser):
    sel = 'h1#recipe-main-content'
    name = browser.find_element_by_css_selector(sel)
    return name.text

In [275]:
_get_name(browser)

"Catherine's Spicy Chicken Soup"

In [285]:
def _get_rating_info(browser):
    rating_info = {}
    sel = 'div.rating-stars'
    rating = browser.find_element_by_css_selector(sel)
    rating_info['rating'] = float(rating.get_attribute('data-ratingstars'))
    sel = 'div.summary-stats-box a.read--reviews'
    reviews = browser.find_element_by_css_selector(sel).text.split()
    try:
        n_made = int(reviews[0])
    except:
        n_made = int(reviews[0][:-1]) * 1000
    try:
        n_reviews = int(reviews[4])
    except:
        n_reviews = int(reviews[4][:-1]) * 1000    
    rating_info['made_by'] = n_made
    rating_info['reviews'] = n_reviews
    return rating_info

In [284]:
sel = 'div.summary-stats-box a.read--reviews'
reviews = browser.find_element_by_css_selector(sel)
reviews.text.split()

['2k', 'made', 'it', '|', '1k', 'reviews']

In [286]:
_get_rating_info(browser)

{'rating': 4.74810457229614, 'made_by': 2000, 'reviews': 1000}

In [289]:
def _get_categories(browser):
    sel = 'ol.breadcrumbs li'
    categories = browser.find_elements_by_css_selector(sel)
    cat_list = [category.text for category in categories]
    cat_dict = {}
    cat_dict['cat_1'] = cat_list[2]
    cat_dict['cat_2'] = cat_list[3]
    cat_dict['cat_3'] = cat_list[4]
    return cat_dict

In [290]:
_get_categories(browser)

{'cat_1': 'Soups, Stews and Chili', 'cat_2': 'Soup', 'cat_3': 'Chicken Soup'}

In [280]:
def _get_submitter_info(browser):
    submitter_info = {}
    sel = 'div.summary-background div.submitter'
    submitter = browser.find_element_by_css_selector(sel)
    href = (submitter.find_element_by_css_selector('div.submitter__img a')
                     .get_attribute('href'))
    id_num = href.split('/')[-2]
    followers = submitter.find_element_by_css_selector('div.submitter__img span').text
    name = submitter.find_element_by_css_selector('p span.submitter__name').text
    submitter_info['href'] = href
    submitter_info['id'] = int(id_num)
    submitter_info['name'] = name
    submitter_info['followers'] = int(followers)
    return submitter_info

In [281]:
_get_submitter_info(browser)

{'href': 'https://www.allrecipes.com/cook/177251/',
 'id': 177251,
 'name': 'AUNTTAF',
 'followers': 14}

In [291]:
def get_recipe_info(browser):
    recipe_info = {}
    recipe_info['name'] = _get_name(browser)
    recipe_info['category'] = _get_categories(browser)
    recipe_info['rating_info'] = _get_rating_info(browser)
    recipe_info['submitter_info'] = _get_submitter_info(browser)
    return recipe_info

In [292]:
get_recipe_info(browser)

{'name': "Catherine's Spicy Chicken Soup",
 'category': {'cat_1': 'Soups, Stews and Chili',
  'cat_2': 'Soup',
  'cat_3': 'Chicken Soup'},
 'rating_info': {'rating': 4.74810457229614, 'made_by': 2000, 'reviews': 1000},
 'submitter_info': {'href': 'https://www.allrecipes.com/cook/177251/',
  'id': 177251,
  'name': 'AUNTTAF',
  'followers': 14}}