In [86]:
import time
import random
import requests
import selenium
from string import ascii_lowercase
from selenium.webdriver import Firefox, Chrome
from bs4 import BeautifulSoup
import pandas as pd
import string
import numpy as np
from selenium.webdriver.common.keys import Keys

In [182]:
browser = Chrome()

In [183]:
url='https://www.allrecipes.com/recipe/141833/thai-green-curry-chicken/'
browser.get(url)

In [84]:
time.sleep(2)

In [90]:
def get_ingredients():
    all_items = []
    count = 1
    while True:
        try:
            sel = 'ul#lst_ingredients_{0}'.format(count)
            ing_list = browser.find_element_by_css_selector(sel)
            all_items += ing_list.text.split('\n')
            count += 1
        except:
            break
    ingredients = []
    for item in all_items:
        if item[0] in string.digits:
            ingredients.append(item)
    return ingredients

In [225]:
ingredients = get_ingredients()
ingredients

['2 red bell pepper, seeded and sliced into strips',
 '4 (8 ounce) boneless pork loin chops',
 '1/2 cup teriyaki sauce',
 '1/4 cup creamy peanut butter',
 '2 tablespoons rice vinegar',
 '1 teaspoon crushed red pepper flakes',
 '2 cloves garlic, minced',
 '1/2 cup chopped green onions',
 '1/4 cup chopped roasted peanuts',
 '2 limes, cut into wedges']

In [215]:
def _determine_quantity(item):
    quantity = 0
    for i, elem in enumerate(item.split()):
        if elem[0] in string.digits:
            try:
                quantity += float(elem)
            except:
                numer, denom = elem.split('/')
                quantity += float(numer) / float(denom)
        else:
            idx = i
            break
    remainder = ' '.join(item.split()[idx:])
    return quantity, remainder

In [216]:
def _parse_special(item, flag_words):
    # Determine special word
    sp_word = ')'
    for word in flag_words:
        if word in item.split():
            sp_word = ' ' + word + ' '
            break
    
    # Parse item 
    count_and_size = item.split(sp_word)[0]
    remainder = item.split(sp_word)[1]
    count, rest = _determine_quantity(count_and_size)
    if sp_word == ')':
        size, unit = _determine_quantity(rest[1:])
    else:
        size, unit = _determine_quantity(rest[1:-1])
    quantity = count * size
    return quantity, unit, remainder

In [221]:
def _remove_descriptors(item,
                        phrases=phrases,
                        stopwords=stopwords,
                        suffixes=suffixes):
    # Remove common/unnecessary ending phrases
    for phrase in phrases:
        if len(item.split(phrase)) > 1:
            item = item.split(phrase)[0]
    # Remove punctuation and stopwords
    words = []
    for elem in item.split():
        word = ''.join([letter for letter in elem.lower() if letter in string.ascii_lowercase])
        if word not in stopwords:
            words.append(word)
    # Remove adjectives and adverbs    
    for suffix in suffixes:
        for word in words.copy():
            try:
                if word[-len(suffix):] == suffix:
                    words.remove(word)
            except:
                continue
    return ' '.join(words)

In [222]:
def parse_ingredients(ingredients, units=units, flag_words=flag_words):
    '''
    Parses a list of ingredients into a list of dictionaries with the following format: 
        {'quantity': (float),
         'units': (str),
         'ingredient': (str)}
    Also takes argument 'units', a list of accepted units (e.g., ['cups', 'tablespoon']).
    If an ingredident does not specify a unit in this list, the label 'each' will be applied.
    '''
    ing_list = []
    for item in ingredients:
        item_dict = {}
        # Check item for flag words (require special parsing treatment)
        flag = False
        for word in item.split():
            if word in flag_words:
                flag = True
        if item.split()[1][0] == '(':
            flag = True  
        # Parse quantities and units        
        if flag:
            quantity, unit, remainder = _parse_special(item, flag_words)
            item_dict['quantity'] = quantity
            item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
        else:
            quantity, remainder = _determine_quantity(item) 
            item_dict['quantity'] = quantity
            if remainder.split()[0] in units:
                unit = remainder.split()[0]
                item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
                remainder = ' '.join(remainder.split()[1:])
            else:
                item_dict['units'] = 'each'
        # Remove preparation instructions from remaining text to isolate ingredient
        item_dict['ingredient'] = _remove_descriptors(remainder)
        # Add item dictionary to list
        ing_list.append(item_dict)
    return ing_list

In [223]:
units = ['pound', 'pounds', 'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons',
         'clove', 'cloves', 'stalk', 'stalks', 'ounce', 'ounces', 'oz.']
phrases = [' - ',', or', ', for garnish', ', cut']
stopwords = ['and', 'into', 'very', 'hot', 'cold', 'fresh', 'large', 'medium', 'small']
suffixes = ['ed','less','ly']
flag_words = ['can', 'cans', 'package', 'packages']

In [227]:
pd.DataFrame(parse_ingredients(ingredients))

Unnamed: 0,ingredient,quantity,units
0,bell pepper strips,2.0,each
1,pork loin chops,32.0,ounce
2,teriyaki sauce,0.5,cup
3,creamy peanut butter,0.25,cup
4,rice vinegar,2.0,tablespoon
5,pepper flakes,1.0,teaspoon
6,garlic,2.0,clove
7,green onions,0.5,cup
8,peanuts,0.25,cup
9,limes,2.0,each


In [95]:
def get_name():
    sel = 'h1#recipe-main-content'
    name = browser.find_element_by_css_selector(sel)
    return name.text

In [98]:
get_name()

'Thai Green Curry Chicken'

In [93]:
def get_rating():
    sel = 'div.rating-stars'
    rating = browser.find_element_by_css_selector(sel)
    return float(rating.get_attribute('data-ratingstars'))

In [94]:
get_rating()

4.53484010696411

In [96]:
def get_categories():
    sel = 'ol.breadcrumbs li'
    categories = browser.find_elements_by_css_selector(sel)
    return [category.text for category in categories]

In [97]:
get_categories()

['Home', 'Recipes', 'World Cuisine', 'Asian', 'Thai']

In [100]:
a = '1/2'

In [101]:
x, y = a.split('/')

In [102]:
x

'1'

In [159]:
a = [1,4,5]

In [160]:
a.remove(4)

In [161]:
a

[1, 5]

In [194]:
s = 'chicken, for garnish'

In [196]:
len(s.split(', for garnish'))

2

In [None]:
s