In [86]:
import time
import random
import requests
import selenium
from string import ascii_lowercase
from selenium.webdriver import Firefox, Chrome
from bs4 import BeautifulSoup
import pandas as pd
import string
import numpy as np
from selenium.webdriver.common.keys import Keys

In [182]:
browser = Chrome()

In [183]:
url='https://www.allrecipes.com/recipe/141833/thai-green-curry-chicken/'
browser.get(url)

In [84]:
time.sleep(2)

In [90]:
def get_ingredients():
    all_items = []
    count = 1
    while True:
        try:
            sel = 'ul#lst_ingredients_{0}'.format(count)
            ing_list = browser.find_element_by_css_selector(sel)
            all_items += ing_list.text.split('\n')
            count += 1
        except:
            break
    ingredients = []
    for item in all_items:
        if item[0] in string.digits:
            ingredients.append(item)
    return ingredients

In [184]:
ingredients = get_ingredients()
ingredients

['1 pound skinless, boneless chicken breast halves - cut into 1 inch cubes',
 '1 tablespoon dark soy sauce',
 '1 tablespoon all-purpose flour',
 '2 tablespoons cooking oil',
 '2 tablespoons green curry paste',
 '2 green onions with tops, chopped',
 '3 cloves garlic, peeled and chopped',
 '1 teaspoon fresh ginger, peeled and finely chopped',
 '2 cups coconut milk',
 '1 tablespoon fish sauce',
 '1 tablespoon dark soy sauce',
 '2 tablespoons white sugar',
 '1/2 cup cilantro leaves, for garnish']

In [172]:
def _determine_quantity(item):
    quantity = 0
    for i, elem in enumerate(item.split()):
        if elem[0] in string.digits:
            try:
                quantity += float(elem)
            except:
                numer, denom = elem.split('/')
                quantity += float(numer) / float(denom)
        else:
            idx = i
            break
    remainder = ' '.join(item.split()[idx:])
    return quantity, remainder

In [167]:
_remove_descriptors(ingredients[1])

' ounces pancetta bacon'

In [177]:
def _parse_special(item, flag_words):
    # Determine special word
    for word in flag_words:
        if word in item.split():
            sp_word = ' ' + word + ' '
            break
    
    # Parse item 
    count_and_size = item.split(sp_word)[0]
    remainder = item.split(sp_word)[1]
    count, rest = _determine_quantity(count_and_size)
    size, unit = _determine_quantity(rest[1:-1])
    quantity = count * size
    return quantity, unit, remainder

In [189]:
def _remove_descriptors(item, stopwords=['and'], endings=['ed','less','ly']):
    
    # Remove meat preparation instructions
    if len(item.split(' - ')) > 1:
        item = item.split(' - ')[0]
    
    # Remove punctuation and stopwords
    words = []
    for elem in item.split():
        word = ''.join([letter for letter in elem.lower() if letter in string.ascii_lowercase])
        if word not in stopwords:
            words.append(word)
    
    # Remove adjectives and adverbs    
    for ending in endings:
        for word in words.copy():
            try:
                if word[-len(ending):] == ending:
                    words.remove(word)
            except:
                continue
    return ' '.join(words)

In [190]:
def parse_ingredients(ingredients, units, flag_words=['can', 'cans', 'package', 'packages']):
    '''
    Parses a list of ingredients into a list of dictionaries with the following format: 
        {'quantity': (float),
         'units': (str),
         'ingredient': (str)}
    Also takes argument 'units', a list of accepted units (e.g., ['cups', 'tablespoon']).
    If an ingredident does not specify a unit in this list, the label 'each' will be applied.
    '''
    ing_list = []
    for item in ingredients:
        item_dict = {}
        
        # Check item for flag words (require special parsing treatment)
        flag = False
        for word in item.split():
            if word in flag_words:
                flag = True
        
        # Parse quantities and units        
        if flag:
            quantity, unit, remainder = _parse_special(item, flag_words)
            item_dict['quantity'] = quantity
            item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
        else:
            quantity, remainder = _determine_quantity(item) 
            item_dict['quantity'] = quantity
            if remainder.split()[0] in units:
                unit = remainder.split()[0]
                item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
                remainder = ' '.join(remainder.split()[1:])
            else:
                item_dict['units'] = 'each'
        
        # Remove preparation instructions from remaining text to isolate ingredient
        item_dict['ingredient'] = _remove_descriptors(remainder)
        
        # Add item dictionary to list
        ing_list.append(item_dict)
    
    return ing_list

In [157]:
units = ['pound', 'pounds', 'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons',
         'clove', 'cloves', 'stalk', 'stalks', 'ounce', 'ounces', 'oz.']

In [191]:
parse_ingredients(ingredients, units)

[{'quantity': 1.0, 'units': 'pound', 'ingredient': 'chicken breast halves'},
 {'quantity': 1.0, 'units': 'tablespoon', 'ingredient': 'dark soy sauce'},
 {'quantity': 1.0, 'units': 'tablespoon', 'ingredient': 'allpurpose flour'},
 {'quantity': 2.0, 'units': 'tablespoon', 'ingredient': 'cooking oil'},
 {'quantity': 2.0, 'units': 'tablespoon', 'ingredient': 'green curry paste'},
 {'quantity': 2.0, 'units': 'each', 'ingredient': 'green onions with tops'},
 {'quantity': 3.0, 'units': 'clove', 'ingredient': 'garlic'},
 {'quantity': 1.0, 'units': 'teaspoon', 'ingredient': 'fresh ginger'},
 {'quantity': 2.0, 'units': 'cup', 'ingredient': 'coconut milk'},
 {'quantity': 1.0, 'units': 'tablespoon', 'ingredient': 'fish sauce'},
 {'quantity': 1.0, 'units': 'tablespoon', 'ingredient': 'dark soy sauce'},
 {'quantity': 2.0, 'units': 'tablespoon', 'ingredient': 'white sugar'},
 {'quantity': 0.5,
  'units': 'cup',
  'ingredient': 'cilantro leaves for garnish'}]

In [95]:
def get_name():
    sel = 'h1#recipe-main-content'
    name = browser.find_element_by_css_selector(sel)
    return name.text

In [98]:
get_name()

'Thai Green Curry Chicken'

In [93]:
def get_rating():
    sel = 'div.rating-stars'
    rating = browser.find_element_by_css_selector(sel)
    return float(rating.get_attribute('data-ratingstars'))

In [94]:
get_rating()

4.53484010696411

In [96]:
def get_categories():
    sel = 'ol.breadcrumbs li'
    categories = browser.find_elements_by_css_selector(sel)
    return [category.text for category in categories]

In [97]:
get_categories()

['Home', 'Recipes', 'World Cuisine', 'Asian', 'Thai']

In [100]:
a = '1/2'

In [101]:
x, y = a.split('/')

In [102]:
x

'1'

In [159]:
a = [1,4,5]

In [160]:
a.remove(4)

In [161]:
a

[1, 5]