In [391]:
import time
import random
import requests
import selenium
from string import ascii_lowercase
from selenium.webdriver import Firefox, Chrome
from bs4 import BeautifulSoup
import pandas as pd
import string
import numpy as np
from selenium.webdriver.common.keys import Keys
import pymongo

In [182]:
browser = Chrome()

In [314]:
url='https://www.allrecipes.com/recipe/141833/thai-green-curry-chicken/'
browser.get(url)

In [84]:
time.sleep(2)

In [307]:
def _get_ingredients(browser):
    all_items = []
    count = 1
    while True:
        try:
            sel = 'ul#lst_ingredients_{0}'.format(count)
            ing_list = browser.find_element_by_css_selector(sel)
            all_items += ing_list.text.split('\n')
            count += 1
        except:
            break
    ingredients = []
    for item in all_items:
        if item[0] in string.digits:
            ingredients.append(item)
    return ingredients

In [403]:
ingredients = get_ingredients()
ingredients

['2 pounds lean ground beef',
 '1 (46 fluid ounce) can tomato juice',
 '1 (29 ounce) can tomato sauce',
 '1 1/2 cups chopped onion',
 '1/2 cup chopped celery',
 '1/4 cup chopped green bell pepper',
 '1/4 cup chili powder',
 '2 teaspoons ground cumin',
 '1 1/2 teaspoons garlic powder',
 '1 teaspoon salt',
 '1/2 teaspoon ground black pepper',
 '1/2 teaspoon dried oregano',
 '1/2 teaspoon white sugar',
 '1/8 teaspoon ground cayenne pepper',
 '2 cups canned red beans, drained and rinsed']

In [369]:
def _determine_quantity(item):
    quantity = 0
    for i, elem in enumerate(item.split()):
        if elem[0] in string.digits:
            try:
                quantity += float(elem)
            except:
                numer, denom = elem.split('/')
                quantity += float(numer) / float(denom)
        else:
            idx = i
            break
    remainder = ' '.join(item.split()[idx:])
    return quantity, remainder

In [370]:
def _parse_special(item, flag_words):
    # Determine special word
    sp_word = ')'
    for word in flag_words:
        if word in item.split():
            sp_word = ' ' + word + ' '
            break
    
    # Parse item 
    count_and_size = item.split(sp_word)[0]
    remainder = item.split(sp_word)[1]
    count, rest = _determine_quantity(count_and_size)
    if sp_word == ')':
        size, unit = _determine_quantity(rest[1:])
    else:
        size, unit = _determine_quantity(rest[1:-1])
    quantity = count * size
    return quantity, unit, remainder

In [371]:
def _remove_descriptors(item,
                        phrases=phrases,
                        stopwords=stopwords,
                        suffixes=suffixes):
    # Remove common/unnecessary ending phrases
    for phrase in phrases:
        if len(item.split(phrase)) > 1:
            item = item.split(phrase)[0]
    # Remove punctuation and stopwords
    words = []
    for elem in item.split():
        word = ''.join([letter for letter in elem.lower() if letter in string.ascii_lowercase])
        if word not in stopwords:
            words.append(word)
    # Remove adjectives and adverbs    
    for suffix in suffixes:
        for word in words.copy():
            try:
                if (word[-len(suffix):] == suffix) and word != 'red':
                    words.remove(word)
            except:
                continue
    return ' '.join(words)

In [372]:
def parse_ingredients(ingredients, units=units, flag_words=flag_words):
    '''
    Parses a list of ingredients into a list of dictionaries with the following format: 
        {'quantity': (float),
         'units': (str),
         'ingredient': (str)}
    Also takes argument 'units', a list of accepted units (e.g., ['cups', 'tablespoon']).
    If an ingredident does not specify a unit in this list, the label 'each' will be applied.
    '''
    ing_list = []
    for item in ingredients:
        item_dict = {}
        # Check item for flag words (require special parsing treatment)
        flag = False
        for word in item.split():
            if word in flag_words:
                flag = True
        if item.split()[1][0] == '(':
            flag = True  
        # Parse quantities and units        
        if flag:
            quantity, unit, remainder = _parse_special(item, flag_words)
            item_dict['quantity'] = quantity
            item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
        else:
            quantity, remainder = _determine_quantity(item) 
            item_dict['quantity'] = quantity
            if remainder.split()[0] in units:
                unit = remainder.split()[0]
                item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
                remainder = ' '.join(remainder.split()[1:])
            else:
                item_dict['units'] = 'each'
        # Remove preparation instructions from remaining text to isolate ingredient
        item_dict['ingredient'] = _remove_descriptors(remainder)
        # Add item dictionary to list
        ing_list.append(item_dict)
    return ing_list

In [386]:
units = ['pound', 'pounds', 'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons',
         'clove', 'cloves', 'stalk', 'stalks', 'ounce', 'ounces', 'oz.', 'cubes', 'pint', 'pints',
         'quart', 'quarts']
phrases = [' - ',', or', ', for garnish', ', cut', ' such as', ' like', 'e.g.']
stopwords = ['and', 'into', 'very', 'hot', 'cold', 'fresh', 'large', 'medium', 'small', 'halves', 'torn', 'bulk']
suffixes = ['ed','less','ly']
flag_words = ['can', 'cans', 'package', 'packages', 'jar', 'jars', 'container', 'containers', 'bag', 'bags']

In [373]:
pd.DataFrame(parse_ingredients(ingredients))

Unnamed: 0,ingredient,quantity,units
0,ground beef chuck,2.0,pound
1,italian sausage,1.0,pound
2,chili beans,45.0,ounce
3,chili beans in spicy sauce,15.0,ounce
4,tomatoes with juice,56.0,ounce
5,tomato paste,6.0,ounce
6,yellow onion,1.0,each
7,celery,3.0,stalk
8,green bell pepper,1.0,each
9,red bell pepper,1.0,each


In [400]:
def _get_id(browser):
    id_and_name = browser.current_url.split('recipe/')[1]
    return int(id_and_name.split('/')[0])

In [273]:
def _get_name(browser):
    sel = 'h1#recipe-main-content'
    name = browser.find_element_by_css_selector(sel)
    return name.text

In [404]:
_get_name(browser)

'Flatlander Chili'

In [285]:
def _get_rating_info(browser):
    rating_info = {}
    sel = 'div.rating-stars'
    rating = browser.find_element_by_css_selector(sel)
    rating_info['rating'] = float(rating.get_attribute('data-ratingstars'))
    sel = 'div.summary-stats-box a.read--reviews'
    reviews = browser.find_element_by_css_selector(sel).text.split()
    try:
        n_made = int(reviews[0])
    except:
        n_made = int(reviews[0][:-1]) * 1000
    try:
        n_reviews = int(reviews[4])
    except:
        n_reviews = int(reviews[4][:-1]) * 1000    
    rating_info['made_by'] = n_made
    rating_info['reviews'] = n_reviews
    return rating_info

In [284]:
sel = 'div.summary-stats-box a.read--reviews'
reviews = browser.find_element_by_css_selector(sel)
reviews.text.split()

['2k', 'made', 'it', '|', '1k', 'reviews']

In [286]:
_get_rating_info(browser)

{'rating': 4.74810457229614, 'made_by': 2000, 'reviews': 1000}

In [305]:
def _get_categories(browser):
    sel = 'ol.breadcrumbs li'
    categories = browser.find_elements_by_css_selector(sel)
    cat_list = [category.text for category in categories]
    cat_dict = {}
    cat_dict['lvl_1'] = cat_list[2]
    try:
        cat_dict['lvl_2'] = cat_list[3]
    except:
        cat_dict['lvl_2'] = None
    try:
        cat_dict['lvl_3'] = cat_list[4]
    except:
        cat_dict['lvl_3'] = None
    return cat_dict

In [290]:
_get_categories(browser)

{'cat_1': 'Soups, Stews and Chili', 'cat_2': 'Soup', 'cat_3': 'Chicken Soup'}

In [310]:
def _get_submitter_info(browser):
    submitter_info = {}
    sel = 'div.summary-background div.submitter'
    submitter = browser.find_element_by_css_selector(sel)
    followers = submitter.find_element_by_css_selector('div.submitter__img span').text
    name = submitter.find_element_by_css_selector('p span.submitter__name').text
    href = (submitter.find_element_by_css_selector('div.submitter__img a')
                     .get_attribute('href'))
    id_num = href.split('/')[-2]
    submitter_info['id'] = int(id_num)
    submitter_info['name'] = name
    submitter_info['followers'] = int(followers)
    submitter_info['href'] = href
    return submitter_info

In [406]:
_get_submitter_info(browser)

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"div.submitter__img span"}
  (Session info: chrome=73.0.3683.86)
  (Driver info: chromedriver=73.0.3683.20 (8e2b610813e167eee3619ac4ce6e42e3ec622017),platform=Mac OS X 10.14.3 x86_64)


In [414]:
def get_recipe_info(browser):
    recipe_info = {}
    recipe_info['id'] = _get_id(browser)
    recipe_info['name'] = _get_name(browser)
    recipe_info['href'] = browser.current_url.split('?')[0]
    recipe_info['category'] = _get_categories(browser)
    recipe_info['rating_info'] = _get_rating_info(browser)
    try:
        recipe_info['submitter_info'] = _get_submitter_info(browser)
    except:
        recipe_info['submitter_info'] = None
    ingredients = _get_ingredients(browser)
    recipe_info['ingredients'] = parse_ingredients(ingredients)
    recipe_info['directions'] = _get_directions(browser)
    return recipe_info

In [415]:
get_recipe_info(browser)

{'id': 13079,
 'name': 'Flatlander Chili',
 'href': 'https://www.allrecipes.com/recipe/13079/flatlander-chili/',
 'category': {'lvl_1': 'Soups, Stews and Chili',
  'lvl_2': 'Chili',
  'lvl_3': 'Beef Chili'},
 'rating_info': {'rating': 4.73050260543823, 'made_by': 3000, 'reviews': 1000},
 'submitter_info': None,
 'ingredients': [{'quantity': 2.0,
   'units': 'pound',
   'ingredient': 'lean ground beef'},
  {'quantity': 46.0, 'units': 'fluid ounce', 'ingredient': 'tomato juice'},
  {'quantity': 29.0, 'units': 'ounce', 'ingredient': 'tomato sauce'},
  {'quantity': 1.5, 'units': 'cup', 'ingredient': 'onion'},
  {'quantity': 0.5, 'units': 'cup', 'ingredient': 'celery'},
  {'quantity': 0.25, 'units': 'cup', 'ingredient': 'green bell pepper'},
  {'quantity': 0.25, 'units': 'cup', 'ingredient': 'chili powder'},
  {'quantity': 2.0, 'units': 'teaspoon', 'ingredient': 'ground cumin'},
  {'quantity': 1.5, 'units': 'teaspoon', 'ingredient': 'garlic powder'},
  {'quantity': 1.0, 'units': 'teaspoon',

In [387]:
def _get_directions(browser):
    directions = {}
    directions['timing'] = _get_timing(browser)
    sel = 'div.directions--section li.step'
    steps = browser.find_elements_by_css_selector(sel)
    directions['steps'] = [step.text for step in steps if step.text]
    directions['servings'] = _get_servings(browser)
    return directions

In [336]:
_get_directions(browser)

{'timing': {'prep': 20, 'cook': 40, 'ready_in': 60},
 'steps': ['Toss chicken first in 1 tablespoon dark soy sauce, then in the flour, coating pieces evenly. Heat the oil in a large skillet over medium high heat. Place chicken in the skillet, cook and stir chicken until browned, about 5 minutes. Remove chicken.',
  'Reduce heat to medium and stir in curry paste. Cook for 1 minute until fragrant, then stir in green onions, garlic, and ginger; cook an additional 2 minutes. Return chicken to the skillet, stirring to coat with the curry mixture. Stir the coconut milk, fish sauce, 1 tablespoon soy sauce, and sugar into the chicken-curry mixture. Allow to simmer over medium heat for 20 minutes until the chicken is tender. Serve garnished with cilantro leaves.']}

In [411]:
def _get_timing(browser):
    timing = {}
    sel = 'div.directions--section ul.prepTime li.prepTime__item'
    timing_list = browser.find_elements_by_css_selector(sel)
    time_str = timing_list[1].get_attribute('aria-label').split(': ')[1]
    timing['prep'] = _parse_timing_string(time_str)
    time_str = timing_list[2].get_attribute('aria-label').split(': ')[1]
    timing['cook'] = _parse_timing_string(time_str)
    time_str = timing_list[3].get_attribute('aria-label').split('Ready in ')[1]
    timing['total'] = _parse_timing_string(time_str)
    return timing

In [410]:
def _parse_timing_string(string):
    total = 0
    if len(string.split('Hours')) > 1:
        total += 60 * int(string.split('Hours')[0])
        string = string.split('Hours')[1]
    if len(string.split('Hour')) > 1:
        total += 60 * int(string.split('Hour')[0])
        string = string.split('Hour')[1]
    if len(string.split('Minutes')) > 1:
        total += int(string.split('Minutes')[0])
    return total

In [382]:
def _get_servings(browser):
    sel = 'span.servings-count span.ng-binding'
    servings = browser.find_element_by_css_selector(sel)
    return int(servings.text)

In [383]:
_get_servings(browser)

12

In [392]:
mc = pymongo.MongoClient()

In [396]:
test_db = mc['recipes_test']
recipes_coll = test_db['recipes']

In [398]:
recipes_coll.insert_one(get_recipe_info(browser))

<pymongo.results.InsertOneResult at 0x10ef51dc8>

In [399]:
list(recipes_coll.find())

[{'_id': ObjectId('5ca3adba8fd54b1b485da598'),
  'id': '78299',
  'name': 'Boilermaker Tailgate Chili',
  'href': 'https://www.allrecipes.com/recipe/78299/boilermaker-tailgate-chili/',
  'category': {'lvl_1': 'Soups, Stews and Chili',
   'lvl_2': 'Chili',
   'lvl_3': 'Pork Chili'},
  'rating_info': {'rating': 4.8183069229126,
   'made_by': 12000,
   'reviews': 4000},
  'submitter_info': {'id': 591623,
   'name': 'MIGHTYPURDUE22',
   'followers': 41,
   'href': 'https://www.allrecipes.com/cook/591623/'},
  'ingredients': [{'quantity': 2.0,
    'units': 'pound',
    'ingredient': 'ground beef chuck'},
   {'quantity': 1.0, 'units': 'pound', 'ingredient': 'italian sausage'},
   {'quantity': 45.0, 'units': 'ounce', 'ingredient': 'chili beans'},
   {'quantity': 15.0,
    'units': 'ounce',
    'ingredient': 'chili beans in spicy sauce'},
   {'quantity': 56.0, 'units': 'ounce', 'ingredient': 'tomatoes with juice'},
   {'quantity': 6.0, 'units': 'ounce', 'ingredient': 'tomato paste'},
   {'quan