Goals
-
To collect recipe data that covers the necessary recipe parameters. To upload the formmatted data to a mongodb database where fronted ios application can access information using a proprietary API. It is important to find recipe sources with as much recipe information as possible

Necessary Recipe Parameters:
- 

1. Name : String
2. Servings : Number
3. Prep time : Number
4. Image URL : String
5. Ingredients : Array(String)
6. Directions : Array(String)
7. Nutrition : Dict(name: String, unit: String?, amount: Number, dailyPercent: Number?)
8. Cuisines : Array(String)
9. Courses : Array(String)
10. Diets: Array(String)

In [76]:
from bs4 import BeautifulSoup
import requests
import re
import string

In [154]:
#CLASSES

class Recipe:

    def __init__(self):
        self.name = None
        self.servings = None
        self.prepTime = None
        self.imageURL = None
        self.ingredients = []
        self.directions = []
        self.nutrition = []
        self.cuisines = []
        self.courses = []

In [19]:
#REGEX

units = """(teaspoons|teaspoon|tsps|tsp|ts|t|tablespoons|tablespoon|tbls|tbl|tbs|tb|tbsps|tbsp|
cup|cups|pint|p|pt|fl pt|quarts|qts|qt|quart|fl qts|fl qt|gallons|gallon|gs|g|
gals|gal|mls|ml|milliliters|milliliters|ls|l|liters|liter|dls|dl|
pounds|pound|lb|ounces|ounce|ozs|oz|mgs|mg|milligram|g|gram|kgs|kg|kilograms|kilogram)""".replace('\n', '')
meals = """([B,b]reakfast|[L,l]unch|[D,d]inner|[B,b]runch)"""
mixedNumberRegex = "\d+(-\d+)*(/\d+)*"
ingredientRegex = re.compile(mixedNumberRegex + " " + units + "[\s)]")
nutrientRegex = re.compile("[0-9]+" + units)
removeBadChars = re.compile("[,.!?()]")

In [85]:
def cleanText(text):
    return text.lower().strip().translate(str.maketrans('', '', string.punctuation))

In [20]:
def handleNumber(text):
    text = text.split()
    value = 0
    for number in text:
        mixedNumber = number.split("-")
        fraction = None
        if len(mixedNumber) > 1:
            value += float(mixedNumber[0])
            fraction = mixedNumber[1].split("/")
        else:
            fraction = number.split("/")
        if len(fraction) > 1:
            value += float(fraction[0]) / float(fraction[1])
        else:
            if number.isnumeric():
                value += float(number)
    return value

In [21]:
def parseNutrients(regex, data):
    
    amount = None
    unit = None
    name = None

    if re.search(regex, data) is None:
        search = re.search("[0-9]+", data)
        if search is not None:
            amount = search[0]
            name = removeBadChars.sub('', data[0:search.span()[0]] + data[search.span()[1]:len(data)]).strip()
        else:
            name = data
    else:
        matches = regex.finditer(data)
        for match in matches:
            name = removeBadChars.sub('', data[0:match.span()[0]] + data[match.span()[1]:len(data)]).strip()
            match = removeBadChars.sub('', match[0])
            aAndu = re.search(mixedNumberRegex, match)
            amount = handleNumber(aAndu[0])
            unit = match[0:aAndu.span()[0]] + match[aAndu.span()[1]:len(match)].strip()
            break

    return (name, amount, unit)

In [9]:
def tasteOfHomeCreateRecipe(soup):
    
    recipe = Recipe()
    
    name = soup.find("h1", class_="recipe-title").text
    servings = soup.find("div", class_="recipe-time-yield__label-servings")
    prepTime = soup.find("div", class_="recipe-time-yield__label-prep")
    imageURL = soup.find("div", class_="recipe-image-and-meta-sidebar__featured-container")
    ingredients = [ingredient.text.strip() for ingredient in soup.find("div", class_="recipe-ingredients").find_all("li")]
    directions = [direction.text.strip() for direction in soup.find("ul", class_="recipe-directions__list").find_all("li")]
    nutrients = soup.find("div", class_="recipe-nutrition-facts")
    cuisines = []
    courses = []
    
    if name is None: return None
    recipe.name = name
    
    if servings is not None:
        servings = re.search("[0-9]+\sservings", servings.text.strip())
        if servings is not None:
            recipe.servings = re.search("[0-9]+", servings[0])[0]
    
    if imageURL is not None:
        recipe.imageURL = imageURL.find("img")['src']
        
    if prepTime is not None:
        prepTime = re.search("[0-9]+", prepTime.text.strip())
        if prepTime is not None:
            recipe.prepTime = prepTime[0]

    if nutrients is not None:   

        nutrients = re.split('[:,().]', nutrients.text)
        nutrients = [' '.join(nt.split()) for nt in nutrients if nt is not '']
        nutrients = [nt for nt in nutrients if re.search('[0-9]+', nt) is not None]
        if len(nutrients) > 0: 
            servings = re.search('[0-9]+', nutrients[0])
            if servings is not None:
                recipe.nutritionServings = servings[0]
                nutrients = nutrients[1:len(nutrients)]
                
    if ingredients is not None:
        for ingredient in ingredients:
            recipe.ingredients.append(ingredient)

    if directions is not None:
        for direction in directions:
            recipe.directions.append(direction)

    if nutrients is not None:
        for nutrient in nutrients:
            info = parseNutrients(nutrientRegex, nutrient)
            recipe.nutrition.append(Nutrient(info[0],info[1],info[2]))
            
    return recipe.asDict()

In [156]:
def iHeartEatingCreateRecipe(soup):
    
    recipe = Recipe()
    
    card = soup.find("div", class_="wprm-recipe-ihe")
    
    if card is None: return None
    
    name = card.find("h2", class_="wprm-recipe-name")
    servings = card.find("span", class_="wprm-recipe-servings")
    prepTime = card.find("div", class_="wprm-recipe-total-time-container")
    imageURL = card.find("div", class_="wprm-recipe-image")
    ingredients = [ing for ing in card.find("ul", class_="wprm-recipe-ingredients").find_all("li")]
    directions = [direction.find("div", class_="wprm-recipe-instruction-text") for direction in card.find("ul", class_="wprm-recipe-instructions").find_all("li")]
    nutrients = [nt for nt in card.find("div", class_="wprm-nutrition-label-container").find_all("span", class_="wprm-nutrition-label-text-nutrition-container")]
    cuisines = card.find("span", class_="wprm-recipe-cuisine")
    courses = card.find("span", class_="wprm-recipe-course")
    
    if name is None: return None
    recipe.name = name.text
    
    if servings is not None: recipe.servings = servings.text
    if prepTime is not None:
        times = prepTime.find_all("span", class_="wprm-recipe-details")
        if times is not None:
            if len(times) == 2:
                recipe.prepTime = str(int(times[0].text) * 60 + int(times[1].text))
            else:
                unit = prepTime.find("span", class_="wprm-recipe-total_time-unit")
                if (unit.text == "hour"): recipe.prepTime = str(int(times[0].text) * 60)
                else: recipe.prepTime = str(times[0].text)
                
    if imageURL is not None: 
        recipe.imageURL = imageURL.find("img")['data-lazy-src']
    if ingredients is not None: 
        for ing in ingredients:
            amount = ing.find("span", class_="wprm-recipe-ingredient-amount")
            unit = ing.find("span", class_="wprm-recipe-ingredient-unit")
            name = ing.find("span", class_="wprm-recipe-ingredient-name")
            if amount is None: recipe.ingredients.append(name.text)
            else:
                if unit is None: recipe.ingredients.append(amount.text + " " + name.text)
                else:
                    recipe.ingredients.append(amount.text + " " + unit.text + " " + name.text)
    if directions is not None:
        for direction in directions:
            if direction is not None:
                recipe.directions.append(direction.text)
    if nutrients is not None:   
        for nt in nutrients:
            label = nt.find("span", class_="wprm-nutrition-label-text-nutrition-label")
            
            if re.search("serving|calories|carbohydrates|protein|fat|saturated fat|cholestoral|sodium|fiber|sugar", cleanText(label.text)) is None: continue
            
            value = nt.find("span", class_="wprm-nutrition-label-text-nutrition-value")
            daily = nt.find("span", class_="wprm-nutrition-label-text-nutrition-daily")
            
            nutrient = {}
            
            nutrient["label"] = cleanText(label.text)
            nutrient["value"] = cleanText(value.text)
            if daily is not None: nutrient["daily"] = cleanText(daily.text)
                
            recipe.nutrition.append(nutrient)
    
    if cuisines is not None: recipe.cuisines = cuisines.text.split(",")
    if courses is not None: recipe.courses = courses.text.split(",")

    return recipe.__dict__

Mongo Setup and DB Connection

In [40]:
import pymongo
import ssl
import json

In [31]:
client = pymongo.MongoClient("mongodb+srv://barak:barakh123@recipeappcluster-4ywv5.mongodb.net/test?retryWrites=true&w=majority", ssl=True, ssl_cert_reqs=ssl.CERT_NONE)
headers = {"User-Agent":"Mozilla/5.0"}

In [32]:
db = client["Recipes"]

General Strategy:
- 

1. Create function for parsing specific webpages
2. Loop through recipe index of webpage
3. Upload results to mongodb

Sites:
- https://www.ihearteating.com/recipe-index-2/
- https://letthebakingbegin.com/page/2/
- https://www.healthyseasonalrecipes.com/browse-recipes/
- https://itdoesnttastelikechicken.com/recipes/
- https://www.veggiesdontbite.com/recipes/ (Diets included)
- https://cooking.nytimes.com/
- https://www.recipetineats.com/recipes/
- https://www.rebootedmom.com/recipe-index/
- https://thevegan8.com/recipe-index/
- https://www.smalltownwoman.com/recipe-index/
- https://gatherforbread.com/recipe-index/
- https://www.occasionallyeggs.com/mains/ (Have to rely on categorization)
- https://www.cookingclassy.com/recipes/appetizer/
- https://rawmanda.com/recipes/ (Cuisine and course inconsistent)
- https://www.thekitchn.com/collection/recipe/breakfast (No cuisine)
- https://www.savorytooth.com/category/poultry-mains/ (No course or cuisine)
- https://tutti-dolci.com/recipe-index/ (No course/nutrition facts)

In [150]:
#Test recipe retrieval
url = "https://www.ihearteating.com/taco-casserole/"
page = requests.get(url, headers=headers).text
soup = BeautifulSoup(page, "lxml")
print(iHeartEatingCreateRecipe(soup))

{'name': 'Taco Casserole', 'servings': '8', 'prepTime': '33', 'imageURL': 'https://www.ihearteating.com/wp-content/uploads/2019/09/taco-casserole-6-1200-1-215x215.jpg', 'ingredients': ['1 ½ pounds lean ground beef', '1 tablespoon olive oil', '1 medium white or yellow onion', '4 cloves garlic', '15 ounce can black beans', '1 tablespoon ancho chili powder', '2 teaspoons ground cumin', '1 teaspoon smoked paprika', '1 teaspoon dried oregano', '1/2 teaspoon kosher salt', '1/4 teaspoon freshly ground black pepper', '2 tablespoons tomato paste', '2-15 ounce jars tomato salsa', '2 cups shredded cheddar or cheddar Jack cheese'], 'directions': ['Arrange a rack in the middle of the oven and heat to 350F. ', 'Heat a large, high-sided skillet over medium heat.', 'Add the beef and cook, breaking up the meat with a spoon, until browned and cooked through, about 6-8 minutes.', 'Remove ground beef to a plate, and drain any remaining fat from the skillet.', 'Add the tablespoon of oil to the now-empty sk

In [157]:
#Upload to DB


collection = db["All"]

indexURLs = ["https://www.ihearteating.com/recipe-index-2/page/" + str(i) for i in range(1, 44)]

for url in indexURLs:
    page = requests.get(url, headers=headers).text
    soup = BeautifulSoup(page, "lxml")
    links = [link["href"] for link in soup.find_all("a", class_="entry-title-link")]
    recipes = []
    for link in links:
        page = requests.get(link, headers=headers).text
        soup = BeautifulSoup(page, "lxml")
        recipe = iHeartEatingCreateRecipe(soup)
        if recipe is not None:  
            recipes.append(recipe)
    collection.insert_many(recipes)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [94]:
collection = db["All"]
# query = {"ingredients": {"$elemMatch": {"name": "half-and-half cream", "name": "butter"}}}
# results = collection.find({})

collection.aggregate([
    {"$addFields": {"nservings": {"$toInt": "$nutritionServings"}}}
])
    
# collection = db["All"]
# collection.insert_many(results)

<pymongo.command_cursor.CommandCursor at 0x10d0e2390>

In [155]:
# #Delete records
collection = db["All"]
collection.delete_many({})

<pymongo.results.DeleteResult at 0x10cc2d488>