Goals
-
To collect recipe data that covers the necessary recipe parameters. To upload the formmatted data to a mongodb database where fronted ios application can access information using a proprietary API. It is important to find recipe sources with as much recipe information as possible

Necessary Recipe Parameters:
- 

1. Name : String
2. Servings : Number
3. Prep time : Number
4. Image URL : String
5. Ingredients : Array(String)
6. Directions : Array(String)
7. Nutrition : Dict(name: String, unit: String?, amount: Number, dailyPercent: Number?)
8. Cuisines : Array(String)
9. Courses : Array(String)
10. Diets: Array(String)

In [3]:
from bs4 import BeautifulSoup
import requests
import re
import string

In [4]:
#CLASSES

class Recipe:

    def __init__(self):
        self.name = None
        self.servings = None
        self.prepTime = None
        self.imageURL = None
        self.ingredients = []
        self.directions = []
        self.nutrition = []
        self.cuisines = []
        self.courses = []
        self.diets = []

In [5]:
#REGEX
unitsRegex = """(teaspoons|teaspoon|tsps|tsp|ts|t|tablespoons|tablespoon|tbls|tbl|tbs|tb|tbsps|tbsp|
cup|cups|pint|p|pt|fl pt|quarts|qts|qt|quart|fl qts|fl qt|gallons|gallon|gs|g|
gals|gal|mls|ml|milliliters|milliliters|ls|l|liters|liter|dls|dl|
pounds|pound|lb|ounces|ounce|ozs|oz|mgs|mg|milligram|g|gram|kgs|kg|kilograms|kilogram)""".replace('\n', '')
nutrientsRegex = "serving|calories|carbohydrates|protein|saturated fat|fat|cholesterol|sodium|fiber|sugar"
coursesRegex = """(main course|side dish|dessert|appetizer|salad|bread|breakfast|soup|beverage|sauce|marinade|fingerfood|snack|drink)"""
dietsRegex = """(gluten free|ketogenic|keto|vegetarian|vegan|pescetarian|paleo|primal|whole30)"""
cuisinesRegex = """(african|american|british|cajun|carribean|chinese|eastern european|european|french|german|greek|indian|irish|italian|japanese|jewish|korean|latin american|mediterranean|mexican|middle eastern|nordic|southern|spanish|thai|vietnamese)"""
mixedNumberRegex = "[0-9]+( [0-9]+)?(/[0-9]+)?"
removeBadChars = re.compile("[,.!?()]")

In [6]:
def cleanText(text):
    return text.lower().strip().translate(str.maketrans('', '', string.punctuation))

In [7]:
def handleNumber(text):
    text = text.split()
    value = 0
    for number in text:
        mixedNumber = number.split()
        fraction = None
        if len(mixedNumber) > 1:
            value += float(mixedNumber[0])
            fraction = mixedNumber[1].split("/")
        else:
            fraction = number.split("/")
        if len(fraction) > 1:
            value += float(fraction[0]) / float(fraction[1])
        else:
            if number.isnumeric():
                value += float(number)
    return value

In [8]:
def parseNutrients(regex, data):
    
    amount = None
    unit = None
    name = None

    if re.search(regex, data) is None:
        search = re.search("[0-9]+", data)
        if search is not None:
            amount = search[0]
            name = removeBadChars.sub('', data[0:search.span()[0]] + data[search.span()[1]:len(data)]).strip()
        else:
            name = data
    else:
        matches = regex.finditer(data)
        for match in matches:
            name = removeBadChars.sub('', data[0:match.span()[0]] + data[match.span()[1]:len(data)]).strip()
            match = removeBadChars.sub('', match[0])
            aAndu = re.search(mixedNumberRegex, match)
            amount = handleNumber(aAndu[0])
            unit = match[0:aAndu.span()[0]] + match[aAndu.span()[1]:len(match)].strip()
            break

    return (name, amount, unit)

In [9]:
def iHeartEatingCreateRecipe(soup):
    
    recipe = Recipe()
    
    card = soup.find("div", class_="wprm-recipe-ihe")
    
    if card is None: return None
    
    name = card.find("h2", class_="wprm-recipe-name")
    servings = card.find("span", class_="wprm-recipe-servings")
    prepTime = card.find("div", class_="wprm-recipe-total-time-container")
    imageURL = card.find("div", class_="wprm-recipe-image")
    ingredients = [ing for ing in card.find("ul", class_="wprm-recipe-ingredients").find_all("li")]
    directions = [direction.find("div", class_="wprm-recipe-instruction-text") for direction in card.find("ul", class_="wprm-recipe-instructions").find_all("li")]
    nutrients = []
    if card.find("div", class_="wprm-nutrition-label-container") is not None:
        nutrients = [nt for nt in card.find("div", class_="wprm-nutrition-label-container").find_all("span", class_="wprm-nutrition-label-text-nutrition-container")]
    cuisines = card.find("span", class_="wprm-recipe-cuisine")
    courses = card.find("span", class_="wprm-recipe-course")
    
    if name is None: return None
    recipe.name = name.text
    
    if servings is not None: recipe.servings = servings.text
    if prepTime is not None:
        times = prepTime.find_all("span", class_="wprm-recipe-details")
        if times is not None:
            if len(times) == 2:
                recipe.prepTime = str(int(times[0].text) * 60 + int(times[1].text))
            else:
                unit = prepTime.find("span", class_="wprm-recipe-total_time-unit")
                if (unit.text == "hour"): recipe.prepTime = str(int(times[0].text) * 60)
                else: recipe.prepTime = str(times[0].text)
                
    if imageURL is not None: 
        recipe.imageURL = imageURL.find("img")['data-lazy-src']
    if ingredients is not None: 
        for ing in ingredients:
            amount = ing.find("span", class_="wprm-recipe-ingredient-amount")
            unit = ing.find("span", class_="wprm-recipe-ingredient-unit")
            name = ing.find("span", class_="wprm-recipe-ingredient-name")
            if amount is None: recipe.ingredients.append(name.text)
            else:
                if unit is None: recipe.ingredients.append(amount.text + " " + name.text)
                else:
                    recipe.ingredients.append(amount.text + " " + unit.text + " " + name.text)
    if directions is not None:
        for direction in directions:
            if direction is not None:
                recipe.directions.append(direction.text)
    if nutrients is not None:   
        for nt in nutrients:
            label = nt.find("span", class_="wprm-nutrition-label-text-nutrition-label")
            
            if re.search(nutrientsRegex, cleanText(label.text)) is None: continue
            
            value = nt.find("span", class_="wprm-nutrition-label-text-nutrition-value")
            daily = nt.find("span", class_="wprm-nutrition-label-text-nutrition-daily")
            
            nutrient = {}
            
            nutrient["label"] = cleanText(label.text)
            nutrient["amount"] = cleanText(value.text)
            if daily is not None: nutrient["daily"] = cleanText(daily.text)
                
            recipe.nutrition.append(nutrient)
    
    if cuisines is not None: recipe.cuisines = cuisines.text.split(",")
    if courses is not None: recipe.courses = courses.text.split(",")

    return recipe.__dict__

In [10]:
def letTheBakingBeginCreateRecipe(soup):
    
    recipe = Recipe()
    
    card = soup.find("div", class_="wprm-recipe")
    
    if card is None: return None
    
    name = card.find("h2", class_="wprm-recipe-name")
    servings = card.find("div", class_="wprm-recipe-servings-container")
    prepTime = card.find("div", class_="wprm-recipe-total-time-container")
    imageURL = card.find("div", class_="wprm-recipe-image")
    ingredients = [ing for ing in card.find_all("ul", class_="wprm-recipe-ingredients")]
    directions = [direction.find("div", class_="wprm-recipe-instruction-text") for direction in card.find("ol", class_="wprm-recipe-instructions").find_all("li")]
    nutrients = []
    if card.find("div", class_="wprm-nutrition-label") is not None:
        nutrients = [nt for nt in card.find_all("div", class_="nutrition-item")]
        nutrients += [nt for nt in card.find_all("div", class_="nutrition-sub-item")]
    cuisines = card.find("span", class_="wprm-recipe-cuisine")
    courses = card.find("span", class_="wprm-recipe-course")
    
    if name is None: return None
    recipe.name = name.text
    
    if servings is not None: recipe.servings = servings.find("span", "wprm-recipe-servings").text
    if prepTime is not None: 
        times = prepTime.find_all("span", class_="wprm-recipe-details")
        if times is not None:
            if len(times) == 2:
                recipe.prepTime = str(int(times[0].text) * 60 + int(times[1].text))
            else:
                unit = prepTime.find("span", class_="wprm-recipe-total_time-unit")
                if (unit.text == "hour"): recipe.prepTime = str(int(times[0].text) * 60)
                else: recipe.prepTime = str(times[0].text)
    if imageURL is not None: recipe.imageURL = imageURL.find("img")['src']
    if len(ingredients) > 0: 
        temp = []
        for ing in ingredients:
            temp += ing.find_all("li")
        ingredients = temp
        for ing in ingredients:
            amount = ing.find("span", class_="wprm-recipe-ingredient-amount")
            unit = ing.find("span", class_="wprm-recipe-ingredient-unit")
            name = ing.find("span", class_="wprm-recipe-ingredient-name")
            if name is None: continue
            if amount is None: recipe.ingredients.append(name.text)
            else:
                if unit is None: recipe.ingredients.append(amount.text + " " + name.text)
                else:
                    recipe.ingredients.append(amount.text + " " + unit.text + " " + name.text)
    else: return None
    if len(directions) > 0:
        for direction in directions:
            recipe.directions.append(direction.text)
    else: return None
    if len(nutrients) > 0:   
        for nt in nutrients:
            
            nutrient = {}
            
            label = nt.find("span", class_="nutrition-main")
            if label is not None:
                
                if re.search(nutrientsRegex, cleanText(label.find("strong").text)) is None: continue
                nutrient["label"] = label.find("strong").text
                num = re.search("\d+", label.text)
                if num is not None:   
                    nutrient["amount"] = str(num[0])
            else:
                label = nt.find("span", class_="nutrition-sub")
                if label is not None:
                    label = cleanText(label.text)
                    
                    if re.search(nutrientsRegex, label) is None: continue
                    num = re.search("\d+", label)
                    if num is not None:
                        nutrient["label"] = cleanText(label[0:num.span()[0]])
                        nutrient["amount"] = cleanText(num[0])
            
            
            daily = nt.find("span", class_="nutrition-percentage")
            
            if daily is not None:
                daily = daily.find("strong")
                if daily is not None:
                    num = re.search("\d+", daily.text)
                    if num is not None:  
                        nutrient["daily"] = str(num[0])
                
            recipe.nutrition.append(nutrient)
    
    if len(cuisines) > 0: recipe.cuisines = cuisines.text.strip().split(",")
    if len(courses) > 0: recipe.courses = courses.text.strip().split(",")

    return recipe.__dict__

In [151]:
def newYorkTimesCreateRecipe(soup):
    
    recipe = Recipe()
    
    name = soup.find("div", class_="title-container")
    servingsAndTime = soup.find("ul", class_="recipe-time-yield")
    imageURL = soup.find("div", class_="media-container")
    if soup.find("ul", class_="recipe-ingredients") is None: return None
    ingredients = [ing.find("span", class_="quantity").text.strip() + " " + ing.find("span", class_="ingredient-name").text.strip() for ing in soup.find("ul", class_="recipe-ingredients").find_all("li")]
    directions = [direction.text.strip() for direction in soup.find("ol", class_="recipe-steps").find_all("li")]
    nutrients = []
    if soup.find("div", class_="nutrition-tooltip") is not None:
        nutrients = [nt for nt in soup.find("div", class_="nutrition-tooltip").find("ul").find_all("span")]
    courseCuisineDiets = soup.find("div", class_="tags-nutrition-container")
    if courseCuisineDiets is not None: courseCuisineDiets = [c for c in soup.find("div", class_="tags-nutrition-container").find_all("a")]
    
    if name is None: return None
    recipe.name = name.find("h1", class_="recipe-title").text.strip()
    
    if servingsAndTime is not None:
        servingsAndTime = servingsAndTime.find_all("li")
        for i in servingsAndTime:
            if i.find("span", class_="recipe-yield") is not None:
                servings = i.find("span", class_="recipe-yield-value").text
                servings = re.search("[0-9]+", servings.strip())
                if servings is not None: recipe.servings = servings[0]
            elif i.find("span", class_="recipe-time") is not None:
                prepTime = i.find("span", class_="recipe-yield-value").text
                time = re.search(mixedNumberRegex, prepTime.strip())
                if time is not None: 
                    recipe.prepTime = handleNumber(time[0])
                    if re.search("hour", prepTime) is not None:
                        recipe.prepTime *= 60
    
    if imageURL is not None: 
        imageURL = imageURL.find("img")
        if imageURL is not None:
            recipe.imageURL = imageURL['src']
        
    if len(ingredients) > 0: recipe.ingredients = ingredients
    if len(directions) > 0: recipe.directions = directions

    if len(nutrients) > 0:   
        nutrientString = ""
        for nt in nutrients:
            nutrientString += " " + nt.text.strip().lower()
        nutrients = nutrientString.split(";")
        
        for nt in nutrients:
            nutrient = {}
            label = re.search(nutrientsRegex, nt)
            amount = re.search("\d+", nt)
            if label is not None and amount is not None:
                nutrient["label"] = label[0]
                nutrient["amount"] = amount[0]
                flag = 0
                for n in recipe.nutrition:
                    if n["label"] == nutrient["label"]: flag = 1
                if flag == 0: recipe.nutrition.append(nutrient)
            
    if courseCuisineDiets is not None:
        courseCuisineDietsString = ""
        for c in courseCuisineDiets:
            courseCuisineDietsString += c.text
        courses = re.findall(coursesRegex, courseCuisineDietsString, re.IGNORECASE)
        cuisines = re.findall(cuisinesRegex, courseCuisineDietsString, re.IGNORECASE)
        diets = re.findall(dietsRegex, courseCuisineDietsString, re.IGNORECASE)
        if courses is not None:
            recipe.courses = courses
        if cuisines is not None:
            recipe.cuisines = cuisines
        if diets is not None:
            recipe.diets = diets
            
    return recipe.__dict__

In [170]:
def clickNCookCreateRecipe(soup):
    
    recipe = Recipe()
    recipe.prepTime = 0
    
    name = soup.find("div", class_="c-recipe")
    servings = soup.find("div", class_="serves")
    imageURL = soup.find("div", class_="c-recipe")
    if soup.find("ul", class_="wpurp-recipe-ingredients") is None: return None
    ingredients = [ing.find("span", class_="recipe-ingredient-quantity-unit").text + ing.find("span", class_="recipe-ingredient-name").text for ing in soup.find("ul", class_="wpurp-recipe-ingredients").find_all("li")]
    directions = [direction.text.strip() for direction in soup.find("ol", class_="wpurp-recipe-instructions").find_all("li")]
    nutrients = soup.find("div", class_="wpurp-nutrition-label")
    courseCuisineDiets = soup.find("div", class_="d-2of5").find("div", class_="tags")
    
    
    if name is None: return None
    recipe.name = name.find("h2").text.strip()
    
    if servings is not None:
        servings = servings.find_all("span", class_=lambda x: x != 'title')
        if servings is not None:
            servings = servings[0].text
            number = re.search("[0-9]+", servings.strip())
            if number is not None:
                recipe.servings = number[0]

    if imageURL is not None: 
        imageURL = imageURL.find("div", class_="img")
        if imageURL is not None:
            imageURL = re.search("http.+'", imageURL['style'])
            if imageURL is not None:
                recipe.imageURL = imageURL[0][0:len(imageURL[0]) - 1]
        
    if len(ingredients) > 0: recipe.ingredients = ingredients
    if len(directions) > 0: 
        recipe.directions = directions
        for d in directions:
            num = re.search('\d+\s', d)
            if num is None:
                recipe.prepTime += 4
            else:
                recipe.prepTime += int(num[0])

    if nutrients is not None:
        nutritionServing = nutrients.find("div", class_="nutrition-serving")
        if nutritionServing is not None:
            num = re.search(mixedNumberRegex, nutritionServing.text)
            if num is not None:
                num = num[0]
                recipe.nutrition.append({"label": "serving", "amount": num})
        nutrients = nutrients.find_all("div", class_=["nutrition-item", "nutrition-sub-item"])
        for nt in nutrients:
            main = nt.find("span", class_=["nutrition-main", "nutrition-sub"])
            if main is not None:
                nutrient = {}
                label = re.search(nutrientsRegex, main.text, re.IGNORECASE)
                amount = re.search("\d+(.\d+)?", main.text)
                if label is not None and amount is not None:
                    nutrient["label"] = label[0]
                    nutrient["amount"] = amount[0]
                    flag = 0
                    for n in recipe.nutrition:
                        if n["label"].lower() == nutrient["label"].lower(): flag = 1
                    if flag == 0: recipe.nutrition.append(nutrient)
        
            
    if courseCuisineDiets is not None: 
        courseCuisineDiets = [c.find("a").text for c in courseCuisineDiets.find("ul").find_all("li")]
        courseCuisineDietsString = ""
        for c in courseCuisineDiets:
            courseCuisineDietsString += c
        courseCuisineDietsString = courseCuisineDietsString.lower()
        courses = re.findall(coursesRegex, courseCuisineDietsString)
        cuisines = re.findall(cuisinesRegex, courseCuisineDietsString)
        diets = re.findall(dietsRegex, courseCuisineDietsString)
        if courses is not None:
            recipe.courses = courses
        if cuisines is not None:
            recipe.cuisines = cuisines
        if diets is not None:
            recipe.diets = diets
            
    return recipe.__dict__    

Mongo Setup and DB Connection

In [35]:
import pymongo
import ssl
import json

In [36]:
client = pymongo.MongoClient("mongodb+srv://barak:barakh123@recipeappcluster-4ywv5.mongodb.net/test?retryWrites=true&w=majority", ssl=True, ssl_cert_reqs=ssl.CERT_NONE)
headers = {"User-Agent":"Mozilla/5.0"}

In [37]:
db = client["Recipes"]
collection = db["All"]

General Strategy:
- 

1. Create function for parsing specific webpages
2. Loop through recipe index of webpage
3. Upload results to mongodb

Sites:
- https://www.ihearteating.com/recipe-index-2/
- https://letthebakingbegin.com/page/2/
- https://www.healthyseasonalrecipes.com/browse-recipes/
- https://itdoesnttastelikechicken.com/recipes/
- https://www.veggiesdontbite.com/recipes/ (Diets included)
- https://cooking.nytimes.com/
- https://www.recipetineats.com/recipes/
- https://www.rebootedmom.com/recipe-index/
- https://thevegan8.com/recipe-index/
- https://www.smalltownwoman.com/recipe-index/
- https://gatherforbread.com/recipe-index/
- https://www.occasionallyeggs.com/mains/ (Have to rely on categorization)
- https://www.cookingclassy.com/recipes/appetizer/
- https://rawmanda.com/recipes/ (Cuisine and course inconsistent)
- https://www.thekitchn.com/collection/recipe/breakfast (No cuisine)
- https://www.savorytooth.com/category/poultry-mains/ (No course or cuisine)
- https://tutti-dolci.com/recipe-index/ (No course/nutrition facts)

In [173]:
#Test recipe retrieval
# url = "https://letthebakingbegin.com/basic-macarons-italian-meringue-method/#wprm-recipe-container-9127"
# page = requests.get(url, headers=headers).text
# soup = BeautifulSoup(page, "lxml")
# print(letTheBakingBeginCreateRecipe(soup))

# url = "https://www.ihearteating.com/banana-cake-with-vanilla-cream-cheese-frosting/"
# page = requests.get(url, headers=headers).text
# soup = BeautifulSoup(page, "lxml")
# print(iHeartEatingCreateRecipe(soup))

# url = "https://cooking.nytimes.com/recipes/1016832-craig-claibornes-smothered-chicken?action=click&module=Collection%20Page%20Recipe%20Card&region=Times%20Classics&pgType=collection&rank=1"
# page = requests.get(url, headers=headers).text
# soup = BeautifulSoup(page, "lxml")
# print(newYorkTimesCreateRecipe(soup))

# url = "https://clickncook.org/recipe/almond-vegetable-stir-fry/"
# page = requests.get(url, headers=headers).text
# soup = BeautifulSoup(page, "lxml")
# print(clickNCookCreateRecipe(soup))

{'name': 'Almond Vegetable Stir Fry', 'servings': '8', 'prepTime': 28, 'imageURL': 'https://clickncook.org/wp-content/uploads/2018/10/stir-fry-vegetable-mixture-e1532623926918.jpg', 'ingredients': ['1 1/2 tablespoons vegetable oil, divided', '1/2 cup whole almonds', '8 cups assorted vegetables, thinly sliced into 2" strips', '1/2 teaspoon garlic powder', '1 teaspoon powdered ginger', '2 tablespoons cornstarch', '3 tablespoons reduced-sodium soy sauce', '1/3 cup water', '4 cups brown rice, cooked'], 'directions': ['Heat half of the oil in a non-stick skillet.', 'Add almonds, cooking and tossing for about 8 minutes until lightly browned.  Remove from heat and set aside.', 'Pour remaining oil in the skillet and add vegetables.  Stir-fry, about five minutes, stirring often until vegetables are crisp-tender.', 'In a small bowl, mix garlic powder, ginger, cornstarch, soy sauce, and water until smooth.', 'Add liquid mixture to skillet, cooking for about two minutes until thickened.', 'Serve o

In [7]:
#NYTimes upload to DB
collections = ["https://cooking.nytimes.com/collections?page=" + str(i) for i in range(1,8)]
for col in collections:
    page = requests.get(col, headers=headers).text
    soup = BeautifulSoup(page, "lxml")
    containers = ["https://cooking.nytimes.com" + c.find("a")['href'] for c in soup.find_all("div", class_="popular-collections-card-container")]
    names = [c.find("h3").text for c in soup.find_all("div", class_="popular-collections-card-container")]
    i = 0
    for container in containers:
        print(names[i])
        page = requests.get(container, headers=headers).text
        soup = BeautifulSoup(page, "lxml")
        recipeURLs = ["https://cooking.nytimes.com" + r.find('a')['href'] for r in soup.find_all("div", class_="card-info-wrapper")]
        recipes = []
        for url in recipeURLs:
            page = requests.get(url, headers=headers).text
            soup = BeautifulSoup(page, "lxml")
            recipe = newYorkTimesCreateRecipe(soup)
            if recipe is not None:
                recipes.append(recipe)
        i += 1
        print(len(recipes))
#         collection.insert_many(recipes)

In [None]:
#Let the baking begin upload to DB
mainURL = "https://letthebakingbegin.com/recipe-index/"
page = requests.get(mainURL, headers=headers).text
soup = BeautifulSoup(page, "lxml")

sections = soup.find("div", class_="alphalist").find_all("ul")

indexUrls = []
sNames = []
for section in sections:
    indexUrls += [item.find("a")['href'] for item in section.find_all("li")]
    sNames += [item.find("a").text for item in section.find_all("li")]


num = 12

#Do 11 again?

    
for i in range(12, len(indexUrls)):
    recipeLinks = []
    print(sNames[num])
    num += 1
    i = 1
    while True:
        page = requests.get("https://letthebakingbegin.com/cuisine/american" + '/page' + str(i), headers=headers)
        if page.status_code == 404:
            break
        soup = BeautifulSoup(page.text, "lxml")
        recipeLinks += [link.find('a')['href'] for link in soup.find_all("div", class_="postgrid-item-a")]
        i += 1
        
    recipes = []
    
    for link in recipeLinks:
        page = requests.get(link, headers=headers).text
        soup = BeautifulSoup(page, "lxml")
        recipe = letTheBakingBeginCreateRecipe(soup)
        if recipe is not None:  
            recipes.append(recipe)
    
#     collection.insert_many(recipes)    
    


In [138]:
#iHeartEating Upload to DB
indexURLs = ["https://www.ihearteating.com/recipe-index-2/page/" + str(i) for i in range(1, 44)]

for url in indexURLs:
    page = requests.get(url, headers=headers).text
    soup = BeautifulSoup(page, "lxml")
    links = [link["href"] for link in soup.find_all("a", class_="entry-title-link")]
    recipes = []
    for link in links:
        page = requests.get(link, headers=headers).text
        soup = BeautifulSoup(page, "lxml")
        recipe = iHeartEatingCreateRecipe(soup)
        if recipe is not None:  
            recipes.append(recipe)
    
#     collection.insert_many(recipes)