In [2]:
from bs4 import BeautifulSoup
import requests
import re

Classes

In [3]:
class Recipe:

    def __init__(self):
        self.ingredients = []
        self.directions = []
        self.nutrition = []
        self.quote = None
        self.prepTime = None
        self.imageURL = None
        self.name = None
        self.servings = None
        self.nutritionServings = None
    
    def addIngredient(self, ingredient):
        self.ingredients.append(ingredient)
    
    def addDirection(self, direction):
        self.directions.append(direction)
    
    def addNutrient(self, nutrient):
        self.nutrition.append(nutrient)

    def asDict(self):
        dt = self.__dict__
        dt["ingredients"] = [ing.__dict__ for ing in self.ingredients]
        dt["nutrition"] = [nt.__dict__ for nt in self.nutrition]
        return dt

class Ingredient:    
    def __init__(self, name, amount, unit):
        self.name = name
        self.amount = amount
        self.unit = unit

class Nutrient:
    def __init__(self, name, amount, unit):
        self.name = name
        self.amount = amount
        self.unit = unit

Regex

In [4]:
units = """(teaspoons|teaspoon|tsps|tsp|ts|t|tablespoons|tablespoon|tbls|tbl|tbs|tb|tbsps|tbsp|
cup|cups|pint|p|pt|fl pt|quarts|qts|qt|quart|fl qts|fl qt|gallons|gallon|gs|g|
gals|gal|mls|ml|milliliters|milliliters|ls|l|liters|liter|dls|dl|
pounds|pound|lb|ounces|ounce|ozs|oz|mgs|mg|milligram|g|gram|kgs|kg|kilograms|kilogram)""".replace('\n', '')
meals = """([B,b]reakfast|[L,l]unch|[D,d]inner|[B,b]runch)"""
mixedNumberRegex = "\d+(-\d+)*(/\d+)*"
ingredientRegex = re.compile(mixedNumberRegex + " " + units + "[\s)]")
nutrientRegex = re.compile("[0-9]+" + units)
removeBadChars = re.compile("[,.!?()]")

In [5]:
def createRecipe(soup):
    
    recipe = Recipe()
    
    name = soup.find("h1", class_="recipe-title").text
    ingredients = [ingredient.text.strip() for ingredient in soup.find("div", class_="recipe-ingredients").find_all("li")]
    directions = [direction.text.strip() for direction in soup.find("ul", class_="recipe-directions__list").find_all("li")]
    nutrients = soup.find("div", class_="recipe-nutrition-facts")
    imageURL = soup.find("div", class_="recipe-image-and-meta-sidebar__featured-container")
    prepTime = soup.find("div", class_="recipe-time-yield__label-prep")
    servings = soup.find("div", class_="recipe-time-yield__label-servings")
    quote = soup.find("div", class_="recipe-tagline__text")
 
    if name is None:
        return None
    recipe.name = name
    
    if imageURL is not None:
        imageURL = imageURL.find("img")['src']
        recipe.imageURL = imageURL

    if prepTime is not None:
        prepTime = re.search("[0-9]+", prepTime.text.strip())
        if prepTime is not None:
            recipe.prepTime = prepTime[0]

    if servings is not None:
        servings = re.search("[0-9]+\sservings", servings.text.strip())
        if servings is not None:
            recipe.servings = re.search("[0-9]+", servings[0])[0]

    if nutrients is not None:   
        #Handle diabetic exchanges!

        nutrients = re.split('[:,().]', nutrients.text)
        nutrients = [' '.join(nt.split()) for nt in nutrients if nt is not '']
        for i in range(0,len(nutrients)):
            if re.search("Diabetic Exchanges", nutrients[i]) is not None:
                nutrients = nutrients[0:i]
                break
        nutrients = [nt for nt in nutrients if re.search('[0-9]+', nt) is not None]
        if len(nutrients) > 0: 
            servings = re.search('[0-9]+', nutrients[0])
            if servings is not None:
                recipe.nutritionServings = servings[0]
                nutrients = nutrients[1:len(nutrients)]
            
    updateArrays(recipe, ingredients, directions, nutrients)

    return recipe.asDict()

In [6]:
def handleNumber(text):
    text = text.split()
    value = 0
    for number in text:
        mixedNumber = number.split("-")
        fraction = None
        if len(mixedNumber) > 1:
            value += float(mixedNumber[0])
            fraction = mixedNumber[1].split("/")
        else:
            fraction = number.split("/")
        if len(fraction) > 1:
            value += float(fraction[0]) / float(fraction[1])
        else:
            if number.isnumeric():
                value += float(number)
    return value

Function for parsing food data

In [7]:
def parseData(regex, data):
    
    amount = None
    unit = None
    name = None

    if re.search(regex, data) is None:
        search = re.search("[0-9]+", data)
        if search is not None:
            amount = search[0]
            name = removeBadChars.sub('', data[0:search.span()[0]] + data[search.span()[1]:len(data)]).strip()
        else:
            name = data
    else:
        matches = regex.finditer(data)
        for match in matches:
            name = removeBadChars.sub('', data[0:match.span()[0]] + data[match.span()[1]:len(data)]).strip()
            match = removeBadChars.sub('', match[0])
            aAndu = re.search(mixedNumberRegex, match)
            amount = handleNumber(aAndu[0])
            unit = match[0:aAndu.span()[0]] + match[aAndu.span()[1]:len(match)].strip()
            break

    return (name, amount, unit)

Add Ingredients/Nutrients/Directions

In [8]:
def updateArrays(recipe, ingredients, directions, nutrients):

    if ingredients is not None:
        for ingredient in ingredients:
            info = parseData(ingredientRegex, ingredient)
            if info[1] != None:
                recipe.addIngredient(Ingredient(info[0],info[1],info[2]))
            else:
                break

    if directions is not None:
        for direction in directions:
            recipe.addDirection(direction)

    if nutrients is not None:
        for nutrient in nutrients:
            info = parseData(nutrientRegex, nutrient)
            recipe.addNutrient(Nutrient(info[0],info[1],info[2]))

Scraping

In [9]:
headers = {"User-Agent":"Mozilla/5.0"}

In [10]:
url = "https://www.tasteofhome.com/recipes/hummus-veggie-wrap-up/"
page = requests.get(url, headers=headers).text
soup = BeautifulSoup(page, "lxml")
createRecipe(soup)

{'ingredients': [{'name': 'hummus', 'amount': 2.0, 'unit': 'tablespoons'},
  {'name': 'whole wheat tortilla 8 inches', 'amount': '1', 'unit': None},
  {'name': 'torn mixed salad greens', 'amount': 0.25, 'unit': 'cup'},
  {'name': 'finely chopped sweet onion', 'amount': 2.0, 'unit': 'tablespoons'},
  {'name': 'thinly sliced cucumber', 'amount': 2.0, 'unit': 'tablespoons'},
  {'name': 'alfalfa sprouts', 'amount': 2.0, 'unit': 'tablespoons'},
  {'name': 'shredded carrot', 'amount': 2.0, 'unit': 'tablespoons'},
  {'name': 'balsamic vinaigrette', 'amount': 1.0, 'unit': 'tablespoon'}],
 'directions': ['Spread hummus over tortilla. Layer with salad greens, onion, cucumber, sprouts and carrot. Drizzle with vinaigrette. Roll up tightly.'],
 'nutrition': [{'name': 'calories', 'amount': '235', 'unit': None},
  {'name': 'fat', 'amount': 8.0, 'unit': 'g'},
  {'name': 'saturated fat', 'amount': 1.0, 'unit': 'g'},
  {'name': 'cholesterol', 'amount': '0', 'unit': None},
  {'name': 'sodium', 'amount': 

Mongo stuff

In [11]:
import pymongo
import ssl
import json

In [12]:
client = pymongo.MongoClient("mongodb+srv://barak:barakh123@recipeappcluster-4ywv5.mongodb.net/test?retryWrites=true&w=majority", ssl=True, ssl_cert_reqs=ssl.CERT_NONE)

In [13]:
db = client["Recipes"]

In [12]:
url = "https://www.tasteofhome.com/course/"
page = requests.get(url, headers=headers).text
soup = BeautifulSoup(page, "lxml")
courses = [json.loads(c['data-analytics-metrics'])['link_name'] for c in soup.find("div", class_="tax-list").find_all("a")]
mainUrls = [("https://www.tasteofhome.com/course/" + c).replace(' ', '-') for c in courses]

In [None]:
#dinnerUrl = "https://www.tasteofhome.com/collection/best-dinner-recipes/"
#dinnerUrl = https://www.tasteofhome.com/collection/contest-winning-healthy-dinner-recipes/
#dinnerUrl = https://www.tasteofhome.com/collection/contest-winning-quick-dinner-recipes/
#appetizersUrl = "https://www.tasteofhome.com/search/index?search=appetizers&st=7&vw=1&page=&fc=Appetizers&sort=0&fc=Taste+of+Home+Magazine+Recipes"
#breakfastUrl = https://www.tasteofhome.com/collection/top-rated-breakfast-recipes/
#breakfastUrl = https://www.tasteofhome.com/collection/work-from-home-breakfasts/
#breakfasrUrl = https://www.tasteofhome.com/collection/pantry-staples-breakfast/
#dessertURL = https://www.tasteofhome.com/collection/easy-dessert-ideas/
#dessertUrl = https://www.tasteofhome.com/collection/sure-to-please-contest-winning-pies/
#drinkUrl = https://www.tasteofhome.com/collection/copycat-drink-recipes/
#drinkUrl = https://www.tasteofhome.com/collection/patio-drinks/
#drunkUrl = https://www.tasteofhome.com/collection/refreshing-strawberry-drinks/
#drinkUrl = https://www.tasteofhome.com/collection/bridal-shower-drinks/
#drinkUrl
#brunchURL = https://www.tasteofhome.com/collection/best-brunch-recipes/
#chickenURL = "https://www.tasteofhome.com/collection/marinated-chicken-recipes-that-put-your-fridge-to-work/"

Different scripts for uploading to DB using tasteofhome website info

In [32]:
#Filter load

numPages = 15

for i in range(1, numPages + 1):
    url = "https://www.tasteofhome.com/search/index?search=appetizers&st=7&vw=1&page=" + str(i) + "&fc=Appetizers&sort=0&fc=Taste+of+Home+Magazine+Recipes"
    page = requests.get(url, headers=headers).text
    soup = BeautifulSoup(page, "lxml")
    recipeURLs = [r.find('a')['href'] for r in soup.find_all("div", class_="rd_search_result_detail pull-left")]
    recipes = []
    for recipeURL in recipeURLs:
        page = requests.get(recipeURL, headers=headers).text
        soup = BeautifulSoup(page, "lxml")
        recipes.append(createRecipe(soup))
    db[courses[0]].insert_many(recipes)

In [24]:
#Listing load


collection = db["All"]
courseUrl = "https://www.tasteofhome.com/collection/marinated-chicken-recipes-that-put-your-fridge-to-work/"


page = requests.get(courseUrl, headers=headers).text
soup = BeautifulSoup(page, "lxml")
recipeUrls = [url.find('a')['href'] for url in soup.find_all('span', class_='listicle-page__cta-button')]
recipes = []
for url in recipeUrls:
    page = requests.get(url, headers=headers).text
    recipeSoup = BeautifulSoup(page, "lxml")
    recipe = createRecipe(recipeSoup)
    recipe["meal"] = None
    if recipe is not None:   
        recipes.append(recipe)
collection.insert_many(recipes)

<pymongo.results.InsertManyResult at 0x10d8c8f48>

In [31]:
collection = db["Dinner"]
# query = {"ingredients": {"$elemMatch": {"name": "half-and-half cream", "name": "butter"}}}
results = collection.find({})
collection = db["All"]
collection.insert_many(results)

<pymongo.results.InsertManyResult at 0x10c81a708>

In [21]:
# #Delete records
# collection = db["Dinner"]
# collection.delete_many({})

<pymongo.results.DeleteResult at 0x10bdcf688>