# WEB SCRAPING 

Since our project relies on a working database of recipes we need to find the recipes and be able to input them into our database. An efficient way of doing this is through web scraping (while giving credit of course) online recipe sites.

This means that a large part of the project actually having data relies on this webscraper and the quality of recipes and information it can extract.

In [102]:
import selenium
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [12]:
recipes = []

URL = 'https://www.allrecipes.com/recipes/1058/fruits-and-vegetables/fruits/'
r = requests.get(URL)
Overallsoup = BeautifulSoup(r.content, 'html5lib')

In [None]:
CUISINE_CLASS = "comp mntl-breadcrumbs__item mntl-block"
NUTRITION_CLASS = 'mntl-nutrition-facts-label__table-body type--cat'
TIMING_CLASS = "mntl-recipe-details__content"
TIMING_LABEL = "mntl-recipe-details__label"
TIMING_VALUE = "mntl-recipe-details__value"
IMG_CLASS = ['primary-image__image', 'mntl-primary-image--blurry']
IMG_ID2 = "mntl-sc-block-image_1-0-1"
RATING_ID = "mntl-recipe-review-bar__rating_2-0"

In [114]:
for fruit in Overallsoup.findAll('a', attrs = {'class': ['taxonomy-nodes__link mntl-text-link type--squirrel-link']}):
    
    fruit_url = fruit.get('href')
    fruit_request = requests.get(fruit_url)
    fruit_soup = BeautifulSoup(fruit_request.content, 'html5lib')
    
    for row in fruit_soup.findAll('a',
                            attrs = {'class': ['comp mntl-card-list-items mntl-document-card mntl-card card card--no-image']}):

        website = row.get('href')
        html_doc = requests.get(website).content
        soup = BeautifulSoup(html_doc, 'html.parser')

        try:
            # Get Recipe Name
            recipe_name = soup.find(id=["article-heading_2-0", "article-heading_1-0"]).get_text().strip()
            if recipe_name == '':
                raise wz.NotFound(f'{website} not found')
            prep_time = ""
            cook_time = ""
            total_time = ""
            servings = ""
            yield_val = ""
            ingr = ""
            directions = ""
            rating = ""
            cuisine_path = "/"
            nutr = ""
            timing = ""
            tm_label = ""
            tm_val = ""
            img_src = ""

            # Get Ingredients
            ing_list_soup = soup.find(class_="mntl-structured-ingredients__list")
            for li in ing_list_soup.find_all("li"):
                if li.text != "":
                    ingr += (li.text[1:(len(li.text)-1)] + ", ")
            if ingr == '':
                raise wz.NotFound("Ingredients Not Found")
            ingr = ingr[:(len(ingr)-2)]

            # Get Directions
            directions_soup = soup.find(id="mntl-sc-block_2-0")
            for li in directions_soup.find_all("li"):
                if li.text != "":
                    directions += (li.text[1:(len(li.text)-3)])
            if directions == '':
                raise wz.NotFound("Directions Not Found")
            directions = directions[1:]

            # Get Rating (out of 5 stars)
            rating = soup.find(id=RATING_ID).get_text()
            rating = rating.strip()
            # Get Cuisine Path
            cuisine_soup = soup.find_all(class_=CUISINE_CLASS)
            for div in cuisine_soup:
                if div.text.strip() != "Recipes":
                    cuisine_path = (cuisine_path + div.text.strip() + "/")
            # Get Nutrition Information
            nutr_soup = soup.find(class_=NUTRITION_CLASS)
            for tr in nutr_soup.find_all("tr"):
                if (tr.text != "") and (tr.text.strip() != "% Daily Value *"):
                    tr_list = tr.text.split()
                    for i in tr_list:
                        nutr += i + ' '
                    nutr = nutr[:(len(nutr)-1)]
                    nutr += ', '
            if len(nutr) > 1:
                if nutr[-2:] == ', ':
                    nutr = nutr[:(len(nutr)-2)]
            # Get Timing
            time_lb_soup = soup.find_all(class_=TIMING_LABEL)
            time_val_soup = soup.find_all(class_=TIMING_VALUE)
            for div in time_lb_soup:
                tm_label += (div.text.strip() + ',')
            tm_label = (tm_label[:len(tm_label)-1])
            for div in time_val_soup:
                tm_val += (div.text.strip() + ',')
            tm_val = (tm_val[:len(tm_val)-1])
            tm_l_lst = tm_label.split(',')
            tm_v_lst = tm_val.split(',')
            for i in range(len(tm_l_lst)):
                timing += tm_l_lst[i].strip()
                timing += ' '
                timing += tm_v_lst[i].strip()
                timing += ', '
            timing = timing[:(len(timing)-2)]
            # Split timing into its individual components
            tm_split = timing.split(',')
            for x in range(len(tm_split)):
                split_indiv = tm_split[x].split(':')
                if split_indiv[0].strip() == "Prep Time":
                    prep_time = split_indiv[1].strip()
                elif split_indiv[0].strip() == "Cook Time":
                    cook_time = split_indiv[1].strip()
                elif split_indiv[0].strip() == "Total Time":
                    total_time = split_indiv[1].strip()
                elif split_indiv[0].strip() == "Servings":
                    servings = split_indiv[1].strip()
                elif split_indiv[0].strip() == "Yield":
                    yield_val = split_indiv[1].strip()
            # Get Image URL
            img_soup = soup.find_all("img", class_=IMG_CLASS)
            img2_soup = soup.find("img", id=IMG_ID2)
            if len(img_soup) > 0:
                img_src = img_soup[0]['src'].strip()
            elif img2_soup != "":
                img_src = img2_soup['data-src'].strip()
            # Return
            recipe_to_return = {"recipe_name": recipe_name, "prep_time": prep_time,
                                "cook_time": cook_time, "total_time": total_time,
                                "servings": servings, "yield": yield_val,
                                "ingredients": ingr,
                                "directions": directions, "rating": rating,
                                "url": website,
                                "cuisine_path": cuisine_path,
                                "nutrition": nutr,
                                "timing": timing,
                                "img_src": img_src}

            recipes.append(recipe_to_return)
        except:
            continue

In [115]:
df = pd.DataFrame.from_dict(recipes)
df.to_csv('recipes.csv')