In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import selenium
from selenium import webdriver
import subprocess
from pathlib import Path
import time
import sys

In [13]:
def tasty_scrape(search, rec_type="recipe"):
    '''
    scrapes tasty.co for recipes matching *search*
    RETURNS DataFrame of:
        'href': 
            rec_type=="all" ---> all recipe links on page
            rec_type=="recipe" ---> all singular recipes on page
            rec_type=="compilation" ---> all recipe compilations on page
        'title':
            title of link
    '''
    # selenium driver setup
    print("retrieving webpage...")
    options = webdriver.chrome.options.Options()
    options.add_argument("--headless")
    node_modules_bin = subprocess.run(
        ["npm", "bin"],
        stdout=subprocess.PIPE,
        universal_newlines=True,
        check=True
    )
    node_modules_bin_path = node_modules_bin.stdout.strip()
    chromedriver_path = Path(node_modules_bin_path) / "chromedriver"

    driver = selenium.webdriver.Chrome(
        options=options,
        executable_path=str(chromedriver_path),
    )
    driver.implicitly_wait(1)
    url = "https://tasty.co/search?q=" + search
    driver.get(url)

    # click "see more" button repeatedly
    print("clicking buttons...")
    bttndiv = driver.find_elements_by_xpath("//div[@class='show-more']")
    while (bttndiv):
        bttn = bttndiv[0].find_elements_by_tag_name('button')[0]
        bttn.click()
        time.sleep(0.2)
        bttndiv = driver.find_elements_by_xpath("//div[@class='show-more']")

    # compile all recipe data
    print("getting all links...")
    soup = BeautifulSoup(driver.page_source, "html.parser")
    all_links = soup.find_all('a', class_='feed-item')
    data = {
        'href': [],
        'title': [],
        'rectype': [],
    }
    for link in all_links:
        ttl = link.find('div', class_='feed-item__title').text
        href = link['href']
        rtype = href.split('/')[1]
        if rec_type == 'all' or rtype == rec_type:
            url = "https://tasty.co" + href
            data['href'].append(url)
            data['title'].append(ttl)
            data['rectype'].append(rtype)

    return pd.DataFrame(data)

In [6]:
def tasty_recipe_scraper(url):
    '''
    INPUT: url of recipe to be scraped
    OUTPUT: dictionary of recipe's info
    '''
    recipe_page = requests.get(url)
    soup = BeautifulSoup(recipe_page.text, 'html.parser')
    info = {
        'title': '',
        'link': url,
        'score': None,
        'total_time': None,
        'prep': None,
        'cook': None,
        'num_ing': 0,
        'num_steps': 0
    }
    title_element = soup.find('h1', class_='recipe-name')
    if title_element:
        info['title'] = title_element.text
    scoretext = soup.find('h4', class_='tips-score-heading')
    if scoretext:
        info['score'] = int(scoretext.text.split('%')[0])  
    times = soup.find_all('div', class_='recipe-time')
    if times:
        info['total_time'] = times[0].find('p').text
        info['prep'] = times[1].find('p').text
        info['cook'] = times[2].find('p').text
    ingredients = soup.find_all('li', class_='ingredient')
    info['num_ing'] = len(ingredients)
    steps = soup.find('ol', class_='prep-steps').find_all('li')
    info['num_steps'] = len(steps)
    return info

In [7]:
def tasty_compilation_scraper(url):
    '''
    INPUT: URL for a recipe compilation page
    OUTPUT: DataFrame with recipe info 
            compatible with scrape_all_recipes.all_recipe_info
    '''
    all_recipe_info = pd.DataFrame({
        'title': [],
        'link': [],
        'score': [],
        'total_time': [],
        'prep': [],
        'cook': [],
        'num_ing': [],
        'num_steps': []
    })
    comp_page = requests.get(url)
    soup = BeautifulSoup(comp_page.text, 'html.parser')
    all_recipes = soup.find_all('a', class_='feed-item')
    for recipe in all_recipes:
        url = "https://tasty.co" + recipe['href'] 
        info = tasty_recipe_scraper(url)
        all_recipe_info = all_recipe_info.append(info, ignore_index=True)
    return all_recipe_info

In [8]:
def scrape_all_comps(compdf):
    '''
    TODO: scrape all compilations in compdf
    scrapes each compilation page in DataFrame compdf
    returns DataFrame with all recipe info
    '''
    all_recipe_info = pd.DataFrame({
        'title': [],
        'link': [],
        'score': [],
        'total_time': [],
        'prep': [],
        'cook': [],
        'num_ing': [],
        'num_steps': []
    })
    for index, row in compdf.iterrows():
        info = tasty_compilation_scraper(row['href'])
        all_recipe_info = all_recipe_info.append(info, ignore_index=True)
    return all_recipe_info

In [12]:
def scrape_all_types(alldf):
    '''
    TODO: scrape all types of recipes in alldf
    scrapes each compilation or recipe page in DataFrame alldf
    returns DataFrame with all recipe info
    '''
    all_recipe_info = pd.DataFrame({
        'title': [],
        'link': [],
        'score': [],
        'total_time': [],
        'prep': [],
        'cook': [],
        'num_ing': [],
        'num_steps': []
    })
    for index, row in alldf.iterrows():
        if row['rectype'] == 'recipe':
            info = tasty_recipe_scraper(row['href'])
            all_recipe_info = all_recipe_info.append(info, ignore_index=True)
        elif row['rectype'] == 'compilation':
            info = tasty_compilation_scraper(row['href'])
            all_recipe_info = all_recipe_info.append(info, ignore_index=True)
        info = tasty_compilation_scraper(row['href'])
        all_recipe_info = all_recipe_info.append(info, ignore_index=True)
    return all_recipe_info

In [10]:
def scrape_all_recipes(recipedf):
    '''
    scrapes each recipe page in DataFrame recipedf
    returns DataFrame with all recipe info
    '''
    all_recipe_info = pd.DataFrame({
        'title': [],
        'link': [],
        'score': [],
        'total_time': [],
        'prep': [],
        'cook': [],
        'num_ing': [],
        'num_steps': []
    })
    for index, row in recipedf.iterrows():
        info = tasty_recipe_scraper(row['href'])
        all_recipe_info = all_recipe_info.append(info, ignore_index=True)
    return all_recipe_info

In [16]:
def tasty_recipes(search, rectype='recipe', sortcol=None):
    '''
    wrapper function for all tasty scraping
    searches tasty.co for all recipes related to search
    returns a duplicate-free DataFrame of all rec_type recipes,
        sorted on sortcol if provided
    rectype: {'recipe', 'compilation', 'all'}
    sortcol: {'title', 'link', 'score', 'total_time', 'prep', 'cook', 'num_ing', 'num_steps'}
    '''
    recipedf = scrape_all_types(tasty_scrape(search, rec_type=rectype))
    recipedf.drop_duplicates(subset="link", keep="first", inplace=True)
    if sortcol:
        recipedf.sort_values(sortcol, inplace=True)

In [14]:
all_recipes = scrape_all_types(tasty_scrape('brownie', rec_type='all'))
print(all_recipes.info(verbose=True))

retrieving webpage...
clicking buttons...
getting all links...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       452 non-null    object 
 1   link        452 non-null    object 
 2   score       441 non-null    float64
 3   total_time  61 non-null     object 
 4   prep        61 non-null     object 
 5   cook        61 non-null     object 
 6   num_ing     452 non-null    float64
 7   num_steps   452 non-null    float64
dtypes: float64(3), object(5)
memory usage: 28.4+ KB
None


In [70]:
info = tasty_recipe_scraper("https://tasty.co/recipe/brownie-truffle-stuffed-strawberries")
print(info)

{'title': 'Brownie Truffle-Stuffed Strawberries', 'link': 'https://tasty.co/recipe/brownie-truffle-stuffed-strawberries', 'score': 92, 'total_time': '2 hr 5 min', 'prep': '10 minutes', 'cook': '25 minutes', 'num_ing': 13, 'num_steps': 11}


In [65]:
times = soup.find_all('div', class_='recipe-time')
print(times)

[<div class="recipe-time xs-col-12 xs-pr3 md-pr2"><h5 class="extra-bold xs-mb05">Total<!-- --> Time</h5><p class="xs-text-4 xs-hide md-block">2 hr 5 min</p><p class="xs-text-4 md-hide">2 hr 5 min</p></div>, <div class="recipe-time xs-col-12 xs-px3 md-px2"><h5 class="extra-bold xs-mb05">Prep<!-- --> Time</h5><p class="xs-text-4 xs-hide md-block">10 minutes</p><p class="xs-text-4 md-hide">10 min</p></div>, <div class="recipe-time xs-col-12 xs-pl3 md-pl2"><h5 class="extra-bold xs-mb05">Cook<!-- --> Time</h5><p class="xs-text-4 xs-hide md-block">25 minutes</p><p class="xs-text-4 md-hide">25 min</p></div>, <div class="recipe-time xs-col-12 xs-pr3 md-pr2"><h5 class="extra-bold xs-mb05">Total<!-- --> Time</h5><p class="xs-text-4 xs-hide md-block">2 hr 5 min</p><p class="xs-text-4 md-hide">2 hr 5 min</p></div>, <div class="recipe-time xs-col-12 xs-px3 md-px2"><h5 class="extra-bold xs-mb05">Prep<!-- --> Time</h5><p class="xs-text-4 xs-hide md-block">10 minutes</p><p class="xs-text-4 md-hide">10

In [34]:
allrecipes = driver.find_elements_by_xpath("//a[@class='feed-item']")
print(len(allrecipes))

119


In [11]:
url = "https://bsmalladi.github.io/"
mypage = requests.get(url)
soup = BeautifulSoup(mypage.text, 'html.parser')
resume = soup.find('a', class_='nava-resume')
resumeurl = url + resume['href'].split('/')[1]
myresume = requests.get(resumeurl)

In [12]:
myresume

<Response [200]>