In [3]:
import json
import time
import random
import pathlib
import urllib.request

import bs4
from tqdm import tqdm

In [10]:
# Fetch slugs
targets = [
    'https://tasty.co/api/recipes/recent?size=20&from={}&page={}&from_offset=1&__amp_source_origin=https%3A%2F%2Ftasty.co'.format(idx, idx // 20)
    for idx in range(80)
]

slugs = []
for url in tqdm(targets):
    req = urllib.request.Request(url)
    with urllib.request.urlopen(req) as response:
        data = json.loads(response.read())
    slugs.extend(item['slug'] for item in data['items'] if item['type'] == 'recipe')
    time.sleep(random.randint(2, 6))

with open('slugs.json', 'w') as f:
    json.dump(slugs, f, indent=2, ensure_ascii=False)

100%|██████████| 80/80 [06:32<00:00,  4.90s/it]


In [4]:
with open('slugs.json') as f:
    slugs = json.load(f)
print('#slugs:', len(slugs))
dst_dir = pathlib.Path('./data/')
dst_dir.mkdir(exist_ok=True)

#slugs: 1266


In [15]:
# Fetch html
for i, slug in enumerate(tqdm(slugs)):
    url = 'https://tasty.co/recipe/{}'.format(slug)
    with urllib.request.urlopen(url) as response:
        dom = bs4.BeautifulSoup(response.read(), 'lxml')

    dst_path = dst_dir / '{:05d}.html'.format(i)
    with dst_path.open('w') as f:
        f.write(dom.prettify())

    time.sleep(random.randint(2, 6))

100%|██████████| 1266/1266 [1:38:46<00:00,  4.68s/it]


In [9]:
# Extract data
html_paths = sorted(list(dst_dir.glob('*.html')))
for i, path in enumerate(tqdm(html_paths)):
    with path.open() as f:
        dom = bs4.BeautifulSoup(f.read(), 'lxml')
    step = [elem.text.strip().lower() for elem in dom.select('.prep-steps > li')]
    ingr = [elem.text.strip().lower() for elem in dom.select('div.ingredients__section li')]
    data = {
        'slug': slugs[i],
        'title': dom.title.string.strip(),
        'n_step': len(step),
        'n_ingr': len(ingr),
        'ingr': ingr,
        'step': step,
    }
    with path.with_suffix('.json').open('w') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

100%|██████████| 1266/1266 [01:04<00:00, 19.66it/s]


In [10]:
!cat ./data/00000.json

{
  "slug": "tortilla-bowl-southwestern-salad",
  "title": "Tortilla Bowl Southwestern Salad Recipe by Tasty",
  "n_step": 12,
  "n_ingr": 16,
  "ingr": [
    "4 teaspoons vegetable oil",
    "4  large flour tortillas",
    "2  romaine lettuce hearts",
    "2  tomatoes",
    "½  red onion",
    "2  avocados",
    "1 cup (175 g) corn, canned, rinsed and drained",
    "1 cup (170 g) black beans, canned, rinsed and drained",
    "¼ cup (60 ml) olive oil",
    "¼ cup (60 ml) lime juice",
    "1 clove garlic, minced",
    "⅛ teaspoon cumin",
    "½ teaspoon red pepper flakes",
    "3 tablespoons fresh cilantro, chopped",
    "½ teaspoon salt",
    "½ teaspoon pepper"
  ],
  "step": [
    "preheat the oven to 350°f (180°c).",
    "pour the vegetable oil (1 teaspoon per bowl) into medium (1.2 quart) oven-proof bowls and rub around to coat the surface. press each tortilla into a greased bowl.",
    "bake for about 10 minutes, until golden brown. let the tortilla bowls