<a href="https://colab.research.google.com/github/bobby-py2002/FoodVision/blob/main/scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

scrape.py



In [None]:
import requests
from bs4 import BeautifulSoup
import json

def clean_text(text):

    return ' '.join(text.split()).replace('\n', ' ').strip()

def scrape_recipe(url):
  try:
    response = requests.get(url , timeout = 30)
    response.raise_for_status()
    print(response.raise_for_status())
    soup = BeautifulSoup(response.text, 'html.parser')

    name = (soup.find('h1', class_= 'recipe-title')
            or
            soup.select_one('h1[data-testid="recipe-title"]')
            or
            soup.find('h1')
            )
    if name:
      name = name.text.strip()

    ingredients = [clean_text(ing.text) for ing in soup.select('.ingredient-list li')]
    instructions = [clean_text(ins.text) for ins in soup.select('.direction-list li')]
    return {
        'name': name,
        'ingredients': ingredients,
        'instructions': instructions
    }
  except requests.exceptions.Timeout:
     print(f"🕒 Timeout: {url} (server too slow)")
     return None
  except requests.exceptions.HTTPError as e:
    print(f"🚨 HTTP {e.response.status_code} error: {url}")
    return None
  except Exception as e:
    print(f"💥 Unexpected error on {url}: {str(e)}")
    return None


recipes = []
urls = ['https://www.food.com/recipe/bagel-french-toast-casserole-362199']

for url in urls:
    try:
        recipes.append(scrape_recipe(url))
    except Exception as e:
        print(f"Error: {e}")

with open('/home/recipes.json', 'w', encoding='utf-8') as f:
    json.dump(recipes, f, indent=2, ensure_ascii=False)


None


In [None]:
import json
def get_recipe_links():
  try:
    response = requests.get("https://www.food.com/ideas/top-breakfast-recipes-6935#c-796349")
    soup=BeautifulSoup(response.text, 'html.parser')
    return list({
        a['href'] for a in soup.find_all('a',href=True)
        if '/recipe/' in a['href'].lower()}
    )
  except Exception as e:
    print(f"LINK ERROR : {e}")

with open('/home/links.json','w', encoding='utf-8') as l:
  json.dump(get_recipe_links(),l,indent=2)


In [None]:
import requests
import gzip
import io
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import time
import json
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  # Progress bar library

# Configuration
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept-Encoding': 'gzip'
}
TEST_SITEMAPS = [
    "https://www.food.com/sitemap-1.xml.gz",
    "https://www.food.com/sitemap-2.xml.gz",
    "https://www.food.com/sitemap-3.xml.gz",
    "https://www.food.com/sitemap-4.xml.gz",
    "https://www.food.com/sitemap-5.xml.gz",
    "https://www.food.com/sitemap-6.xml.gz",
    "https://www.food.com/sitemap-7.xml.gz",
    "https://www.food.com/sitemap-8.xml.gz",
    "https://www.food.com/sitemap-9.xml.gz",
    "https://www.food.com/sitemap-10.xml.gz",
    "https://www.food.com/sitemap-11.xml.gz",
    "https://www.food.com/sitemap-12.xml.gz",
    "https://www.food.com/sitemap-13.xml.gz",
    "https://www.food.com/sitemap-14.xml.gz",
    "https://www.food.com/sitemap-15.xml.gz",
    "https://www.food.com/sitemap-16.xml.gz",
    "https://www.food.com/sitemap-17.xml.gz",
    "https://www.food.com/sitemap-18.xml.gz",
    "https://www.food.com/sitemap-19.xml.gz",
    "https://www.food.com/sitemap-20.xml.gz",
    "https://www.food.com/sitemap-21.xml.gz",
    "https://www.food.com/sitemap-22.xml.gz",
    "https://www.food.com/sitemap-23.xml.gz",
    "https://www.food.com/sitemap-24.xml.gz"
]
REQUEST_TIMEOUT = (3, 5)  # 3s connect, 5s read

def clean_text(text):
    """Clean recipe text"""
    return ' '.join(text.split()).replace('\n', ' ').strip() if text else ""

def extract_urls(sitemap_url):
    """Get recipe URLs from a single sitemap"""
    try:
        response = requests.get(sitemap_url, headers=HEADERS, timeout=10)
        with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as gz_file:
            xml_content = gz_file.read()
            root = ET.fromstring(xml_content)
            return [elem.text for elem in root.iter()
                   if elem.text and "/recipe/" in elem.text]
    except Exception as e:
        print(f"⚠️ Sitemap error ({sitemap_url}): {type(e).__name__}")
        return []

def scrape_recipe(url):
    """Scrape a single recipe with fault tolerance"""
    try:
        response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')

        # Extract data with fallbacks
        name = (soup.find('h1', class_='recipe-title') or
               soup.select_one('h1[data-testid="recipe-title"]') or
               soup.find('h1'))
        name = clean_text(name.get_text()) if name else "Untitled Recipe"

        ingredients = []
        for selector in ['.structured-ingredients__list li', '.ingredient-list li']:
            if not ingredients:
                ingredients = [clean_text(ing.get_text())
                             for ing in soup.select(selector)]

        instructions = []
        for selector in ['.recipe__steps-content li', '.direction-list li']:
            if not instructions:
                instructions = [clean_text(step.get_text())
                              for step in soup.select(selector)]

        return {
            'name': name,
            'url': url,
            'ingredients': ingredients,
            'instructions': instructions
        }
    except Exception as e:
        return None

def main():
    print("🚀 Starting scrape...")

    # Step 1: Get URLs from sitemaps (parallel)
    print("🔍 Extracting recipe URLs from sitemaps...")
    with ThreadPoolExecutor(max_workers=5) as executor:
        all_urls = list(set().union(*executor.map(extract_urls, TEST_SITEMAPS)))

    print(f"📊 Found {len(all_urls)} recipes to scrape")

    # Step 2: Scrape recipes with progress bar
    recipes = []
    failed = 0
    with tqdm(total=len(all_urls), desc="Scraping Recipes") as pbar:
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = []
            for url in all_urls:
                future = executor.submit(scrape_recipe, url)
                future.add_done_callback(lambda _: pbar.update(1))
                futures.append(future)

            for future in futures:
                result = future.result()
                if result:
                    recipes.append(result)
                else:
                    failed += 1
                pbar.set_postfix({"Success": len(recipes), "Failed": failed})

    # Save results
    if recipes:
        with open('/home/recipes.json', 'w', encoding='utf-8') as f:
            json.dump(recipes, f, indent=2, ensure_ascii=False)
        print(f"\n✨ Saved {len(recipes)} recipes ({failed} failed)")
    else:
        print("\n💥 All recipes failed!")

if __name__ == "__main__":
    main()