## Extracting data from Web Pages Using Beautiful Soup

In [1]:
import re
from typing import Any, cast
import json
import os
from tqdm import tqdm

from bs4 import BeautifulSoup, Tag

In [2]:
base_dir = 'data/raw'
out_path = 'data'
bs_parser = 'html.parser'

### Helper Functions

In [3]:
def clean(text: Any) -> str:
    """Convert text to a string and clean it."""
    if text is None:
        return ""
    if isinstance(text, Tag):
        text = text.get_text()
    if not isinstance(text, str):
        text = str(text)
    """Replace non-breaking space with normal space and remove surrounding whitespace."""
    text = text.replace(" ", " ").replace("\u200b", "").replace("\u200a", " ")
    text = re.sub(r"(\n\s*)+\n", "\n\n", text)
    text = re.sub(r" +\n", "\n", text)
    return text.strip()
    
    return cast(str, text.strip())

In [4]:
def get_general_info(soup, link):
    title = clean(soup.select_one(".heading-1").get_text()) if soup.select_one(".heading-1") else "No Title"
    
    # Handle elements with the same class
    time_elements = soup.select("ul.recipe__cook-and-prep li span time")
    prep_time = clean(time_elements[0].get_text()) if len(time_elements) > 0 else "No Prep Time"
    cook_time = clean(time_elements[1].get_text()) if len(time_elements) > 1 else "No Cook Time"

    difficulty_element = soup.select_one("ul.recipe__cook-and-prep li:nth-child(2) .icon-with-text__children")
    serves_element = soup.select_one("ul.recipe__cook-and-prep li:nth-child(3) .icon-with-text__children")

    difficulty = clean(difficulty_element.get_text()) if difficulty_element else "No Difficulty"
    serves = clean(serves_element.get_text()) if serves_element else "No Serves"
    
    description = clean(soup.select_one(".post-header__description").get_text()) if soup.select_one(".post-header__description") else "No Description"
    diet_element = soup.select_one(".terms-icons-list__text")
    diet_type = clean(diet_element.get_text()) if diet_element else "None"

    return {
        "link": link,
        "title": title,
        "prep_time": prep_time,
        "cook_time": cook_time,
        "difficulty": difficulty,
        "serves": serves,
        "description": description,
        "diet_type": diet_type,
    }  

In [5]:
# Helper function to obtain nutritional info from recipes
def get_nutrition_info(soup):
    nutrition_values = {}
    
    # Extract values from table
    for row in soup.select('tbody.key-value-blocks__batch tr.key-value-blocks__item'):
        key = row.select_one('td.key-value-blocks__key').text.strip()
        value = row.select_one('td.key-value-blocks__value').text.strip()
        nutrition_values[key] = value

    return nutrition_values

In [6]:
def get_instructions (soup):
    # List to store the final output
    ingredients_list = []
    
    # Find the section with ingredients
    ingredients_section = soup.find('section', class_='recipe__ingredients')
    
    if ingredients_section:
        # Find all sub-sections within the main ingredients section
        subsections = ingredients_section.find_all('section')
        
        for subsection in subsections:
            # Find all <li> tags within the current subsection
            ingredients_items = subsection.find_all('li')
            for item in ingredients_items:
                ingredients_list.append(item.get_text(strip=True))
            
            # Check for a heading (e.g., 'For the dressing') and extract the ingredients under it
            heading = subsection.find('h3')
            if heading:
                heading_text = heading.get_text(strip=True)
                ingredients_list.append(heading_text)
                ul = heading.find_next('ul')
                ingredients_items = ul.find_all('li')
                for item in ingredients_items:
                    ingredients_list.append(item.get_text(strip=True))
    
    return ingredients_list


In [7]:
def extract_methods(soup):

    # List to store the final output
    methods_list = []
    
    # Find the section with methods
    methods_section = soup.find('section', class_='recipe__method-steps')
    
    if methods_section:
        # Find all steps within the methods section
        steps = methods_section.find_all('li', class_='list-item')
        
        for step in steps:
            step_text = step.find('div', class_='editor-content').get_text(strip=True)
            methods_list.append(step_text)
    
    return methods_list

In [8]:
def process_recipe(recipe):
    soup = BeautifulSoup(recipe['html'], bs_parser)
    link = recipe['url']
    
    general_info = get_general_info(soup, link)
    nutrition_info = get_nutrition_info(soup)
    instructions = get_instructions(soup)
    methods = extract_methods(soup)
    
    return {
        "general_info": general_info,
        "nutrition_info": nutrition_info,
        "instructions": instructions,
        "methods": methods
    }

In [9]:
def process_recipes(data):
    recipes = []
    total = len(data)
    for item in tqdm(data, desc="Processing recipes", unit="recipe"):
        recipe = process_recipe(item)
        recipes.append(recipe)
    return recipes

In [10]:
def save_recipes_to_markdown(recipes, batch_size=30):
    total_batches = (len(recipes) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(recipes), batch_size), desc="Guardando recetas en Markdown", unit="batch"):
        batch = recipes[i:i+batch_size]
        markdown_content = ""
        
        for recipe in batch:
            markdown_content += f"## {recipe['general_info']['title']}\n"
            markdown_content += f"**Link:** {recipe['general_info']['link']}\n\n"
            markdown_content += f"**Description:** {recipe['general_info']['description']}\n\n"
            markdown_content += f"**Prep Time:** {recipe['general_info']['prep_time']}\n\n"
            markdown_content += f"**Cook Time:** {recipe['general_info']['cook_time']}\n\n"
            markdown_content += f"**Difficulty:** {recipe['general_info']['difficulty']}\n\n"
            markdown_content += f"**Serves:** {recipe['general_info']['serves']}\n\n"
            markdown_content += f"**Diet Type:** {recipe['general_info']['diet_type']}\n\n"
            
            markdown_content += "### Nutrition Information\n"
            for key, value in recipe['nutrition_info'].items():
                markdown_content += f"- **{key}:** {value}\n"
            
            markdown_content += "\n### Ingredients\n"
            for ingredient in recipe['instructions']:
                markdown_content += f"- {ingredient}\n"
            
            markdown_content += "\n### Method\n"
            for step in recipe['methods']:
                markdown_content += f"- {step}\n"
            
            markdown_content += "\n\n---\n\n"
        
        batch_number = i // batch_size + 1
        with open(f'{out_path}/recipes_batch_{batch_number}.md', 'w') as f:
            f.write(markdown_content)


In [11]:
def load_json_files(base_dir):
    all_data = []
    filenames = [f for f in os.listdir(base_dir) if re.match(r'recipes_batch_\d+\.json', f)]
    total_files = len(filenames)
    progress_points = [1, 25, 50, 75, 100]

    for i, filename in enumerate(filenames):
        file_path = os.path.join(base_dir, filename)
        with open(file_path, 'r') as file:
            data = json.load(file)
            all_data.extend(data)
        
        progress = (i + 1) / total_files * 100
        if any(progress >= p for p in progress_points):
            print(f"Processing files: {int(progress)}% completed")
            progress_points = [p for p in progress_points if progress < p]

    return all_data


In [12]:
# Load data from all JSON files
data = load_json_files(base_dir)

# Proccess recipes
recipes = process_recipes(data)

# save recipes in markdown files
save_recipes_to_markdown(recipes)

Processing files: 2% completed
Processing files: 26% completed
Processing files: 50% completed
Processing files: 76% completed
Processing files: 100% completed


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f033f0fe360>>
Traceback (most recent call last):
  File "/home/acrisvall/recipes_rag/.venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


KeyboardInterrupt: 