In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
import time
from datetime import datetime
from requests.exceptions import RequestException
import xml.etree.ElementTree as ET

In [8]:
import pandas as pd
import re
import xml.etree.ElementTree as ET
from datetime import datetime

class DataProcessor:
    """Class for processing XML data and cleaning nutritional data."""

    def __init__(self):
        """Initialize the DataProcessor class."""
        pass  # No instance variables needed, as all methods are static

    @staticmethod
    def clean_nutrition_data(df):
        """
        Cleans the nutrition data by:
        - Removing 'g', 'mg', 'micro', 'perc', and '°C'
        - Converting appropriate units to float

        Args:
            df (pd.DataFrame): DataFrame containing nutritional data.

        Returns:
            pd.DataFrame: Cleaned DataFrame.
        """
        excluded_indexes = [0, 1, 3, 4, 5, 10]
        excluded_columns = ['Fogás', 'Konyha', 'Nehézség', 'Elkészítési idő', 'Szakács elkészítette', 
                            'Speciális étrendek', 'Vegetáriánus', 'Alkalom', 'Költség egy főre', 
                            'Konyhatechnológia']
        
        for index, col in enumerate(df.columns):
            if index in excluded_indexes or col in excluded_columns:
                continue
            if df[col].dtype == 'object':
                df[col] = df[col].apply(lambda x: DataProcessor.clean_value(x))
        
        return df
   
    @staticmethod
    def clean_nutrition_data_ingredients(df):
        """
        Cleans numerical nutrition data by:
        - Removing units like 'g', 'mg', 'µg', 'micro'
        - Converting to float and scaling appropriately
        - Keeping non-numeric columns unchanged

        Args:
        df (pd.DataFrame): The dataframe containing nutritional data.

        Returns:
        pd.DataFrame: Cleaned dataframe with corrected values.
        """
        excluded_columns = ['Alapanyag neve', 'Elsődleges kategória', 'Másodlagos kategória']

        for col in df.columns:
            if col in excluded_columns:
                continue  # Skip non-numeric columns
            if df[col].dtype == 'object':
                df[col] = df[col].apply(lambda x: clean_value(x))
        return df
    
    @staticmethod
    def clean_value(value):
        """
        Cleans individual value by:
        - Removing unwanted characters
        - Converting to appropriate numeric format

        Args:
            value (str): The value to clean.

        Returns:
            float or str: Cleaned numeric value if applicable, otherwise original string.
        """
        if isinstance(value, str):
            value = value.strip().lower()
            has_number = any(char.isdigit() for char in value)

            if has_number:
                if 'mg' in value:
                    return float(value.replace('mg', '').strip()) / 1000
                elif 'µg' in value:
                    return float(value.replace('µg', '').strip()) / 1_000_000
                elif 'micro' in value:
                    return float(value.replace('micro', '').strip()) / 1_000_000
                elif 'g' in value:
                    return float(value.replace('g', '').strip())
                elif 'perc' in value:
                    return float(value.replace('perc', '').strip())
                elif '°c' in value:
                    return float(value.replace('°c', '').strip())

        return value  # Return unchanged if not numeric

    @staticmethod
    def clean_ingredient_value(value):
        """
        Cleans an individual value by:
        - Removing units ('g', 'mg', 'µg', 'micro', 'mcg')
        - Converting to float and scaling appropriately
        - Handling cases where unexpected characters appear

        Args:
        value (str): The original value.

        Returns:
        float or original value if conversion fails.
        """
        if isinstance(value, str):
            value = value.strip().lower()

            # Extract numeric part using regex
            num_match = re.search(r"[-+]?\d*\.?\d+", value)
            if not num_match:
                return value  # Return as-is if no number is found

            num = float(num_match.group())  # Extract the numerical value

            # Check for unit and scale accordingly
            if "mg" in value:
                return num / 1000  # Convert mg to g
            elif "µg" in value or "mcg" in value or "micro" in value:
                return num / 1000000  # Convert µg/micro/mcg to g
            elif "g" in value:
                return num  # Already in g, no change needed

            return num  # If no unit is found, assume the number is correct

        return value  # Return original if not a string


In [21]:
class RecipeScraper:
    """Class for scraping full details of a recipe from Nosalty."""

    BASE_URL = "https://www.nosalty.hu/recept/"

    def __init__(self, url):
        self.url = url
        self.soup = self.get_soup()
        
    def extract_urls_from_xml(file_path, check_url_format=False, date_range=None):
        """
        Extracts recipe URLs from an XML file, optionally checking format and date range.
        
        Args:
            file_path (str): Path to the XML file.
            check_url_format (bool): If True, checks if the URL is valid.
            date_range (tuple): Optional (start_date, end_date) for filtering URLs by lastmod.
                                Format: ('YYYY-MM-DD', 'YYYY-MM-DD').

        Returns:
            list: List of valid URLs.
        """
        def is_valid_url(url):
            """Validates URL format."""
            pattern = r"^(http|https)://[^\s/$.?#].[^\s]*$"
            return re.match(pattern, url) is not None

        def is_within_date_range(date_text, start_date, end_date):
            """Checks if the date falls within the specified range."""
            try:
                date = datetime.strptime(date_text, "%Y-%m-%d")
                return start_date <= date <= end_date
            except ValueError:
                return False

        # Define XML namespace
        namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

        # Parse XML
        tree = ET.parse(file_path)
        root = tree.getroot()

        urls = []
        base_url = "https://www.nosalty.hu/recept/"
        for url in root.findall('ns:url', namespace):
            loc = url.find('ns:loc', namespace)
            lastmod = url.find('ns:lastmod', namespace)

            if loc is not None:
                url_text = loc.text
                if not url_text.startswith(base_url):
                    continue
                if check_url_format and not is_valid_url(url_text):
                    continue
                if date_range and lastmod is not None:
                    start_date, end_date = map(lambda d: datetime.strptime(d, "%Y-%m-%d"), date_range)
                    if not is_within_date_range(lastmod.text, start_date, end_date):
                        continue
                urls.append(url_text)

        return urls
    
    def get_soup(self):
        """Fetch and parse the webpage."""
        try:
            response = requests.get(self.url, timeout=10)
            if response.status_code != 200:
                return None
            return BeautifulSoup(response.text, 'html.parser')
        except requests.RequestException:
            return None

    def get_title(self):
        """Extracts the recipe title, or returns None if unavailable."""
        title_tag = self.soup.find('h1', class_='p-article__title -recipe pt-10 px-3 mb-5 d-block')
        return title_tag.text.strip() if title_tag else 'N/A' 

    def get_additional_info(self):
        """Extracts additional metadata related to the recipe."""
        info = []
        classes = [
            'a-text -fontSize-9 -fontColorSenary d-none d-sm-block',
            'a-text -fontSize-9 -fontColorQuaternary d-none d-sm-block',
            'a-link -fontColorPrimary -fontSize-16 -recipe pr-3 pl-2'  # Now capturing <a> elements
        ]

        for class_name in classes:
            elements = self.soup.find_all(['span', 'a'], class_=class_name)  # Look for both <span> and <a>
            for element in elements:
                info.append(element.text.strip())

        return ', '.join(info)

    def get_recipe_details(self):
        """Extracts the available recipe details: Time, Cost, and Difficulty."""
        details = {}

        # Define mapping from Hungarian labels to English keys
        rename_map = {
            "Idő": "Time",
            "Költség": "Cost",
            "Nehézség": "Difficulty"
        }

        # Locate all sections containing the relevant details
        detail_sections = self.soup.find_all('div', class_='p-recipe__detailsBody d-print-flex justify-content-end text-center mb-1 w-100')

        for section in detail_sections:
            label_tag = section.find('span', class_='p-recipe__detailsHeading d-block text-uppercase')  # Find label
            value_tag = section.find('div') or section.find('time') or section.find('span')  # Find actual value

            if label_tag and value_tag:
                label_text = label_tag.text.strip()
                value_text = value_tag.text.strip()

                # Store the extracted data with correct English labels
                if label_text in rename_map:
                    details[rename_map[label_text]] = value_text  # Correctly stores the value now

        return details

    
    def get_preparation_and_baking_time(self):
        """Extracts preparation and baking time from the recipe details list."""
        details = {}

        # Define label mapping from Hungarian to English
        rename_map = {
            "Előkészítés ideje": "Preparation Time",
            "Sütés ideje": "Baking Time"
        }

        # Locate the container that holds preparation and baking times
        details_section = self.soup.find('div', class_='p-recipe__detailsList')
        if not details_section:
            return details  # Return empty if section is missing

        # Find the list containing preparation and baking times
        list_section = details_section.find('ul', class_='m-list__list')
        if not list_section:
            return details  # Return empty if no list is found

        # Find all list items with the relevant class
        list_items = list_section.find_all('li', class_='m-list__item -simple -fontSize-18')

        for item in list_items:
            # Find the label (first span)
            label_tag = item.find('span')
            # Find the value (last span in the item)
            value_tag = item.find_all('span')[-1] if item.find_all('span') else None

            if label_tag and value_tag:
                label_text = label_tag.text.strip()
                value_text = value_tag.text.strip()

                # Store data using mapped labels
                if label_text in rename_map:
                    details[rename_map[label_text]] = value_text

        return details

    def get_portions(self):
        """Extracts the portion size from the input field."""

        # Find portions (now inside an input field with id="adag")
        portion_input = self.soup.find('input', {'id': 'adag'})
        portion_value = portion_input['value'].strip() if portion_input and 'value' in portion_input.attrs else 'N/A'

        return portion_value

    def get_nutrition_data(self):
        """Extracts all available nutrition details (Protein, Fat, Carbs, Water, and Minerals)."""
        nutrition_data = {}

        # Define the section titles and their corresponding nutrient names
        nutrition_sections = {
            'Fehérje': ['Összesen'],  # Protein
            'Zsír': ['Összesen', 'Telített zsírsav', 'Egyszeresen telítetlen zsírsav', 'Többszörösen telítetlen zsírsav', 'Koleszterin'],  # Fats
            'Szénhidrátok': ['Összesen', 'Cukor', 'Élelmi rost'],  # Carbs
            'VÍZ': ['Összesen'],  # Water
            'Ásványi anyagok': ['Cink', 'Szelén', 'Kálcium', 'Vas', 'Magnézium', 'Foszfor', 'Nátrium', 'Réz', 'Mangán'],  # Minerals
            'Vitaminok': ['A vitamin', 'B6 vitamin', 'B12 Vitamin', 'E vitamin', 'C vitamin', 'D vitamin', 'K vitamin',
                          'Tiamin - B1 vitamin', 'Riboflavin - B2 vitamin', 'Niacin - B3 vitamin', 'Pantoténsav - B5 vitamin',
                          'Folsav - B9-vitamin', 'Kolin', 'Retinol', 'α-karotin', 'β-karotin', 'β-crypt', 'Likopin', 'Lut-zea']  # Vitamins
        }

        # Loop through each section and extract relevant data
        for section, nutrients in nutrition_sections.items():
            tag = self.soup.find('h4', string=section)  # Find the section header (e.g., "Fehérje")

            if tag:
                list_section = tag.find_next('ul')  # Get the next <ul> after the section header

                if list_section:
                    for li in list_section.find_all('li'):  # Iterate through each <li> inside the section
                        for nutrient in (nutrients if isinstance(nutrients, list) else [nutrients]):
                            if nutrient in li.text:  # If the nutrient name appears in the text
                                value = li.find('div', class_='kc')  # Find the actual value
                                value_text = value.text.strip() if value else 'N/A'

                                # **Fix "Összesen" overwriting issue** by renaming it based on its section
                                if nutrient == "Összesen":
                                    renamed_nutrient = f"{section} - Összesen"  # Example: "Fehérje - Összesen"
                                else:
                                    renamed_nutrient = nutrient  # Keep other names unchanged

                                # Store the correctly named value
                                nutrition_data[renamed_nutrient] = value_text

        return nutrition_data


    def get_ingredients(self):
        """Extracts ingredient names, their full URLs, and amounts."""
        ingredients_list = []

        # Locate the correct section in the HTML
        ingredients_section = self.soup.find('ul', class_='m-list__list -nutrition')

        if ingredients_section:
            items = ingredients_section.find_all('li', class_='m-list__item p-2 -dotted -fontSize-18 d-flex justify-content-between pl-5')

            for item in items:
                # Extract quantity from span
                quantity_tag = item.find('span', class_='-fontSize-lg-16')
                quantity = quantity_tag.text.strip() if quantity_tag else 'N/A'

                # Extract ingredient name and URL from <a> tag
                ingredient_tag = item.find('a', class_='a-link -primaryHoverEffect -fontSize-18 -fontSize-lg-16')
                if ingredient_tag:
                    ingredient_name = ingredient_tag.text.strip()
                else:
                    ingredient_name = 'N/A'

                # Append as dictionary
                ingredients_list.append({
                    "Ingredient": ingredient_name,
                    "Quantity": quantity
                })
    
        return ingredients_list


    def to_dataframe(self):
        """Converts extracted recipe details to a pandas DataFrame."""
        return pd.DataFrame([{
            'Recipe name': self.get_title(),
             **self.get_recipe_details(),
            'Categories': self.get_additional_info(),
            **self.get_preparation_and_baking_time(),
            'Portions': self.get_portions(),           
            'Ingredients': self.get_ingredients(),
            **self.get_nutrition_data()
        }])

In [22]:
# Defining XML file paths
file_paths = [
    r'C:\Users\Bálint\Desktop\Asztal\Projekt\inputs\receptek_url.xml',
    r'C:\Users\Bálint\Desktop\Asztal\Projekt\inputs\receptek_url_2.xml',
    r'C:\Users\Bálint\Desktop\Asztal\Projekt\inputs\receptek_url_3.xml'
]

# Processing all URLs
all_urls = []
for file_path in file_paths:
    urls = RecipeScraper.extract_urls_from_xml(file_path, check_url_format=True)
    all_urls.extend(urls)

In [24]:
import traceback
# Start time tracking
start_time = time.time()

all_recipe_details = []
# Initialize counters
success_count = 0
total_count = len(all_urls)

# Process each recipe URL
with tqdm(total=total_count, desc="Processing Recipes") as pbar:
    for i, url in enumerate(all_urls, start=1):
        try:
            scraper = RecipeScraper(url)
            recipe_details_df = scraper.to_dataframe()

            # **Check if recipe details are valid**
            if recipe_details_df is not None:
                all_recipe_details.append(recipe_details_df)
                success_count += 1  # Count only successful scrapes

        except Exception as e:
            print(f"Error while processing {url} ")
        
        # Periodically log progress
        if i % 5000 == 0 or i == total_count:
            elapsed_time = time.time() - start_time
            print(f"Processed {i}/{total_count} URLs. Successful recipes: {success_count}. Elapsed time: {elapsed_time:.2f} seconds.")
        
        pbar.update(1)

# Combine all extracted data into a single DataFrame
final_df = pd.concat(all_recipe_details, ignore_index=True)
final_df_cleaned = DataProcessor.clean_nutrition_data(final_df)

# Save the cleaned results to a CSV file
output_path = r"C:\Users\Bálint\Desktop\Asztal\Projekt\receptek_jav.csv"
final_df_cleaned.to_csv(output_path, index=False)

# End time tracking
end_time = time.time()
print(f"Total execution time: {end_time - start_time:.2f} seconds")
# Display the first few rows of the cleaned dataset
final_df_cleaned.head()

Processing Recipes:   5%|██▋                                                 | 3614/71260 [1:08:11<16:59:08,  1.11it/s]

Error while processing https://www.nosalty.hu/recept/olga-almas 


Processing Recipes:   5%|██▋                                                 | 3648/71260 [1:08:50<14:24:23,  1.30it/s]

Error while processing https://www.nosalty.hu/recept/hideg-gorog-gyumolcsleves-fozes-nelkul 


Processing Recipes:   5%|██▋                                                 | 3749/71260 [1:10:45<15:37:59,  1.20it/s]

Error while processing https://www.nosalty.hu/recept/flan-napolitano 


Processing Recipes:   7%|███▋                                                | 5000/71260 [1:34:52<20:40:17,  1.12s/it]

Processed 5000/71260 URLs. Successful recipes: 4997. Elapsed time: 5692.42 seconds.


Processing Recipes:   7%|███▋                                                | 5010/71260 [1:35:12<70:18:09,  3.82s/it]

Error while processing https://www.nosalty.hu/recept/borban-parolt-zoldsegek 


Processing Recipes:  13%|██████▊                                             | 9399/71260 [3:03:51<68:39:54,  4.00s/it]

Error while processing https://www.nosalty.hu/recept/somloi-kocka 


Processing Recipes:  13%|██████▊                                             | 9401/71260 [3:04:02<86:27:39,  5.03s/it]

Error while processing https://www.nosalty.hu/recept/sult-hekk-rizzsel 


Processing Recipes:  13%|██████▋                                            | 9402/71260 [3:04:12<112:30:07,  6.55s/it]

Error while processing https://www.nosalty.hu/recept/sult-turos-nutellas-palacsinta 


Processing Recipes:  13%|██████▋                                            | 9403/71260 [3:04:22<130:41:35,  7.61s/it]

Error while processing https://www.nosalty.hu/recept/husos-samosa-snidlinges-fokhagymas-martogatossal 


Processing Recipes:  13%|██████▊                                             | 9411/71260 [3:04:46<45:34:33,  2.65s/it]

Error while processing https://www.nosalty.hu/recept/tejszines-kelkaposztaleves 


Processing Recipes:  13%|██████▊                                             | 9413/71260 [3:04:47<30:38:42,  1.78s/it]

Error while processing https://www.nosalty.hu/recept/edes-babos-retes 
Error while processing https://www.nosalty.hu/recept/thai-daralt-husos-uvegteszta 


Processing Recipes:  13%|███████                                              | 9422/71260 [3:04:50<8:06:25,  2.12it/s]

Error while processing https://www.nosalty.hu/recept/pesto-s-csirkeszarnyak 
Error while processing https://www.nosalty.hu/recept/kinai-rizses-csirke 
Error while processing https://www.nosalty.hu/recept/rozmaringos-kacsa 
Error while processing https://www.nosalty.hu/recept/roszti-3 
Error while processing https://www.nosalty.hu/recept/zacskoban-sult-tarja-zoldsegekkel 
Error while processing https://www.nosalty.hu/recept/gyumolcsos-palacsintakehely 


Processing Recipes:  13%|███████                                              | 9425/71260 [3:04:50<5:32:51,  3.10it/s]

Error while processing https://www.nosalty.hu/recept/csirke-pataki-talban-burgonyaval-gyumolcsokkel 
Error while processing https://www.nosalty.hu/recept/teli-gyumolcskehely 
Error while processing https://www.nosalty.hu/recept/tormas-sonkatekercsek-3 
Error while processing https://www.nosalty.hu/recept/szines-rakott-kinai-kel 


Processing Recipes:  13%|███████                                              | 9427/71260 [3:04:50<5:12:19,  3.30it/s]

Error while processing https://www.nosalty.hu/recept/maktorta-10-csokolades-meggymartassal 
Error while processing https://www.nosalty.hu/recept/mogyorokremmel-toltott-narancsos-muffin 


Processing Recipes:  13%|███████                                              | 9430/71260 [3:04:52<6:39:35,  2.58it/s]

Error while processing https://www.nosalty.hu/recept/majonezes-burgonyasalata-6-szojajoghurttal 
Error while processing https://www.nosalty.hu/recept/pillecukros-csokis-halmok 


Processing Recipes:  13%|███████                                              | 9437/71260 [3:04:56<7:30:59,  2.28it/s]

Error while processing https://www.nosalty.hu/recept/sult-fokhagymas-juhturos-piritos 
Error while processing https://www.nosalty.hu/recept/katica-torta-4 
Error while processing https://www.nosalty.hu/recept/dios-csiga-6 
Error while processing https://www.nosalty.hu/recept/bananos-joghurt 
Error while processing https://www.nosalty.hu/recept/tukortojasos-rakott-krumpli 
Error while processing https://www.nosalty.hu/recept/normann-rizskremleves 


Processing Recipes:  13%|███████                                              | 9445/71260 [3:04:56<3:07:21,  5.50it/s]

Error while processing https://www.nosalty.hu/recept/sajtos-lapcsanka-tocsni-14 
Error while processing https://www.nosalty.hu/recept/sport-szelet-3-0 
Error while processing https://www.nosalty.hu/recept/tepsis-hal-olajbogyoval 
Error while processing https://www.nosalty.hu/recept/kakaos-kalacs-9 
Error while processing https://www.nosalty.hu/recept/kokuszkremes-torta-3-fondant-nal-bevonva 
Error while processing https://www.nosalty.hu/recept/csirkemell-kinai-modra-2 
Error while processing https://www.nosalty.hu/recept/meggyleves-5-almaval 


Processing Recipes:  13%|███████                                              | 9448/71260 [3:04:56<2:26:39,  7.02it/s]

Error while processing https://www.nosalty.hu/recept/paradicsomsalata-5-burgonyas 
Error while processing https://www.nosalty.hu/recept/omlett-11-tarkonnyal 


Processing Recipes:  13%|███████                                              | 9454/71260 [3:04:58<2:59:05,  5.75it/s]

Error while processing https://www.nosalty.hu/recept/baconnel-mozzarellaval-toltott-gombafejek 
Error while processing https://www.nosalty.hu/recept/sult-hagymas-sajtos-muffin 
Error while processing https://www.nosalty.hu/recept/farsangi-torta 
Error while processing https://www.nosalty.hu/recept/zold-teas-panna-cotta-csokiszosszal 
Error while processing https://www.nosalty.hu/recept/pizzalap-paradicsomszosz-4-olaszos 


Processing Recipes:  13%|███████                                              | 9459/71260 [3:04:59<3:31:29,  4.87it/s]

Error while processing https://www.nosalty.hu/recept/ijaszhus 
Error while processing https://www.nosalty.hu/recept/truffel-9-zold-teas-rozsaborsos 
Error while processing https://www.nosalty.hu/recept/tejszines-sertesragu-zold-metelttel 
Error while processing https://www.nosalty.hu/recept/toltott-tojasok-husvetra 
Error while processing https://www.nosalty.hu/recept/hagymas-husos-kacsazsir 


Processing Recipes:  13%|███████                                              | 9462/71260 [3:04:59<2:40:14,  6.43it/s]

Error while processing https://www.nosalty.hu/recept/curry-s-rizs-2-zoldsegekkel 
Error while processing https://www.nosalty.hu/recept/csoki-mousse-5 
Error while processing https://www.nosalty.hu/recept/leveles-csik 


Processing Recipes:  13%|███████                                              | 9468/71260 [3:05:01<2:57:48,  5.79it/s]

Error while processing https://www.nosalty.hu/recept/bananos-sajttorta 
Error while processing https://www.nosalty.hu/recept/kaposztas-hagymas-pogacsa 
Error while processing https://www.nosalty.hu/recept/fank-17-almas 
Error while processing https://www.nosalty.hu/recept/francia-kremes-6-egyszeruen 
Error while processing https://www.nosalty.hu/recept/zoldseges-mini-muffin 


Processing Recipes:  13%|███████                                              | 9474/71260 [3:05:01<1:44:58,  9.81it/s]

Error while processing https://www.nosalty.hu/recept/ceklasalata-7 
Error while processing https://www.nosalty.hu/recept/almas-tortacska 
Error while processing https://www.nosalty.hu/recept/sargarepa-salata-7-grillezett 
Error while processing https://www.nosalty.hu/recept/citrompite-3-gyors-tojasmentes 
Error while processing https://www.nosalty.hu/recept/gnocchi-gruyere-sajttal-tejszinnel 
Error while processing https://www.nosalty.hu/recept/gnocchi-paradicsommal-pesto-val 


Processing Recipes:  13%|███████                                              | 9480/71260 [3:05:01<1:13:16, 14.05it/s]

Error while processing https://www.nosalty.hu/recept/serpenyos-gnocchi 
Error while processing https://www.nosalty.hu/recept/vanilias-fahejas-kiflik 
Error while processing https://www.nosalty.hu/recept/epres-muffin 
Error while processing https://www.nosalty.hu/recept/almas-batyu 
Error while processing https://www.nosalty.hu/recept/maglyarakas-4-extra 
Error while processing https://www.nosalty.hu/recept/csokolades-kavetorta 


Processing Recipes:  13%|███████▎                                               | 9486/71260 [3:05:01<54:46, 18.80it/s]

Error while processing https://www.nosalty.hu/recept/brownies-focirajongoknak 
Error while processing https://www.nosalty.hu/recept/atlanti-lazac-2 
Error while processing https://www.nosalty.hu/recept/atlanti-lazac-1 
Error while processing https://www.nosalty.hu/recept/daras-turoszelet-2 
Error while processing https://www.nosalty.hu/recept/buzas-zoldsegkoret 
Error while processing https://www.nosalty.hu/recept/sult-husgolyok 


Processing Recipes:  13%|███████▎                                               | 9490/71260 [3:05:01<47:08, 21.84it/s]

Error while processing https://www.nosalty.hu/recept/fank-18 
Error while processing https://www.nosalty.hu/recept/pasztorrizs 
Error while processing https://www.nosalty.hu/recept/amerikai-palacsinta-7-kefires 
Error while processing https://www.nosalty.hu/recept/szilvatorta-3-fahejas 


Processing Recipes:  13%|███████                                              | 9496/71260 [3:05:03<2:07:34,  8.07it/s]

Error while processing https://www.nosalty.hu/recept/hagymas-szalonnas-csirkemaj 
Error while processing https://www.nosalty.hu/recept/narancsszirupos-narancsos-suti 
Error while processing https://www.nosalty.hu/recept/csirkezuzas-becsinalt-leves 


Processing Recipes:  14%|███████▏                                           | 10000/71260 [3:15:08<20:11:56,  1.19s/it]

Processed 10000/71260 URLs. Successful recipes: 9917. Elapsed time: 11708.14 seconds.


Processing Recipes:  17%|████████▊                                          | 12398/71260 [4:04:13<61:45:05,  3.78s/it]

Error while processing https://www.nosalty.hu/recept/kiskegyed-csokolades-pitetortaja 


Processing Recipes:  19%|█████████▌                                         | 13423/71260 [4:25:39<66:55:28,  4.17s/it]

Error while processing https://www.nosalty.hu/recept/spenotos-ricottas-palacsintatekercsek 


Processing Recipes:  19%|█████████▌                                         | 13424/71260 [4:25:49<95:04:55,  5.92s/it]

Error while processing https://www.nosalty.hu/recept/csirkeporkolt-zoldseges-lecsos-rizzsel 


Processing Recipes:  19%|█████████▍                                        | 13425/71260 [4:25:59<114:48:09,  7.15s/it]

Error while processing https://www.nosalty.hu/recept/zoldseges-husos-teszta 


Processing Recipes:  21%|██████████▋                                        | 15000/71260 [4:58:11<18:53:21,  1.21s/it]

Processed 15000/71260 URLs. Successful recipes: 14913. Elapsed time: 17891.77 seconds.


Processing Recipes:  28%|██████████████▎                                    | 20000/71260 [6:40:04<18:57:59,  1.33s/it]

Processed 20000/71260 URLs. Successful recipes: 19913. Elapsed time: 24004.89 seconds.


Processing Recipes:  34%|█████████████████▎                                 | 24225/71260 [8:06:25<50:46:29,  3.89s/it]

Error while processing https://www.nosalty.hu/recept/cukkinis-quesadilla 


Processing Recipes:  35%|█████████████████▉                                 | 25000/71260 [8:22:20<18:28:56,  1.44s/it]

Processed 25000/71260 URLs. Successful recipes: 24912. Elapsed time: 30140.90 seconds.


Processing Recipes:  35%|█████████████████▉                                 | 25079/71260 [8:24:05<48:46:51,  3.80s/it]

Error while processing https://www.nosalty.hu/recept/szilvas-kocka 


Processing Recipes:  42%|█████████████████████                             | 30000/71260 [10:04:49<16:04:41,  1.40s/it]

Processed 30000/71260 URLs. Successful recipes: 29911. Elapsed time: 36289.55 seconds.


Processing Recipes:  46%|███████████████████████▏                          | 33068/71260 [11:08:30<41:26:32,  3.91s/it]

Error while processing https://www.nosalty.hu/recept/ciabatta-evi-neni-konyhajabol 


Processing Recipes:  49%|████████████████████████▌                         | 35000/71260 [11:48:29<12:36:35,  1.25s/it]

Processed 35000/71260 URLs. Successful recipes: 34910. Elapsed time: 42509.24 seconds.


Processing Recipes:  50%|█████████████████████████▏                        | 35852/71260 [12:06:47<38:53:34,  3.95s/it]

Error while processing https://www.nosalty.hu/recept/cseresznyes-pitetorta-zsuzsamamatol 


Processing Recipes:  56%|████████████████████████████                      | 40000/71260 [13:33:19<11:24:13,  1.31s/it]

Processed 40000/71260 URLs. Successful recipes: 39909. Elapsed time: 48799.63 seconds.


Processing Recipes:  56%|████████████████████████████▏                     | 40086/71260 [13:35:16<33:35:33,  3.88s/it]

Error while processing https://www.nosalty.hu/recept/palocleves-ahogy-sylvia-kesziti 


Processing Recipes:  56%|████████████████████████████▏                     | 40087/71260 [13:35:26<49:30:41,  5.72s/it]

Error while processing https://www.nosalty.hu/recept/vadkovaszos-bukta 


Processing Recipes:  56%|████████████████████████████▏                     | 40088/71260 [13:35:36<60:40:34,  7.01s/it]

Error while processing https://www.nosalty.hu/recept/rakott-csirkemell-sonkaval-gombaval-sajttal 


Processing Recipes:  56%|████████████████████████████▏                     | 40090/71260 [13:35:49<60:10:37,  6.95s/it]

Error while processing https://www.nosalty.hu/recept/egeszsegesebb-pizzateszta 


Processing Recipes:  56%|████████████████████████████▏                     | 40091/71260 [13:35:59<68:07:44,  7.87s/it]

Error while processing https://www.nosalty.hu/recept/tarkonyos-raguleves-citrommal 


Processing Recipes:  56%|████████████████████████████▏                     | 40092/71260 [13:36:09<73:41:35,  8.51s/it]

Error while processing https://www.nosalty.hu/recept/mac-and-cheese-gonoszka-konyhajabol 


Processing Recipes:  60%|██████████████████████████████▏                   | 43090/71260 [14:39:26<30:42:50,  3.93s/it]

Error while processing https://www.nosalty.hu/recept/dios-sullyesztett-racsos-pite 


Processing Recipes:  63%|████████████████████████████████▏                  | 45000/71260 [15:18:23<8:06:19,  1.11s/it]

Processed 45000/71260 URLs. Successful recipes: 44902. Elapsed time: 55103.64 seconds.


Processing Recipes:  69%|██████████████████████████████████▍               | 49106/71260 [16:40:14<23:01:27,  3.74s/it]

Error while processing https://www.nosalty.hu/recept/epres-pite-lesteria-konyhajabol 


Processing Recipes:  70%|███████████████████████████████████▊               | 50001/71260 [16:57:27<5:34:09,  1.06it/s]

Processed 50000/71260 URLs. Successful recipes: 49901. Elapsed time: 61047.13 seconds.


Processing Recipes:  77%|███████████████████████████████████████▎           | 55000/71260 [18:33:06<5:19:06,  1.18s/it]

Processed 55000/71260 URLs. Successful recipes: 54901. Elapsed time: 66786.44 seconds.


Processing Recipes:  80%|████████████████████████████████████████▏         | 57329/71260 [19:16:44<14:19:22,  3.70s/it]

Error while processing https://www.nosalty.hu/recept/csicsokas-savanyukaposzta-kremleves 


Processing Recipes:  84%|██████████████████████████████████████████▉        | 60000/71260 [20:07:21<3:57:22,  1.26s/it]

Processed 60000/71260 URLs. Successful recipes: 59900. Elapsed time: 72441.35 seconds.


Processing Recipes:  89%|█████████████████████████████████████████████▍     | 63483/71260 [21:13:57<8:09:11,  3.77s/it]

Error while processing https://www.nosalty.hu/recept/barnasoros-csokoladetorta 


Processing Recipes:  91%|██████████████████████████████████████████████▌    | 65000/71260 [21:42:39<2:15:23,  1.30s/it]

Processed 65000/71260 URLs. Successful recipes: 64899. Elapsed time: 78159.37 seconds.


Processing Recipes:  98%|████████████████████████████████████████████████████ | 70000/71260 [23:24:34<25:06,  1.20s/it]

Processed 70000/71260 URLs. Successful recipes: 69899. Elapsed time: 84274.05 seconds.


Processing Recipes:  99%|████████████████████████████████████████████████████▍| 70440/71260 [23:33:48<54:29,  3.99s/it]

Error while processing https://www.nosalty.hu/recept/nasi-goreng-indonez-piritott-rizs 


Processing Recipes:  99%|██████████████████████████████████████████████████▍| 70441/71260 [23:33:58<1:19:05,  5.79s/it]

Error while processing https://www.nosalty.hu/recept/joghurtos-vajas-kifli 


Processing Recipes:  99%|██████████████████████████████████████████████████▍| 70442/71260 [23:34:08<1:36:15,  7.06s/it]

Error while processing https://www.nosalty.hu/recept/turos-vanilias-parna-lekvarral 


Processing Recipes:  99%|████████████████████████████████████████████████████▌| 70750/71260 [23:40:36<35:13,  4.14s/it]

Error while processing https://www.nosalty.hu/recept/sajtos-rakott-cukkini-szaftos-raguval 


Processing Recipes:  99%|████████████████████████████████████████████████████▌| 70751/71260 [23:40:46<50:05,  5.90s/it]

Error while processing https://www.nosalty.hu/recept/sult-varganya-szelet-gesztenyevel 


Processing Recipes:  99%|██████████████████████████████████████████████████▋| 70752/71260 [23:40:57<1:00:25,  7.14s/it]

Error while processing https://www.nosalty.hu/recept/nagy-ozlabspagetti 


Processing Recipes:  99%|████████████████████████████████████████████████████▋| 70772/71260 [23:41:34<37:32,  4.62s/it]

Error while processing https://www.nosalty.hu/recept/kekszszalami-keksz-datolya-pisztacia-cukormentes 


Processing Recipes:  99%|████████████████████████████████████████████████████▋| 70788/71260 [23:41:45<06:59,  1.13it/s]

Error while processing https://www.nosalty.hu/recept/narancskenyer-narancslikorrel 
Error while processing https://www.nosalty.hu/recept/finom-almas-suti 
Error while processing https://www.nosalty.hu/recept/csirkecomb-gombas-tejfolos-szoszban 
Error while processing https://www.nosalty.hu/recept/hasselback-burgonya-avagy-a-sved-burgonya 
Error while processing https://www.nosalty.hu/recept/krumplis-cukkinis-lepeny 
Error while processing https://www.nosalty.hu/recept/mandulas-almas-pite 
Error while processing https://www.nosalty.hu/recept/sargabaracklekvaros-bukta 
Error while processing https://www.nosalty.hu/recept/klasszikus-darazsfeszek 
Error while processing https://www.nosalty.hu/recept/egyszeru-bundas-alma 
Error while processing https://www.nosalty.hu/recept/koktel-gin-fizz-feketeribizli-halloween 
Error while processing https://www.nosalty.hu/recept/chili-vega-sutotok-bab-nachos-jalapeno 
Error while processing https://www.nosalty.hu/recept/sajtkremleves-sajt-leves 
Error w

Processing Recipes:  99%|████████████████████████████████████████████████████▋| 70845/71260 [23:41:45<00:53,  7.71it/s]

Error while processing https://www.nosalty.hu/recept/tejszines-pulykaragu 
Error while processing https://www.nosalty.hu/recept/sutotok-tarte-tatin-forditott-pite 
Error while processing https://www.nosalty.hu/recept/sutotok-dio-piskota-tekercs-bosch-mascarpone-robotgep-osz-adventvaro 
Error while processing https://www.nosalty.hu/recept/egyszeru-paradicsomos-gombaleves-nokedlivel 
Error while processing https://www.nosalty.hu/recept/avokados-csirkes-teszta-gyorsan 
Error while processing https://www.nosalty.hu/recept/spenotos-omlett 
Error while processing https://www.nosalty.hu/recept/pan-de-muerto 
Error while processing https://www.nosalty.hu/recept/csemegeuborka-leveben-pacolt-sult-csirke-zoldfuszeres-uborkas-burgonyasalataval 
Error while processing https://www.nosalty.hu/recept/mezes-granola 
Error while processing https://www.nosalty.hu/recept/magyaros-burgonyaleves-kolbasszal 
Error while processing https://www.nosalty.hu/recept/bejglicsiga-cukormentes-bejgli-mak-dio-unnep 
Er

Processing Recipes: 100%|████████████████████████████████████████████████████▊| 71051/71260 [23:41:45<00:03, 59.30it/s]

Error while processing https://www.nosalty.hu/recept/csipos-tatar-beefsteak 
Error while processing https://www.nosalty.hu/recept/karacsonyi-citromos-kevert-sutemeny 
Error while processing https://www.nosalty.hu/recept/sos-karamellas-mezeskalacs-torta 
Error while processing https://www.nosalty.hu/recept/sajtburger-falat-taska-burgerszosz 
Error while processing https://www.nosalty.hu/recept/rakoczi-turos-baracklekvarral 
Error while processing https://www.nosalty.hu/recept/csulok-sorben-sulve 
Error while processing https://www.nosalty.hu/recept/baileys-ir-kremlikor 
Error while processing https://www.nosalty.hu/recept/gyumolcskenyer-szaftos-puha 
Error while processing https://www.nosalty.hu/recept/selymes-kremes-tejfolos-harcsapaprikas-kapros-turos-csusza 
Error while processing https://www.nosalty.hu/recept/zsurek-zurek-lengyel-kovaszos-leves 
Error while processing https://www.nosalty.hu/recept/toltott-zokni 
Error while processing https://www.nosalty.hu/recept/berliner-1 
Error 

Processing Recipes: 100%|███████████████████████████████████████████████████▉| 71185/71260 [23:41:46<00:00, 124.83it/s]

Error while processing https://www.nosalty.hu/recept/konnyu-kraterszelet 
Error while processing https://www.nosalty.hu/recept/kuglof-tom-cruise-kokusz-fehercsoki 
Error while processing https://www.nosalty.hu/recept/fahej-keksz-ciniminis 
Error while processing https://www.nosalty.hu/recept/aszalt-szilvaval-gongyolt-csirkemell-baconkontos 
Error while processing https://www.nosalty.hu/recept/pulyka-wellington-modra 
Error while processing https://www.nosalty.hu/recept/dios-szilvas-papucs-karacsony-sutemeny-aprosuti-auchan-vendegvaro 
Error while processing https://www.nosalty.hu/recept/mikulas-suti 
Error while processing https://www.nosalty.hu/recept/dios-makos-bejgliszelet 
Error while processing https://www.nosalty.hu/recept/legegyszerubb-puspokkenyer 
Error while processing https://www.nosalty.hu/recept/grillazs-hazilag 
Error while processing https://www.nosalty.hu/recept/egyszeru-lazac 
Error while processing https://www.nosalty.hu/recept/lazacos-rakott-teszta 
Error while proce

Processing Recipes: 100%|█████████████████████████████████████████████████████| 71260/71260 [23:41:46<00:00,  1.20s/it]


Error while processing https://www.nosalty.hu/recept/sajtos-karfiolnuggets 
Error while processing https://www.nosalty.hu/recept/narancslekvar-marmalade-narancsdzsem 
Error while processing https://www.nosalty.hu/recept/kevert-salata-savanyu-kaposztaval 
Error while processing https://www.nosalty.hu/recept/kokuszos-puncsgolyo-piritott-mandulaval 
Error while processing https://www.nosalty.hu/recept/turoval-toltott-csoroge-cannoli 
Error while processing https://www.nosalty.hu/recept/egyszeru-csorogefank 
Error while processing https://www.nosalty.hu/recept/vegan-csokitorta-egyszeruen 
Error while processing https://www.nosalty.hu/recept/brownie-sajttorta-eper-valentinnap-sutemeny 
Error while processing https://www.nosalty.hu/recept/churros-csokiszosszal 
Error while processing https://www.nosalty.hu/recept/paradicsomos-babfozelek-villamgyorsan 
Error while processing https://www.nosalty.hu/recept/citrompite-fank 
Error while processing https://www.nosalty.hu/recept/klasszikus-skot-toj

Unnamed: 0,Recipe name,Time,Cost,Difficulty,Categories,Preparation Time,Baking Time,Portions,Ingredients,Fehérje - Összesen,...,Niacin - B3 vitamin,Pantoténsav - B5 vitamin,Folsav - B9-vitamin,Kolin,Retinol,α-karotin,β-karotin,β-crypt,Likopin,Lut-zea
0,Tőzegáfonyás répatorta,50perc,olcsó,könnyű,"torta, répatorta",5 perc,45.0,8,"[{'Ingredient': 'nádcukor', 'Quantity': '150 g...",4.8,...,0.0,0.0,1.8e-05,0.058,2.9e-05,0.000619,0.001478,2e-06,0.0,0.000149
1,Hideg joghurtos őszibarackleves,51perc,olcsó,könnyű,"levesek, gyümölcsleves",6 perc,45.0,4,"[{'Ingredient': 'őszibarack', 'Quantity': '8 k...",4.2,...,0.002,0.0,1.4e-05,0.023,1.4e-05,0.0,0.000398,0.000163,0.0,0.000222
2,Házi készítésű pappardelle tészta pesto rossoval,120perc,olcsó,közepes,"tészta, tészta alaprecept",110 perc,,4,"[{'Ingredient': 'rétesliszt', 'Quantity': '165...",10.8,...,0.001,0.0,2.2e-05,0.075,3.9e-05,0.0,0.0,2e-06,0.0,0.000129
3,Hargita citromtortája,55perc,olcsó,könnyű,"édes süti, kevert sütemény",15 perc,40.0,8,"[{'Ingredient': 'vaj', 'Quantity': '125 g'}, {...",4.2,...,0.0,0.0,1.4e-05,0.045,0.000162,0.0,3.3e-05,2e-06,0.0,6.6e-05
4,Tiramisu 1.,45perc,megfizethető,könnyű,"édes süti, tiramisu",45 perc,,6,"[{'Ingredient': 'babapiskóta', 'Quantity': '40...",15.8,...,0.002,0.0,2.8e-05,0.139,6.2e-05,6e-06,1.5e-05,6e-06,0.0,0.000186


In [180]:
class IngredientScraper:
    """Class for scraping full details of an ingredient from Nosalty."""

    BASE_URL = "https://www.nosalty.hu/alapanyag/"

    def __init__(self, url):
        self.url = url
        self.soup = self.get_soup()

    @staticmethod
    def extract_urls_from_xml(file_path, check_url_format=False, date_range=None):
        """
        Extracts ingredient URLs from an XML file, optionally checking format and date range.

        Args:
            file_path (str): Path to the XML file.
            check_url_format (bool): If True, checks if the URL is valid.
            date_range (tuple): Optional (start_date, end_date) for filtering URLs by lastmod.
                                Format: ('YYYY-MM-DD', 'YYYY-MM-DD').

        Returns:
            list: List of valid URLs.
        """
        def is_valid_url(url):
            """Validates URL format."""
            pattern = r"^(http|https)://[^\s/$.?#].[^\s]*$"
            return re.match(pattern, url) is not None

        def is_within_date_range(date_text, start_date, end_date):
            """Checks if the date falls within the specified range."""
            try:
                date = datetime.strptime(date_text, "%Y-%m-%d")
                return start_date <= date <= end_date
            except ValueError:
                return False

        # Define XML namespace
        namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

        # Parse XML
        tree = ET.parse(file_path)
        root = tree.getroot()

        urls = []
        for url in root.findall('ns:url', namespace):
            loc = url.find('ns:loc', namespace)
            lastmod = url.find('ns:lastmod', namespace)

            if loc is not None:
                url_text = loc.text
                if not url_text.startswith(IngredientScraper.BASE_URL):
                    continue
                if check_url_format and not is_valid_url(url_text):
                    continue
                if date_range and lastmod is not None:
                    start_date, end_date = map(lambda d: datetime.strptime(d, "%Y-%m-%d"), date_range)
                    if not is_within_date_range(lastmod.text, start_date, end_date):
                        continue
                urls.append(url_text)

        return urls

    def get_soup(self):
        """Fetch and parse the ingredient webpage."""
        try:
            response = requests.get(self.url, timeout=5)
            if response.status_code != 200:
                return None
            return BeautifulSoup(response.text, 'html.parser')
        except requests.RequestException:
            return None

    def get_ingredient_name(self):
        """Extracts the ingredient name from metadata."""
        if not self.soup:
            return "N/A"
        meta_tag = self.soup.find("meta", property="og:title")
        return meta_tag["content"] if meta_tag and "content" in meta_tag.attrs else "Unknown Ingredient"

    def get_categories(self):
        """Extracts primary and secondary ingredient categories."""
        if not self.soup:
            return "N/A", "N/A"
        category_links = self.soup.find_all("a", class_="a-link -fontColorPrimary -fontSize-16 -recipe pr-3 pl-2")
        categories = [link.text.strip() for link in category_links]

        primary_category = categories[1] if len(categories) > 0 else "N/A"
        secondary_category = categories[2] if len(categories) > 2 else "N/A"
        return primary_category, secondary_category

    def get_nutrition_data(self):
        """Extracts nutrition details for the ingredient."""
        if not self.soup:
            return {}

        nutrition_data = {}

        # Locate the nutrition section
        ingredient_section = self.soup.find("div", id="ingredientCalculatorResult")
        if not ingredient_section:
            return {}  # No nutrition section found, return empty

        # Find all nutrition data rows
        rows = ingredient_section.find_all('div', class_='d-flex align-items-center flex-row w-100')

        for row in rows:
            # Extract labels and values
            paragraphs = row.find_all('p')
            if len(paragraphs) == 2:
                label = paragraphs[0].text.strip()
                value = paragraphs[1].text.strip()

                # Save to dictionary
                nutrition_data[label] = value

        return nutrition_data

    def to_dataframe(self):
        """Converts extracted ingredient details into a pandas DataFrame."""
        primary_category, secondary_category = self.get_categories()
        return pd.DataFrame([{
            "Ingredient Name": self.get_ingredient_name(),
            "Primary Category": primary_category,
            "Secondary Category": secondary_category,
            **self.get_nutrition_data()
        }])


In [191]:
# XML paths
file_paths = [
    r'C:\Users\Bálint\Desktop\Asztal\Projekt\inputs\receptek_url.xml',
    r'C:\Users\Bálint\Desktop\Asztal\Projekt\inputs\receptek_url_2.xml',
    r'C:\Users\Bálint\Desktop\Asztal\Projekt\inputs\receptek_url_3.xml'
]

# Processing all URLs 
all_ingredient_urls = []
for file_path in file_paths:
    urls_ingredients = IngredientScraper.extract_urls_from_xml(file_path, check_url_format=True)
    all_ingredient_urls.extend(urls_ingredients)

In [190]:
# Start measuring execution time
start_time = time.time()

# Initialize counters
ingredient_data = []
success_count = 0
total_count = len(all_ingredient_urls)

# Process each ingredient URL
with tqdm(total=total_count, desc="Processing Ingredients") as pbar:
    for i, url in enumerate(all_ingredient_urls, start=1):
        ingredient_scraper = IngredientScraper(url)
        ingredient_details_df = ingredient_scraper.to_dataframe()

        if not ingredient_details_df.empty:
            ingredient_data.append(ingredient_details_df)
            success_count += 1  # Count successful scrapes

        # Periodically print status updates
        if i % 150 == 0 or i == total_count:
            elapsed_time = time.time() - start_time
            print(f"Processed {i}/{total_count} URLs. Successful ingredients: {success_count}. Elapsed time: {elapsed_time:.2f} seconds.")

        pbar.update(1)  # Update progress bar

# Combine all ingredient data into a single DataFrame
final_ingredient_df = pd.concat(ingredient_data, ignore_index=True)

# Clean the extracted nutritional data
final_ingredient_df_cleaned = DataProcessor.clean_nutrition_data_ingredients(final_ingredient_df)

# Save the cleaned data to CSV
output_path = r"C:\Users\Bálint\Desktop\Asztal\Projekt\outputs\ingredients_cleaned.csv"
final_ingredient_df_cleaned.to_csv(output_path, index=False)

# End measuring execution time
end_time = time.time()
print(f"Total runtime: {end_time - start_time:.2f} seconds.")
# Display the first few rows of the cleaned dataset
final_ingredient_df_cleaned.head()

Processing Ingredients:  10%|█████▋                                                 | 151/1465 [01:44<09:38,  2.27it/s]

Processed 150/1465 URLs. Successful ingredients: 150. Elapsed time: 103.87 seconds.


Processing Ingredients:  20%|███████████▎                                           | 300/1465 [03:26<11:55,  1.63it/s]

Processed 300/1465 URLs. Successful ingredients: 300. Elapsed time: 206.73 seconds.


Processing Ingredients:  31%|████████████████▉                                      | 450/1465 [05:09<10:48,  1.56it/s]

Processed 450/1465 URLs. Successful ingredients: 450. Elapsed time: 309.54 seconds.


Processing Ingredients:  41%|██████████████████████▌                                | 600/1465 [06:55<10:55,  1.32it/s]

Processed 600/1465 URLs. Successful ingredients: 600. Elapsed time: 415.49 seconds.


Processing Ingredients:  51%|████████████████████████████▏                          | 750/1465 [08:37<08:09,  1.46it/s]

Processed 750/1465 URLs. Successful ingredients: 750. Elapsed time: 517.66 seconds.


Processing Ingredients:  61%|█████████████████████████████████▊                     | 900/1465 [10:15<05:52,  1.60it/s]

Processed 900/1465 URLs. Successful ingredients: 900. Elapsed time: 615.87 seconds.


Processing Ingredients:  72%|██████████████████████████████████████▋               | 1050/1465 [11:56<03:43,  1.86it/s]

Processed 1050/1465 URLs. Successful ingredients: 1050. Elapsed time: 716.74 seconds.


Processing Ingredients:  82%|████████████████████████████████████████████▏         | 1200/1465 [13:31<02:45,  1.60it/s]

Processed 1200/1465 URLs. Successful ingredients: 1200. Elapsed time: 811.96 seconds.


Processing Ingredients:  92%|█████████████████████████████████████████████████▊    | 1350/1465 [15:00<01:09,  1.65it/s]

Processed 1350/1465 URLs. Successful ingredients: 1350. Elapsed time: 900.80 seconds.


Processing Ingredients: 100%|██████████████████████████████████████████████████████| 1465/1465 [16:11<00:00,  1.51it/s]

Processed 1465/1465 URLs. Successful ingredients: 1465. Elapsed time: 971.72 seconds.





Total runtime: 973.58 seconds.


Unnamed: 0,Ingredient Name,Primary Category,Secondary Category,Foszfor,Magnézium,Nátrium,Kalcium,B6 vitamin,Kolin,Niacin - B3 vitamin,...,Béta-karotin,Lutein+zeaxantin,K vitamin,A vitamin,Likopin,Alfa-karotin,Béta-kriptoxantin,koleszterin,Retinol,Folsav
0,vargánya gomba,gombák,erdei gomba,0.086,0.009,0.005,0.003,0.104,0.0173,0.003607,...,,,,,,,,,,
1,áfonya,gyümölcsök,bogyós gyümölcs,0.012,0.006,0.001,0.006,0.052,0.006,0.000418,...,3.2e-05,8e-05,1.9e-05,3e-06,,,,,,
2,vöröskáposzta,zöldségek,káposztafélék,0.03,0.016,0.027,0.045,0.209,0.0171,0.000418,...,0.00067,0.000329,3.8e-05,5.6e-05,2e-05,,,,,
3,citrom,gyümölcsök,déli gyümölcs,0.016,0.008,0.002,0.026,0.08,0.0051,0.0001,...,3e-06,1.1e-05,,1e-06,,1e-06,2e-05,,,
4,sárgarépa,zöldségek,gyökérzöldség,0.035,0.012,0.069,0.033,0.138,0.0088,0.000983,...,0.008285,0.000256,1.3e-05,0.000835,1e-06,0.003477,,,,
