In [16]:
import requests
from requests.exceptions import HTTPError
from bs4 import BeautifulSoup as bs


In [None]:
pages=50
scraped_recipes=[]
for i in range(1,pages+1):
  try:
    r = requests.get(f'https://www.skinnytaste.com/recipe-index/?_paged={i}')
    print(f"Page {i}")
    r.raise_for_status()
    soup = bs(r.text,'html.parser')
    recipe_cards=soup.select('.post-content')
    scraped_recipes.extend(recipe_cards)
    print(f"{len(recipe_cards)} items found on page {i}")
    print(f"Items so far {len(scraped_recipes)} ")
    print("\n----------\n")
  except HTTPError as http_error:
    print(f"HTTP error occurred: {http_error} - {i} - request")
  except Exception as err:
    print(f"Unknown occurred: {err} ---- {i} - request")
    
  
  
  

In [None]:

len(scraped_recipes)

In [None]:
data=[]
# 


for recipe in scraped_recipes:
    try:
        # Initialize a dictionary to store recipe properties
        recipe_info = {}

        # Name
        recipe_info['name'] = recipe.find('h2', class_='entry-title').text if recipe.find('h2', class_='entry-title') else None
        
        # Image
        recipe_info['image'] = recipe.find('img')['src'] if recipe.find('img') else None
        
        # Keys
        recipe_info['keys'] = [li.get_text(strip=True) for li in recipe.find_all('li')]
        
        # Personal Points
        points_div = recipe.find('div', class_='wprm-ww-points')
        recipe_info['points'] = points_div.text if points_div else None
        
        # Get the URL of the recipe
        url = recipe.find('a')['href'] if recipe.find('a') else None

        # Fetch the recipe details page
        if url:
            try:
                print(f"Finding details for {recipe_info['name']}")
                recipe_request = requests.get(url)
                recipe_request.raise_for_status()
                recipe_soup = bs(recipe_request.text, 'html.parser')
                
                # Calories
                calories_span = recipe_soup.find('span', class_='value-calories')
                recipe_info['calories'] = float(calories_span.text.split(':')[1].strip()) if calories_span else None
                
                # Summary
                summary_paragraph = recipe_soup.find('p')
                recipe_info['summary'] = summary_paragraph.text if summary_paragraph else None
            except requests.HTTPError as http_err:
                print(f"HTTP error occurred while fetching details for {recipe_info['name']}: {http_err}")
                recipe_info['calories'] = None
                recipe_info['summary'] = None
        else:
            recipe_info['calories'] = None
            recipe_info['summary'] = None

        # Append the recipe info to the list
        data.append(recipe_info)

    except Exception as e:
        print(f"An error occurred while processing the recipe: {e}")



In [None]:
# recipe_content[18].find('h2').text
len(data)

In [210]:
import pandas as pd

In [211]:
df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
df_cleaned = df[
    (df['keys'].apply(lambda x: len(x) > 0)) & 
    (df['points'].notna()) & 
    (df['calories'].notna())
]
df_cleaned['points']=df_cleaned["points"].apply(lambda x: int(x)) 

In [212]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# 1. Calories Distribution
plt.figure(figsize=(12, 6))
sns.histplot(df_cleaned['calories'], bins=20)
plt.title('Calories Distribution')
plt.xlabel('Calories')
plt.ylabel('Frequency')
plt.grid()
plt.show()

In [None]:
df_cleaned.explode('keys')

In [None]:
# 2. Recipe Key Distribution
# Exploding the keys to create a long format for count plotting
keys_exploded = df_cleaned.explode('keys')

plt.figure(figsize=(12, 6))
sns.countplot(data=keys_exploded, x='keys', order=keys_exploded['keys'].value_counts().index)
plt.title('Recipe Key Distribution')
plt.xlabel('Recipe Keys')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()

In [None]:
# 3. Points Distribution
plt.figure(figsize=(12, 6))
sns.histplot(df_cleaned['points'], bins=10)
plt.title('Points Distribution')
plt.xlabel('Points')
plt.ylabel('Frequency')
plt.grid()
plt.show()

In [111]:
from IPython.display import HTML,display

In [154]:
def image_formatter(url: str):
    return '<img src="'+ url + '" width="60" >'

In [196]:
def filter_recipe(calorie_range: tuple, point_range: tuple, data: pd.DataFrame):
    # Make a copy of the original data to avoid modifying it
    try:
        data_to_use = data.copy()

        # Unpack calorie and point ranges
        min_calorie, max_calorie = calorie_range
        min_point, max_point = point_range

        # Validate that calorie_range and point_range are valid tuples
        if not isinstance(calorie_range, tuple) or not isinstance(point_range, tuple):
            raise ValueError("Calorie range and point range must be tuples.")
        if len(calorie_range) != 2 or len(point_range) != 2:
            raise ValueError("Both ranges must have two values (min, max).")
        if not isinstance(min_calorie, (int, float)) or not isinstance(max_calorie, (int, float)):
            raise ValueError("Calorie values must be numeric.")
        if not isinstance(min_point, (int, float)) or not isinstance(max_point, (int, float)):
            raise ValueError("Point values must be numeric.")
        
        # Ensure ranges are logical
        if min_calorie > max_calorie:
            raise ValueError("Minimum calorie value cannot be greater than maximum calorie value.")
        if min_point > max_point:
            raise ValueError("Minimum point value cannot be greater than maximum point value.")
        
        # Filter based on the provided ranges
        filtered_data = data_to_use[
            (data_to_use["points"] >= min_point) &
            (data_to_use["points"] <= max_point) &
            (data_to_use["calories"] >= min_calorie) &
            (data_to_use["calories"] <= max_calorie)
        ]
        
        # Check if any data remains after filtering
        if filtered_data.empty:
            raise ValueError("No recipes found within the specified ranges.")

        # Apply the image formatting function to display images in the DataFrame
        filtered_data.loc[:, 'image'] = filtered_data.loc[:, 'image'].apply(lambda x: image_formatter(x))
        
        # Return the first 10 recipes sorted by calories, handling exceptions
        return display(HTML(
            filtered_data
            .sort_values(by=['calories'], ascending=False)
            .head(10)
            .to_html(escape=False)
        ))

    except ValueError as ve:
        print(f"ValueError: {ve}")
    except KeyError as ke:
        print(f"KeyError: One or more expected columns are missing: {ke}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
  
  

In [None]:
df_cleaned.head()

In [None]:
filter_recipe((60,200),(3,5),df_cleaned)