# Recipe Scraping Analysis 

In [22]:
from pymongo import MongoClient
import pandas as pd 
pd.set_option('display.max_columns', None) 
from IPython.display import display 

In [2]:
# Connect to MongoDB
client = MongoClient()
db = client['recipe_db']  # Replace with your database name

# List all collections (tables)
collections = db.list_collection_names()
print("Collections:", collections)

Collections: ['whatsgabycooking', 'sallysbakingaddiction', 'wellplated', 'acouplecooks_recipes', 'wedishitup', 'ahealthysliceoflife', 'abeautifulmess_recipes']


In [3]:
# Function to convert all collections into a single DataFrame
def load_all_collections(db):
    all_data = []  # List to store all documents

    for collection_name in db.list_collection_names():
        collection = db[collection_name]  # Access collection
        documents = list(collection.find())  # Convert to list
        for doc in documents:
            doc['_collection'] = collection_name  # Track source collection
            doc.pop('_id', None)  # Remove MongoDB's default `_id`
        all_data.extend(documents)

    # Convert list of dicts to DataFrame
    df = pd.DataFrame(all_data)
    return df

# Load into DataFrame
df = load_all_collections(db)

In [23]:
df.head()

Unnamed: 0,canonical_url,host,image,language,site_name,_collection,author,category,cook_time,cuisine,description,ingredient_groups,ingredients,instructions,instructions_list,keywords,nutrients,prep_time,ratings,ratings_count,title,total_time,yields,cooking_method,dietary_restrictions,equipment
1,https://whatsgabycooking.com/14-of-our-favorit...,whatsgabycooking.com,https://whatsgabycooking.com/wp-content/upload...,en-US,What's Gaby Cooking,whatsgabycooking,Gaby Dalkin,Dessert,20.0,American,Strawberry Blueberry Shortcakes are the desser...,"[{'ingredients': ['3 cups all purpose flour', ...","[3 cups all purpose flour, 2 teaspoons lemon z...","For the Biscuits\nCombine the flour, lemon zes...","[For the Biscuits, Combine the flour, lemon ze...","[strawberry shortcake, blueberry shortcake]","{'calories': '731 kcal', 'carbohydrateContent'...",15.0,5.0,1.0,Strawberry Blueberry Shortcakes,35.0,9 servings,,,
2,https://whatsgabycooking.com/raspberry-brownies/,whatsgabycooking.com,https://whatsgabycooking.com/wp-content/upload...,en-US,What's Gaby Cooking,whatsgabycooking,Gaby Dalkin,Dessert,50.0,American,These Raspberry Swirled Brownies are the most ...,[{'ingredients': ['240 grams dark chocolate (a...,[240 grams dark chocolate (approximately 1½ cu...,Preheat the oven to 325° F.\nCombine the choco...,"[Preheat the oven to 325° F., Combine the choc...","[brownies, raspberry brownies, how to make bro...","{'calories': '358 kcal', 'carbohydrateContent'...",10.0,4.67,21.0,Raspberry Swirled Brownies,195.0,16 servings,,,
3,https://whatsgabycooking.com/pineapple-sheet-c...,whatsgabycooking.com,https://whatsgabycooking.com/wp-content/upload...,en-US,What's Gaby Cooking,whatsgabycooking,Gaby Dalkin,Dessert,35.0,American,Pineapple Sheet Cake – the breakfast and desse...,"[{'ingredients': ['2 cups white sugar', '2 egg...","[2 cups white sugar, 2 eggs (at room temperatu...",Preheat the oven to 350 degrees F.\nLine a 9x1...,"[Preheat the oven to 350 degrees F., Line a 9x...","[cream cheese frosting, pineapple sheet cake]","{'calories': '233 kcal', 'carbohydrateContent'...",15.0,4.81,21.0,Pineapple Sheet Cake with Cream Cheese Frosting,50.0,20 servings,,,
4,https://whatsgabycooking.com/magic-bar-brownies/,whatsgabycooking.com,https://whatsgabycooking.com/wp-content/upload...,en-US,What's Gaby Cooking,whatsgabycooking,Gaby Dalkin,"Dessert,Snack",60.0,American,Magic Bar Brownies have been big since before ...,[{'ingredients': ['240 grams dark chocolate (a...,[240 grams dark chocolate (approx 1½ cups regu...,Preheat the oven to 325° F. Line a 9×13 pan wi...,[Preheat the oven to 325° F. Line a 9×13 pan w...,[best brownies],"{'calories': '611 kcal', 'carbohydrateContent'...",30.0,5.0,3.0,Magic Bar Brownies,90.0,16 servings,,,
5,https://whatsgabycooking.com/grilled-peaches-w...,whatsgabycooking.com,https://whatsgabycooking.com/wp-content/upload...,en-US,What's Gaby Cooking,whatsgabycooking,Gaby Dalkin,Dessert,4.0,"Mediterranean,American",There’s really nothing quite like Grilled Peac...,"[{'ingredients': ['2 ripe peaches', '2 ripe ne...","[2 ripe peaches, 2 ripe nectarines, canola oil...",Heat grill to medium high heat.\nBrush the pea...,"[Heat grill to medium high heat., Brush the pe...","[grilled peaches, peaches and ice cream]","{'calories': '166 kcal', 'carbohydrateContent'...",1.0,5.0,2.0,Grilled Peaches with Vanilla Ice Cream,5.0,8 servings,,,


In [5]:
# data cleaning

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9848 entries, 0 to 9847
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   canonical_url         9848 non-null   object 
 1   host                  9848 non-null   object 
 2   image                 9823 non-null   object 
 3   language              9848 non-null   object 
 4   site_name             9848 non-null   object 
 5   _collection           9848 non-null   object 
 6   author                8916 non-null   object 
 7   category              8470 non-null   object 
 8   cook_time             6030 non-null   float64
 9   cuisine               8346 non-null   object 
 10  description           8915 non-null   object 
 11  ingredient_groups     8923 non-null   object 
 12  ingredients           8930 non-null   object 
 13  instructions          9000 non-null   object 
 14  instructions_list     9000 non-null   object 
 15  keywords             

In [44]:
df = df[
    ~((df['ingredients'].isna()) | df['ingredients'].astype(str).isin(['{}', '[]'])) & 
    ~((df['instructions'].isna()) | df['instructions'].astype(str).isin(['{}', '[]']))
]

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8819 entries, 1 to 9847
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   canonical_url         8819 non-null   object 
 1   host                  8819 non-null   object 
 2   image                 8819 non-null   object 
 3   language              8819 non-null   object 
 4   site_name             8819 non-null   object 
 5   _collection           8819 non-null   object 
 6   author                8819 non-null   object 
 7   category              8445 non-null   object 
 8   cook_time             5965 non-null   float64
 9   cuisine               8329 non-null   object 
 10  description           8818 non-null   object 
 11  ingredient_groups     8812 non-null   object 
 12  ingredients           8819 non-null   object 
 13  instructions          8819 non-null   object 
 14  instructions_list     8819 non-null   object 
 15  keywords             

In [46]:
nutrition_df = df[df['nutrients']!= {}] # 5k recipes not bad 
nutrition_df['category'].value_counts()

Main Course                    686
Dessert                        676
Side Dish                      433
Breakfast                      353
Main Dish                      323
                              ... 
Breakfast,Drinks,Smoothie        1
Snacks                           1
Appetizer,Snack,Lunch            1
Main Course,Lunch,Breakfast      1
brunch,Drinks                    1
Name: category, Length: 241, dtype: int64

In [47]:
nutrition_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5582 entries, 1 to 9847
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   canonical_url         5582 non-null   object 
 1   host                  5582 non-null   object 
 2   image                 5582 non-null   object 
 3   language              5582 non-null   object 
 4   site_name             5582 non-null   object 
 5   _collection           5582 non-null   object 
 6   author                5582 non-null   object 
 7   category              5544 non-null   object 
 8   cook_time             4153 non-null   float64
 9   cuisine               5481 non-null   object 
 10  description           5582 non-null   object 
 11  ingredient_groups     5578 non-null   object 
 12  ingredients           5582 non-null   object 
 13  instructions          5582 non-null   object 
 14  instructions_list     5582 non-null   object 
 15  keywords             

In [66]:
print(nutrition_df[['ingredient_groups']].loc[1])

ingredient_groups    [{'ingredients': ['3 cups all purpose flour', ...
Name: 1, dtype: object


In [55]:
nutrition_df['ingredients']

1       [3 cups all purpose flour, 2 teaspoons lemon z...
2       [240 grams dark chocolate (approximately 1½ cu...
3       [2 cups white sugar, 2 eggs (at room temperatu...
4       [240 grams dark chocolate (approx 1½ cups regu...
5       [2 ripe peaches, 2 ripe nectarines, canola oil...
                              ...                        
9836    [2 1/2 pounds apples, ½ cup brown sugar, ½ cup...
9840    [2 ¼ cups cranberry juice cocktail, 1 box cran...
9843    [6 ounces hot coffee, 1-2 teaspoons sugar, 1½ ...
9844    [12 ounces Chex cereal, 1 cup semi-sweet choco...
9847    [3 ounces Prosecco, 2 ounces Campari, 1 ounce ...
Name: ingredients, Length: 5582, dtype: object

In [52]:
indr = nutrition_df[['ingredients', 'nutrients', 'dietary_restrictions']]