# Recipe Scraping Analysis 

In [21]:
from pymongo import MongoClient
import pandas as pd 
pd.set_option('display.max_columns', None) 
from IPython.display import display 
from ingredient_parser import parse_ingredient
import regex as re 

In [2]:
# Connect to MongoDB
client = MongoClient()
db = client['recipe_db']  # Replace with your database name

# List all collections (tables)
collections = db.list_collection_names()
print("Collections:", collections)

Collections: ['whatsgabycooking', 'sallysbakingaddiction', 'wellplated', 'acouplecooks_recipes', 'wedishitup', 'ahealthysliceoflife', 'abeautifulmess_recipes']


In [3]:
# Function to convert all collections into a single DataFrame
def load_all_collections(db):
    all_data = []  # List to store all documents

    for collection_name in db.list_collection_names():
        collection = db[collection_name]  # Access collection
        documents = list(collection.find())  # Convert to list
        for doc in documents:
            doc['_collection'] = collection_name  # Track source collection
            doc.pop('_id', None)  # Remove MongoDB's default `_id`
        all_data.extend(documents)

    # Convert list of dicts to DataFrame
    df = pd.DataFrame(all_data)
    return df

# Load into DataFrame
df = load_all_collections(db)

In [4]:
df.head()

Unnamed: 0,canonical_url,host,image,language,site_name,_collection,author,category,cook_time,cuisine,description,ingredient_groups,ingredients,instructions,instructions_list,keywords,nutrients,prep_time,ratings,ratings_count,title,total_time,yields,cooking_method,dietary_restrictions,equipment
0,https://whatsgabycooking.com/meal-plan-week-20/,whatsgabycooking.com,https://whatsgabycooking.com/wp-content/upload...,en-US,What's Gaby Cooking,whatsgabycooking,,,,,,,,,,,,,,,,,,,,
1,https://whatsgabycooking.com/14-of-our-favorit...,whatsgabycooking.com,https://whatsgabycooking.com/wp-content/upload...,en-US,What's Gaby Cooking,whatsgabycooking,Gaby Dalkin,Dessert,20.0,American,Strawberry Blueberry Shortcakes are the desser...,"[{'ingredients': ['3 cups all purpose flour', ...","[3 cups all purpose flour, 2 teaspoons lemon z...","For the Biscuits\nCombine the flour, lemon zes...","[For the Biscuits, Combine the flour, lemon ze...","[strawberry shortcake, blueberry shortcake]","{'calories': '731 kcal', 'carbohydrateContent'...",15.0,5.0,1.0,Strawberry Blueberry Shortcakes,35.0,9 servings,,,
2,https://whatsgabycooking.com/raspberry-brownies/,whatsgabycooking.com,https://whatsgabycooking.com/wp-content/upload...,en-US,What's Gaby Cooking,whatsgabycooking,Gaby Dalkin,Dessert,50.0,American,These Raspberry Swirled Brownies are the most ...,[{'ingredients': ['240 grams dark chocolate (a...,[240 grams dark chocolate (approximately 1½ cu...,Preheat the oven to 325° F.\nCombine the choco...,"[Preheat the oven to 325° F., Combine the choc...","[brownies, raspberry brownies, how to make bro...","{'calories': '358 kcal', 'carbohydrateContent'...",10.0,4.67,21.0,Raspberry Swirled Brownies,195.0,16 servings,,,
3,https://whatsgabycooking.com/pineapple-sheet-c...,whatsgabycooking.com,https://whatsgabycooking.com/wp-content/upload...,en-US,What's Gaby Cooking,whatsgabycooking,Gaby Dalkin,Dessert,35.0,American,Pineapple Sheet Cake – the breakfast and desse...,"[{'ingredients': ['2 cups white sugar', '2 egg...","[2 cups white sugar, 2 eggs (at room temperatu...",Preheat the oven to 350 degrees F.\nLine a 9x1...,"[Preheat the oven to 350 degrees F., Line a 9x...","[cream cheese frosting, pineapple sheet cake]","{'calories': '233 kcal', 'carbohydrateContent'...",15.0,4.81,21.0,Pineapple Sheet Cake with Cream Cheese Frosting,50.0,20 servings,,,
4,https://whatsgabycooking.com/magic-bar-brownies/,whatsgabycooking.com,https://whatsgabycooking.com/wp-content/upload...,en-US,What's Gaby Cooking,whatsgabycooking,Gaby Dalkin,"Dessert,Snack",60.0,American,Magic Bar Brownies have been big since before ...,[{'ingredients': ['240 grams dark chocolate (a...,[240 grams dark chocolate (approx 1½ cups regu...,Preheat the oven to 325° F. Line a 9×13 pan wi...,[Preheat the oven to 325° F. Line a 9×13 pan w...,[best brownies],"{'calories': '611 kcal', 'carbohydrateContent'...",30.0,5.0,3.0,Magic Bar Brownies,90.0,16 servings,,,


In [5]:
# data cleaning

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9848 entries, 0 to 9847
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   canonical_url         9848 non-null   object 
 1   host                  9848 non-null   object 
 2   image                 9823 non-null   object 
 3   language              9848 non-null   object 
 4   site_name             9848 non-null   object 
 5   _collection           9848 non-null   object 
 6   author                8916 non-null   object 
 7   category              8470 non-null   object 
 8   cook_time             6030 non-null   float64
 9   cuisine               8346 non-null   object 
 10  description           8915 non-null   object 
 11  ingredient_groups     8923 non-null   object 
 12  ingredients           8930 non-null   object 
 13  instructions          9000 non-null   object 
 14  instructions_list     9000 non-null   object 
 15  keywords             

In [7]:
df = df[
    ~((df['ingredients'].isna()) | df['ingredients'].astype(str).isin(['{}', '[]'])) & 
    ~((df['instructions'].isna()) | df['instructions'].astype(str).isin(['{}', '[]']))
]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8819 entries, 1 to 9847
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   canonical_url         8819 non-null   object 
 1   host                  8819 non-null   object 
 2   image                 8819 non-null   object 
 3   language              8819 non-null   object 
 4   site_name             8819 non-null   object 
 5   _collection           8819 non-null   object 
 6   author                8819 non-null   object 
 7   category              8445 non-null   object 
 8   cook_time             5965 non-null   float64
 9   cuisine               8329 non-null   object 
 10  description           8818 non-null   object 
 11  ingredient_groups     8812 non-null   object 
 12  ingredients           8819 non-null   object 
 13  instructions          8819 non-null   object 
 14  instructions_list     8819 non-null   object 
 15  keywords             

In [9]:
nutrition_df = df[df['nutrients']!= {}] # 5k recipes not bad 
nutrition_df['category'].value_counts()

Main Course                    686
Dessert                        676
Side Dish                      433
Breakfast                      353
Main Dish                      323
                              ... 
Breakfast,Drinks,Smoothie        1
Snacks                           1
Appetizer,Snack,Lunch            1
Main Course,Lunch,Breakfast      1
brunch,Drinks                    1
Name: category, Length: 241, dtype: int64

In [10]:
nutrition_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5582 entries, 1 to 9847
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   canonical_url         5582 non-null   object 
 1   host                  5582 non-null   object 
 2   image                 5582 non-null   object 
 3   language              5582 non-null   object 
 4   site_name             5582 non-null   object 
 5   _collection           5582 non-null   object 
 6   author                5582 non-null   object 
 7   category              5544 non-null   object 
 8   cook_time             4153 non-null   float64
 9   cuisine               5481 non-null   object 
 10  description           5582 non-null   object 
 11  ingredient_groups     5578 non-null   object 
 12  ingredients           5582 non-null   object 
 13  instructions          5582 non-null   object 
 14  instructions_list     5582 non-null   object 
 15  keywords             

In [11]:
print(nutrition_df['ingredient_groups'].loc[1])

[{'ingredients': ['3 cups all purpose flour', '2 teaspoons lemon zest', '3 tablespoons granulated sugar', '1 1/2 tablespoons baking powder', '1 teaspoon salt', '12 tablespoons cold unsalted butter (cut into small pieces)', '1 1/2 cups heavy cream', '1 1/2 teaspoons vanilla extract'], 'purpose': 'For the Shortcakes'}, {'ingredients': ['1 pound strawberries (tops removed and sliced)', '1/3 cup white sugar', '1 pint blueberries'], 'purpose': 'For the Strawberries + Blueberries'}, {'ingredients': ['2 cups heavy cream', '2/3 cup powdered sugar', '2 teaspoons vanilla extract'], 'purpose': 'For the Cream'}]


In [12]:
nutrition_df['ingredients'].loc[1]

['3 cups all purpose flour',
 '2 teaspoons lemon zest',
 '3 tablespoons granulated sugar',
 '1 1/2 tablespoons baking powder',
 '1 teaspoon salt',
 '12 tablespoons cold unsalted butter (cut into small pieces)',
 '1 1/2 cups heavy cream',
 '1 1/2 teaspoons vanilla extract',
 '1 pound strawberries (tops removed and sliced)',
 '1/3 cup white sugar',
 '1 pint blueberries',
 '2 cups heavy cream',
 '2/3 cup powdered sugar',
 '2 teaspoons vanilla extract']

In [13]:
indr = nutrition_df[['ingredients', 'nutrients', 'dietary_restrictions']]
indr.head()

Unnamed: 0,ingredients,nutrients,dietary_restrictions
1,"[3 cups all purpose flour, 2 teaspoons lemon z...","{'calories': '731 kcal', 'carbohydrateContent'...",
2,[240 grams dark chocolate (approximately 1½ cu...,"{'calories': '358 kcal', 'carbohydrateContent'...",
3,"[2 cups white sugar, 2 eggs (at room temperatu...","{'calories': '233 kcal', 'carbohydrateContent'...",
4,[240 grams dark chocolate (approx 1½ cups regu...,"{'calories': '611 kcal', 'carbohydrateContent'...",
5,"[2 ripe peaches, 2 ripe nectarines, canola oil...","{'calories': '166 kcal', 'carbohydrateContent'...",


In [24]:
# establish all ingredient names 

def extract_text_with_spaces(string):
    """Extract only letters, commas, and spaces from a given string."""
    return ' '.join(re.findall(r'[A-Za-z,]+', string))

In [31]:
ingredient_lists = indr['ingredients'].to_list()
ingredients_list = []
for i in ingredient_lists:
    ingredients_stripped = extract_text_with_spaces(str(i)).split(',')
    ingredients_list+=ingredients_stripped
    
all_ingredients = list(set(ingredients_list))
all_ingredients[:10]

['',
 ' pound bacon cooked and crumbled ',
 ' when in season ',
 ' cups shredded Monterey Jack cheese ',
 'cups farro ',
 ' Gala',
 ' large green zucchini squash ',
 ' pounds Yukon gold potatoes ',
 ' sour cream or vegan sour cream or cashew cream ',
 ' cup shredded Monterey Jack cheese ']

In [57]:
unique_ingredient_columns = set()
fixed_ings = []
for ing in all_ingredients:
    info = parse_ingredient(ing)
    if info.name != []:
        info_name = info.name[0].text.lower()
        if info_name not in unique_ingredient_columns and info_name not in fixed_ings :
            if info.name[0].confidence < .80:
                print(f"ingredient in recipe: {ing}\nproposed ingredient: {info_name}")
                response = input("if changing ingredient type new name, else enter")
                if response == '':
                    unique_ingredient_columns.add(info_name)
                elif response[0] == '/':
                    continue
                else:
                    unique_ingredient_columns.add(response[1:])
            else:
                unique_ingredient_columns.add(info_name)

ingredient in recipe:  when in season 
proposed ingredient: when in season


if changing ingredient type new name, else hit space /


ingredient in recipe:  cup half and half or heavy cream if you want to make this outrageously decadent 
proposed ingredient: half and half


if changing ingredient type new name, else hit space 


IndexError: string index out of range

In [16]:
# break out recipes by nutritional info 
macros_df = pd.json_normalize(indr['nutrients'])
macros_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5582 entries, 0 to 5581
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   calories               5578 non-null   object
 1   carbohydrateContent    5548 non-null   object
 2   proteinContent         5522 non-null   object
 3   fatContent             5516 non-null   object
 4   saturatedFatContent    5426 non-null   object
 5   transFatContent        3770 non-null   object
 6   cholesterolContent     4753 non-null   object
 7   sodiumContent          4190 non-null   object
 8   fiberContent           5382 non-null   object
 9   sugarContent           5492 non-null   object
 10  unsaturatedFatContent  2963 non-null   object
 11  servingSize            3963 non-null   object
dtypes: object(12)
memory usage: 523.4+ KB


In [17]:
macros_func = macros_df[~((macros_df['calories'].isna()) | (macros_df['carbohydrateContent'].isna()) | (macros_df['proteinContent'].isna()) | (macros_df['fatContent'].isna()))]

In [18]:
# make note of dietary restrictions 
indr[indr['dietary_restrictions'].notna()]
# only present for 1250 recipes 

Unnamed: 0,ingredients,nutrients,dietary_restrictions
4259,[1 ball Best Pizza Dough (or Thin Crust Dough)...,"{'servingSize': '1 slice', 'calories': '121 ca...",[Vegetarian Diet]
4260,"[¾ cup unsalted cashews, 1 medium yellow onion...","{'calories': '380 calories', 'sugarContent': '...",[Vegan Diet]
4263,[1 ½ ounces (3 tablespoons) tequila blanco or ...,"{'calories': '166 calories', 'sugarContent': '...",[Vegan Diet]
4264,"[1 pound ripe strawberries, 3 tablespoons suga...","{'calories': '310 calories', 'sugarContent': '...",[Vegetarian Diet]
4267,"[1 ¾ cups [245 g] all-purpose flour, ½ cup [10...","{'calories': '267 calories', 'sugarContent': '...",[Vegetarian Diet]
...,...,...,...
7494,"[1 ¼ cup all-purpose flour, 3/4 cup light brow...","{'calories': '207 calories', 'sugarContent': '...",[Vegetarian Diet]
7499,"[1 cup dried chickpeas (not cooked or canned),...","{'servingSize': '4 falafel', 'calories': '258 ...",[Vegan Diet]
7502,"[⅓ cup hummus, 8 English cucumber slices (or s...","{'calories': '552 calories', 'sugarContent': '...",[Vegan Diet]
7505,[2 cups Old Fashioned rolled oats (don’t subst...,"{'calories': '293 calories', 'sugarContent': '...",[Vegan Diet]
