## Scraping from Documents

Amy Goldlis, 2025
(adapted from R)

In [1]:
#!pip install docx
from docx import Document
import re
import os
import random


## Class

Let's pull in some recipes:

In [2]:
def read_mom_word_doc(filename):
    """
    This is a very specific script to extract info from my mom's collection of Word docs.
    Luckily, my mom is super organized, so extrcacting the data is not too terrible
    """
    # Open the .docx file
    doc = Document(filename)
    text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
    
    # Split lines and remove empty ones
    recipe_list = [line.strip() for line in text.split("\n") if line.strip()]
    
    recipe_name = recipe_list[0]
    recipe_subtitle = "" if recipe_list[1] == "Ingredients" else recipe_list[1]
    
    # Identify section indices
    indices = {
        "Ingredients": next((i for i, line in enumerate(recipe_list) if "Ingredients" in line), None),
        "Method": next((i for i, line in enumerate(recipe_list) if "Method" in line), None),
        "Variations": next((i for i, line in enumerate(recipe_list) if "Variations:" in line), len(recipe_list)),
        "Nutritional Information": next((i for i, line in enumerate(recipe_list) if "Nutritional Information:" in line), len(recipe_list)),
        "Source": next((i for i, line in enumerate(recipe_list) if "Source:" in line), len(recipe_list)),
        "Notes": next((i for i, line in enumerate(recipe_list) if "Notes:" in line), len(recipe_list))
    }
    
    # Extract ingredients
    ingredients = recipe_list[indices["Ingredients"]+1:indices["Method"]] if indices["Ingredients"] else []
    
    # Extract method
    method = recipe_list[indices["Method"]+1:indices["Variations"]] if indices["Method"] else []
    
    # Create a structured dictionary
    new_recipe = {
        "recipe_name": recipe_name,
        "subtitle": recipe_subtitle,
        "ingredients": ingredients,
        "method": method
    }
    
    return new_recipe



In [3]:
# Example usage:
pancake_recipe = read_mom_word_doc("../Recipes/Pancakes.docx")
saffron_chicken = read_mom_word_doc("../Recipes/Saffron Chicken.docx")
print(pancake_recipe)

{'recipe_name': 'Pancakes for Twins', 'subtitle': 'AKA:  Pancakes', 'ingredients': ['1½ cups flour', '1½ tsp. baking powder', '¼ - ½ tsp. salt', '1½ cups milk', '3 tbsp melted butter', '2 large eggs', '3 tbsp maple syrup', '½ tsp. vanilla'], 'method': ['Mix dry ingredients', 'Quickly add wet ingredients, gently stir together just until combined with lumps left', 'Pour onto preheated buttered pan, flip when bubbles form, ready when other side is golden brown', 'Serve to two hungry boys – no syrup necessary. Also surprisingly yummy cold from the fridge as a snack.']}


In [4]:
pancake_recipe['recipe_name']

'Pancakes for Twins'

In [5]:

saffron_chicken['ingredients']

['16 Chicken wings (or 8-10 Thighs/Drumsticks)',
 'Salt / Pepper',
 'Olive Oil',
 '2 Tbsp Butter',
 '1 Large Onion (diced)',
 '3 Cloves Garlic (minced)',
 '2 Tsp Fresh Ginger (minced) or 1 Tsp powdered',
 '½ Tsp Paprika',
 '¼ Tsp Black Pepper (or Cayenne if you want more “zing”)',
 '1 Tsp Powdered Chicken Stock (or 1 Maggi Cube)',
 '2 Tbsp Flour',
 '¼ Tsp Saffron (Get the good stuff straight from Iran)',
 '2 Cups Chicken Broth',
 '3 Tbsp Fresh Parsley or 3 Tsp dried']