### Functions:
- compile list of recipes
- open files in the list (set) of recipes
- parse ingredient amount and unit from line and add to dict
- create weekly folder and populate w/t recipes and shopping list

##### Find quantity and unit of ingredient

In [None]:
def findAmount(amtStr):
    """
    Extracts and converts ingredient amount and unit from a string.

    Args:
        amnStr: String containing the amount and unit.

    Returns:
        the amount as a float and the unit as a string.
    """
    # define regex
    regex = r"(\d+|\.\d+)(\.\d+\s+\d+\/\d+|\.\d+|\/\d+|\s+\d+\/\d+)?"

    # clean passed string
    amtStr = amtStr.strip()
    # extract unit and quantity
    try:
        # find int, float,
        amount = re.match(regex, amtStr)
        unit = amtStr[(amount.span()[1]+1):]

        # convert quantity to float
        rawAmnt = amount.group()
        if "/" and " " in rawAmnt:
            numer = float(rawAmnt.split(" ")[1].split("/")[0])
            denom = float(rawAmnt.split(" ")[1].split("/")[1])
            amount = float(rawAmnt.split(" ")[0]) + float(numer/denom)
        elif "/" in rawAmnt:
            numer = float(rawAmnt.split("/")[0])
            denom = float(rawAmnt.split("/")[1])
            amount = numer/denom
        elif " " in rawAmnt:
            amount = float(rawAmnt.split(
                " ")[0]) + float(rawAmnt.split(" ")[1])
        else:
            amount = rawAmnt
        # convert to numbers if dozen is unit
        if unit == "DOZEN":
            amount = float(rawAmnt)*12
            unit = ""

        # return quantity and unit
        return round(float(amount), 2), unit
    except:
        amount = amtStr
        return round(float(amount), 2), ""

##### Find recipe paths

In [None]:
def findRecipes(recipeDir):
    """
    Finds all recipe files within a directory.

    Args:
        recipeDir: Path to the directory containing recipe files.

    Returns:
        fileNames: A set of paths to the recipe files.
    """

    fileNames = set()
    for root, dirs, files in os.walk(recipeDir, topdown=False):
        for name in files:
            if name.endswith(".txt"):
                fileNames.add(os.path.join(root, name))
    return fileNames

##### Read Recipe and Store Ingredients as List

In [None]:
def readRecipe(fileName):
    """
    Reads a recipe file and extracts the ingredients.

    Args:
        fleName: Path to the recipe file.

    Returns:
        ingredientList: A list of ingredient strings.
    """
    # Open the recipe
    with open(fileName, 'r') as recipe:
        # Store the files contents as lists of lines
        lines = [line.strip() for line in recipe]

    # split the recipe by section
    recipeList = [[]]
    for x, y in itertools.groupby(lines, lambda z: z == ''):
        if x:
            recipeList.append([])
        recipeList[-1].extend(y)

    # remove the delimeters (list comprehension is pretty baller)
    recipeList = [[line for line in section if line != '']
                  for section in recipeList]

    # check to see if ingredients are where expected
    # if not where expected, return empty list
    if recipeList[1][0].upper() != "INGREDIENTS:":
        print(
            f"Could not find the ingredient's section for the recipe {recipeList[0][1].upper()}. Recipe files should have ingredients as the second section.")
        ingredientList = []
        return ingredientList
    # else, return ingredient list
    else:
        ingredientList = recipeList[1][1:]
        return ingredientList

##### Move Weekly Recipes and write shopping list to weekly dir

In [None]:
def createWeeklyFolder(recipes, shoppingList, storageDir):
    """
    Creates a new folder with the week's recipes and shopping list.

    Args:
        recipes: Set of paths to the recipe files.
        shoppingList: Dictionary of ingredients and their quantities.
        storage_dir: Path to the directory where weekly folders are stored.
    """
    # make newDir (remove if already exits)
    newDir = f'{storageDir}/{str(date.today()).split(" ")[0]}'
    if os.path.isdir(newDir):
        shutil.rmtree(newDir)

    os.mkdir(newDir)
    
    # iterate over each recipe and copy to newDir
    for path in recipes:
        # extract recipeName and copy to new dir
        recipeName = path.split("\\")[-1]
        dst = f'{newDir}/{recipeName}'
        shutil.copy(path, dst)

    # write shoppingList to txt file
    with open(f'{newDir}/shoppingList.txt', 'w') as file:
        for ingredient, quantity in shoppingList.items():
            file.write(f'{ingredient}: {quantity[0]} {quantity[1]}\n')

### Put it all together...

In [None]:
"""
Set up imports and directory stuff
"""
# Make imports
import os
import itertools
import re
import shutil
from datetime import datetime as date

# Set up imports and directory stuff
wd = 
storageDir = "mealPlans"
recipeDir = 'recipes'

# find the recipes as a set of paths
recipeSet = findRecipes(f'{wd}/{recipeDir}')

# open each recipe, and store the ingredients used
ingredientList = []
for recipe in enumerate(recipeSet):
    ingredientList.extend(readRecipe(recipe[1]))


cleanerDict = {"tea.*": "tsp", "table.*": "tbs", "tbl.*": "tbs",
               "ounce.*": "oz", "pound.*": "lb", "\.\Z": "",
               "cup.*": "cup"}

# standardize the units names and make uppercase
for oldStr, newStr in cleanerDict.items():
    ingredientList = [re.sub(oldStr, newStr, line, flags=re.IGNORECASE)
                      for line in ingredientList]

ingredientList = [line.upper() for line in ingredientList]


# extract each ingredient, adding ammounts in a dictionary
shoppingList = {}
for line in ingredientList:

    item, quant = line.split("-", maxsplit=1)
    amnt, unit = findAmount(quant)

    if item.strip() not in shoppingList.keys():
        shoppingList[item.strip()] = [amnt, unit.strip()]
    else:

        shoppingList[item.strip()][0] += amnt

# create a folder with the shopping list and recipes
createWeeklyFolder(recipeSet, shoppingList, storageDir)

### Build web scraper
- functions to:
    - find the website domain ie) verify the html structure
    - extract data from html using different tag/attribute combos
    - parse the html and return the data

##### Function to find domain

In [1]:
#function to find the domain of a url
def parseDomain(url):
    # match for "www.MATCH.com"
    findDomain = re.search(r"\.[A-Za-z0-9]+\.", url)

    # Access the matched text using group(), else report bad url
    if findDomain:
        return(findDomain.group()[1:-1])
    else:
        return("bad_url")


##### Function to parse html

In [2]:
#function to parse the ingredients and recipe contained
#by a website. htmlKey is a dictionary containing the relevant html tags etc.
def parseRecipeSite(url, htmlKey):
    #send request and read html if good request
    page = requests.get(url) #read page
    if page.status_code == 200:
        #get the soup
        soup = BeautifulSoup(page.content, 'html.parser')
        
        #isolate the text from the website and return
        ingredientStr = htmlKey[0][0](soup, htmlKey[0][1])
        recipeStr =  htmlKey[1][0](soup, htmlKey[1][1])
        return ingredientStr, recipeStr
    else:
        return f'This url "{url}" had a bad request. Error #: {page.status_code}'


##### Functions to extract data from html

In [3]:
#function to find all, provide soup and relvant tags
def findAll(soup, key):
    #isolate the section and run if exists
    htmlList = soup.find_all(key[0], key[1])
    if htmlList:
        data = str()
        #add each item to the string and return
        for item in htmlList:
            # Get text from each tag
            data += item.text.strip() + "\n"
        return data
    else:
        return f"Error with these tags:\n{key}\nAnd this soup:\n{soup}"

#function to find a specific section, then read the text of a certain tag
def find_findAll(soup, key):
    #find the section
    section_container = soup.find(key[0], key[1])
    
    # If the container is found, find all list items within it and return
    if section_container:
        data = "\n".join([item.text.strip() for item in section_container.find_all(key[2])])
        
        return data
    else:
        return f"Error with these tags:\n{key}\nAnd this soup:\n{soup}"

##### Function to store scraped recipes

In [42]:


def storeScrapedRecipes (file, recipeTuple):

    recipeName = file.name.split("/")[-1].split(".")[0]
    ingredients = re.sub(r"\n+", "\n", recipeTuple[0])
    
    recipe = re.sub(r"\n+", "\n", recipeTuple[1])
    string = ""
    string += f'Name:\n{recipeName.replace("-", " ")}\n\n'
    string += f'Ingredients:\n{ingredients}\n\n'
    string += f'Directions:\n{recipe}\n\n'
    string += f'Notes:\nScraped from a website on {str(date.today()).split(" ")[0]}\n\n'
    string += f'Date:\n{str(date.today()).split(" ")[0]}'
    #file.write(re.sub("\n","\n",string))
    file.write(string)

#with open(f'{newDir}/{"https://www.allrecipes.com/recipe/26472/the-best-chicken-soup-ever/".split("/")[-2]}.txt', "w+", encoding="utf-8") as file:
            
            #storeScrapedRecipes(file, ("testIngre", "TestReci"))

##### get urls from file

In [5]:
def findRecipeUrls(scrapeFilePath):
    try:
        with open(scrapeFilePath, "r") as file:
            # Read lines, strip whitespace, and store in a list
            urls = [line.strip() for line in file.readlines()]
            # Convert the list to a tuple
            return tuple(urls)
    except:
        print(f"Error: File '{scrapeFilePath}' not found.")
        return tuple()


##### Main Web Scraper

In [46]:
import requests
from bs4 import BeautifulSoup
import re
import os
import shutil
from datetime import datetime as date

#set-up html parsing keys
#allrecipes key
allRecipe_ingredient = (findAll,('li', {'class':'mntl-structured-ingredients__list-item'}))
allRecipe_recipe = (findAll,("li", {"class" :"comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI"}))

#afamilyfeast key
familyFeast_ingredient = (find_findAll, ('div', {'class':"tasty-recipes-ingredients-body"}, 'li'))
familyFeast_recipe = (findAll, ('li', {'id':re.compile(r'instruction-step-*')}))

#compile keys in a dictionary of nested tuples
htmlKeys = {"allrecipes":(allRecipe_ingredient,allRecipe_recipe), 
            "afamilyfeast": (familyFeast_ingredient,familyFeast_recipe),
            "bad_url": ""}

#set up folder to house the scraped recipes
wd = ""
scrapedDir = os.path.join(wd, "scrapedRecipes")
newDir = f'{scrapedDir}/{str(date.today()).split(" ")[0]}'
if os.path.isdir(newDir):
    shutil.rmtree(newDir)

os.mkdir(newDir)

#define web pages of interest
#url = 'https://www.allrecipes.com/recipe/26472/the-best-chicken-soup-ever/\nhttps://www.afamilyfeast.com/strawberry-torte/' 
#urlList = url.split("\n")

urlTuple = findRecipeUrls(f'{scrapedDir}/recipeURLs.txt')
#print(urlTuple)

for url in urlTuple:
    domain = parseDomain(url)
    if domain == "bad_url":
        htmlKeys["bad_url"] += url +"\n"
        print("BAD")
    else:
        ingredients, recipe = parseRecipeSite(url, htmlKeys[domain])
        #print(f'{newDir}/{url.split("/")[-2]}')
        with open(f'{newDir}/{url.split("/")[-2]}.txt', "w+", encoding="utf-8") as file:
            print("here")
            storeScrapedRecipes(file, (ingredients, recipe))
            

    

here
here
here
here
here
here
here
here
here


### Test

In [47]:

#set-up html parsing keys
#allrecipes key
allRecipe_ingredient = (findAll,('li', {'class':'mntl-structured-ingredients__list-item'}))
allRecipe_recipe = (findAll,("li", {"class" :"comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI"}))

#afamilyfeast key
familyFeast_ingredient = (find_findAll, ('div', {'class':"tasty-recipes-ingredients-body"}, 'li'))
familyFeast_recipe = (findAll, ('li', {'id':re.compile(r'instruction-step-*')}))

#compile keys in a dictionary of nested tuples
htmlKeys = {"allrecipes":(allRecipe_ingredient,allRecipe_recipe), 
            "afamilyfeast": (familyFeast_ingredient,familyFeast_recipe),
            "bad_url": ""}

#set up folder to house the scraped recipes
wd = 
scrapedDir = os.path.join(wd, "scrapedRecipes")
newDir = f'{scrapedDir}/{str(date.today()).split(" ")[0]}'
if os.path.isdir(newDir):
    shutil.rmtree(newDir)

os.mkdir(newDir)

#define web pages of interest
#url = 'https://www.allrecipes.com/recipe/26472/the-best-chicken-soup-ever/\nhttps://www.afamilyfeast.com/strawberry-torte/' 
#urlList = url.split("\n")

urlTuple = findRecipeUrls(f'{scrapedDir}/recipeURLs.txt')
#print(urlTuple)

for url in urlTuple:
    domain = parseDomain(url)
    if domain == "bad_url":
        htmlKeys["bad_url"] += url +"\n"
        print("BAD")
    else:
        ingredients, recipe = parseRecipeSite(url, htmlKeys[domain])
        #print(f'{newDir}/{url.split("/")[-2]}')
        with open(f'{newDir}/{url.split("/")[-2]}.txt', "w+", encoding="utf-8") as file:
            print("here")
            storeScrapedRecipes(file, (ingredients, recipe))
            

    

here
here
here
here
here
here
here
here
here
