In [117]:
%pip install eep153_tools
%pip install python_gnupg
%pip install -U gspread_pandas
#load in file from class
def format_id(id,zeropadding=0):
    """Nice string format for any id, string or numeric.

    Optional zeropadding parameter takes an integer
    formats as {id:0z} where
    """
    if pd.isnull(id) or id in ['','.']: return None

    try:  # If numeric, return as string int
        return ('%d' % id).zfill(zeropadding)
    except TypeError:  # Not numeric
        return id.split('.')[0].strip().zfill(zeropadding)
    except ValueError:
        return None

#data_url = "https://docs.google.com/spreadsheets/d/12Z4n8HbFZRYvH6B-D8EDLDibRiL50zNMlSBLMJ41C1o/"
data_url = "https://docs.google.com/spreadsheets/d/1GTo423_gUJe1Von9jypWAbC0zSQ7WGegAWPuRi7eJAI/edit?gid=1410082681#gid=1410082681"

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [118]:
import pandas as pd
import re 
from eep153_tools.sheets import read_sheets

#create recipes df
recipes = read_sheets(data_url, sheet="recipes")
recipes = (recipes
           .assign(parent_foodcode = lambda df: df["parent_foodcode"].apply(format_id),
                   ingred_code = lambda df: df["ingred_code"].apply(format_id))
           .rename(columns={"parent_desc": "recipe"}))
recipes.head()
#recipes.shape


Unnamed: 0,parent_foodcode,recipe,ingred_code,ingred_desc,ingred_wt
0,11340000,"Imitation milk, non-soy, sweetened",43543,"Milk, imitation, non-soy",100.0
1,11460150,"Yogurt, frozen, NS as to flavor, lowfat milk",1298,"Yogurt, frozen, flavors other than chocolate, ...",100.0
2,11460160,"Yogurt, frozen, chocolate, lowfat milk",1117,"Yogurt, plain, low fat, 12 grams protein per 8...",81.8
3,11460160,"Yogurt, frozen, chocolate, lowfat milk",19166,"Cocoa, dry powder, unsweetened, processed with...",5.2
4,11460160,"Yogurt, frozen, chocolate, lowfat milk",19335,"Sugars, granulated",13.0


In [119]:
#check the data types
recipes.dtypes

parent_foodcode     object
recipe              object
ingred_code         object
ingred_desc         object
ingred_wt          float64
dtype: object

In [120]:
#List of non-vegan keywords
NON_VEGAN_KEYWORDS = [
    "beef", "pork", "chicken", "turkey", "fish", "seafood", "shellfish", "shrimp", "crab","crabs",
    "lamb", "goat", "duck", "goose", "tuna", "salmon", "cod", "bacon", "ham",
    "shellfish", "lobster", "mussels", "oysters", "scallops", "octopus", "eel",
    "organ meat", "milk","Eggnog" "cheese", "butter", "cream","ice cream", "yogurt", "whey",
    "casein", "lactose", "ghee", "buttermilk", "egg", "eggs", "mayo", "mayonnaise", "albumen",
    "albumin", "lysozyme", "ovomucoid", "ovomucin", "ovovitellin", "honey",
    "bee pollen", "royal jelly", "propolis", "shellac", "confectioner’s glaze",
    "carmine", "cochineal", "lard", "tallow", "suet", "gelatin", "collagen",
    "isinglass", "bone broth", "bone stock", "fish sauce", "oyster sauce",
    "shrimp paste", "worcestershire sauce", "anchovies", "rennet", "pepsin",
    "bone char", "vitamin d3", "lanolin", "omega-3 fish oil", "caseinate",
    "lecithin (egg)", "cysteine", "l-cysteine", "glycerin (animal)",
    "glycerol (animal)", "stearic acid (animal)", "tallowate", "sodium tallowate",
    "capric acid", "caprylic acid", "cheese", "pudding", "processed", "veal",'sirloin', "steak", "animal",
    "Custard", "Mousse", "chocolate", "Meatballs", "meat", "Gravy", "poultry","baby"
]

#this partal match: "milkshake" or "eggroll" will get flagged (since "milk" or "egg" is in the keyword list).
NON_VEGAN_PATTERN = re.compile(
    '|'.join(map(re.escape, NON_VEGAN_KEYWORDS)),
    re.IGNORECASE
)

def filter_vegan_ingredients(df: pd.DataFrame) -> pd.DataFrame:
    # 1) Convert to string, lowercase, remove punctuation
    df["recipe"] = df["recipe"].astype(str).str.lower().fillna("")
    df["recipe"] = df["recipe"].str.replace(r"[^\w\s]", "", regex=True)

    df["ingred_desc"] = df["ingred_desc"].astype(str).str.lower().fillna("")
    df["ingred_desc"] = df["ingred_desc"].str.replace(r"[^\w\s]", "", regex=True)

    # 2) Create a mask for rows that do NOT contain non-vegan keywords
    mask = ~(df["recipe"].str.contains(NON_VEGAN_PATTERN, na=False, regex=True) |
             df["ingred_desc"].str.contains(NON_VEGAN_PATTERN, na=False, regex=True))

    return df[mask]

In [121]:
vegan_recipes = filter_vegan_ingredients(recipes)
vegan_recipes.shape

(12809, 5)

In [136]:
#start copying code from mini lecture VEGAN

#create nutrition df
nutrition = (read_sheets(data_url, sheet="nutrients")
             .assign(ingred_code = lambda df: df["ingred_code"].apply(format_id)))

display(nutrition.head())
nutrition.columns
nutrition.shape



# normalize weights to percentage terms. 
vegan_recipes['ingred_wt'] = vegan_recipes['ingred_wt']/vegan_recipes.groupby(['parent_foodcode'])['ingred_wt'].transform("sum")

# we're going to extend the recipes data frame to include the nutrient profiles of its ingredients (in 100g)
df_vegan = vegan_recipes.merge(nutrition, how="left", on="ingred_code")

# multiply all nutrients per 100g of an ingredient by the weight of that ingredient in a recipe.
numeric_cols = list(df_vegan.select_dtypes(include=["number"]).columns)
numeric_cols.remove("ingred_wt")
df_vegan[numeric_cols] = df_vegan[numeric_cols].mul(df_vegan["ingred_wt"], axis=0)

# sum nutrients of food codes (over the multiple ingredients)
# python tip: one can merge dictionaries dict1 dict2 using **, that is: dict_merge = {**dict1, **dict2}. The ** effectively "unpacks" the key value pairs in each dictionary
df_vegan = df_vegan.groupby('parent_foodcode').agg({**{col: "sum" for col in numeric_cols},
                                        "recipe": "first"})

df_vegan.index.name = "recipe_id"

food_names = df_vegan["recipe"]
print(food_names.head())
df_vegan.head()
df_vegan.shape

Unnamed: 0,ingred_code,Ingredient description,Capric acid,Lauric acid,Myristic acid,Palmitic acid,Palmitoleic acid,Stearic acid,Oleic acid,Linoleic Acid,...,Vitamin B12,"Vitamin B-12, added",Vitamin B6,Vitamin C,Vitamin D,Vitamin E,"Vitamin E, added",Vitamin K,Water,Zinc
0,1001,"Butter, salted",2.529,2.587,7.436,21.697,0.961,9.999,19.961,2.728,...,0.17,0.0,0.003,0.0,0.0,2.32,0.0,7.0,15.87,0.09
1,1002,"Butter, whipped, with salt",2.039,2.354,7.515,20.531,1.417,7.649,17.37,2.713,...,0.07,0.0,0.008,0.0,0.0,1.37,0.0,4.6,16.72,0.05
2,1003,"Butter oil, anhydrous",2.495,2.793,10.005,26.166,2.228,12.056,25.026,2.247,...,0.01,0.0,0.001,0.0,0.0,2.8,0.0,8.6,0.24,0.01
3,1004,"Cheese, blue",0.601,0.491,3.301,9.153,0.816,3.235,6.622,0.536,...,1.22,0.0,0.166,0.0,0.5,0.25,0.0,2.4,42.41,2.66
4,1005,"Cheese, brick",0.585,0.482,3.227,8.655,0.817,3.455,7.401,0.491,...,1.26,0.0,0.065,0.0,0.5,0.26,0.0,2.5,41.11,2.6


recipe_id
11115400                   kefir ns as to fat content
11440060                                 tzatziki dip
11551050                            licuado or batido
11553100                           fruit smoothie nfs
11553110    fruit smoothie with whole fruit and dairy
Name: recipe, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vegan_recipes['ingred_wt'] = vegan_recipes['ingred_wt']/vegan_recipes.groupby(['parent_foodcode'])['ingred_wt'].transform("sum")


(4351, 66)

In [137]:
# NONVEGAN

#create NONVEGAN nutrition df
nutrition = (read_sheets(data_url, sheet="nutrients")
             .assign(ingred_code = lambda df: df["ingred_code"].apply(format_id)))

display(nutrition.head())
nutrition.columns
nutrition.shape



# normalize weights to percentage terms. 
recipes['ingred_wt'] = recipes['ingred_wt']/recipes.groupby(['parent_foodcode'])['ingred_wt'].transform("sum")

# we're going to extend the recipes data frame to include the nutrient profiles of its ingredients (in 100g)
df_non_vegan = recipes.merge(nutrition, how="left", on="ingred_code")

# multiply all nutrients per 100g of an ingredient by the weight of that ingredient in a recipe.
numeric_cols = list(df_non_vegan.select_dtypes(include=["number"]).columns)
numeric_cols.remove("ingred_wt")
df_non_vegan[numeric_cols] = df_non_vegan[numeric_cols].mul(df_non_vegan["ingred_wt"], axis=0)

# sum nutrients of food codes (over the multiple ingredients)
# python tip: one can merge dictionaries dict1 dict2 using **, that is: dict_merge = {**dict1, **dict2}. The ** effectively "unpacks" the key value pairs in each dictionary
df_non_vegan = df_non_vegan.groupby('parent_foodcode').agg({**{col: "sum" for col in numeric_cols},
                                        "recipe": "first"})

df_non_vegan.index.name = "recipe_id"

food_names = df_non_vegan["recipe"]
print(food_names.head())
df_non_vegan.head()
df_non_vegan.shape


Unnamed: 0,ingred_code,Ingredient description,Capric acid,Lauric acid,Myristic acid,Palmitic acid,Palmitoleic acid,Stearic acid,Oleic acid,Linoleic Acid,...,Vitamin B12,"Vitamin B-12, added",Vitamin B6,Vitamin C,Vitamin D,Vitamin E,"Vitamin E, added",Vitamin K,Water,Zinc
0,1001,"Butter, salted",2.529,2.587,7.436,21.697,0.961,9.999,19.961,2.728,...,0.17,0.0,0.003,0.0,0.0,2.32,0.0,7.0,15.87,0.09
1,1002,"Butter, whipped, with salt",2.039,2.354,7.515,20.531,1.417,7.649,17.37,2.713,...,0.07,0.0,0.008,0.0,0.0,1.37,0.0,4.6,16.72,0.05
2,1003,"Butter oil, anhydrous",2.495,2.793,10.005,26.166,2.228,12.056,25.026,2.247,...,0.01,0.0,0.001,0.0,0.0,2.8,0.0,8.6,0.24,0.01
3,1004,"Cheese, blue",0.601,0.491,3.301,9.153,0.816,3.235,6.622,0.536,...,1.22,0.0,0.166,0.0,0.5,0.25,0.0,2.4,42.41,2.66
4,1005,"Cheese, brick",0.585,0.482,3.227,8.655,0.817,3.455,7.401,0.491,...,1.26,0.0,0.065,0.0,0.5,0.26,0.0,2.5,41.11,2.6


recipe_id
11000000                      milk human
11100000                        milk nfs
11111000                      milk whole
11111100           milk low sodium whole
11111150    milk calcium fortified whole
Name: recipe, dtype: object


(8888, 66)

In [138]:
prices = read_sheets(data_url, sheet="prices")[["food_code", "year", "price"]]

prices["food_code"] = prices["food_code"].apply(format_id)

prices = prices.set_index(["year", "food_code"])
print(prices.index.levels[0])

# we'll focus on the latest price data
prices = prices.xs("2017/2018", level="year")

# drop rows of prices where the price is "NA"
prices = prices.dropna(subset="price")

print(f"We have prices for {prices.shape[0]} unique recipes (FNDDS food codes)")

Index(['2011/2012', '2013/2014', '2015/2016', '2017/2018'], dtype='object', name='year')
We have prices for 4435 unique recipes (FNDDS food codes)


In [139]:
#add diet requirements

rda = read_sheets(data_url, sheet="rda")

rda = rda.set_index("Nutrient")

rda.columns, rda.head()

(Index(['Nutrient Type', 'Unit', 'Constraint Type', 'Female_19_30',
        'Female_endurance', 'Male_19_30', 'Male_endurance'],
       dtype='object'),
               Nutrient Type  Unit Constraint Type  Female_19_30  \
 Nutrient                                                          
 Energy                Macro  kcal             RDA        2000.0   
 Protein               Macro     g             RDA          46.0   
 Carbohydrate          Macro     g             RDA         130.0   
 Dietary Fiber         Macro     g             RDA          28.0   
 Linoleic Acid         Macro     g              AI          12.0   
 
                Female_endurance  Male_19_30  Male_endurance  
 Nutrient                                                     
 Energy                   2800.0      2400.0          3500.0  
 Protein                    80.0        56.0           112.0  
 Carbohydrate              488.0       130.0           560.0  
 Dietary Fiber              28.0        33.6          

In [140]:
common_recipes = df.index.intersection(prices.index)

# python tip: given a list of indices, "loc" both subsets and sorts. 
df = df.loc[common_recipes]
prices = prices.loc[common_recipes]

# lets remap the price dataframe index to be the actual food names.
prices.index = prices.index.map(food_names)

A_all = df.T

print(prices.head())
print(A_all.head())

                                              price
kefir ns as to fat content                 0.345625
tzatziki dip                               1.217789
licuado or batido                          0.189099
fruit smoothie nfs                         0.462558
fruit smoothie with whole fruit and dairy  0.402191
                 11115400  11440060  11551050  11553100  11553110  11553120  \
Capric acid        0.0195  0.005673   0.00048  0.000266  0.000266       0.0   
Lauric acid         0.026  0.000273  0.000959  0.000531  0.000531       0.0   
Myristic acid      0.0945  0.000545  0.000959  0.000531  0.000531       0.0   
Palmitic acid      0.2805    5.7604  0.053557  0.039154  0.039149  0.016415   
Palmitoleic acid   0.0185  0.638909  0.005183  0.003332  0.003332   0.00092   

                 11710051 11710055  11710357  11710358  ... 95312410 95312560  \
Capric acid         0.689    0.689  0.078597  0.078597  ...      0.0      0.0   
Lauric acid         0.023    0.023  0.486647  0.486

In [141]:
# pick a demographic (column from rda dataframe)
'''
select from 
['Female_19_30', 'Female_endurance', 'Male_19_30', 'Male_endurance']
'''
group = "Male_endurance"

# create lower bounds and upper bounds.
bmin = rda.loc[rda['Constraint Type'].isin(['RDA', 'AI']), group]
bmax = rda.loc[rda['Constraint Type'].isin(['UL']), group]

# reindex ensures we only keep nutrients in bmin/bmax
Amin = A_all.reindex(bmin.index).dropna(how='all')
Amax = A_all.reindex(bmax.index).dropna(how='all')

b = pd.concat([bmin, -bmax])
A = pd.concat([Amin, -Amax])

#python tip: by typing "=" after the variable name inside the curly braces, it formats the output so we don't have to write f"variable = {variable}"
print(f"{bmin.shape=}")
print(f"{Amin.shape=}")
print(f"{bmax.shape=}")
print(f"{Amax.shape=}")
print(f"{b.shape=}")
print(f"{A.shape=}")
print(f"{prices.shape=}")

bmin.shape=(26,)
Amin.shape=(26, 1974)
bmax.shape=(1,)
Amax.shape=(1, 1974)
b.shape=(27,)
A.shape=(27, 1974)
prices.shape=(1974, 1)


In [142]:
from  scipy.optimize import linprog as lp
import numpy as np
p = prices
tol = 1e-6 # Numbers in solution smaller than this (in absolute value) treated as zeros
result = lp(p, -A, -b, method='highs')

#result

In [143]:
print(f"Cost of diet for {group} is ${result.fun:.2f} per day.")

Cost of diet for Male_endurance is $3.90 per day.


In [130]:
# lets mess with the index on price df so they are recipe names not ids.

# get the result x in a series with food names
diet = pd.Series(result.x,index=prices.index)

print("\nYou'll be eating (in 100s of grams or milliliters):")
print(round(diet[diet >= tol], 2))


You'll be eating (in 100s of grams or milliliters):
split peas from dried fat added                             4.79
vermicelli made from soybeans                               3.28
peanuts unroasted                                           1.47
pancakes whole grain reduced fat from frozen                0.73
cereal rice flakes                                          0.44
cereal toasted oat                                          1.42
orange juice 100 with calcium added frozen reconstituted    0.38
cilantro raw                                                0.16
sugar white granulated or lump                              1.40
nutritional powder mix high protein herbalife               0.13
dtype: float64


In [157]:
df_vegan.shape

(4351, 66)

In [158]:
df_non_vegan.shape

(8888, 66)

In [151]:
prices.he

Unnamed: 0,price
kefir ns as to fat content,0.345625
tzatziki dip,1.217789
licuado or batido,0.189099
fruit smoothie nfs,0.462558
fruit smoothie with whole fruit and dairy,0.402191


# Female Min Cost Diet


In [160]:
import numpy as np
import pandas as pd
from scipy.optimize import linprog

def run_min_cost_diet(
    group: str,
    diet_type: str,
    df_vegan: pd.DataFrame,
    df_non_vegan: pd.DataFrame,
    prices: pd.DataFrame,
    rda: pd.DataFrame,
    food_names: dict,
) -> float:

    # 1. Select the correct dataset
    df_selected = df_vegan if diet_type.lower() == "vegan" else df_non_vegan

    # 2. Find matching recipes
    common_recipes = df_selected.index.intersection(prices.index)
    df_selected = df_selected.loc[common_recipes]
    prices = prices.loc[common_recipes]

    # 3. Check if prices is empty
    if prices.empty:
        raise ValueError("[ERROR] No matching prices found. `prices` is empty after filtering.")

    # 4. Map food names
    prices.index = prices.index.map(food_names)

    # 5. Transpose for constraints
    A_all = df_selected.T

    # 6. Extract RDA constraints
    bmin = rda.loc[rda["Constraint Type"].isin(["RDA", "AI"]), group]
    bmax = rda.loc[rda["Constraint Type"].isin(["UL"]), group]

    # 7. Reindex for constraints
    Amin = A_all.reindex(bmin.index).dropna(how="all")
    Amax = A_all.reindex(bmax.index).dropna(how="all")

    # 8. Construct constraint matrices
    b_ub = pd.concat([bmin, -bmax])
    A_ub = pd.concat([Amin, -Amax])

    # 9. Check if A_ub is empty
    if A_ub.shape[0] == 0 or A_ub.shape[1] == 0:
        raise ValueError("[ERROR] `A_ub` is empty. No valid nutrient constraints found.")

    # 10. Extract and convert cost_vector
    if "price" not in prices.columns:
        raise ValueError("[ERROR] 'prices' DataFrame must have a 'price' column.")

    cost_vector = prices["price"].values
    cost_vector = pd.to_numeric(cost_vector, errors="coerce").flatten()  # Ensure numeric

    # 11. Convert matrices to NumPy arrays
    A_ub = A_ub.values
    b_ub = b_ub.values.flatten()

    # ---- Debug Prints ----
    print("---- DEBUG ----")
    print(f"cost_vector.shape = {cost_vector.shape} (should be (n,))")
    print(f"A_ub.shape = {A_ub.shape} (should be (m, n))")
    print(f"b_ub.shape = {b_ub.shape} (should be (m,))")
    print("cost_vector.dtype =", cost_vector.dtype)

    # 12. Solve linear program
    result = linprog(
        c=cost_vector,
        A_ub=A_ub,
        b_ub=b_ub,
        bounds=(0, None),
        method="highs"
    )

    # 13. Check result
    if result.success:
        print(f"[SUCCESS] Min cost for {diet_type} diet, group={group} is ${result.fun:.2f} per day.")
        return result.fun
    else:
        raise ValueError(f"[ERROR] Linprog failed: {result.message}")


In [161]:
cost_vegan_female = run_min_cost_diet(
    group="Female_19_30",
    diet_type="vegan",
    df_vegan=df_vegan,
    df_non_vegan=df_non_vegan,
    prices=prices,
    rda=rda,
    food_names=food_names,
)

ValueError: [ERROR] No matching prices found. `prices` is empty after filtering.

In [159]:
cost_vector.shape = (0,)  (should be (n,))
A_ub.shape = (0,0)  (should be (m,n))
b_ub.shape = (27,)  (should be (m,))


SyntaxError: invalid syntax. Perhaps you forgot a comma? (3716623170.py, line 1)

 ## ADD Write a function that takes as arguments the characteristics of a person (e.g., age, sex) and returns a `pandas.Series' of Dietary Reference Intakes (DRI's) or "Recommended Daily Allowances" (RDA) of a variety of nutrients appropriate for your population of interest.

# Female VEGAN Min Cost Diet



# Female Endurance Athlete Min Cost Diet




# Female Endurance Athlete  VEGAN Min Cost Diet


 # Male Min Cost Diet

 # Male VEGAN Min Cost Diet

# Male Endurance Athlete Min Cost Diet

# Male VEGAN Endurance Athlete Min Cost Diet