In [9]:
%pip install eep153_tools
%pip install python_gnupg
%pip install -U gspread_pandas
#load in file from class
def format_id(id,zeropadding=0):
    """Nice string format for any id, string or numeric.

    Optional zeropadding parameter takes an integer
    formats as {id:0z} where
    """
    if pd.isnull(id) or id in ['','.']: return None

    try:  # If numeric, return as string int
        return ('%d' % id).zfill(zeropadding)
    except TypeError:  # Not numeric
        return id.split('.')[0].strip().zfill(zeropadding)
    except ValueError:
        return None

data_url = "https://docs.google.com/spreadsheets/d/1GTo423_gUJe1Von9jypWAbC0zSQ7WGegAWPuRi7eJAI/edit?gid=1410082681#gid=1410082681"

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [10]:
import pandas as pd
import re 
from eep153_tools.sheets import read_sheets

#create recipes df
recipes = read_sheets(data_url, sheet="recipes")
recipes = (recipes
           .assign(parent_foodcode = lambda df: df["parent_foodcode"].apply(format_id),
                   ingred_code = lambda df: df["ingred_code"].apply(format_id))
           .rename(columns={"parent_desc": "recipe"}))
recipes.head()


Unnamed: 0,parent_foodcode,recipe,ingred_code,ingred_desc,ingred_wt
0,11340000,"Imitation milk, non-soy, sweetened",43543,"Milk, imitation, non-soy",100.0
1,11460150,"Yogurt, frozen, NS as to flavor, lowfat milk",1298,"Yogurt, frozen, flavors other than chocolate, ...",100.0
2,11460160,"Yogurt, frozen, chocolate, lowfat milk",1117,"Yogurt, plain, low fat, 12 grams protein per 8...",81.8
3,11460160,"Yogurt, frozen, chocolate, lowfat milk",19166,"Cocoa, dry powder, unsweetened, processed with...",5.2
4,11460160,"Yogurt, frozen, chocolate, lowfat milk",19335,"Sugars, granulated",13.0


In [11]:
#List of words we dont want in our non-vegan diet
KEYWORDS = [
    "mix","frozen", 'powder', 'processed', 'textured'
]

#this partial match
PATTERN = re.compile(
    '|'.join(map(re.escape,KEYWORDS)),
    re.IGNORECASE
)

#Filters out all the foods that we didn't want in our final recipe (ie frozen and powder foods)

def filter_non_vegan_ingredients(df: pd.DataFrame) -> pd.DataFrame:
    # 1) Convert to string, lowercase, remove punctuation
    df["recipe"] = df["recipe"].astype(str).str.lower().fillna("")
    df["recipe"] = df["recipe"].str.replace(r"[^\w\s]", "", regex=True)

    df["ingred_desc"] = df["ingred_desc"].astype(str).str.lower().fillna("")
    df["ingred_desc"] = df["ingred_desc"].str.replace(r"[^\w\s]", "", regex=True)

    # 2) Create a mask for rows that do NOT contain non-vegan keywords
    mask = ~(df["recipe"].str.contains(PATTERN, na=False, regex=True) |
             df["ingred_desc"].str.contains(PATTERN, na=False, regex=True))

    return df[mask]

In [12]:
non_vegan_recipes = filter_non_vegan_ingredients(recipes)
non_vegan_recipes.shape

(31057, 5)

In [13]:
# NONVEGAN

#create NONVEGAN nutrition df
nutrition = (read_sheets(data_url, sheet="nutrients")
             .assign(ingred_code = lambda df: df["ingred_code"].apply(format_id)))

display(nutrition.head())
nutrition.columns
nutrition.shape


# normalize weights to percentage terms. 
non_vegan_recipes['ingred_wt'] = non_vegan_recipes['ingred_wt']/non_vegan_recipes.groupby(['parent_foodcode'])['ingred_wt'].transform("sum")

# we're going to extend the recipes data frame to include the nutrient profiles of its ingredients (in 100g)
df_non_vegan = non_vegan_recipes.merge(nutrition, how="left", on="ingred_code")

# multiply all nutrients per 100g of an ingredient by the weight of that ingredient in a recipe.
numeric_cols = list(df_non_vegan.select_dtypes(include=["number"]).columns)
numeric_cols.remove("ingred_wt")
df_non_vegan[numeric_cols] = df_non_vegan[numeric_cols].mul(df_non_vegan["ingred_wt"], axis=0)

# sum nutrients of food codes (over the multiple ingredients)
# python tip: one can merge dictionaries dict1 dict2 using **, that is: dict_merge = {**dict1, **dict2}. The ** effectively "unpacks" the key value pairs in each dictionary
df_non_vegan = df_non_vegan.groupby('parent_foodcode').agg({**{col: "sum" for col in numeric_cols},
                                        "recipe": "first"})

df_non_vegan.index.name = "recipe_id"

food_names = df_non_vegan["recipe"]
print(food_names.head())
df_non_vegan.head()
df_non_vegan.shape


Unnamed: 0,ingred_code,Ingredient description,Capric acid,Lauric acid,Myristic acid,Palmitic acid,Palmitoleic acid,Stearic acid,Oleic acid,Linoleic Acid,...,Vitamin B12,"Vitamin B-12, added",Vitamin B6,Vitamin C,Vitamin D,Vitamin E,"Vitamin E, added",Vitamin K,Water,Zinc
0,1001,"Butter, salted",2.529,2.587,7.436,21.697,0.961,9.999,19.961,2.728,...,0.17,0.0,0.003,0.0,0.0,2.32,0.0,7.0,15.87,0.09
1,1002,"Butter, whipped, with salt",2.039,2.354,7.515,20.531,1.417,7.649,17.37,2.713,...,0.07,0.0,0.008,0.0,0.0,1.37,0.0,4.6,16.72,0.05
2,1003,"Butter oil, anhydrous",2.495,2.793,10.005,26.166,2.228,12.056,25.026,2.247,...,0.01,0.0,0.001,0.0,0.0,2.8,0.0,8.6,0.24,0.01
3,1004,"Cheese, blue",0.601,0.491,3.301,9.153,0.816,3.235,6.622,0.536,...,1.22,0.0,0.166,0.0,0.5,0.25,0.0,2.4,42.41,2.66
4,1005,"Cheese, brick",0.585,0.482,3.227,8.655,0.817,3.455,7.401,0.491,...,1.26,0.0,0.065,0.0,0.5,0.26,0.0,2.5,41.11,2.6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_vegan_recipes['ingred_wt'] = non_vegan_recipes['ingred_wt']/non_vegan_recipes.groupby(['parent_foodcode'])['ingred_wt'].transform("sum")


recipe_id
11000000                      milk human
11100000                        milk nfs
11111000                      milk whole
11111100           milk low sodium whole
11111150    milk calcium fortified whole
Name: recipe, dtype: object


(8031, 66)

In [14]:
# This reads food price data from a Google Sheet, formats it, and extracts the latest available price data for food items identified by their "food_code". 
#It then removes missing price values and provides a count of the number of unique food codes with prices.


# Read the "prices" sheet from the Google Sheet data_url
# Keeping only the relevant columns: "food_code", "year", and "price"

prices = read_sheets(data_url, sheet="prices")[["food_code", "year", "price"]]

# Format the "food_code" column using a function `format_id`
# (Assumption: `format_id` is defined elsewhere to standardize the format)
prices["food_code"] = prices["food_code"].apply(format_id)

# Set a MultiIndex with levels: "year" and "food_code"
prices = prices.set_index(["year", "food_code"])

# Print available years in the dataset (first level of MultiIndex)
print(prices.index.levels[0])

# Extract price data for the most recent year (assumed to be "2017/2018")
prices = prices.xs("2017/2018", level="year")

# Drop rows where the price is missing ("NA" values in the "price" column)
prices = prices.dropna(subset="price")

# Print the number of unique recipes (food codes) with valid price data
print(f"We have prices for {prices.shape[0]} unique recipes (FNDDS food codes)")

Index(['2011/2012', '2013/2014', '2015/2016', '2017/2018'], dtype='object', name='year')
We have prices for 4435 unique recipes (FNDDS food codes)


In [15]:
# Reads dietary reference intake (RDA/AI) data from a Google Sheet, formats it, and extracts the  minimum required intake values for various nutrients.


# Read the "rda" sheet from the Google Sheet data_url
rda = read_sheets(data_url, sheet="rda")

# Set "Nutrient" as the index for easier access to nutrient-based data
rda = rda.set_index("Nutrient")

rda.columns, rda.head()

(Index(['Nutrient Type', 'Unit', 'Constraint Type', 'Female_19_30',
        'Female_endurance_athlete', 'Male_19_30', 'Male_endurance_athlete'],
       dtype='object'),
               Nutrient Type  Unit Constraint Type  Female_19_30  \
 Nutrient                                                          
 Energy                Macro  kcal             RDA        2000.0   
 Protein               Macro     g             RDA          46.0   
 Carbohydrate          Macro     g             RDA         130.0   
 Dietary Fiber         Macro     g             RDA          28.0   
 Linoleic Acid         Macro     g              AI          12.0   
 
                Female_endurance_athlete  Male_19_30  Male_endurance_athlete  
 Nutrient                                                                     
 Energy                           2800.0      2400.0                  3500.0  
 Protein                            80.0        56.0                   112.0  
 Carbohydrate                      48

In [16]:

# This script ensures that the non-vegan food dataset (df_non_vegan) and the prices dataset contain only matching food items.
# It then maps food codes to their actual names and transposes the nutrient data for further analysis.

# Find the common food items between df_vegan and prices datasets
common_recipes = df_non_vegan.index.intersection(prices.index)

# Subset both dataframes to include only common food items
df_non_vegan = df_non_vegan.loc[common_recipes]
prices = prices.loc[common_recipes]

# Remap the index of the prices DataFrame to use actual food names instead of food codes by applying the "food_names" mapping.
prices.index = prices.index.map(food_names)

# Transpose df_vegan so that nutrients become rows and food items become columns
A_all = df_non_vegan.T

print(prices.head())
print(A_all.head())

                       price
milk nfs            0.100484
milk whole           0.09828
milk reduced fat 2  0.092085
milk low fat 1      0.090914
milk fat free skim  0.092441
                 11100000 11111000 11112110 11112210 11113000 11114300  \
Capric acid       0.03825    0.075    0.049    0.027    0.002    0.027   
Lauric acid        0.0405    0.077    0.055    0.029    0.001    0.029   
Myristic acid     0.14275    0.297    0.175    0.091    0.008    0.091   
Palmitic acid     0.42475    0.829    0.558    0.287    0.025    0.287   
Palmitoleic acid  0.01175      0.0    0.027    0.017    0.003    0.017   

                 11114330 11114350 11115000 11115100  ... 95311000 95312410  \
Capric acid         0.049    0.075    0.022    0.022  ...      0.0      0.0   
Lauric acid         0.055    0.077    0.025    0.025  ...      0.0      0.0   
Myristic acid       0.175    0.297    0.089    0.089  ...      0.0      0.0   
Palmitic acid       0.558    0.829    0.231    0.231  ...      0.

In [17]:
# In order to change which population, make sure to pick a demographic (column from rda dataframe)

'''
select from 
['Female_19_30', 'Female_endurance_athlete', 'Male_19_30', 'Male_endurance_athlete']
'''
group = "Female_endurance_athlete"

# create lower bounds and upper bounds.
bmin = rda.loc[rda['Constraint Type'].isin(['RDA', 'AI']), group]
bmax = rda.loc[rda['Constraint Type'].isin(['UL']), group]

# reindex ensures we only keep nutrients in bmin/bmax
Amin = A_all.reindex(bmin.index).dropna(how='all')
Amax = A_all.reindex(bmax.index).dropna(how='all')

b = pd.concat([bmin, -bmax])
A = pd.concat([Amin, -Amax])

#python tip: by typing "=" after the variable name inside the curly braces, it formats the output so we don't have to write f"variable = {variable}"
print(f"{bmin.shape=}")
print(f"{Amin.shape=}")
print(f"{bmax.shape=}")
print(f"{Amax.shape=}")
print(f"{b.shape=}")
print(f"{A.shape=}")
print(f"{prices.shape=}")

bmin.shape=(26,)
Amin.shape=(26, 4055)
bmax.shape=(1,)
Amax.shape=(1, 4055)
b.shape=(27,)
A.shape=(27, 4055)
prices.shape=(4055, 1)


In [18]:
from  scipy.optimize import linprog as lp
import numpy as np
p = prices
tol = 1e-6 # Numbers in solution smaller than this (in absolute value) treated as zeros
result = lp(p, -A, -b, method='highs')

In [19]:

# get the result x in a series with food names
diet = pd.Series(result.x,index=prices.index)


#prints all of the results
print(f"Cost of diet for a non-vegan {group} is ${result.fun:.2f} per day. \n")

print(f"As a non-vegan {group} you'll be eating (in 100s of grams or milliliters): \n")

print(round(diet[diet >= tol], 2))

Cost of diet for a non-vegan Female_endurance_athlete is $2.98 per day. 

As a non-vegan Female_endurance_athlete you'll be eating (in 100s of grams or milliliters): 

milk low fat 1                           6.07
carp steamed or poached                  0.52
egg yolk only raw                        0.09
split peas from dried fat added          4.97
rice white cooked made with margarine    1.32
cereal rice flakes                       0.05
cereal toasted oat                       0.32
ripe plantain raw                        3.39
sugar white granulated or lump           2.01
dtype: float64


In [20]:
#Function for deliverable [A] Dietary Reference Intakes

def get_population_dri(population, rda_df) -> pd.Series:
    
    # 1. Filter rows to only those where Constraint Type is RDA or AI
    rda_filtered = rda_df[rda_df["Constraint Type"].isin(["RDA", "AI"])].copy()

    # 3. Extract the column for the chosen population as a Series
    dri_series = rda_filtered[population]

    # 4. Drop any rows that are NaN (just in case)
    dri_series.dropna(inplace=True)

    # 5. Return the final Series
    return dri_series


In [21]:

#example of get_population_dri function
#YOU NEED TO SET POPULATION = TO ONE OF THE POPULATIONS BELOW 
'''
select population from 
['Female_19_30', 'Female_endurance_athlete', 'Male_19_30', 'Male_endurance_athlete']
'''

population = "Male_19_30"
dri_for_female_19_30 = get_population_dri(population, rda)

print("Dietary recommendations (RDA) for", population)
print(dri_for_female_19_30)

Dietary recommendations (RDA) for Male_19_30
Nutrient
Energy            2400.0
Protein             56.0
Carbohydrate       130.0
Dietary Fiber       33.6
Linoleic Acid       17.0
Linolenic Acid       1.6
Calcium           1000.0
Iron                 8.0
Magnesium          400.0
Phosphorus         700.0
Potassium         4700.0
Zinc                11.0
Copper               0.9
Selenium            55.0
Vitamin A          900.0
Vitamin E           15.0
Vitamin D           15.0
Vitamin C           90.0
Thiamin              1.2
Riboflavin           1.3
Niacin              16.0
Vitamin B6           1.3
Vitamin B12          2.4
Choline            550.0
Vitamin K          120.0
Folate             400.0
Name: Male_19_30, dtype: float64
