In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import cvxpy as cp
import json
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [21]:
#definitions
amino_acids = ['alanine', 
       'arginine', 'aspartic_acid', 'cystine', 'glutamic_acid', 'glycine',
       'histidine', 'hydroxyproline', 'isoleucine', 'leucine', 'lysine',
       'methionine', 'phenylalanine', 'proline', 'serine', 'threonine',
       'tryptophan', 'tyrosine', 'valine',]
essential = ['histidine', 'isoleucine', 'leucine', 'lysine', 
                         'methionine', 'phenylalanine', 'threonine', 'tryptophan', 'valine']
minerals = ['calcium','copper','irom','magnesium','manganese','phosphorous','potassium','zink']
vitamins = []
summary = ['protein','carbohydrate','total_fat','water']

In [22]:
df = pd.read_csv('../data/nutrition_clean.csv')
names = pd.read_csv('../data/food_names.csv')
names = names['name']
name_to_idx = {v: k for k, v in names.to_dict().items()}
columns_to_idx = {v: i for i, v in enumerate(df.columns)}

with open('../data/units.json') as json_file:
    units = json.load(json_file)

## Constraints

In [82]:
lower = {'calories': 2000,
 'total_fat': 50,
 'saturated_fat': 10,
 'cholesterol': 50,
 'sodium': 1500,
 'choline': 550,
 'folate': 400,
 'folic_acid': 400,
 'niacin': 16,
 'pantothenic_acid': 5,
 'riboflavin': 1.3,
 'thiamin': 1.2,
 'vitamin_a': 3000,
 'vitamin_a_rae': ' mcg',
 'carotene_alpha': ' mcg',
 'carotene_beta': ' mcg',
 'cryptoxanthin_beta': ' mcg',
 'lutein_zeaxanthin': ' mcg',
 'vitamin_b12': 2.4,
 'vitamin_b6': 1.3,
 'vitamin_c': 90,
 'vitamin_d': 600,
 'vitamin_e': 15,
 'vitamin_k': 120,
 'calcium': 1300,
 'copper': 0.9,
 'irom': 8,
 'magnesium': 400,
 'manganese': 2.3,
 'phosphorous': 700,
 'potassium': 4700,
 'selenium': 55,
 'zink': 11,
 'protein': 70,
 'alanine': ' g',
 'arginine': ' g',
 'aspartic_acid': ' g',
 'cystine': 0.287,
 'glutamic_acid': ' g',
 'glycine': ' g',
 'histidine': 0.7,
 'hydroxyproline': 'g',
 'isoleucine': 1.4,
 'leucine': 2.73,
 'lysine': 2.1,
 'methionine': 0.728,
 'phenylalanine': 0.875,
 'proline': ' g',
 'serine': ' g',
 'threonine': 1.05,
 'tryptophan': 0.28,
 'tyrosine': 0.875,
 'valine': 1.82,
 'carbohydrate': 130, #130
 'fiber': 38,
 'sugars': 'g',
 'fructose': 'g',
 'galactose': 'g',
 'glucose': 'g',
 'lactose': 'g',
 'maltose': 'g',
 'sucrose': 'g',
 'fat': 'g', #78
 'saturated_fatty_acids': ' g',
 'monounsaturated_fatty_acids': ' g',
 'polyunsaturated_fatty_acids': 17,
 'alcohol': 'g',
 'ash': 'g',
 'caffeine': 'g',
 'theobromine': 'g',
 'water': 100} #500

upper = {'calories': 2500,
 'total_fat': 120,
 'saturated_fat': 25,
 'cholesterol': 250,
 'sodium': 3000,
 'choline': 2500,
 'folate': 900,
 'folic_acid': 900,
 'niacin': 30,
 'pantothenic_acid': 100,
 'riboflavin': 500,
 'thiamin': 200,
 'vitamin_a': 8000,
 'vitamin_a_rae': ' mcg',
 'carotene_alpha': ' mcg',
 'carotene_beta': ' mcg',
 'cryptoxanthin_beta': ' mcg',
 'lutein_zeaxanthin': ' mcg',
 'vitamin_b12': 8,
 'vitamin_b6': 4,
 'vitamin_c': 1000,
 'vitamin_d': 5000,
 'vitamin_e': 500,
 'vitamin_k': 500,
 'calcium': 2000,
 'copper': 7,
 'irom': 35,
 'magnesium': 500,
 'manganese': 7,
 'phosphorous': 3500,
 'potassium': 7000,
 'selenium': 300,
 'zink': 35,
 'protein': 100,
 'alanine': ' g',
 'arginine': ' g',
 'aspartic_acid': ' g',
 'cystine': ' g',
 'glutamic_acid': ' g',
 'glycine': ' g',
 'histidine': ' g',
 'hydroxyproline': 'g',
 'isoleucine': ' g',
 'leucine': ' g',
 'lysine': ' g',
 'methionine': ' g',
 'phenylalanine': ' g',
 'proline': ' g',
 'serine': ' g',
 'threonine': ' g',
 'tryptophan': ' g',
 'tyrosine': ' g',
 'valine': ' g',
 'carbohydrate': 400, #300
 'fiber': 55,
 'sugars': 40,
 'fructose': ' g',
 'galactose': 'g',
 'glucose': ' g',
 'lactose': ' g',
 'maltose': ' g',
 'sucrose': ' g',
 'fat': 'g', #120
 'saturated_fatty_acids': ' g',
 'monounsaturated_fatty_acids': ' g',
 'polyunsaturated_fatty_acids': 30,
 'alcohol': 5,
 'ash': ' g',
 'caffeine': ' mg',
 'theobromine': ' mg',
 'water': 2000}

In [83]:
constraints = {k_up: (v_low, v_up) for (k_up, v_up), (_, v_low) in zip(upper.items(), lower.items()) 
 if not (isinstance(v_low, str) and isinstance(v_up, str))}

In [84]:
constraints

{'calories': (2000, 2500),
 'saturated_fat': (10, 25),
 'cholesterol': (50, 250),
 'sodium': (1500, 3000),
 'choline': (550, 2500),
 'folate': (400, 900),
 'folic_acid': (400, 900),
 'niacin': (16, 30),
 'pantothenic_acid': (5, 100),
 'riboflavin': (1.3, 500),
 'thiamin': (1.2, 200),
 'vitamin_a': (3000, 8000),
 'vitamin_b12': (2.4, 8),
 'vitamin_b6': (1.3, 4),
 'vitamin_c': (90, 1000),
 'vitamin_d': (600, 5000),
 'vitamin_e': (15, 500),
 'vitamin_k': (120, 500),
 'calcium': (1300, 2000),
 'copper': (0.9, 7),
 'irom': (8, 35),
 'magnesium': (400, 500),
 'manganese': (2.3, 7),
 'phosphorous': (700, 3500),
 'potassium': (4700, 7000),
 'selenium': (55, 300),
 'zink': (11, 35),
 'protein': (70, 100),
 'cystine': (0.287, ' g'),
 'histidine': (0.7, ' g'),
 'isoleucine': (1.4, ' g'),
 'leucine': (2.73, ' g'),
 'lysine': (2.1, ' g'),
 'methionine': (0.728, ' g'),
 'phenylalanine': (0.875, ' g'),
 'threonine': (1.05, ' g'),
 'tryptophan': (0.28, ' g'),
 'tyrosine': (0.875, ' g'),
 'valine': (1.

Minimize $|\mathbf{x}|_1$

subject to $b_1 \leq A\mathbf{x} \leq b_2$

where $\mathbf{x} \in \mathcal{R}_+$

$\mathbf{x}: 1\times 8763$

$\mathbf{x}$ is the vector of all food weights in units of 100g

In [85]:
def get_constraint_vectors(constraints):
    lower = []
    upper = []
    lower_idx = []
    upper_idx = []
    for k,v in constraints.items():
        if not isinstance(v[0], str):
            lower_idx.append(k)
            lower.append(v[0])
        if not isinstance(v[1], str):
            upper_idx.append(k)
            upper.append(v[1])
    return lower, upper, lower_idx, upper_idx

In [86]:
l, u, l_idxs, u_idxs = get_constraint_vectors(constraints)

In [87]:
#should be roughly 70 x 8700, i.e. A.T.
Al = df[l_idxs].values.T
Au = df[u_idxs].values.T

In [88]:
# Least calorie solution (may need to control for beverages by placing constraint 
# or regularization on x, e.g. weight constraint)
n = len(df)

# Define and solve the CVXPY problem.
x = cp.Variable(n)
prob = cp.Problem(cp.Minimize(cp.norm1(x)),
                 [Au @ x <= u, Al @ x >= l,])
prob.solve()

# Print result.
print('Total weight: ',prob.value * 100, 'g')

Total weight:  457.3053003755828 g


In [89]:
final = df.values.T @ x.value
for i, col in enumerate(df.columns):
    print(f"{final[i]:.2f}{units[col]}  {col}")

2000.00kcal  calories
120.14g  total_fat
25.00g  saturated_fat
250.00mg  cholesterol
1500.00 mg  sodium
550.00 mg  choline
781.16 mcg  folate
400.00 mcg  folic_acid
24.75 mg  niacin
5.25 mg  pantothenic_acid
4.15 mg  riboflavin
3.82 mg  thiamin
4366.61 IU  vitamin_a
1230.50 mcg  vitamin_a_rae
1.24 mcg  carotene_alpha
159.20 mcg  carotene_beta
1.08 mcg  cryptoxanthin_beta
219.35 mcg  lutein_zeaxanthin
4.78 mcg  vitamin_b12
1.31 mg  vitamin_b6
90.00 mg  vitamin_c
600.00 IU  vitamin_d
15.00 mg  vitamin_e
120.00 mcg  vitamin_k
1300.00 mg  calcium
2.78 mg  copper
21.15 mg  irom
500.00 mg  magnesium
2.30 mg  manganese
1599.59 mg  phosphorous
4700.00 mg  potassium
66.35 mcg  selenium
13.81 mg  zink
78.59 g  protein
3.97 g  alanine
4.80 g  arginine
7.52 g  aspartic_acid
0.90 g  cystine
14.19 g  glutamic_acid
4.25 g  glycine
2.03 g  histidine
0.00g  hydroxyproline
3.79 g  isoleucine
6.40 g  leucine
5.64 g  lysine
1.55 g  methionine
3.55 g  phenylalanine
5.77 g  proline
3.93 g  serine
3.23 g  th

In [92]:
u_dual = prob.constraints[0].dual_value
tol = 1e-6
u_lim = pd.DataFrame([(nutrient, val) for nutrient, val in zip(np.array(u_idxs)[u_dual > tol], u_dual[u_dual > tol])])
print(u_lim)

               0         1
0  saturated_fat  0.000997
1    cholesterol  0.000146
2      magnesium  0.000136
3         sugars  0.001274
4            fat  0.009582
5        alcohol  0.002667


In [93]:
l_dual = prob.constraints[1].dual_value
tol = 1e-6
l_lim = pd.DataFrame([(nutrient, val) for nutrient, val in zip(np.array(l_idxs)[l_dual > tol], l_dual[l_dual > tol])])
print(l_lim)

             0         1
0     calories  0.002176
1       sodium  0.000015
2      choline  0.000297
3   folic_acid  0.000015
4    vitamin_c  0.000032
5    vitamin_d  0.000010
6    vitamin_e  0.000123
7    vitamin_k  0.000064
8      calcium  0.000074
9    manganese  0.000460
10   potassium  0.000067
11       fiber  0.001647
12       water  0.008779


In [94]:
result = x.value
idxs = np.array([i for i in range(len(result))])
tol = 1e-6
final_foods = pd.DataFrame([(names[food], food, val) for food, val in zip(idxs[result > tol], result[result > tol])])

In [95]:
final_foods.sort_values(by=2,ascending=False)

Unnamed: 0,0,1,2
13,"Milk, with added vitamin A and vitamin D, regu...",5090,0.63947
5,"Soybeans, raw, mature seeds",853,0.609297
17,"Pork, simmered, cooked, chitterlings, variety ...",6541,0.548811
7,"Potato chips, reduced fat, without salt",1954,0.517495
9,"Oil, high oleic, canola, industrial",2378,0.476247
6,"Frankfurter, low fat, beef",994,0.471681
14,"Cocoa, processed with alkali, hi-fat or breakf...",5185,0.368832
11,"Gums, seed gums (includes locust bean, guar)",2569,0.198783
4,"Fat, chicken",580,0.195035
10,"Vanilla extract, alcohol, imitation",2452,0.151976


In [35]:
for idx in final_foods[1]:
    print(names[idx], df.iloc[idx]['potassium'])

Yeast extract spread 2100.0
Fish oil, cod liver 0.0
Vanilla extract 148.0
Fat, chicken 0.0
Soybeans, raw, mature seeds 1797.0
Frankfurter, low fat, beef 129.0
Snacks, plain, pork skins 127.0
Egg Mix, USDA Commodity 373.0
Potato chips, reduced fat, without salt 1744.0
Formulated Bar, SOUTH BEACH protein bar 793.0
Gums, seed gums (includes locust bean, guar) 0.0
Cereals, dry, plain, original, MALT-O-MEAL 100.0
Cereals ready-to-eat, KELLOGG'S PRODUCT 19, KELLOGG 179.0
Protein supplement, powder, Muscle Milk Light, milk based 840.0
Milk, with added vitamin A and vitamin D, regular, nonfat, dry 1794.0
Formulated bar, all flavors, SNICKERS MARATHON Energy Bar, MARS SNACKFOOD US 351.0
Pork, simmered, cooked, chitterlings, variety meats and by-products, fresh 14.0


In [69]:
for i, v in enumerate(df.iloc[1954].values):
    if v != 0:
        print(df.columns[i], v)

calories 487.0
total_fat 21.0
saturated_fat 4.2
sodium 8.0
choline 53.0
folate 10.0
niacin 7.0
riboflavin 0.27
thiamin 0.21
vitamin_b6 0.67
vitamin_c 25.7
vitamin_e 5.47
vitamin_k 13.3
calcium 21.0
copper 0.348
irom 1.35
magnesium 89.0
phosphorous 193.0
potassium 1744.0
selenium 8.1
zink 1.01
protein 7.1
alanine 0.3586224489795918
arginine 0.39405
aspartic_acid 0.5819391086771264
cystine 0.0733306010928961
glutamic_acid 1.01042486411489
glycine 0.2915444573591923
histidine 0.1782435996687584
isoleucine 0.2859009365940474
leucine 0.500967094076655
lysine 0.4915781017369727
methionine 0.152961143405384
phenylalanine 0.2599805624666785
proline 0.2883880813605632
serine 0.2629565149188422
threonine 0.2574170616113744
tryptophan 0.0716600678825051
tyrosine 0.2136265247460558
valine 0.317183661441189
carbohydrate 67.8
fiber 6.1
sugars 0.22
fat 20.8
saturated_fatty_acids 4.16
monounsaturated_fatty_acids 4.8
polyunsaturated_fatty_acids 10.94
ash 3.3
water 1.0


In [37]:
low_lim_sum = []
low_lim_ess = []
low_lim_min = []
for nutrient in summary:
    low, upp = constraints[nutrient]
    low_lim_sum.append(low)
for nutrient in essential:
    low, upp = constraints[nutrient]
    low_lim_ess.append(low)
for nutrient in minerals:
    low, upp = constraints[nutrient]
    low_lim_min.append(low)

In [55]:
idx = 2569
print(df[summary].iloc[idx].sum())

fig = make_subplots(rows=1, cols=3, specs=[[{'type':'polar'},{'type':'polar'},{'type':'polar'}]])

fig.add_trace(go.Scatterpolar(
      r=low_lim_sum + [low_lim_sum[0]],theta=summary + [summary[0]],
      name='lower_lim'
), row=1, col=1)

fig.add_trace(go.Scatterpolar(
      r=df[summary+[summary[0]]].iloc[idx],
      theta=summary+ [summary[0]],
      fill='toself',
      name=names[idx]
), row=1, col=1)

fig.add_trace(go.Scatterpolar(
      r=low_lim_ess + [low_lim_ess[0]],theta=essential + [essential[0]],
      name='lower_lim'
), row=1, col=2)

fig.add_trace(go.Scatterpolar(
      r=df[essential+[essential[0]]].iloc[idx],
      theta=essential+ [essential[0]],
      fill='toself',
      name=names[idx]
), row=1, col=2)

fig.add_trace(go.Scatterpolar(
      r=low_lim_min + [low_lim_min[0]],theta=minerals + [minerals[0]],
      name='lower_lim'
), row=1, col=3)

fig.add_trace(go.Scatterpolar(
      r=df[minerals+[minerals[0]]].iloc[idx],
      theta=minerals+ [minerals[0]],
      fill='toself',
      name=names[idx]
), row=1, col=3)



fig.update_layout(
    height=600, width=1800,
    title=names[idx],
  polar=dict(
    radialaxis=dict(
      visible=True,
#       range=[0, 100]
    )),
  showlegend=False
)

fig.show()

97.39999999999999


In [50]:
df[summary].iloc[1954]

protein          7.1
carbohydrate    67.8
total_fat       21.0
water            1.0
Name: 1954, dtype: float64

In [51]:
df[summary].iloc[995]

protein         12.34
carbohydrate    66.36
total_fat       15.00
water            5.05
Name: 995, dtype: float64

In [52]:
df[summary].iloc[994]

protein         12.0
carbohydrate     1.6
total_fat        9.5
water           63.8
Name: 994, dtype: float64

In [74]:
df[(df['protein'] > 0) * (df[amino_acids].sum(axis=1) == 0)][['protein']+amino_acids]

Unnamed: 0,protein,alanine,arginine,aspartic_acid,cystine,glutamic_acid,glycine,histidine,hydroxyproline,isoleucine,leucine,lysine,methionine,phenylalanine,proline,serine,threonine,tryptophan,tyrosine,valine
4,1.10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,23.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,19.40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.91,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8764,0.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8767,1.78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8770,1.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8771,3.20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
df[(df['protein'] == 0) * (df[amino_acids].sum(axis=1) > 0)][['protein']+amino_acids]

Unnamed: 0,protein,alanine,arginine,aspartic_acid,cystine,glutamic_acid,glycine,histidine,hydroxyproline,isoleucine,leucine,lysine,methionine,phenylalanine,proline,serine,threonine,tryptophan,tyrosine,valine
615,0.0,0.0,0.0,0.002,0.0,0.002,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1221,0.0,0.072,0.088,0.155,0.019,0.234,0.07,0.027,0.0,0.06,0.094,0.078,0.016,0.075,0.067,0.067,0.062,0.023,0.058,0.093
1474,0.0,0.003,0.002,0.003,0.001,0.007,0.001,0.001,0.0,0.001,0.005,0.001,0.001,0.002,0.003,0.002,0.001,0.0,0.001,0.002
2186,0.0,0.036,0.064,0.105,0.014,0.18,0.026,0.031,0.0,0.007,0.017,0.019,0.029,0.017,0.029,0.041,0.024,0.0,0.016,0.024
4438,0.0,0.013,0.007,0.003,0.0,0.017,0.002,0.001,0.0,0.001,0.002,0.002,0.0,0.002,0.002,0.002,0.002,0.0,0.0,0.002
4621,0.0,0.003,0.002,0.003,0.001,0.007,0.001,0.001,0.0,0.001,0.005,0.001,0.001,0.002,0.003,0.002,0.001,0.0,0.001,0.002


In [76]:
complete = df[(df['protein'] > 0) * (df[amino_acids].sum(axis=1) > 0)][['protein']+amino_acids]

In [77]:
amino_median = []
for idx in range(len(amino_acids)):
    ratio = complete[[amino_acids[idx]]].sum(axis=1) / complete['protein']
    amino_median.append(ratio.median())
amino_median = 0.9 * np.array(amino_median)

In [78]:
mask = (df['protein'] > 0) * (df[amino_acids].sum(axis=1) == 0)
fix = df[mask][['protein']].values * amino_median
for idx in range(len(amino_acids)):
    df[amino_acids[idx]][mask] = fix[:,idx]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [82]:
#Save amended data
df.to_csv('../data/nutrition_augment.csv', index=False)