# Setup Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
import re
import itertools
from IPython.display import clear_output

# Load Data

In [2]:
nutrient_dvs = {'Calories': 2000,
                'Protein': 50,
                'Trans Fat': 58,
                'Saturated Fat': 20,
                'Cholesterol': 300,
                'Sodium': 2300,
                'Carbohydrates': 275,
                'Dietary Fiber': 28,
                'Sugars': 50,
                'Vitamin D': 20,
                'Calcium': 1300,
                'Iron': 18,
                'Potassium': 4700}
sorted(list(nutrient_dvs.keys()))

['Calcium',
 'Calories',
 'Carbohydrates',
 'Cholesterol',
 'Dietary Fiber',
 'Iron',
 'Potassium',
 'Protein',
 'Saturated Fat',
 'Sodium',
 'Sugars',
 'Trans Fat',
 'Vitamin D']

In [3]:
sorted(['Calories', 'Calories From Fat', 'Total Fat', 'Saturated Fat',
       'Trans Fat', 'Polyunsaturated Fat', 'Monounsaturated Fat',
       'Cholesterol', 'Sodium', 'Carbohydrates', 'Dietary Fiber', 'Sugars',
       'Protein', 'Servings Per Container', 'NMV Score'])

['Calories',
 'Calories From Fat',
 'Carbohydrates',
 'Cholesterol',
 'Dietary Fiber',
 'Monounsaturated Fat',
 'NMV Score',
 'Polyunsaturated Fat',
 'Protein',
 'Saturated Fat',
 'Servings Per Container',
 'Sodium',
 'Sugars',
 'Total Fat',
 'Trans Fat']

In [4]:
products_nutrition = pd.read_csv('Safeway Product Data.csv', index_col=0)
products_nutrition.head(2)
print("Shape:", products_nutrition.shape)

Shape: (3248, 20)


In [5]:
products_data = pd.read_csv('Safeway Search Data.csv', index_col=0)
display(products_data.head(2))
print("Shape:", products_data.shape)

Unnamed: 0_level_0,price,unitOfMeasure,sellByWeight,aisleName,name,departmentName,aisleId,upc,restrictedValue,displayType,...,featured,pastPurchased,promoDescription,promoType,isArProduct,unitQuantity,displayEstimateText,displayUnitQuantityText,promoEndDate,promoText
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
960016747,149.99,ML,I,Spirits & Mixers|1_29_2,Don Julio Tequila 1942 80 Proof - 750 Ml,"Wine, Beer & Spirits",1_29_2_8,67454500032,303,-1,...,False,False,Safeway Club Price: $149.99&lt;BR&gt;SAVE up t...,P,False,FL.OZ,,ea,2021-05-02T23:59:00,
960127189,94.99,LT,I,Spirits & Mixers|1_29_2,Hennessy Cognac VS Very Special 80 Proof - 1.7...,"Wine, Beer & Spirits",1_29_2_3,8811015058,303,-1,...,False,False,Safeway Club Price: $94.99&lt;BR&gt;SAVE up to...,P,False,FL.OZ,,ea,2021-05-02T23:59:00,


Shape: (10400, 27)


In [6]:
product_info = products_nutrition.merge(products_data, left_index=True, right_index=True)
product_info = product_info.reset_index(drop=False, ).drop_duplicates(subset=['index']).set_index('index')
display(product_info.iloc[:2])
print("Shape:", product_info.shape)

Unnamed: 0_level_0,Name,Servings Per Container,Calories,Calories From Fat,Total Fat,Saturated Fat,Trans Fat,Polyunsaturated Fat,Monounsaturated Fat,Cholesterol,...,featured,pastPurchased,promoDescription,promoType,isArProduct,unitQuantity,displayEstimateText,displayUnitQuantityText,promoEndDate,promoText
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101050229,HERSHEYS Milk Chocolate Full Size - 6-1.55 Oz,6,220,0,13g,8g,0g,0,0,10mg,...,False,False,Safeway Club Price: $5.99&lt;BR&gt;SAVE up to:...,P,False,OUNCE,,ea,2021-04-25T23:59:00,
101050230,HERSHEYS Milk Chocolate with Almonds Full Size...,6,210,0,14g,7g,0g,0,0,10mg,...,False,False,Safeway Club Price: $5.99&lt;BR&gt;SAVE up to:...,P,False,OUNCE,,ea,2021-04-25T23:59:00,


Shape: (3007, 47)


In [7]:
product_info.columns

Index(['Name', 'Servings Per Container', 'Calories', 'Calories From Fat',
       'Total Fat', 'Saturated Fat', 'Trans Fat', 'Polyunsaturated Fat',
       'Monounsaturated Fat', 'Cholesterol', 'Sodium', 'Total Carbohydrate',
       'Dietary Fiber', 'Sugars', 'Protein', 'Vitamin A', 'Vitamin C',
       'Calcium', 'Iron', 'Image Link', 'price', 'unitOfMeasure',
       'sellByWeight', 'aisleName', 'name', 'departmentName', 'aisleId', 'upc',
       'restrictedValue', 'displayType', 'basePrice', 'inventoryAvailable',
       'averageWeight', 'pricePer', 'salesRank', 'shelfName', 'id', 'featured',
       'pastPurchased', 'promoDescription', 'promoType', 'isArProduct',
       'unitQuantity', 'displayEstimateText', 'displayUnitQuantityText',
       'promoEndDate', 'promoText'],
      dtype='object')

In [8]:
random_row_idx = np.random.choice(product_info.index)
product_info.loc[random_row_idx]

Name                              O Organics Organic Broth Vegetable - 32 Oz
Servings Per Container                                               about 4
Calories                                                                  10
Calories From Fat                                                          0
Total Fat                                                                 0g
Saturated Fat                                                             0g
Trans Fat                                                                 0g
Polyunsaturated Fat                                                        0
Monounsaturated Fat                                                        0
Cholesterol                                                              0mg
Sodium                                                                 630mg
Total Carbohydrate                                                        2g
Dietary Fiber                                                             0g

In [9]:
nutrition_columns = ['Calories', 'Calories From Fat',
       'Total Fat', 'Saturated Fat', 'Trans Fat', 'Polyunsaturated Fat',
       'Monounsaturated Fat', 'Cholesterol', 'Sodium', 'Total Carbohydrate',
       'Dietary Fiber', 'Sugars', 'Protein', 'Servings Per Container']#'Vitamin A', 'Vitamin C', ""'Calcium', 'Iron']

#UPDATE COLUMN NAMES IN THE DICT BELOW TO MATCH `nutrient_dvs`
fixed_column_names = {"Total Carbohydrate": "Carbohydrates"}
nutrition_data = product_info.loc[:, nutrition_columns].rename(columns=fixed_column_names)
nutrition_data = nutrition_data.fillna(0)
nutrition_data

Unnamed: 0_level_0,Calories,Calories From Fat,Total Fat,Saturated Fat,Trans Fat,Polyunsaturated Fat,Monounsaturated Fat,Cholesterol,Sodium,Carbohydrates,Dietary Fiber,Sugars,Protein,Servings Per Container
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
101050229,220,0,13g,8g,0g,0,0,10mg,35mg,26g,1g,0,3g,6
101050230,210,0,14g,7g,0g,0,0,10mg,30mg,22g,2g,0,4g,6
102010102,130,50,6g,4g,0g,0,0,5mg,55mg,17g,1g,9g,1g,6
102010157,130,0,7g,3.5g,0g,0,0,10mg,75mg,17g,0g,0,2g,8
102010163,140,0,7g,4g,0g,0,0,<5mg,35mg,17g,<1g,9g,1g,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970033608,30,0,3g,0.5g,0g,0,0,0mg,240mg,1g,<1g,0,0g,About 15
970058173,210,0,12g,7g,0g,0,0,60mg,55mg,24g,0g,0,4g,9
970065047,0,0,0,0,0,0,0,0,0,0,0,0,0,0
970065217,140,0,5g,0g,0g,0,0,0mg,200mg,22g,3g,0,2g,about 13


In [10]:
def clean_servings(s):
    s = str(s).replace("about", "")
    return s

nutrition_data = nutrition_data[nutrition_data['Servings Per Container'].apply(lambda s: 'var' not in str(s).lower())]
nutrition_data['Servings Per Container'] = nutrition_data['Servings Per Container'].apply(clean_servings)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
def to_int(item):
    try:
        item = float(item)
    except ValueError as e:
        if item.strip() == '': 
            return 0
        item = re.sub("[^0-9]", "", item)
        return to_int(item)
    return int(item)

for col in nutrition_data.columns:
    nutrition_data.loc[:, col] = pd.Series(nutrition_data.loc[:, col].apply(to_int), dtype=int)

nutrition_data.dtypes

Calories                  int64
Calories From Fat         int64
Total Fat                 int64
Saturated Fat             int64
Trans Fat                 int64
Polyunsaturated Fat       int64
Monounsaturated Fat       int64
Cholesterol               int64
Sodium                    int64
Carbohydrates             int64
Dietary Fiber             int64
Sugars                    int64
Protein                   int64
Servings Per Container    int64
dtype: object

In [None]:
nutrition_data

# Nutrition Data Compared Against Nutrition Needs & In-Product Amounts

In [14]:
nutrition_data_by_serving = nutrition_data[nutrition_data['Servings Per Container'] != 0]
nutrition_data_by_serving.iloc[:, :-1] = nutrition_data_by_serving.apply(func=lambda arr: arr/nutrition_data_by_serving['Servings Per Container'], axis=0).iloc[:, :-1]
nutrition_data_by_serving

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, v)


Unnamed: 0_level_0,Calories,Calories From Fat,Total Fat,Saturated Fat,Trans Fat,Polyunsaturated Fat,Monounsaturated Fat,Cholesterol,Sodium,Carbohydrates,Dietary Fiber,Sugars,Protein,Servings Per Container
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
101050229,36.666667,0.000000,2.166667,1.333333,0.0,0.0,0.0,1.666667,5.833333,4.333333,0.166667,0.0,0.500000,6
101050230,35.000000,0.000000,2.333333,1.166667,0.0,0.0,0.0,1.666667,5.000000,3.666667,0.333333,0.0,0.666667,6
102010102,21.666667,8.333333,1.000000,0.666667,0.0,0.0,0.0,0.833333,9.166667,2.833333,0.166667,1.5,0.166667,6
102010157,16.250000,0.000000,0.875000,4.375000,0.0,0.0,0.0,1.250000,9.375000,2.125000,0.000000,0.0,0.250000,8
102010371,14.444444,0.000000,0.444444,0.000000,0.0,0.0,0.0,0.000000,10.555556,2.444444,0.222222,0.0,0.222222,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970033083,0.219780,0.000000,0.164835,0.010989,0.0,0.0,0.0,0.054945,0.604396,0.000000,0.000000,0.0,0.010989,91
970033608,2.000000,0.000000,0.200000,0.333333,0.0,0.0,0.0,0.000000,16.000000,0.066667,0.066667,0.0,0.000000,15
970058173,23.333333,0.000000,1.333333,0.777778,0.0,0.0,0.0,6.666667,6.111111,2.666667,0.000000,0.0,0.444444,9
970065217,10.769231,0.000000,0.384615,0.000000,0.0,0.0,0.0,0.000000,15.384615,1.692308,0.230769,0.0,0.153846,13


In [15]:
average_quantities_per_serving = nutrition_data_by_serving.apply(np.mean)
average_quantities_per_serving

Calories                  29.377554
Calories From Fat          5.348847
Total Fat                  1.739779
Saturated Fat              1.062531
Trans Fat                  0.019213
Polyunsaturated Fat        0.139144
Monounsaturated Fat        0.183763
Cholesterol                3.401673
Sodium                    55.852774
Carbohydrates              3.301343
Dietary Fiber              0.248312
Sugars                     0.723114
Protein                    1.318831
Servings Per Container    79.638793
dtype: float64

In [16]:
nutrient_dvs_df = pd.DataFrame(nutrient_dvs, index=['% DV']).T
av_servings_for_dv = pd.DataFrame(average_quantities_per_serving).merge(right=nutrient_dvs_df, left_index=True, right_index=True)
av_servings_for_dv = av_servings_for_dv.iloc[:, 1] / av_servings_for_dv.iloc[:, 0]
av_servings_for_dv

Calories           68.079187
Saturated Fat      18.822985
Trans Fat        3018.768306
Cholesterol        88.191900
Sodium             41.179691
Carbohydrates      83.299432
Dietary Fiber     112.761193
Sugars             69.145425
Protein            37.912376
dtype: float64

In [17]:
key_nutrients = ["Protein", "Sodium", "Carbohydrates", 'Cholesterol', 'Dietary Fiber', 'Calories']
nutrient_metric_vector = av_servings_for_dv.loc[key_nutrients]
nutrient_metric_vector = nutrient_metric_vector / np.sum(nutrient_metric_vector)
nutrient_metric_vector

Protein          0.087877
Sodium           0.095451
Carbohydrates    0.193080
Cholesterol      0.204421
Dietary Fiber    0.261370
Calories         0.157801
dtype: float64

In [18]:
def evaluate_product(product_nutrient_vector):
    return np.dot(nutrient_metric_vector, product_nutrient_vector)

nutrition_data_by_serving.loc[:, 'NMV Score'] = nutrition_data_by_serving.loc[:, key_nutrients].apply(func=evaluate_product, axis=1, raw=True)
nutrition_data_by_serving.sort_values('NMV Score')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0_level_0,Calories,Calories From Fat,Total Fat,Saturated Fat,Trans Fat,Polyunsaturated Fat,Monounsaturated Fat,Cholesterol,Sodium,Carbohydrates,Dietary Fiber,Sugars,Protein,Servings Per Container,NMV Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
960195844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,189,0.000000
960358690,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,75,0.000000
208100707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0.000000
960037372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17,0.000000
960450534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960010754,560.0,250.0,28.0,12.0,0.0,0.0,0.0,90.0,1310.0,42.0,4.0,9.0,34.0,1,243.949587
960010564,690.0,0.0,29.0,13.0,5.0,0.0,0.0,40.0,1160.0,83.0,4.0,0.0,25.0,1,247.050508
960189877,710.0,370.0,41.0,17.0,1.0,0.0,0.0,85.0,1200.0,54.0,2.0,7.0,32.0,1,257.716558
148011577,780.0,0.0,39.0,7.0,5.0,0.0,0.0,85.0,1120.0,81.0,7.0,0.0,26.0,1,267.119341


In [19]:
top_items = nutrition_data_by_serving.sort_values(by='NMV Score', ascending=False).iloc[:100, :].loc[:, key_nutrients]
display(top_items.head())
combo_scores = pd.DataFrame(columns=[*key_nutrients, "PIDs", "NMV Score", "Has DVs"])
for items_per_combo in range(2, 4+1):
    combinations = list(itertools.combinations_with_replacement(top_items.index, r=items_per_combo))
    k_combo_scores = pd.DataFrame({"PIDs": pd.Series(combinations)})
    for nutrient in key_nutrients:
        nutrient_info = k_combo_scores["PIDs"].apply(lambda combo: np.sum(top_items.loc[combo, :], axis=0)[nutrient])
        k_combo_scores[nutrient] = nutrient_info
    k_combo_scores["NMV Score"] = k_combo_scores[key_nutrients].apply(lambda combo: evaluate_product(combo), axis=1)
    k_combo_scores["Has DVs"] = k_combo_scores[key_nutrients].apply(lambda combo: np.all(combo > av_servings_for_dv[key_nutrients]), axis=1)
    combo_scores = combo_scores.append(k_combo_scores)
combo_scores

Unnamed: 0_level_0,Protein,Sodium,Carbohydrates,Cholesterol,Dietary Fiber,Calories
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
960021867,41.0,2880.0,102.0,410.0,8.0,1250.0
148011577,26.0,1120.0,81.0,85.0,7.0,780.0
960189877,32.0,1200.0,54.0,85.0,2.0,710.0
960010564,25.0,1160.0,83.0,40.0,4.0,690.0
960010754,34.0,1310.0,42.0,90.0,4.0,560.0


KeyboardInterrupt: 

In [None]:
combo_scores

In [None]:
healthy_products = combo_scores[combo_scores["Has DVs"]]
def get_combo_price_and_names(pids):
    info = product_info.loc[pids, ["basePrice", "Name"]]
    prices, names = info["basePrice"], info["Name"]
    combiner = "\" AND \""
    return np.round(np.sum(prices), 2), f'"{combiner.join([str(n) for n in names])}"'

names_and_prices = healthy_products["PIDs"].apply(get_combo_price_and_names)
healthy_products.loc[:, "Price"] = names_and_prices.apply(lambda row: row[0])
healthy_products.loc[:, "Names"] = names_and_prices.apply(lambda row: row[1])
healthy_products = healthy_products.sort_values("Price")
healthy_products

In [None]:
healthy_products.to_csv('Safeway Recommendations.csv')

# PCA On Nutrition Data

In [None]:
def print_component(idx, component):
    print(f"Component {idx}: ")
    [print(f"{round(weight, 2)}*'{feature}'", end=" + ") for feature, weight in zip(nutrition_columns, component)]
    print("") # gap to next component


num_components = 5
pca_pipeline = make_pipeline(StandardScaler(), PCA(n_components=num_components))
pcs = pca_pipeline.fit(nutrition_data)
_ = [print_component(i, pc) for i, pc in enumerate(pcs.named_steps['pca'].components_)]

In [None]:
plt.rcParams['figure.figsize'] = (9.5, 9.5)

# What will be the angle of each axis in the plot? (we divide the plot / number of variable)
N = len(nutrition_columns)
angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]

# Initialise the spider plot
ax = plt.subplot(111, polar=True)

# Draw one axe per variable + add labels
plt.xticks(angles[:-1], nutrition_columns, color='grey', size=8)

# Draw ylabels
ax.set_rlabel_position(0)
domain = np.linspace(-1, 1, 10+1)
plt.yticks(domain, [*[str(round(el, 1)) for el in domain][:-1], ""], color="black", size=7)
plt.ylim(min(domain), max(domain))

for component_number in range(num_components):
    # Load data
    values = pcs.named_steps['pca'].components_[component_number].flatten().tolist()
    values += values[:1]
    values
    
    # Plot data
    ax.plot(angles, values, linewidth=1, linestyle='solid', label=f'Component {component_number}')

    # Fill area
    ax.fill(angles, values, 'b', alpha=0.1)

# Show the graph
plt.show()