In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import gzip
import copy
import pickle as pk
import heapq as hp

import networkx as nx
from networkx.algorithms import bipartite, components, shortest_paths

# Importing the datasets, cleaning, transforming

In [2]:
products_litteral = pd.read_pickle('products_litteral.pk')

# Making graphs

In [8]:
with open('food_aisles.pk', 'rb') as file:
    food_aisles = pk.load(file)

In [9]:
by_aisle = {}
for aisle in food_aisles:
    by_aisle[aisle] = pd.read_pickle('by_aisle_'+aisle+'.pk')

In [10]:
def find_recommendation_path(source_id, G=None, aisle=None, health_thresh=2, product_scores=None, hscores=None, fscores=None, alpha=.3, beta=.2):
    if G is None:
        G = nx.read_gpickle('thresholded_inv_'+aisle+'.gpk')
    if product_scores is None:
        product_scores = pd.read_pickle('product_scores')
        
    targets = set()
    for x in G.nodes:
        if x in product_scores.index and product_scores.at[x, "healthiness"] >= health_thresh:
            targets.add(x)
            
    # do not allow paths through even less healthy products:
    G_ok = G.subgraph({x for x in G.nodes if x in product_scores.index and product_scores.at[x, "healthiness"] >= product_scores.at[source_id, "healthiness"]})
    pred, lengths = nx.dijkstra_predecessor_and_distance(G_ok, source_id, weight='weight')
    candidates = [(-np.inf, None)]
    sep = []
    for candidate in targets:
        if candidate not in lengths:
            sep.append(candidate)
        if candidate in lengths and - lengths[candidate] > candidates[0][0]:   # ie candidate is closer than all current candidates
            if len(candidates) >= 10:
                hp.heappop(candidates)
            hp.heappush(candidates, (- lengths[candidate], candidate))
    #if fscores is None:
    #    fscores = pd.read_pickle('fidelity_scores.pk')
    if len(sep) == len(targets):
        print("not connected to a healthy product")
        return
    rec = (0, None)
    fids = []
    dists = []
    healths = []
    if candidates == [(-np.inf, None)]:
        print('No path to healthy products')
        return
    for (mdist, candidate) in candidates:
        if mdist == -np.inf:
            continue
        fids.append(product_scores.at[candidate, 'fidelity'])
        dists.append(- mdist)
        healths.append(product_scores.at[candidate, 'healthiness'])
    max_fid = max(fids)
    min_fid = min(fids)
    max_dist = max(dists)
    min_dist = min(dists)
    max_health = max(healths)
    min_health = min(healths)
    for (mdist, candidate) in candidates:
        if mdist == -np.inf:
            continue
        fid = ((product_scores.at[candidate, 'fidelity'] - min_fid) / (max_fid - min_fid)) if max_fid != min_fid else 0
        dist = ((max_dist + mdist) / (max_dist - min_dist)) if max_dist != min_dist else 0
        health = ((product_scores.at[candidate, "healthiness"] - min_health) / (max_health - min_health)) if max_health != min_health else 0
        score = (1+health)**alpha * (1+fid)**beta * (1+dist)**(1-alpha-beta)
        if score > rec[0]:
            rec = (score, candidate)
    orig = rec[1]
    path = [orig]
    while orig != source_id:
        orig = pred[orig][0]
        path.append(orig)
    return path


# Prints some good and bad examples of predictions

good_product_ids = [8923,16419,4313,12218,516]
bad_product_ids = [28199,6184, 14809]

print("Good recommendation examples : ")
for p in good_product_ids :
    target = find_recommendation_path(row.product_id, aisle=prod_lit.at[row.product_id, 'aisle'], product_scores=product_scores)[0]
    s = prod_lit.at[p, 'product_name']+' should be replaced by '+ prod_lit.at[target, 'product_name']+ ' (healthiness gain = '+str (round(product_scores.at[target, 'healthiness']-product_scores.at[p, 'healthiness'],2))+")"
    print(s)
    
print("\nBad or strange recommendation examples : ")
for p in bad_product_ids :
    target = find_recommendation_path(row.product_id, aisle=prod_lit.at[row.product_id, 'aisle'], product_scores=product_scores)[0]
    s = prod_lit.at[p, 'product_name']+' should be replaced by '+ prod_lit.at[target, 'product_name']+ ' (healthiness gain = '+str (round(product_scores.at[target, 'healthiness']-product_scores.at[p, 'healthiness'],2))+")"
    print(s)

In [11]:
product_scores = pd.read_pickle('GOOD_product_scores.pk')

In [12]:
product_scores.head()

Unnamed: 0_level_0,healthiness,count,fidelity
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,-0.09073,507,0.679629
2,2.47619,21,0.093217
3,1.306452,62,1.630632
4,-0.166667,90,0.975122
5,3.25,4,1.829932


In [13]:
# Find real recommendations
with open('THE_GOOD_users_cluster.pk', 'rb') as f:
    users_cluster = pk.load(f)

In [14]:
prod_lit = products_litteral.set_index('product_id')
prod_lit.head()

Unnamed: 0_level_0,product_name,aisle,department
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Chocolate Sandwich Cookies,cookies cakes,snacks
78,Nutter Butter Cookie Bites Go-Pak,cookies cakes,snacks
102,Danish Butter Cookies,cookies cakes,snacks
172,Gluten Free All Natural Chocolate Chip Cookies,cookies cakes,snacks
285,Mini Nilla Wafers Munch Pack,cookies cakes,snacks


In [15]:
unhealthy_food_orders = pd.read_pickle('unhealthy_food_orders.pk')

In [16]:
unhealthy_food_orders.head()

Unnamed: 0,user_id,product_id
150,4,1200
151,4,17769
152,4,43704
153,4,37646
154,4,11865


In [17]:
some_unhealthy_orders = unhealthy_food_orders.sample(100)

In [18]:
some_unhealthy_orders.head()

Unnamed: 0,user_id,product_id
14177866,204571,12761
8808,154,48395
6961760,99996,16891
13144050,189571,40396
9491310,136847,42701


In [28]:
for _, row in some_unhealthy_orders.iterrows():
    try:
        print('For product_id= ', row.product_id)
        print('Path from', prod_lit.at[row.product_id, 'product_name'], 'with healthiness', product_scores.at[row.product_id, 'healthiness'])
        path = find_recommendation_path(row.product_id, aisle=prod_lit.at[row.product_id, 'aisle'], product_scores=product_scores)
        target = path[0]
        print('To', prod_lit.at[target, 'product_name'], 'with healthiness', product_scores.at[target, 'healthiness'])
        print()
    except:
        print('============================= Echec: For user ', row.user_id)
        print('Path from', prod_lit.at[row.product_id, 'product_name'], 'with healthiness', product_scores.at[row.product_id, 'healthiness'])
        print()
        

For product_id=  46735
Path from Cherry Caffeine Free Unsweetened Soft Drink Mix with healthiness 1.4
Nb of targets: 42
To Pink Lemonade Unsweetened Soft Drink Mix with healthiness 3.0

For product_id=  8736
Path from Synergy Organic & Raw Cosmic Cranberry with healthiness 1.880787037037037
Nb of targets: 254
To Organic Raw Kombucha Gingerade with healthiness 2.34447523380672

For product_id=  21938
Path from Green Bell Pepper with healthiness 3.494960073308025
Nb of targets: 519
To Organic Garlic with healthiness 3.51486216932082

For product_id=  34222
Path from Squeeze Grape Jelly with healthiness 0.5653846153846154
Nb of targets: 250
To Organic Creamy Peanut Butter with healthiness 2.4247073072264835

For product_id=  28199
Path from Clementines, Bag with healthiness 1.7839971295299606
Nb of targets: 326
To Organic Baby Spinach with healthiness 2.8187180907607314

For product_id=  36929
Path from Milk, Vitamin D with healthiness 2.4545454545454546
Nb of targets: 96
To Organic Whole

To Sharp Cheddar Cheese Shredded with healthiness 2.050301810865191

For product_id=  29145
Path from Green Tea Blueberry Pomegranate with healthiness 1.5781990521327014
Nb of targets: 323
To Organic Unsweetened Lemon Iced Green Tea with healthiness 2.169811320754717

For product_id=  12341
Path from Hass Avocados with healthiness 1.9221343591935598
Nb of targets: 4
To Butter Lettuce with healthiness 2.1941747572815533

For product_id=  4232
Path from Ambrosia, Medium Roast with healthiness 1.9681159420289855
Nb of targets: 158
To Jacob's Wonderbar Dark Roast with healthiness 2.187258687258687

For product_id=  12384
Path from Organic Lactose Free 1% Lowfat Milk with healthiness 2.4063386944181646
Nb of targets: 175
To Organic Lactose Free 1% Lowfat Milk with healthiness 2.4063386944181646

For product_id=  44422
Path from Organic Old Fashioned Rolled Oats with healthiness 2.800900900900901
Nb of targets: 142
To Gluten Free Quick Cooking Oats with healthiness 2.8597972972972974

For pr

To Ice Cream Chocolate Mocha Chip with healthiness 2.3899676375404533

For product_id=  25146
Path from Original Orange Juice with healthiness 1.6700911590218492
Nb of targets: 254
To 100% Raw Coconut Water with healthiness 2.296306926903942

For product_id=  2054
Path from Belgium Beer with healthiness 0.6
Nb of targets: 44
To 1500 Pale Ale with healthiness 3.2222222222222223



In [39]:
# Prints some good and bad examples of predictions

good_product_ids = [8923,16419,4313,12218,516]
bad_product_ids = [28199,6184, 14809]

print("Good recommendation examples : ")
for p in good_product_ids :
    target = find_recommendation_path(row.product_id, aisle=prod_lit.at[row.product_id, 'aisle'], product_scores=product_scores)[0]
    s = prod_lit.at[p, 'product_name']+' should be replaced by '+ prod_lit.at[target, 'product_name']+ ' (healthiness gain = '+str (round(product_scores.at[target, 'healthiness']-product_scores.at[p, 'healthiness'],2))+")"
    print(s)
    
print("\nBad or strange recommendation examples : ")
for p in bad_product_ids :
    target = find_recommendation_path(row.product_id, aisle=prod_lit.at[row.product_id, 'aisle'], product_scores=product_scores)[0]
    s = prod_lit.at[p, 'product_name']+' should be replaced by '+ prod_lit.at[target, 'product_name']+ ' (healthiness gain = '+str (round(product_scores.at[target, 'healthiness']-product_scores.at[p, 'healthiness'],2))+")"
    print(s)

Good recommendation examples : 
Nb of targets: 44
Chocolate Milk Cartons 2% Reduced Fat should be replaced by 1500 Pale Ale (healthiness gain = 2.89)
Nb of targets: 44
Sweet Sliced Bread should be replaced by 1500 Pale Ale (healthiness gain = 2.58)
Nb of targets: 44
Star Wars Italian Sweet Creme Creamer should be replaced by 1500 Pale Ale (healthiness gain = 2.53)
Nb of targets: 44
Classic Wheat Bread should be replaced by 1500 Pale Ale (healthiness gain = 2.34)
Nb of targets: 44
American Cheese Slices should be replaced by 1500 Pale Ale (healthiness gain = 2.26)

Bad or strange recommendation examples : 
Nb of targets: 44
Clementines, Bag should be replaced by 1500 Pale Ale (healthiness gain = 1.44)
Nb of targets: 44
Clementines should be replaced by 1500 Pale Ale (healthiness gain = 2.46)
Nb of targets: 44
Organic Earl Grey Black Tea Bags 15 Count should be replaced by 1500 Pale Ale (healthiness gain = 1.58)


In [20]:
# mean healthiness per aisle
healthiness_per_aisle = {}
for aisle in food_aisles:
    healthiness_per_aisle[aisle] = pd.merge(by_aisle[aisle], product_scores, left_on='product_id', right_on='product_id')

In [23]:
for aisle in food_aisles:
    print(aisle, healthiness_per_aisle[aisle].mean()['healthiness'])
    print("====================")

butter 1.683022965208091
energy sports drinks 0.5959973106994256
soy lactosefree 2.049678544066623
ice cream ice 1.372019258082887
breakfast bakery 1.3105598454455978
coffee 1.23477358262605
marinades meat preparation 1.8484857537876438
kosher foods 2.2725794806356587
beers coolers 0.7434820461027672
canned meat seafood 1.8672391934007981
tortillas flat bread 2.0540074158785098
frozen meals 0.26754207134166325
cookies cakes 0.7943586097177386
chips pretzels 1.326160376033867
packaged vegetables fruits 1.8773111512029206
popcorn jerky 0.9440996708025281
fresh vegetables 3.175854541779337
candy chocolate 0.7567353956430564
salad dressing toppings 1.8100482857846063
grains rice dried goods 2.5794504312590965
red wines 1.2536254665546902
packaged produce 1.4934894972637078
specialty cheeses 2.026884802584048
packaged seafood 2.2347788901513415
spices seasonings 2.4253572657093
frozen vegan vegetarian 1.8708081295539487
yogurt 1.6868528758265835
canned meals beans 1.5207193160268189
instant

In [24]:
# pour voir la healthiness d'un produit en particulier:
healthiness_per_aisle['packaged produce'][healthiness_per_aisle['packaged produce'].product_id==21385]

Unnamed: 0,product_id,product_name,aisle,department,healthiness,count,fidelity
18,21385,Butter Lettuce,packaged produce,produce,2.194175,618,0.807941


In [25]:
# print proportion of products above the healthiness threshold for each aisle
for aisle in food_aisles:
    t = 0
    g = 0
    lst = []
    for ind in by_aisle[aisle]['product_id']:
        if ind in product_scores.index:
            lst.append(product_scores.at[ind, 'healthiness'])
        if ind in product_scores.index and product_scores.at[ind, 'healthiness']>=2:
            g += 1
        t+=1
    #print(by_aisle[aisle]['product_id'])
    print(aisle, float(g)/t, g, t)
    #print(sorted(lst))

butter 0.44666666666666666 67 150
energy sports drinks 0.09863945578231292 29 294
soy lactosefree 0.6109215017064846 179 293
ice cream ice 0.3290559120073327 359 1091
breakfast bakery 0.29646017699115046 67 226
coffee 0.26029411764705884 177 680
marinades meat preparation 0.45965770171149145 188 409
kosher foods 0.591715976331361 100 169
beers coolers 0.13246753246753246 51 385
canned meat seafood 0.48056537102473496 136 283
tortillas flat bread 0.5601659751037344 135 241
frozen meals 0.05909090909090909 52 880
cookies cakes 0.16590389016018306 145 874
chips pretzels 0.32659251769464104 323 989
packaged vegetables fruits 0.5349593495934959 329 615
popcorn jerky 0.1930379746835443 61 316
fresh vegetables 0.9332161687170475 531 569
candy chocolate 0.19743178170144463 246 1246
salad dressing toppings 0.45 252 560
grains rice dried goods 0.7232142857142857 243 336
red wines 0.24568965517241378 57 232
packaged produce 0.125 4 32
specialty cheeses 0.5608856088560885 152 271
packaged seafood 

In [28]:
product_scores.sort_values(by="healthiness").head()

Unnamed: 0_level_0,healthiness,count,fidelity
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
32114,-1.0,1,0.0
30980,-1.0,1,0.0
30993,-1.0,1,0.0
30998,-1.0,2,0.0
31011,-1.0,3,0.821429


The very first idea coming to mind when one wants to improve people's consumption habits is to make them consume healthy products. The problem of this approach is the requirement of a "healthiness score" that is not easy to evaluate.
A second approach would be to advise products that consumers usually like. Again, this will bring us to define an "appreciation score" reflecting how likely a consumer is to re-buy this product.


# I - Healthiness score

No information concerning the products' healthiness is provided in the datasets. By clustering the users according to their consumptions in each aisle, we can separate them into more or less healthy groups. Now, we define the healthiness of a production as a function of the distribution of people who consumed this product.
For instance, if a "healthy" user consumes a certain product, it gives 4 points to this product. An "average" consumer would give 1 point, while an "unhealthy" consumer would remove 1 point to the product. Then, it is easy to adapt this (arbitrary) scores making a weighted average taking the number of consumptions into account. The bigger the score, the healthier the product.
In particular, if a product has a score of -1, it means that it's only consumed by people of the unhealthy cluster, and a product rated 4 is only conumed by people of the healthy cluster.
The healthiness score may be irrelevant when a product is only consumed by very few people, because it introduces a great variance in score evaluation.

## II - Appreciation score

We can decude from our dataset the average number of times a product is re-bought. Nevertheless, we have to take the consumer's consumption habits into account. For instance, imagine a consumer, Ada, buying everytime the same set of products, and bought each of them 10 times, except one product, let's say "chicory and ham", which wasn't so good, and she bought it only 4 times. Now imagine  a second consumer Robert, who doesn't like consuming the same products, and tries to have a very varied consumption. He bought once each product, except one, say "goat cheese", which he loved and bought 3 times. If Ada and Robert are the only consumers of the dataset, "chicory and ham" would have a better appreciation score (that is 4) than "goat cheese" (score of 3). We don't want this to happen ! 
This is why we normalize the contribution of each user by the average time he or she consumes each product. Then, we avarege the scores obtained for each user, and obtain a general appreciation score for the product. The greater, the more the product is likely to be re-bought. The score itself is difficult to  interpret, but we can compare different scores quite safely.

## III - Limitations

Some products have been consumed by very few people, sometimes 0, so we have no (reliable) information about their scores. A quick exploration of these products show that they are not products we would want to advise, because they are too specific. As an example, we can mention and . Our decision for these products is to give them a score of 0.