In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import gzip
import copy
import pickle as pk
import heapq as hp

import networkx as nx
from networkx.algorithms import bipartite, components, shortest_paths

# Importing the datasets, cleaning, transforming

In [2]:
products_litteral = pd.read_pickle('products_litteral.pk')

# Making graphs

In [3]:
orders_products = pd.read_pickle('orders_products.pk')

In [4]:
with open('food_aisles.pk', 'rb') as file:
    food_aisles = pk.load(file)

In [5]:
by_aisle = {}
for aisle in food_aisles:
    by_aisle[aisle] = pd.read_pickle('by_aisle_'+aisle+'.pk')

In [6]:
def find_recommendation_path(source_id, G=None, aisle=None, health_thresh=2, product_scores=None, hscores=None, fscores=None, alpha=.3, beta=.2):
    if G is None:
        G = nx.read_gpickle('thresholded_inv_'+aisle+'.gpk')
    if product_scores is None:
        product_scores = pd.read_pickle('product_scores')
    #if hscores is None:
    #    hscores = pd.read_pickle('prod_healthiness.pk')
    targets = set()
    for x in G.nodes:
        if x in product_scores.index and product_scores.at[x, "healthiness"] >= health_thresh:
            targets.add(x)
        #else:
        #    print(product_scores.at[x, "healthiness"])
    #targets = {x for x in G.nodes if x in product_scores.index and product_scores.at[x, "healthiness"] >= health_thresh}
    #print(sorted(lst))
    #print(sorted(G.nodes))
    print('Nb of targets:', len(targets))
    #print(targets)
    # do not allow paths through even less healthy products:
    G_ok = G.subgraph({x for x in G.nodes if x in product_scores.index and product_scores.at[x, "healthiness"] >= product_scores.at[source_id, "healthiness"]})
    pred, lengths = nx.dijkstra_predecessor_and_distance(G_ok, source_id, weight='weight')
    candidates = [(-np.inf, None)]
    sep = []
    for candidate in targets:
        if candidate not in lengths:
            sep.append(candidate)
        if candidate in lengths and - lengths[candidate] > candidates[0][0]:   # ie candidate is closer than all current candidates
            if len(candidates) >= 10:
                hp.heappop(candidates)
            hp.heappush(candidates, (- lengths[candidate], candidate))
    #if fscores is None:
    #    fscores = pd.read_pickle('fidelity_scores.pk')
    if len(sep) == len(targets):
        print("not connected to a healthy product")
        return
    rec = (0, None)
    fids = []
    dists = []
    healths = []
    if candidates == [(-np.inf, None)]:
        print('No path to healthy products')
        return
    for (mdist, candidate) in candidates:
        if mdist == -np.inf:
            continue
        fids.append(product_scores.at[candidate, 'fidelity'])
        dists.append(- mdist)
        healths.append(product_scores.at[candidate, 'healthiness'])
    max_fid = max(fids)
    min_fid = min(fids)
    max_dist = max(dists)
    min_dist = min(dists)
    max_health = max(healths)
    min_health = min(healths)
    for (mdist, candidate) in candidates:
        if mdist == -np.inf:
            continue
        fid = ((product_scores.at[candidate, 'fidelity'] - min_fid) / (max_fid - min_fid)) if max_fid != min_fid else 0
        dist = ((max_dist + mdist) / (max_dist - min_dist)) if max_dist != min_dist else 0
        health = ((product_scores.at[candidate, "healthiness"] - min_health) / (max_health - min_health)) if max_health != min_health else 0
        score = (1+health)**alpha * (1+fid)**beta * (1+dist)**(1-alpha-beta)
        if score > rec[0]:
            rec = (score, candidate)
    orig = rec[1]
    path = [orig]
    while orig != source_id:
        orig = pred[orig][0]
        path.append(orig)
    return path

In [7]:
product_scores = pd.read_pickle('GOOD_product_scores.pk')


In [8]:
product_scores.head()

Unnamed: 0_level_0,healthiness,count,fidelity
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,-0.09073,507,0.679629
2,2.47619,21,0.093217
3,1.306452,62,1.630632
4,-0.166667,90,0.975122
5,3.25,4,1.829932


In [9]:
# Find real recommendations
with open('THE_GOOD_users_cluster.pk', 'rb') as f:
    users_cluster = pk.load(f)

In [10]:
prod_lit = products_litteral.set_index('product_id')
prod_lit.head()

Unnamed: 0_level_0,product_name,aisle,department
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Chocolate Sandwich Cookies,cookies cakes,snacks
78,Nutter Butter Cookie Bites Go-Pak,cookies cakes,snacks
102,Danish Butter Cookies,cookies cakes,snacks
172,Gluten Free All Natural Chocolate Chip Cookies,cookies cakes,snacks
285,Mini Nilla Wafers Munch Pack,cookies cakes,snacks


In [11]:
unhealthy_food_orders = pd.read_pickle('unhealthy_food_orders.pk')

In [12]:
unhealthy_food_orders.head()

Unnamed: 0,user_id,product_id
150,4,1200
151,4,17769
152,4,43704
153,4,37646
154,4,11865


In [13]:
some_unhealthy_orders = unhealthy_food_orders.sample(100)

In [14]:
some_unhealthy_orders.head()

Unnamed: 0,user_id,product_id
6991621,100412,25706
732687,10722,28199
12847343,185260,6878
9967001,143675,46226
12681324,182891,18479


In [15]:
for _, row in some_unhealthy_orders.iterrows():
    try:
        print('For user ', row.user_id)
        print('Path from', prod_lit.at[row.product_id, 'product_name'], 'with healthiness', product_scores.at[row.product_id, 'healthiness'])
        path = find_recommendation_path(row.product_id, aisle=prod_lit.at[row.product_id, 'aisle'], product_scores=product_scores)
        target = path[0]
        print('To', prod_lit.at[target, 'product_name'], 'with healthiness', product_scores.at[target, 'healthiness'])
        print()
    except:
        print('============================= Echec: For user ', row.user_id)
        print('Path from', prod_lit.at[row.product_id, 'product_name'], 'with healthiness', product_scores.at[row.product_id, 'healthiness'])
        print()
        

For user  100412
Path from Hazelnut Bromstad Design Liquid Coffee Creamer with healthiness 0.6894164193867458
Nb of targets: 35
To Organic Heavy Whipping Cream with healthiness 2.9727683122178536

For user  10722
Path from Clementines, Bag with healthiness 1.7839971295299606
Nb of targets: 326
To Organic Baby Spinach with healthiness 2.8187180907607314

For user  185260
Path from Red Blend with healthiness 0.7757575757575758
Nb of targets: 52
To Pinot Noir, California, 2010 with healthiness 3.2285714285714286

For user  143675
Path from Thick & Crispy Tortilla Chips with healthiness 2.219419324131366
Nb of targets: 318
To Thick & Crispy Tortilla Chips with healthiness 2.219419324131366

For user  182891
Path from Organic Low Sodium Vegetable Broth with healthiness 3.2069206571128976
Nb of targets: 236
To Vegetable Broth Low Sodium with healthiness 3.3223300970873786

For user  13388
Path from Italian Pizza Crust Original 12oz with healthiness 1.1197604790419162
Nb of targets: 131
To Ye

To Pyramid Black Tea Bags with healthiness 2.343243243243243

For user  46054
Path from Organic Low Fat Cottage Cheese with healthiness 2.238878143133462
Nb of targets: 76
To Organic Low Fat Cottage Cheese with healthiness 2.238878143133462

For user  152991
Path from Uncured Bacon + Cranberry Bison Bar with healthiness 1.8709677419354838
Nb of targets: 56
To Beef Steak with Cranberry & Sriracha with healthiness 2.0344827586206895

For user  172979
Path from Cherrios Honey Nut with healthiness 0.4354995150339476
Nb of targets: 179
To Organic Honey Nut O's Cereal with healthiness 2.310344827586207

For user  115074
Path from Unsweetened Original Almond Breeze Almond Milk with healthiness 2.1137014314928426
Nb of targets: 175
To Unsweetened Original Almond Breeze Almond Milk with healthiness 2.1137014314928426

For user  9939
Path from Oyster Crackers with healthiness 2.33605220228385
Nb of targets: 272
To Crackers Harvest Whole Wheat with healthiness 2.490731204943357

For user  160884


Nb of targets: 449
To Organic YoKids Smoothie Strawberry Banana Lowfat Yogurt with healthiness 2.245877788554801

For user  184502
Path from Small Curd Lowfat 2% Milkfat Cottage Cheese with healthiness 1.4439592430858808
Nb of targets: 76
To Organic Sour Cream with healthiness 2.948473282442748



In [16]:
# mean healthiness per aisle
healthiness_per_aisle = {}
for aisle in food_aisles:
    healthiness_per_aisle[aisle] = pd.merge(by_aisle[aisle], product_scores, left_on='product_id', right_on='product_id')

In [17]:
for aisle in food_aisles:
    print(aisle, healthiness_per_aisle[aisle].mean()['healthiness'])
    print("====================")

condiments 2.004907583297024
trail mix snack mix 0.9634255243411282
frozen vegan vegetarian 1.8708081295539487
packaged poultry 1.870988241007237
refrigerated 1.6168735790431676
ice cream ice 1.372019258082887
frozen meat seafood 1.1495443508086283
frozen dessert 0.9712701079368742
refrigerated pudding desserts 1.0508712509881477
bakery desserts 1.172374385646769
yogurt 1.6868528758265835
ice cream toppings 1.0968692866158418
frozen breakfast 1.1212457073295234
dry pasta 2.3914223590514916
cream 1.0683572142703215
fresh herbs 2.9310565291669697
cookies cakes 0.7943586097177386
soy lactosefree 2.049678544066623
hot cereal pancake mixes 1.801588915528622
butter 1.683022965208091
bulk dried fruits vegetables 2.2577091884944296
pasta sauce 1.9266203231247623
frozen pizza 0.7212447554510947
red wines 1.2536254665546902
frozen meals 0.26754207134166325
canned meals beans 1.5207193160268189
nuts seeds dried fruit 1.6247456278442978
eggs 2.1004976545045837
breakfast bakery 1.3105598454455978
b

In [18]:
# pour voir la healthiness d'un produit en particulier:
healthiness_per_aisle['packaged produce'][healthiness_per_aisle['packaged produce'].product_id==21385]

Unnamed: 0,product_id,product_name,aisle,department,healthiness,count,fidelity
18,21385,Butter Lettuce,packaged produce,produce,2.194175,618,0.807941


In [19]:
# print proportion of products above the healthiness threshold for each aisle
for aisle in food_aisles:
    t = 0
    g = 0
    lst = []
    for ind in by_aisle[aisle]['product_id']:
        if ind in product_scores.index:
            lst.append(product_scores.at[ind, 'healthiness'])
        if ind in product_scores.index and product_scores.at[ind, 'healthiness']>=2:
            g += 1
        t+=1
    #print(by_aisle[aisle]['product_id'])
    print(aisle, float(g)/t, g, t)
    #print(sorted(lst))

condiments 0.5364806866952789 250 466
trail mix snack mix 0.2028985507246377 14 69
frozen vegan vegetarian 0.5185185185185185 98 189
packaged poultry 0.494949494949495 49 99
refrigerated 0.3851851851851852 260 675
ice cream ice 0.3290559120073327 359 1091
frozen meat seafood 0.22270742358078602 51 229
frozen dessert 0.17857142857142858 20 112
refrigerated pudding desserts 0.17346938775510204 17 98
bakery desserts 0.28619528619528617 85 297
yogurt 0.4415204678362573 453 1026
ice cream toppings 0.24705882352941178 21 85
frozen breakfast 0.27364864864864863 81 296
dry pasta 0.7242888402625821 331 457
cream 0.2360248447204969 38 161
fresh herbs 0.8488372093023255 73 86
cookies cakes 0.16590389016018306 145 874
soy lactosefree 0.6109215017064846 179 293
hot cereal pancake mixes 0.4884488448844885 148 303
butter 0.44666666666666666 67 150
bulk dried fruits vegetables 0.6666666666666666 8 12
pasta sauce 0.5413533834586466 216 399
frozen pizza 0.15223880597014924 51 335
red wines 0.24568965517

In [20]:
product_scores.head()

Unnamed: 0_level_0,healthiness,count,fidelity
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,-0.09073,507,0.679629
2,2.47619,21,0.093217
3,1.306452,62,1.630632
4,-0.166667,90,0.975122
5,3.25,4,1.829932
