In [1]:
# import packages
import random
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity 

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline 
plt.style.use('seaborn')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
# import data
order_products_prior = pd.read_csv('C:/Users/mwamb/Desktop/Desktop/instacart/data/order_products__prior.csv')
order_products_train = pd.read_csv('C:/Users/mwamb/Desktop/Desktop/instacart/data/order_products__train.csv')
orders = pd.read_csv('C:/Users/mwamb/Desktop/Desktop/instacart/data/orders.csv')
products = pd.read_csv('C:/Users/mwamb/Desktop/Desktop/instacart/data/products.csv')
departments = pd.read_csv('C:/Users/mwamb/Desktop/Desktop/instacart/data/departments.csv')
aisles = pd.read_csv('C:/Users/mwamb/Desktop/Desktop/instacart/data/aisles.csv')

In [4]:
print(order_products_prior.shape)
print(order_products_train.shape)
print(orders.shape)
print(aisles.shape)
print(departments.shape)
print(products.shape)

(603635, 4)
(1384617, 4)
(3421083, 7)
(134, 2)
(21, 2)
(49688, 4)


In [5]:
# concat order_products_prior and train together to get a total list of ordered products
order_products_total = pd.concat([order_products_prior, order_products_train])

print('there are', order_products_total.shape[0], 'number of products have been ordered')

there are 1988252 number of products have been ordered


In [6]:
# merge order_products_total with products to get product names
order_products_total = order_products_total.drop('add_to_cart_order', axis = 1)
order_products_total = order_products_total.merge(products[['product_id', 'product_name']],how='left', on='product_id')
order_products_total.head()

Unnamed: 0,order_id,product_id,reordered,product_name
0,2,33120,1,Organic Egg Whites
1,2,28985,1,Michigan Organic Kale
2,2,9327,0,Garlic Powder
3,2,45918,1,Coconut Butter
4,2,30035,0,Natural Sweetener


In [9]:
order_products_total['reordered'].unique()

array([1, 0], dtype=int64)

. To build my recommeder, I decide to focus on products that have been reordered before for users.  
. Pipeline

### Find products that have reordered before

In [7]:
# get the list of orders that have been reordered before
reorders = order_products_total[order_products_total['reordered'] == 1]
reorders.shape

(1185417, 4)

In [8]:
reorders.head()

Unnamed: 0,order_id,product_id,reordered,product_name
0,2,33120,1,Organic Egg Whites
1,2,28985,1,Michigan Organic Kale
3,2,45918,1,Coconut Butter
5,2,17794,1,Carrots
6,2,40141,1,Original Unflavored Gelatine Mix


In [10]:
orders2 = orders[['order_id', 'user_id']]

In [16]:
orders2.head()

Unnamed: 0,order_id,user_id
0,2539329,1
1,2398795,1
2,473747,1
3,2254736,1
4,431534,1


In [11]:
# merge to get user_id and product_id
user_orders = reorders.merge(orders2, on='order_id')

In [13]:
user_orders.head()

Unnamed: 0,order_id,product_id,reordered,product_name,user_id,high_volume
0,2,33120,1,Organic Egg Whites,202279,
1,2,28985,1,Michigan Organic Kale,202279,True
2,2,45918,1,Coconut Butter,202279,False
3,2,17794,1,Carrots,202279,True
4,2,40141,1,Original Unflavored Gelatine Mix,202279,True


In [12]:
# filtering out the high volumn products that user reordered more than once
user_orders['high_volume'] = (user_orders['product_id'].value_counts().sort_values(ascending=False)>1)
high_volume = user_orders[user_orders['high_volume'] == True]

In [17]:
high_volume.head()

Unnamed: 0,order_id,product_id,reordered,product_name,user_id,high_volume
1,2,28985,1,Michigan Organic Kale,202279,True
3,2,17794,1,Carrots,202279,True
4,2,40141,1,Original Unflavored Gelatine Mix,202279,True
8,3,17704,1,Lemons,205970,True
9,3,21903,1,Organic Baby Spinach,205970,True


In [14]:

# get a matrix of different high volume items that particular user purchased
high_volume_users = high_volume.groupby(['user_id', 'product_name']).size().sort_values(ascending=False).unstack().fillna(0)

In [18]:
high_volume_users.head()

product_name,0% Fat Blueberry Greek Yogurt,0% Fat Free Organic Milk,0% Greek Strained Yogurt,0% Greek Yogurt Black Cherry on the Bottom,"0% Greek, Blueberry on the Bottom Yogurt",1 Apple + 1 Mango Fruit Bar,1 Apple + 1 Pear Fruit Bar,1 Liter,1 Ply Paper Towels,1% Low Fat Chocolate Milk,...,Zesty Pizza Sprouted Flax Snax,Zinfandel,Zucchini Banana & Amaranth Organic Baby Food,Zucchini Gingerbread Carrot Smart Cookies,"Zucchini, Spinach & Banana Blend Veggies On-The-Go Stage 2 (6 Months and Up)",of Hanover 100 Calorie Pretzels Mini,smartwater® Electrolyte Enhanced Water,vitaminwater® XXX Acai Blueberry Pomegranate,with Crispy Almonds Cereal,with Olive Oil Mayonnaise Dressing
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
90,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# calculate similarity between each user
cosine_dists = pd.DataFrame(cosine_similarity(high_volume_users),index=high_volume_users.index, columns=high_volume_users.index)

In [20]:

cosine_dists.head()

user_id,27,66,90,150,155,206,208,214,222,382,...,205794,205908,205943,205970,205990,206043,206082,206105,206162,206206
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
90,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
155,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
def Recommender_System(user_id):
    
    '''
    enter user_id and return a list of 5 recommendations.
    '''
    
    u = high_volume.groupby(['user_id','product_name']).size().sort_values(ascending=False).unstack().fillna(0)
    u_sim = pd.DataFrame(cosine_similarity(u), index=u.index, columns=u.index)

    p = high_volume.groupby(['product_name','user_id']).size().sort_values(ascending=False).unstack().fillna(0)
    
    recommendations = pd.Series(np.dot(p.values,cosine_dists[user_id]), index=p.index)
    return recommendations.sort_values(ascending=False).head()

In [22]:
#gives a random user ID to for input to the recommender system
random.sample(high_volume['user_id'].tolist(),1)

[88292]

In [23]:
Recommender_System(91397)

product_name
Bag of Organic Bananas    153.933381
Organic Strawberries        8.236779
Organic Baby Spinach        8.224785
Organic Hass Avocado        8.008435
Organic Whole Milk          5.207724
dtype: float64

In [24]:
_ = order_products_total.merge(orders, on='order_id', how='left')

In [25]:

_.head()

Unnamed: 0,order_id,product_id,reordered,product_name,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,Organic Egg Whites,202279,prior,3,5,9,8.0
1,2,28985,1,Michigan Organic Kale,202279,prior,3,5,9,8.0
2,2,9327,0,Garlic Powder,202279,prior,3,5,9,8.0
3,2,45918,1,Coconut Butter,202279,prior,3,5,9,8.0
4,2,30035,0,Natural Sweetener,202279,prior,3,5,9,8.0


In [26]:
_[_.user_id == 91397].product_name.value_counts().head(20)

Maple & Pecan Granola Gluten Free                                                2
Organic Chicken Strips                                                           2
Organic Whole Milk                                                               2
Organic Plain Whole Milk Yogurt                                                  2
Limes                                                                            2
Organic Vanilla Whole Milk Yogurt                                                1
Mild Sliced Cheddar Cheese                                                       1
Bag of Organic Bananas                                                           1
Organic Large Extra Fancy Fuji Apple                                             1
Shredded Mild Cheddar Cheese                                                     1
Organic Seasoned Yukon Select Potatoes Hashed Browns                             1
Red Vine Tomato                                                                  1
Orga

In [27]:

random.sample(high_volume['user_id'].tolist(),1)

[96576]

In [28]:
Recommender_System(175965)

product_name
Unsweetened Vanilla Almond Milk                                  6.781353
Holler Mountain Organic Coffee                                   2.054484
Organic Bagged Mini Dark Peanut Butter                           1.294378
Mint Chip Almond Milk Non-Dairy Frozen Dessert                   1.000000
Raw Probiotics Women probiotic 85 Billion 32 Strains capsules    1.000000
dtype: float64

In [29]:
_[_.user_id == 175965].product_name.value_counts().head(20)

Unsweetened Vanilla Almond Milk                                  5
Organic SprouTofu Silken Tofu                                    4
Organic Granny Smith Apple                                       4
Raw Probiotics Women probiotic 85 Billion 32 Strains capsules    3
Organic Roasted Turkey Breast                                    3
Organic Large Green Asparagus                                    3
Organic Hot Italian Chicken Sausage                              2
Organic Strawberries                                             2
Organic Carrot Bunch                                             2
Blue Chips Corn Tortilla Chips                                   2
Sweet Onion                                                      2
Organic Blackberries                                             2
Mint Chip Almond Milk Non-Dairy Frozen Dessert                   2
Gummy Bears                                                      2
Organic Bagged Mini Dark Peanut Butter                        

### Metric

In [30]:
users = high_volume.user_id.unique().tolist()

In [31]:
def how_match():
    res = []
    for user in sorted(users)[:1000]:
        recommendations = Recommender_System(user)
        top_20_itmes = _[_.user_id == user].product_name.value_counts().head(20)
    
        recommendations_list = recommendations.index.tolist()
        top_20_items_list = top_20_itmes.index.tolist()
    
        res.append((len(set(recommendations_list) & set(top_20_items_list)))/5)
    return np.mean(res)

In [None]:

# get metric for the :1000 users
how_match()