In [29]:
import pandas as pd
import os
import numpy as np


In [30]:
# define current working folder
# curr_dir = os.path.dirname(os.path.realpath(__file__))
curr_dir = os.getcwd()

In [31]:
# import tables from Instacart Kaggle dataset
aisles = pd.read_csv(os.path.join(curr_dir, 'aisles.csv'))
departments = pd.read_csv(os.path.join(curr_dir, 'departments.csv'))
order_products_prior = pd.read_csv(os.path.join(curr_dir, 'order_products__prior.csv'))
order_products_train = pd.read_csv(os.path.join(curr_dir, 'order_products__train.csv'))
orders = pd.read_csv(os.path.join(curr_dir, 'orders.csv'))
products = pd.read_csv(os.path.join(curr_dir, 'products.csv'))

In [32]:
# stack the previous train/"prior" datasets used for training/test
# we want to redo the train/test groups and not rely on the Kaggle 
# dataset creator's groups since we're going to drop data
order_products = order_products_train
order_products = order_products.append(order_products_prior)

In [33]:
# check for duplicated primary keys
aisles_dups = aisles[aisles.duplicated(['aisle_id'])]
departments_dups = departments[departments.duplicated(['department_id'])]
order_products_prior_dups = order_products_prior[order_products_prior.duplicated(['order_id','product_id'])]
order_products_train_dups = order_products_train[order_products_train.duplicated(['order_id','product_id'])]
orders_dups = orders[orders.duplicated(['order_id'])]
products_dups = products[products.duplicated(['product_id'])]
order_products_dups = order_products[order_products.duplicated(['order_id','product_id'])]

print('Check for duplicates in primary key of dataframes before merging:')
print('\nAISLES')
print(aisles_dups)
print('\nDEPARTMENTS')
print(departments_dups)
print('\nORDER_PRODUCTS_PRIOR')
print(order_products_prior_dups)
print('\nORDER_PRODUCTS_TRAIN')
print(order_products_train_dups)
print('\nORDER_DUPS')
print(orders_dups)
print('\nPRODUCT_DUPS')
print(products_dups)
print('\nORDER_PRODUCT_DUPS')
print(order_products_dups)


Check for duplicates in primary key of dataframes before merging:

AISLES
Empty DataFrame
Columns: [aisle_id, aisle]
Index: []

DEPARTMENTS
Empty DataFrame
Columns: [department_id, department]
Index: []

ORDER_PRODUCTS_PRIOR
Empty DataFrame
Columns: [order_id, product_id, add_to_cart_order, reordered]
Index: []

ORDER_PRODUCTS_TRAIN
Empty DataFrame
Columns: [order_id, product_id, add_to_cart_order, reordered]
Index: []

ORDER_DUPS
Empty DataFrame
Columns: [order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order]
Index: []

PRODUCT_DUPS
Empty DataFrame
Columns: [product_id, product_name, aisle_id, department_id]
Index: []

ORDER_PRODUCT_DUPS
Empty DataFrame
Columns: [order_id, product_id, add_to_cart_order, reordered]
Index: []


In [34]:
# merge the tables together because there are no duplicates on primary keys 
all_merged_data = pd.merge(order_products, products, how='left', on=['product_id', 'product_id'])
all_merged_data = pd.merge(all_merged_data, orders, how='left', on=['order_id', 'order_id'])
all_merged_data = pd.merge(all_merged_data, aisles, how='left', on=['aisle_id', 'aisle_id'])
all_merged_data = pd.merge(all_merged_data, departments, how='left', on=['department_id', 'department_id'])


In [35]:
# CUT DOWN ON PRODUCTS
# Need to reduce the number of products in the user/products matrix for recommendations (123k users x 49k products = ~110GB of RAM required for float distance calculations)

# get the products and the number of orders they were sold in
product_dist = all_merged_data.groupby(['product_id','product_name']).size().sort_values(ascending=False).reset_index()
product_dist.columns = ['product_id','product_name','count']

# loop through the count of products by order they were sold in to see
# information about products with very low sales (like % of volume) 
for i in range(1,11):
    threshold = 20 * i

    pct_of_prod_ids_under_threshold = 100 * product_dist['product_id'].loc[product_dist['count'] <= threshold].count() / product_dist['product_id'].count()
    
    pct_of_prod_volume_sold_under_threshold = 100 * product_dist['count'].loc[product_dist['count'] <= threshold].sum() / product_dist['count'].sum()

    print(str(round(pct_of_prod_ids_under_threshold,2)) + '% of product codes have ' + str(threshold) + \
            ' or fewer sales accounting for ' + str(round(pct_of_prod_volume_sold_under_threshold,2)) + '% of all sales')

# choose product sales volume threshold for inclusion in model
threshold = 200

# drop low-sale-count products from all_merged data (likely not enough purchases to make good recommendations, )
product_dist.drop(product_dist['product_id'].loc[product_dist['count'] <= threshold].index, inplace=True)

print('\n\nTOP PRODUCTS (BY ORDERS CONTAINING PRODUCT)')
product_dist

27.66% of product codes have 20 or fewer sales accounting for 0.42% of all sales
41.28% of product codes have 40 or fewer sales accounting for 1.01% of all sales
49.35% of product codes have 60 or fewer sales accounting for 1.6% of all sales
54.97% of product codes have 80 or fewer sales accounting for 2.18% of all sales
58.98% of product codes have 100 or fewer sales accounting for 2.71% of all sales
62.22% of product codes have 120 or fewer sales accounting for 3.23% of all sales
64.88% of product codes have 140 or fewer sales accounting for 3.74% of all sales
67.15% of product codes have 160 or fewer sales accounting for 4.24% of all sales
68.94% of product codes have 180 or fewer sales accounting for 4.69% of all sales
70.53% of product codes have 200 or fewer sales accounting for 5.14% of all sales


TOP PRODUCTS (BY ORDERS CONTAINING PRODUCT)


Unnamed: 0,product_id,product_name,count
0,24852,Banana,491291
1,13176,Bag of Organic Bananas,394930
2,21137,Organic Strawberries,275577
3,21903,Organic Baby Spinach,251705
4,47209,Organic Hass Avocado,220877
...,...,...,...
14637,43831,Extra Dry Champagne,201
14638,5247,Zero Cola,201
14639,41900,Bagel Chips Toasted Garlic,201
14640,17142,Tandoori Inspired Spiced Chicken,201


In [36]:
# CUT DOWN ON USERS
# Need to reduce the number of users in the user/products matrix for recommendations (123k users x 49k products = ~110GB of RAM required for float distance calculations)

# step 1 - get the number of products by order and user
user_order_dist = all_merged_data.groupby(['user_id','order_id']).agg(count=('product_id','count')).sort_values(by='count',ascending=False).reset_index()
user_order_dist.columns = ['user_id','order_id','count']

# step 2 - get the number of products (count them for each order) by user
user_order_prod_count_dist = user_order_dist.groupby(['user_id']).agg(count=('count','sum')).reset_index().sort_values(by='count',ascending=False)
user_order_prod_count_dist

# step 2 - get the number of orders by user (there are between 3 and 100 orders per user in this dataset)
user_dist = user_order_dist.groupby(['user_id']).size().sort_values(ascending=False).reset_index()
user_dist.columns = ['user_id','count']

# loop through order counts by user (3 to 10) and show product sold (count them for each order)
# to see where to cut the data off
for i in range(1,10):
    threshold = i

    pct_of_user_ids_under_threshold = 100 * user_dist['user_id'].loc[user_dist['count'] <= threshold].count() / user_dist['user_id'].count()
    pct_of_user_volume_sold_under_threshold = 100 * user_dist['count'].loc[user_dist['count'] <= threshold].sum() / user_dist['count'].sum()
    pct_of_sales_volume_sold_under_threshold = 100 * user_order_prod_count_dist['count'].loc[user_dist['count'] <= threshold].sum() / user_order_prod_count_dist['count'].sum()

    print(str(round(pct_of_user_ids_under_threshold,2)) + '% of users have ' + str(threshold) + \
            ' or fewer orders, or ' + str(round(pct_of_user_volume_sold_under_threshold,2)) + \
            '% of all orders (' + str(round(pct_of_sales_volume_sold_under_threshold,2)) +'% of all product order volume).')

# choose product sales volume threshold for inclusion in model
threshold = 5

# drop low-sale-count products from all_merged data (likely not enough purchases to make good recommendations, )
user_dist.drop(user_dist['user_id'].loc[user_dist['count'] <= threshold].index, inplace=True)
user_dist


0.0% of users have 1 or fewer orders, or 0.0% of all orders (0.0% of all product order volume).
0.0% of users have 2 or fewer orders, or 0.0% of all orders (0.0% of all product order volume).
4.21% of users have 3 or fewer orders, or 0.78% of all orders (4.13% of all product order volume).
15.1% of users have 4 or fewer orders, or 3.46% of all orders (15.08% of all product order volume).
23.96% of users have 5 or fewer orders, or 6.19% of all orders (23.85% of all product order volume).
31.39% of users have 6 or fewer orders, or 8.94% of all orders (31.34% of all product order volume).
37.79% of users have 7 or fewer orders, or 11.7% of all orders (37.67% of all product order volume).
43.16% of users have 8 or fewer orders, or 14.35% of all orders (42.89% of all product order volume).
47.9% of users have 9 or fewer orders, or 16.98% of all orders (47.69% of all product order volume).


Unnamed: 0,user_id,count
0,170771,100
1,205483,100
2,96192,100
3,137255,100
4,65141,100
...,...,...
156800,169628,6
156801,58070,6
156802,124533,6
156803,100531,6


In [37]:
# REMOVE USERS AND PRODUCTS

# inner join the all_merged_data with the product_dist and user_dist to keep the desired users and products
all_merged_data = pd.merge(all_merged_data, product_dist, how='inner', on='product_id')
all_merged_data = pd.merge(all_merged_data, user_dist, how='inner', on='user_id')

all_merged_data

# save the merged data as a csv (so that it can be imported later without needing to re-merge)
all_merged_data.to_csv(os.path.join(curr_dir,'all_merged_data.csv'), sep=',', encoding='utf-8', index=False)


In [38]:
# create train/validate/test datasets

# randomly select 60%, 20% and 20% of user_ids for train/validate/test datasets
user_ids = all_merged_data[['user_id']].drop_duplicates()

# use a random seed (111) so that the random numbers generated are the same even if this code is re-run
user_ids_train, user_ids_validate, user_ids_test = np.split(user_ids.sample(frac=1, random_state=111), 
                                                            [int(.6*len(user_ids)), int(.8*len(user_ids))])

# delete user_ids
del user_ids

# create the train, test and validation datasets
train = user_ids_train.merge(all_merged_data, how='inner', on='user_id')
test = user_ids_test.merge(all_merged_data, how='inner', on='user_id')
validate = user_ids_validate.merge(all_merged_data, how='inner', on='user_id')

# save the test and validate dataframes to csv (will delete the dataframes later in this code and import for future use - keep memory usage low)
test.to_csv(os.path.join(curr_dir,'model_test.csv'), sep=',', encoding='utf-8', index=False)
validate.to_csv(os.path.join(curr_dir,'model_validate.csv'), sep=',', encoding='utf-8', index=False)

# test and validate datasets are deleted later...

In [39]:
train

Unnamed: 0,user_id,order_id,product_id,add_to_cart_order,reordered,product_name_x,aisle_id,department_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle,department,product_name_y,count_x,count_y
0,121770,37902,11109,26,0,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,prior,1,0,8,,other creams cheeses,dairy eggs,Organic 4% Milk Fat Whole Milk Cottage Cheese,4616,43
1,121770,255629,11109,5,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,prior,3,1,11,8.0,other creams cheeses,dairy eggs,Organic 4% Milk Fat Whole Milk Cottage Cheese,4616,43
2,121770,545631,11109,7,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,prior,43,6,11,13.0,other creams cheeses,dairy eggs,Organic 4% Milk Fat Whole Milk Cottage Cheese,4616,43
3,121770,881845,11109,7,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,prior,33,5,9,28.0,other creams cheeses,dairy eggs,Organic 4% Milk Fat Whole Milk Cottage Cheese,4616,43
4,121770,900923,11109,12,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,prior,42,0,6,14.0,other creams cheeses,dairy eggs,Organic 4% Milk Fat Whole Milk Cottage Cheese,4616,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18155823,130415,2533989,44971,10,0,1st Foods Baby Food- Pears,92,18,prior,8,1,22,18.0,baby food formula,babies,1st Foods Baby Food- Pears,238,13
18155824,130415,1780990,47301,16,1,Very Young Small Sweet Peas,81,15,prior,10,1,15,17.0,canned jarred vegetables,canned goods,Very Young Small Sweet Peas,266,13
18155825,130415,1860568,47301,4,1,Very Young Small Sweet Peas,81,15,prior,5,3,22,5.0,canned jarred vegetables,canned goods,Very Young Small Sweet Peas,266,13
18155826,130415,2533989,47301,1,1,Very Young Small Sweet Peas,81,15,prior,8,1,22,18.0,canned jarred vegetables,canned goods,Very Young Small Sweet Peas,266,13


In [40]:
# There are gaps in the user_ids and product_ids due to the 
# sampling so create a mapping table to assign sequential ids,
# but be able to map back to the original users/products later

train_users_map = train[['user_id']].drop_duplicates().reset_index(drop=True)
train_users_map['model_user_id'] = train_users_map.index

train_products_map = train[['product_id']].drop_duplicates().reset_index(drop=True)
train_products_map['model_product_id'] = train_products_map.index

# update train_users
train = train.merge(train_users_map, how='inner', on='user_id')
train = train.merge(train_products_map, how='inner', on='product_id')
train.drop(['user_id','product_id'], axis=1, inplace=True)

# export the train_users_map and train_products_map to csv (need this later to get the product information)
train_users_map.to_csv(os.path.join(curr_dir,'train_users_map.csv'), sep=',', encoding='utf-8', index=False)
train_products_map.to_csv(os.path.join(curr_dir,'train_products_map.csv'), sep=',', encoding='utf-8', index=False)

# users have multiple orders, so need to collapse the orders 
# and sum the reordered binary indicator field by grouping train
# on user_id and product_id and summing reordered 
# (reordered + 1 is the total number of times a user ordered a product)
train = train.groupby(['model_user_id','model_product_id']).agg(reordered_count=('reordered','sum')).reset_index()

# save the train_user_maps and train_product_maps to csv (will delete the dataframes later in this code and import for future use - keep memory usage low)
train_users_map.to_csv(os.path.join(curr_dir,'model_train_user_map.csv'), sep=',', encoding='utf-8', index=False)
train_products_map.to_csv(os.path.join(curr_dir,'model_train_product_map.csv'), sep=',', encoding='utf-8', index=False)


In [41]:
train

Unnamed: 0,model_user_id,model_product_id,reordered_count
0,0,0,19
1,0,1,7
2,0,2,25
3,0,3,9
4,0,4,10
...,...,...,...
6877928,94078,9277,6
6877929,94078,10043,0
6877930,94078,10523,2
6877931,94078,10850,0


In [42]:
# 2 recommendation engines can be created from the user/product data

# User-user collaborative recommendation
#    A product is recommended to a user based on how similar all their purchases are to other users
#         e.g. user A buys frozen berries, lactose-free milk, and lactose-free yoghurt, and flax seeds
#              user B buys frozen berries, lactose-free milk, and lactose-free yoghurt 
#                     >>>> flax seeds will be recommended
#    this is looking for combinations of products across users and matching similar users to recommend products

# Product-product collaborative recommendation
#    A product is recommended to a user based on a commonality of items they've purchased/in their cart
#         e.g. user A buys bacon, ham and eggs;
#              user B buys bacon, eggs; 
#              user C buys eggs 
#                     >>>> then bacon will be the item recommended
#    this is looking for combinations of products (regardless of other user purchases) 
#    and matching products often bought together


In [43]:
# Build the user-product matrix for both item-item and user-iser recommendations

# get the dimensions (users by products)
n_users = train.model_user_id.unique().shape[0]
n_products = train.model_product_id.unique().shape[0]

# matrix of ordered products
# data_matrix = np.zeros((n_users, n_products), dtype=int)
data_matrix = np.zeros((n_users, n_products), dtype=int)


# iterate through the tuples of the dataframe
for line in train.itertuples():
    # line[0] is the index, line[1] is the model_user_ids, line[2] are the model_product_ids, line[3] is the reordered_count (add one for total orders)
    data_matrix[line[1]-1, line[2]-1] = line[3]+1
    

In [130]:
# save the numpy array as a binary file (can be loaded into python directly)
np.save(os.path.join(curr_dir, 'train_user_product_matrix.npy'), data_matrix=data_matrix)


(94079, 14642)

In [26]:
# DROP ALL THE PAST DATAFRAMES THAT AREN'T NEEDED
# NEED TO DO THIS TO LOWER MEMORY USAGE AS MUCH AS POSSIBLE
try:
    del aisles
except:
    print('dataframe *aisles* does not exist.')

try:
    del departments
except:
    print('dataframe *departments* does not exist.')

try:
    del order_products_prior
except:
    print('dataframe *order_products_prior* does not exist.')

try:
    del order_products_train
except:
    print('dataframe *order_products_train* does not exist.')

try:
    del orders
except:
    print('dataframe *orders* does not exist.')

try:
    del products
except:
    print('dataframe *products* does not exist.')

try:
    del aisles_dups
except:
    print('dataframe *aisles_dups* does not exist.')

try:
    del departments_dups
except:
    print('dataframe *departments_dups* does not exist.')

try:
    del order_products_prior_dups
except:
    print('dataframe *order_products_prior_dups* does not exist.')

try:
    del order_products_train_dups
except:
    print('dataframe *order_products_train_dups* does not exist.')

try:
    del orders_dups
except:
    print('dataframe *orders_dups* does not exist.')

try:
    del products_dups
except:
    print('dataframe *products_dups* does not exist.')

try:
    del order_products_dups
except:
    print('dataframe *order_products_dups* does not exist.')

try:
    del product_dist
except:
    print('dataframe *product_dist* does not exist.')

try:
    del user_dist
except:
    print('dataframe *user_dist* does not exist.')

try:
    del user_order_prod_count_dist
except:
    print('dataframe *user_order_prod_count_dist* does not exist.')

try:
    del all_merged_data
except:
    print('dataframe *all_merged_data* does not exist.')

try:
    del user_order_prod_count_dist
except:
    print('dataframe *user_order_prod_count_dist* does not exist.')



dataframe *aisles* does not exist.
dataframe *departments* does not exist.
dataframe *order_products_prior* does not exist.
dataframe *order_products_train* does not exist.
dataframe *orders* does not exist.
dataframe *products* does not exist.
dataframe *aisles_dups* does not exist.
dataframe *departments_dups* does not exist.
dataframe *order_products_prior_dups* does not exist.
dataframe *order_products_train_dups* does not exist.
dataframe *orders_dups* does not exist.
dataframe *products_dups* does not exist.
dataframe *order_products_dups* does not exist.
dataframe *product_dist* does not exist.
dataframe *user_dist* does not exist.
dataframe *user_order_prod_count_dist* does not exist.
dataframe *all_merged_data* does not exist.
dataframe *user_order_prod_count_dist* does not exist.


**************** ONCE YOU HAVE THE MATRIX SAVED, RUN CODE FROM HERE ****************************************************************************************

In [None]:
# # TO LOAD FROM THIS POINT
import numpy as np
from sklearn.metrics import pairwise_distances

curr_dir = os.getcwd()
# # curr_dir = os.path.dirname(os.path.realpath(__file__))

# load the binary numpy array back into python 
with np.load(os.path.join(curr_dir, 'train_user_product_matrix.npy')) as data:
    data_matrix = data['data_matrix']


In [132]:
# Create the user-user distances between products (to match users and recommend the best product)
user_similarity = pairwise_distances(data_matrix, metric='cosine')


MemoryError: Unable to allocate 65.9 GiB for an array with shape (94079, 94079) and data type float64

In [133]:
# from sklearn.metrics.pairwise import pairwise_distances
product_similarity = pairwise_distances(data_matrix.T, metric='cosine')


MemoryError: Unable to allocate 10.3 GiB for an array with shape (14642, 94079) and data type float64

In [None]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred


In [None]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')