In [2]:
# import libraries required for this notebook
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import mlxtend as mlx

import seaborn as sns
sns.set(style='whitegrid')

import random
import datetime


from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori

import implicit
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
import scipy.sparse as sparse

In [4]:
# setup datatypes to save memory 
order_dtypes = {
    'order_id':np.int32,
    'user_id':np.int64,
    'eval_set':'category',
    'order_number':np.int16,
    'order_dow':np.int8,
    'order_hour_of_day':np.int8,
    'days_since_prior_order':np.float32
}

product_dtypes={
    'product_id':np.uint16,
    'aisle_id':np.int16,
    'department_id':np.int16
}

order_details_dtypes={
    'order_id':np.int32,
    'product_id':np.uint16,
    'add_to_cart_order':np.int32,
    'reordered':np.int8  
}

In [5]:
# import data
orders = pd.read_csv('orders.csv', dtype=order_dtypes)
products = pd.read_csv('products.csv', dtype=product_dtypes)
order_details_prior = pd.read_csv('order_products__prior.csv', dtype=order_details_dtypes)

In [6]:
aisles = pd.read_csv('aisles.csv')
departments = pd.read_csv('departments.csv')
order_details_train = pd.read_csv('order_products__train.csv')

In [7]:
# combine our dataset
order_details_all = pd.merge(orders, order_details_prior, on='order_id')
order_details_all = pd.merge(order_details_all, products, on='product_id')
order_details_all = pd.merge(order_details_all, aisles, on='aisle_id')
order_details_all = pd.merge(order_details_all, departments, on='department_id')

In [20]:
order_details_all.head(30)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,2539329,1,prior,1,2,8,,196,1,0,Soda,77,7,soft drinks,beverages
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,soft drinks,beverages
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,soft drinks,beverages
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,soft drinks,beverages
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,soft drinks,beverages
5,3367565,1,prior,6,2,7,19.0,196,1,1,Soda,77,7,soft drinks,beverages
6,550135,1,prior,7,1,9,20.0,196,1,1,Soda,77,7,soft drinks,beverages
7,3108588,1,prior,8,1,14,14.0,196,2,1,Soda,77,7,soft drinks,beverages
8,2295261,1,prior,9,1,16,0.0,196,4,1,Soda,77,7,soft drinks,beverages
9,2550362,1,prior,10,4,8,30.0,196,1,1,Soda,77,7,soft drinks,beverages


In [9]:
order_details_all.to_csv('order_details.csv', encoding='utf-8')

KeyboardInterrupt: 

### 1. Global Recommendations

#### What top two products were purchased with item A?

These recommendations will be derived from a product contingency matrix. A product contingency matrix identifies how many times a product pairing was purchased. For example, how many times were bananas purchased with avocados?

The problem with this method is that these recommendations are not customer specific (hence the global identifier). Recommendations are basically a popularity contest whereby the most popular product pairing wins. It also does not determine whether purchasing item A results in a higher likelihood that item B is also purchased.

#### Product Contingency Matrix

A product contingency matrix requires us to have a list of all product pairings for every order. This product pairing will then be summarized using the pandas crosstab function to create the product contingency matrix.

We have over 3,000,000 order transactions within this dataset - this is a great wealth of data, but is too large for us to process a crosstab. We will reduce the number of users included in the dataset which will decrease the number of transactions overall.

In [10]:
# create mask for users
user_subset = orders.user_id.unique()[:1500]

In [11]:
# create orders subset
orders_subset = order_details_all[order_details_all.user_id.isin(user_subset)]

In [12]:
len(orders_subset)

219257

The dataset was reduced down to 1,500 customers, accounting for 219,257 orders. The following contingency matrix will built from this data.

In [13]:
# contingency matrix for products 
from itertools import combinations, permutations, product

def get_contingency_matrix(order_details):
    '''
        Takes an Instacart order dataset and returns a product contingency matrix    
        
        Parameters
        ----------
            order_details: The Instacart order history
        
        Returns
        -------
            product contingency matrix, ordered by purchase frequency in descending order
    '''

    # define item_list to hold our product pairing
    item_list = pd.DataFrame(columns=['a', 'b'])

    # loop through order history by order ID
    for order_id, order_products in order_details.groupby('order_id'):

        # find a unique list of products
        p_names = list(order_products.product_name.unique())

        # create a container to hold all product combinations
        product_list = []

        # loop through permutations
        for a, b in product(p_names,p_names):

            # append the permutation to our list
            product_list.append([a, b])

        # create a temporary dataframe out of the product pairs
        temp = pd.DataFrame(product_list, columns=['a','b'])
        
        # add product pairs to the master item list
        item_list = pd.concat([item_list, temp], axis=0)
    
    # create our crosstab
    matrix = pd.crosstab(item_list.a, item_list.b)
    
    # sort our column values by purchase quantity (in descending order)
    sorted_names = matrix.sum().sort_values(ascending=False).index.tolist()
    
    # return a sorted contingency matrix
    return matrix.loc[sorted_names, sorted_names]

In [15]:
# create the contingency matrix
cont_matrix = get_contingency_matrix(orders_subset)


In [17]:
# preview first 15 columns
cont_matrix.iloc[:15, :15]

b,Banana,Bag of Organic Bananas,Organic Strawberries,Organic Baby Spinach,Organic Hass Avocado,Organic Avocado,Limes,Organic Raspberries,Large Lemon,Organic Garlic,Strawberries,Organic Zucchini,Organic Yellow Onion,Organic Whole Milk,Cucumber Kirby
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Banana,2910,7,310,293,149,340,192,125,245,99,219,116,83,177,206
Bag of Organic Bananas,7,2661,480,327,466,173,176,344,146,148,127,198,177,239,112
Organic Strawberries,310,480,1870,228,297,152,148,299,84,145,7,109,104,151,77
Organic Baby Spinach,293,327,228,1564,221,179,137,148,186,144,96,148,118,129,158
Organic Hass Avocado,149,466,297,221,1458,7,151,191,78,165,46,118,143,89,48
Organic Avocado,340,173,152,179,7,1181,179,78,161,91,74,81,78,123,88
Limes,192,176,148,137,151,179,1040,84,177,109,57,92,94,64,84
Organic Raspberries,125,344,299,148,191,78,84,989,71,44,65,96,55,112,56
Large Lemon,245,146,84,186,78,161,177,71,1039,85,90,84,55,49,172
Organic Garlic,99,148,145,144,165,91,109,44,85,786,27,69,156,63,56


The contingency matrix is comprised of product names spanning both rows & columns. The inner cells represent the purchase quantity of the product pairing.

For example, Bananas & Limes were purchased 192 times within our dataset. We can now build recommendations using this matrix.

In [18]:
def get_global_recommendations(product, matrix):
    '''
        Given a product contingency matrix (matrix), find the top 2 products to be recommended
        for purchase given a selected item (product)
        
        parameters
        ----------
            product: The product name for which recommendations should be generated
            
            marix: product contingency matrix
            
        returns
        -------
            product recommendatiosn in a list
        
    '''

    # find our purchased products paired with product p
    product_series = matrix[p]

    # create a mask so that we remove product p from this list
    product_series = product_series[matrix[p].index != p]

    # get top 2 products most frequently purchased
    recommendations = product_series.sort_values(ascending=False)[:2]
    
    # return reccomendations
    return recommendations

 Find top 5 products to our recommender and print the top 2 recommendations for each product.

In [19]:
print('Recommendations based on global products contingency matrix')
print()

# print reccomendations for top 5 products
for p in cont_matrix.columns[:5]:
    
    print('Top 2 recommended items to buy with {} are:'.format(p))

    # get recommendations
    recommendations = get_global_recommendations(p, cont_matrix)
       
    # print results
    print(recommendations.index.tolist())
    print()

Recommendations based on global products contingency matrix

Top 2 recommended items to buy with Banana are:
['Organic Avocado', 'Organic Strawberries']

Top 2 recommended items to buy with Bag of Organic Bananas are:
['Organic Strawberries', 'Organic Hass Avocado']

Top 2 recommended items to buy with Organic Strawberries are:
['Bag of Organic Bananas', 'Banana']

Top 2 recommended items to buy with Organic Baby Spinach are:
['Bag of Organic Bananas', 'Banana']

Top 2 recommended items to buy with Organic Hass Avocado are:
['Bag of Organic Bananas', 'Organic Strawberries']



In this section we demonstrated how to build a simple recommender using a product contingency matrix. Recommendations are provided based on purchase frequency - so this case, we are recommending the most popular items paired with each product.

In the next sections we will build on this to include further intelligence.

### 2. Association Rules

An association rules provide us with a simple equation, where the right hand side of the equation (consequents) represents products a customer is likely to purchase as a result of having the items on the left hand side (antecedants) already in their basket.

Grocery stores utilize association rules to know which products they should place together in order to foster additional purchases. We can also use association rules to recommend products based on what products have currently been added to a shopping cart. Our next recommender will return a set of products based on association rules that we create.

Association rules are created from itemsets (which are simply products that are frequently purchased together). We will create itemsets using the apriori algorithm. The frequent itemsets are then fed into the association_rules algorithm to generate our final association rules to be used by our recommender.

### 4. Conclusion


In this document we explored 2 different approaches for building product recommendations.

Global recommendations give us insight into the most popular product pairings. They are not useful for providing similar product recommendations or personalized product recommendations.

Association rules provide us with insight into how often product itemsets are purchased together, as well as the influence that an itemset has on purchasing related itemsets. Association rules are extremely expensive to create and still suffer from being able to produce personalized product recommendations.


Why are recommenders such as the one we created important to companies like Instacart? It allows users to discover more products that they could truly be interested in, but would never find on their own. This increases sales and improves the customer experience. As customers begin to trust the recommendations, they are more likely to discover additional items and increase basket size.