In [1]:
# import libraries
import pandas as pd
import numpy as np
import random 
from apyori import apriori

In [2]:
orders = pd.read_csv('orders.csv')

In [3]:
orders.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [4]:
products = pd.read_csv('products.csv')

In [5]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [6]:
orders.shape

(32434489, 4)

In [7]:
products.shape

(49688, 4)

In [8]:
# merge both datasets into one
product_orders = pd.merge(products, orders, on='product_id').sort_values(by='order_id')
product_orders.shape

(32434489, 7)

In [9]:
product_orders.tail()

Unnamed: 0,product_id,product_name,aisle_id,department_id,order_id,add_to_cart_order,reordered
12835237,21162,Organic Mixed Berry Yogurt & Fruit Snack,92,18,3421083,3,0
15933432,24852,Banana,24,4,3421083,9,1
10779617,18176,Organic Strawberry Yogurt & Fruit Snack,92,18,3421083,4,1
25491843,39678,Free & Clear Natural Dishwasher Detergent,74,17,3421083,6,1
29338254,45309,Purple Carrot & blueberry Puffs,92,18,3421083,2,0


In [10]:
# Reservoir sampling (used a set for faster runtime)
product_order_ids = set(product_orders['order_id'])
order_product_names = []

while len(order_product_names) < 10000:
    random_order_id = random.choice(list(product_order_ids))
    if random_order_id in product_order_ids:
        order = product_orders.loc[product_orders['order_id'] == random_order_id, 'product_name'].tolist()
        order_product_names.append(order)
        product_order_ids.remove(random_order_id)

In [11]:
print(order_product_names)

[['Organic Creamy Chocolate Fudge Nutritional Shake', 'Organic Sweet Vanilla Bean Nutrition Complete Protein Shake', 'Frozen Greek Yogurt Bars Blueberry'], ['Lowfat Cottage Cheese', 'Bananas', 'Wakame, Pacific, Silver Grade, Ready-to-Use', 'Qualita Rossa Ground Coffee', 'Fat Free Milk', 'Penne Rigate #41 Pasta', 'Greek Yogurt Style Nonfat Yogurt', 'Cheese Alternative, American Style, Slices', 'Cage Free Large White Eggs', 'Parsley, Italian (Flat), New England Grown', 'Kelp, Wild Atlantic Kombu', 'Organic Tomato Sauce', 'Sauerkraut Salad, Arame (Sea Vegetable) & Ginger', 'Spaghetti  No 12', 'Premium Green Tea Bags', 'Organic Watercress', 'Dandruff Relief Shampoo'], ['Chunk Light Tuna In Water', 'Intense & Smoky French Roast Dark Roast Ground Coffee', 'Shredded Parmesan Cheese', 'Pepper Gournay Cheese', 'Traditional Chicken & Wild Rice Soup', 'Bathroom Tissue Softness & Strength, Double Rolls', 'Romaine Hearts', 'Roma Tomato', 'Garlic & Fine Herbs Gournay Cheese', 'Multi Grain Crispbread

In [12]:
# apply Apriori algorithm and convert to a list
association_rules = apriori(order_product_names, min_support=0.001, min_confidence=0.2, length=2)
association_results = list(association_rules)
print(association_results)

[RelationRecord(items=frozenset({'Banana', '1% Low Fat Milk'}), support=0.001, ordered_statistics=[OrderedStatistic(items_base=frozenset({'1% Low Fat Milk'}), items_add=frozenset({'Banana'}), confidence=0.3448275862068966, lift=2.2611644997173546)]), RelationRecord(items=frozenset({'1% Lowfat Milk', 'Banana'}), support=0.002, ordered_statistics=[OrderedStatistic(items_base=frozenset({'1% Lowfat Milk'}), items_add=frozenset({'Banana'}), confidence=0.37735849056603776, lift=2.4744819053510674)]), RelationRecord(items=frozenset({'100% Raw Coconut Water', 'Bag of Organic Bananas'}), support=0.003, ordered_statistics=[OrderedStatistic(items_base=frozenset({'100% Raw Coconut Water'}), items_add=frozenset({'Bag of Organic Bananas'}), confidence=0.24390243902439024, lift=2.14325517596125)]), RelationRecord(items=frozenset({'Banana', '100% Whole Wheat Bread'}), support=0.0043, ordered_statistics=[OrderedStatistic(items_base=frozenset({'100% Whole Wheat Bread'}), items_add=frozenset({'Banana'}),

In [13]:
# extract the association rule + the corresponding confidence and append to rules list
rules_list = []

for item in association_results:
    rules = item[0]
    pairs = [x for x in rules]
    confidence = item[2][0][2]
    rules_list.append([pairs[0] + ' ' + '->' + ' ' + pairs[1],confidence])

In [36]:
# convert rules list to a dataframe sorted by confidence in descending order and list first 50 rows
rules_df = pd.DataFrame(rules_list,columns=['Association Rule', 'Confidence']).sort_values(by='Confidence', ascending=False)
rules_df.head(50)

Unnamed: 0,Association Rule,Confidence
898,Sparkling Water Grapefruit -> Sparkling Lemon ...,0.833333
453,Oh My Yog! Organic Wild Quebec Blueberry Cream...,0.777778
899,Sparkling Water Grapefruit -> Sparkling Water ...,0.625
403,Strawberry on the Bottom Nonfat Greek Yogurt -...,0.592593
401,Blueberry Whole Milk Yogurt Pouch -> Organic W...,0.583333
322,Banana -> Raspberry on the Bottom Nonfat Greek...,0.578947
719,Large Lemon -> Michigan Organic Kale,0.571429
811,Sparkling Water Grapefruit -> Sparkling Lemon ...,0.565217
866,Organic Garlic -> Organic Carrot Bunch,0.555556
647,Bag of Organic Bananas -> Organic Cucumber,0.555556
