# Market Basket Analysis - Apriori Algorithm

In [1]:
# Imports

import numpy as np
import scipy
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from apyori import apriori
from collections import Counter
from datetime import datetime
from itertools import combinations


from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [13]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
# Read in  data

orders = pd.read_csv('./data/orders.csv') 
orders_train = pd.read_csv('./data/order_products__train.csv')
orders_prior = pd.read_csv('./data/order_products__prior.csv')
products = pd.read_csv('./data/products.csv')
aisles = pd.read_csv('./data/aisles.csv')
depts = pd.read_csv('./data/departments.csv')

### Advantages of Market Basket Analysis
There are many advantages to implementing Market Basket Analysis in marketing. Market basket Analysis(MBA) can be applied to data of customers from the point of sale (PoS) systems.

It helps retailers with:

- Increases customer engagement
- Boosting sales and increasing RoI
- Improving customer experience
- Optimize marketing strategies and campaigns
- Help to understand customers better
- Identifies customer behavior and pattern

### Step 1: Data Integration

In [4]:
# Combining dataframes 

data = orders_prior.merge(products, on = 'product_id')

In [5]:
data .info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 7 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   order_id           int64 
 1   product_id         int64 
 2   add_to_cart_order  int64 
 3   reordered          int64 
 4   product_name       object
 5   aisle_id           int64 
 6   department_id      int64 
dtypes: int64(6), object(1)
memory usage: 1.9+ GB


In [6]:
# Checking for nulls 

data.isna().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
product_name         0
aisle_id             0
department_id        0
dtype: int64

### Step 2: Basket Creation

In [7]:
# Create a frequency table of product_ids

product_counts = data.groupby('product_id')['order_id'].count().reset_index().rename(columns = {'order_id'\
                                                                                                    :'frequency'})

product_counts = product_counts.sort_values('frequency', ascending=False)[0:100].reset_index(drop=True)

product_counts.head(10)

Unnamed: 0,product_id,frequency
0,24852,472565
1,13176,379450
2,21137,264683
3,21903,241921
4,47209,213584
5,47766,176815
6,47626,152657
7,16797,142951
8,26209,140627
9,27845,137905


In [8]:
freq_products = list(product_counts.product_id)
freq_products[1:10]

[13176, 21137, 21903, 47209, 47766, 47626, 16797, 26209, 27845]

In [9]:
order_products = data[data.product_id.isin(freq_products)]
order_products.shape

(7483881, 7)

In [10]:
basket = order_products.pivot_table(columns='product_name', values='reordered', \
                                    index='order_id').reset_index().fillna(0).set_index('order_id')

In [11]:
basket

product_name,100% Raw Coconut Water,100% Whole Wheat Bread,2% Reduced Fat Milk,Apple Honeycrisp Organic,Asparagus,Bag of Organic Bananas,Banana,Bartlett Pears,Blueberries,Boneless Skinless Chicken Breasts,...,Sparkling Natural Mineral Water,Sparkling Water Grapefruit,Spring Water,Strawberries,Uncured Genoa Salami,Unsalted Butter,Unsweetened Almondmilk,Unsweetened Original Almond Breeze Almond Milk,Whole Milk,Yellow Onions
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3421078,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3421080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3421081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3421082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Create encoder 

def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
    
basket = basket.applymap(encode_units)
basket.head()

product_name,100% Raw Coconut Water,100% Whole Wheat Bread,2% Reduced Fat Milk,Apple Honeycrisp Organic,Asparagus,Bag of Organic Bananas,Banana,Bartlett Pears,Blueberries,Boneless Skinless Chicken Breasts,...,Sparkling Natural Mineral Water,Sparkling Water Grapefruit,Spring Water,Strawberries,Uncured Genoa Salami,Unsalted Butter,Unsweetened Almondmilk,Unsweetened Original Almond Breeze Almond Milk,Whole Milk,Yellow Onions
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### Step 3: Apply Apriori Algorithm

In [14]:
# Only looking at first 100K due to memory constraints

basket_subset = basket[:100000]

In [15]:
# Frequency with support 

frequent_items = apriori(basket_subset, min_support=0.01, use_colnames=True)

frequent_items.head()

Unnamed: 0,support,itemsets
0,0.0127,(100% Raw Coconut Water)
1,0.01917,(100% Whole Wheat Bread)
2,0.01235,(2% Reduced Fat Milk)
3,0.02678,(Apple Honeycrisp Organic)
4,0.01738,(Asparagus)


In [16]:
# Getting Association rules 

rules = association_rules(frequent_items, metric='lift', min_threshold=1)

rules.sort_values('lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
29,(Organic Raspberries),(Organic Strawberries),0.04394,0.0881,0.01032,0.234866,2.665899,0.006449,1.191817
28,(Organic Strawberries),(Organic Raspberries),0.0881,0.04394,0.01032,0.11714,2.665899,0.006449,1.082912
14,(Banana),(Organic Fuji Apple),0.17149,0.0271,0.01078,0.062861,2.319587,0.006133,1.03816
15,(Organic Fuji Apple),(Banana),0.0271,0.17149,0.01078,0.397786,2.319587,0.006133,1.375773
4,(Organic Raspberries),(Bag of Organic Bananas),0.04394,0.13489,0.01322,0.300865,2.230446,0.007293,1.2374
5,(Bag of Organic Bananas),(Organic Raspberries),0.13489,0.04394,0.01322,0.098006,2.230446,0.007293,1.05994
2,(Organic Hass Avocado),(Bag of Organic Bananas),0.07202,0.13489,0.0211,0.292974,2.171949,0.011385,1.22359
3,(Bag of Organic Bananas),(Organic Hass Avocado),0.13489,0.07202,0.0211,0.156424,2.171949,0.011385,1.100055
26,(Organic Strawberries),(Organic Hass Avocado),0.0881,0.07202,0.01282,0.145516,2.020501,0.006475,1.086013
27,(Organic Hass Avocado),(Organic Strawberries),0.07202,0.0881,0.01282,0.178006,2.020501,0.006475,1.109376
