In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

root = 'C:/Users/cabre/PycharmProjects/Market-Basket-Analysis/Data/'

In [2]:
orders = pd.read_csv(root + 'orders.csv')
order_products_prior = pd.read_csv(root + 'order_products__prior.csv')
order_products_train = pd.read_csv(root + 'order_products__train.csv')
products = pd.read_csv(root + 'products.csv')

In [3]:
order_products = pd.concat([order_products_prior, order_products_train])
order_products.shape

(33819106, 4)

In [4]:
order_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [5]:
order_products.product_id.nunique()

49685

Out of 49685 keeping top 100 most frequent products.

In [6]:
product_counts = order_products.groupby('product_id')['order_id'].count().reset_index().rename(columns = {'order_id':'frequency'})
product_counts = product_counts.sort_values('frequency', ascending=False)[0:100].reset_index(drop = True)
product_counts = product_counts.merge(products, on = 'product_id', how = 'left')
product_counts.head(10)

Unnamed: 0,product_id,frequency,product_name,aisle_id,department_id
0,24852,491291,Banana,24,4
1,13176,394930,Bag of Organic Bananas,24,4
2,21137,275577,Organic Strawberries,24,4
3,21903,251705,Organic Baby Spinach,123,4
4,47209,220877,Organic Hass Avocado,24,4
5,47766,184224,Organic Avocado,24,4
6,47626,160792,Large Lemon,24,4
7,16797,149445,Strawberries,24,4
8,26209,146660,Limes,24,4
9,27845,142813,Organic Whole Milk,84,16


Keeping 100 most frequent items in order_products dataframe

In [7]:
freq_products = list(product_counts.product_id)
freq_products[1:10]

[13176, 21137, 21903, 47209, 47766, 47626, 16797, 26209, 27845]

In [8]:
order_products = order_products[order_products.product_id.isin(freq_products)]
order_products.shape

(7795471, 4)

In [9]:
order_products.order_id.nunique()

2444982

In [10]:
order_products = order_products.merge(products, on = 'product_id', how='left')
order_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,28985,2,1,Michigan Organic Kale,83,4
1,2,17794,6,1,Carrots,83,4
2,3,24838,2,1,Unsweetened Almondmilk,91,16
3,3,21903,4,1,Organic Baby Spinach,123,4
4,3,46667,6,1,Organic Ginger Root,83,4


In [11]:
transactions_list = order_products.groupby('order_id')['product_name'].apply(list).tolist()
transactions_list

[['Cucumber Kirby',
  'Bag of Organic Bananas',
  'Organic Hass Avocado',
  'Organic Whole String Cheese'],
 ['Michigan Organic Kale', 'Carrots'],
 ['Unsweetened Almondmilk', 'Organic Baby Spinach', 'Organic Ginger Root'],
 ['Bag of Organic Bananas',
  'Organic Raspberries',
  '2% Reduced Fat Milk',
  'Organic Hass Avocado'],
 ['Extra Virgin Olive Oil'],
 ['Banana',
  'Organic Cilantro',
  'Organic Avocado',
  'Yellow Onions',
  'Organic Strawberries',
  'Organic Black Beans',
  'Organic Half & Half'],
 ['Extra Virgin Olive Oil'],
 ['Soda', 'Sparkling Natural Mineral Water'],
 ['Organic Whole Milk', 'Organic Broccoli Florets', 'Honeycrisp Apple'],
 ['Spring Water'],
 ['Red Vine Tomato',
  'Sparkling Water Grapefruit',
  'Organic Raspberries',
  'Asparagus',
  'Organic Avocado'],
 ['Organic Baby Carrots', 'Unsweetened Almondmilk'],
 ['Banana'],
 ['2% Reduced Fat Milk', 'Banana', 'Asparagus'],
 ['Organic Avocado', 'Organic Red Onion'],
 ['Organic Unsweetened Almond Milk',
  'Banana',
  '

In [12]:
transaction_encoder = TransactionEncoder()
transaction_array = transaction_encoder.fit(transactions_list).transform(transactions_list)
transaction_df = pd.DataFrame(transaction_array, columns=transaction_encoder.columns_)
transaction_df

Unnamed: 0,100% Raw Coconut Water,100% Whole Wheat Bread,2% Reduced Fat Milk,Apple Honeycrisp Organic,Asparagus,Bag of Organic Bananas,Banana,Bartlett Pears,Blueberries,Boneless Skinless Chicken Breasts,...,Sparkling Natural Mineral Water,Sparkling Water Grapefruit,Spring Water,Strawberries,Uncured Genoa Salami,Unsalted Butter,Unsweetened Almondmilk,Unsweetened Original Almond Breeze Almond Milk,Whole Milk,Yellow Onions
0,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2444977,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2444978,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2444979,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2444980,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


Creating frequent sets and rules

In [14]:
frequent_items = apriori(transaction_df, min_support=0.01, use_colnames=True, low_memory=True)
frequent_items.head()

Unnamed: 0,support,itemsets
0,0.016062,(100% Raw Coconut Water)
1,0.025814,(100% Whole Wheat Bread)
2,0.0158,(2% Reduced Fat Milk)
3,0.035694,(Apple Honeycrisp Organic)
4,0.029101,(Asparagus)


In [15]:
rules = association_rules(frequent_items, metric="lift", min_threshold=1)
rules.sort_values('lift', ascending=True)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
27,(Organic Strawberries),(Banana),0.112711,0.200938,0.023857,0.211665,1.053382,1.0,0.001209,1.013607,0.057114,0.082324,0.013424,0.165196
26,(Banana),(Organic Strawberries),0.200938,0.112711,0.023857,0.118728,1.053382,1.0,0.001209,1.006827,0.06342,0.082324,0.006781,0.165196
23,(Organic Baby Spinach),(Banana),0.102948,0.200938,0.021839,0.212133,1.055712,1.0,0.001152,1.014209,0.058829,0.077429,0.01401,0.160408
22,(Banana),(Organic Baby Spinach),0.200938,0.102948,0.021839,0.108683,1.055712,1.0,0.001152,1.006435,0.066043,0.077429,0.006394,0.160408
19,(Limes),(Banana),0.059984,0.200938,0.013539,0.225713,1.123292,1.0,0.001486,1.031996,0.116763,0.054729,0.031004,0.146546
18,(Banana),(Limes),0.200938,0.059984,0.013539,0.06738,1.123292,1.0,0.001486,1.00793,0.13736,0.054729,0.007867,0.146546
29,(Organic Whole Milk),(Banana),0.058411,0.200938,0.013368,0.228866,1.138984,1.0,0.001631,1.036216,0.129594,0.054346,0.03495,0.147697
28,(Banana),(Organic Whole Milk),0.200938,0.058411,0.013368,0.066529,1.138984,1.0,0.001631,1.008697,0.15271,0.054346,0.008622,0.147697
8,(Bag of Organic Bananas),(Organic Whole Milk),0.161527,0.058411,0.011288,0.069883,1.196413,1.0,0.001853,1.012335,0.195794,0.0541,0.012184,0.131568
9,(Organic Whole Milk),(Bag of Organic Bananas),0.058411,0.161527,0.011288,0.193253,1.196413,1.0,0.001853,1.039326,0.174352,0.0541,0.037838,0.131568
