In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [2]:
products = pd.read_csv('/Users/kookhan/Downloads/archive/products.csv')
products

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5
49684,49685,En Croute Roast Hazelnut Cranberry,42,1
49685,49686,Artisan Baguette,112,3
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8


In [3]:
orders = pd.read_csv('/Users/kookhan/Downloads/archive/order_products__prior.csv')
orders

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
32434484,3421083,39678,6,1
32434485,3421083,11352,7,0
32434486,3421083,4600,8,0
32434487,3421083,24852,9,1


In [4]:
# Merging product data and order data

order_products = orders.merge(products, how='left', on='product_id')
data = order_products[['order_id','product_id']]
data

Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035
...,...,...
32434484,3421083,39678
32434485,3421083,11352
32434486,3421083,4600
32434487,3421083,24852


In [5]:
# Sampling 10000 orders and listing product ids for each order

order_list = []
order_id = pd.DataFrame(orders['order_id'].unique())
order_id = order_id.sample(n=10000, random_state=5)[0]

for o in order_id:
    condition = data['order_id'] == o
    d = data[condition]
    order_list.append(d['product_id'].tolist())


In [6]:
len(order_list)

10000

In [7]:
# Encoding product lists by order as below dataframe and fitting with Apriori for association analysis

te = TransactionEncoder()
te_ary = te.fit(order_list).transform(order_list)
df = pd.DataFrame(te_ary, columns=te.columns_) 
df

Unnamed: 0,1,2,3,4,9,10,11,23,25,26,...,49667,49668,49673,49674,49677,49680,49682,49683,49686,49687
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9996,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9997,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9998,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
# Selecting itemsets showing support greater than 0.005

frequent_itemsets = apriori(df, min_support=0.005, use_colnames=True)
frequent_itemsets 

Unnamed: 0,support,itemsets
0,0.0066,(45)
1,0.0102,(196)
2,0.0063,(260)
3,0.0095,(432)
4,0.0070,(890)
...,...,...
359,0.0052,"(47209, 39275)"
360,0.0052,"(47209, 40706)"
361,0.0056,"(47209, 45007)"
362,0.0051,"(47209, 47626)"


In [9]:
# Association analysis for selected itemsets

from mlxtend.frequent_patterns import association_rules

association_data = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.001)
association_data

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(24852),(4605),0.1452,0.0213,0.0075,0.051653,2.425018,0.004407,1.032006
1,(4605),(24852),0.0213,0.1452,0.0075,0.352113,2.425018,0.004407,1.319365
2,(4920),(24852),0.0255,0.1452,0.0069,0.270588,1.863555,0.003197,1.171903
3,(24852),(4920),0.1452,0.0255,0.0069,0.047521,1.863555,0.003197,1.023119
4,(13176),(5077),0.1227,0.0218,0.0053,0.043195,1.981412,0.002625,1.022361
...,...,...,...,...,...,...,...,...,...
189,(45007),(47209),0.0333,0.0680,0.0056,0.168168,2.473061,0.003336,1.120419
190,(47209),(47626),0.0680,0.0505,0.0051,0.075000,1.485149,0.001666,1.026486
191,(47626),(47209),0.0505,0.0680,0.0051,0.100990,1.485149,0.001666,1.036696
192,(47626),(47766),0.0505,0.0520,0.0079,0.156436,3.008378,0.005274,1.123803


In [10]:
product_dict = { name:value for name, value in zip(products['product_id'], products['product_name']) }

association_data['antecedents'] = association_data['antecedents'].apply(lambda x: list(x)[0])
association_data['consequents'] = association_data['consequents'].apply(lambda x: list(x)[0])
association_data['antecedents_name'] = association_data['antecedents'].apply(lambda x: product_dict[x])
association_data['consequents_name'] = association_data['consequents'].apply(lambda x: product_dict[x])

In [11]:
# Sorting data by lift with descending order

association_data.sort_values('lift', ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedents_name,consequents_name
170,26209,28842,0.0467,0.0151,0.0050,0.107066,7.090489,0.004295,1.102994,Limes,Bunched Cilantro
171,28842,26209,0.0151,0.0467,0.0050,0.331126,7.090489,0.004295,1.425231,Bunched Cilantro,Limes
126,24964,22935,0.0324,0.0351,0.0074,0.228395,6.506982,0.006263,1.250510,Organic Garlic,Organic Yellow Onion
127,22935,24964,0.0351,0.0324,0.0074,0.210826,6.506982,0.006263,1.226092,Organic Yellow Onion,Organic Garlic
172,26209,31717,0.0467,0.0214,0.0055,0.117773,5.503412,0.004501,1.109238,Limes,Organic Cilantro
...,...,...,...,...,...,...,...,...,...,...,...
152,24852,45007,0.1452,0.0333,0.0050,0.034435,1.034092,0.000165,1.001176,Banana,Organic Zucchini
156,47209,24852,0.0680,0.1452,0.0094,0.138235,0.952034,-0.000474,0.991918,Organic Hass Avocado,Banana
157,24852,47209,0.1452,0.0680,0.0094,0.064738,0.952034,-0.000474,0.996513,Banana,Organic Hass Avocado
64,13176,47766,0.1227,0.0520,0.0059,0.048085,0.924707,-0.000480,0.995887,Bag of Organic Bananas,Organic Avocado


In [12]:
association_data.sort_values('lift', ascending = False).to_csv('/Users/kookhan/Downloads/archive/result.csv')