#### Modeling and Analysis 

In [2]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import random

In [7]:
# reading aisles dataset
aisles = pd.read_csv('data/aisles.csv')

In [8]:
# reading department dataset
department = pd.read_csv('data/departments.csv')

In [19]:
p = 0.01
all_orders = pd.read_csv('data/orders_train_prior.csv', header = 0, skiprows = lambda i: i>0 and random.random() > p)

In [32]:
all_orders.head(2)

Unnamed: 0,product_id,product_name,aisle_id,department_id,order_id,add_to_cart_order,reordered,quantity
0,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,50978,12,0,1.0
1,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,1790235,12,1,1.0


In [33]:
all_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30037 entries, 0 to 30036
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   product_id         30037 non-null  int64  
 1   product_name       30037 non-null  object 
 2   aisle_id           30037 non-null  int64  
 3   department_id      30037 non-null  int64  
 4   order_id           30037 non-null  int64  
 5   add_to_cart_order  30037 non-null  int64  
 6   reordered          30037 non-null  int64  
 7   quantity           30037 non-null  float64
dtypes: float64(1), int64(6), object(1)
memory usage: 1.8+ MB


In [34]:
# create basket of products per order_id
basket = (all_orders
        .groupby(['order_id', 'product_name'])['quantity']
        .sum().unstack().reset_index().fillna(0)
        .set_index('order_id'))

In [23]:
basket.shape

(28941, 9141)

In [24]:
# show a subset of columns
basket.iloc[:, [0, 1, 2, 3,4, 5, 6, 7]].head()

product_name,0 Calorie Fuji Apple Pear Water Beverage,0% Fat Free Organic Milk,0% Fat Greek Yogurt Black Cherry on the Bottom,0% Fat Organic Greek Vanilla Yogurt,0% Greek Strained Yogurt,0% Greek Yogurt Black Cherry on the Bottom,"0% Greek, Blueberry on the Bottom Yogurt",1 % Lowfat Milk
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# function to encode negative values
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

In [26]:
# apply function
basket_sets = basket.applymap(encode_units)

In [27]:
basket_sets.head()

product_name,0 Calorie Fuji Apple Pear Water Beverage,0% Fat Free Organic Milk,0% Fat Greek Yogurt Black Cherry on the Bottom,0% Fat Organic Greek Vanilla Yogurt,0% Greek Strained Yogurt,0% Greek Yogurt Black Cherry on the Bottom,"0% Greek, Blueberry on the Bottom Yogurt",1 % Lowfat Milk,1 Apple + 1 Mango Fruit Bar,1 Apple + 1 Pear Fruit Bar,...,Zucchini Squash,"\""Darn Good\"" Chili Mix","\""Mokaccino\"" Milk + Blue Bottle Coffee Chocolate",by Mennen Power Antiperspirant/Deodorant Fresh,for Tots Apple White Grape Juice,gel hand wash sea minerals,smartwater® Electrolyte Enhanced Water,vitaminwater® XXX Acai Blueberry Pomegranate,with Crispy Almonds Cereal,with Dawn Action Pacs Fresh Scent Dishwasher Detergent Pacs
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
125,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
359,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
frequent_itemsets = apriori(basket_sets, min_support = 0.01, use_colnames = True)

In [36]:
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.001831,(100% Whole Wheat Bread)
1,0.001175,(2% Reduced Fat Milk)
2,0.002142,(Apple Honeycrisp Organic)
3,0.002591,(Asparagus)
4,0.011403,(Bag of Organic Bananas)


In [40]:
frequent_itemsets = apriori(basket_sets, min_support = 0.001, use_colnames = True)
top_items = frequent_itemsets.sort_values('support', ascending = False)[:20]
for i in range(len(top_items.itemsets)):
    top_items.itemsets.iloc[i] = str(list(top_items.itemsets.iloc[i]))
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(111)
ax.bar(top_items.itemsets, top_items.support)
for label in ax.xaxis.get_ticklabels():
    label.set_rotation(90)
plt.xlabel('Item')
plt.ylabel('Support');

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


NameError: name 'plt' is not defined

In [38]:
# Create the rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.001)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


In [39]:
rules[ (rules['lift'] >= 6) &
       (rules['confidence'] >= 0.8) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
