In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import fpgrowth, apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [2]:
instacart = pd.read_csv('../data/instacart.csv')
instacart.drop('Unnamed: 0', axis = 1, inplace=True)
print('Dataset shape:', instacart.shape)
instacart.head()

Dataset shape: (21407642, 14)


Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,department_id,aisle_id,aisle,department
0,2,202279,3,5,9,8.0,28985,2,1,Michigan Organic Kale,4,83.0,fresh vegetables,produce
1,2,202279,3,5,9,8.0,9327,3,0,Garlic Powder,13,104.0,spices seasonings,pantry
2,2,202279,3,5,9,8.0,30035,5,0,Natural Sweetener,13,17.0,baking ingredients,pantry
3,2,202279,3,5,9,8.0,17794,6,1,Carrots,4,83.0,fresh vegetables,produce
4,2,202279,3,5,9,8.0,1819,8,1,All Natural No Stir Creamy Almond Butter,13,88.0,spreads,pantry


There are more than 21 millions of rows. I am going to select the first two millions of rows to reduce the size of the data.

In [3]:
order_set = instacart.loc[instacart['order_id'] <= 320000]
print('Dataset shape:', order_set.shape)
order_set.tail()

Dataset shape: (2000398, 14)


Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,department_id,aisle_id,aisle,department
2000393,320000,134478,9,5,12,3.0,18362,5,0,Organic Bread with 21 Whole Grains,3,112.0,bread,bakery
2000394,320000,134478,9,5,12,3.0,22683,6,1,Original Real Live Chocolate,19,45.0,candy chocolate,snacks
2000395,320000,134478,9,5,12,3.0,7781,7,1,Organic Sticks Low Moisture Part Skim Mozzarel...,16,21.0,packaged cheese,dairy eggs
2000396,320000,134478,9,5,12,3.0,8518,2,1,Organic Red Onion,4,83.0,fresh vegetables,produce
2000397,320000,134478,9,5,12,3.0,16521,11,1,Walnut Halves & Pieces,19,117.0,nuts seeds dried fruit,snacks


Now we have almost 320.000 orders.

In [4]:
prods = order_set[['order_id', 'product_name']].reset_index(drop=True)
prods.sort_values(by='order_id')

Unnamed: 0,order_id,product_name
0,2,Michigan Organic Kale
1,2,Garlic Powder
2,2,Natural Sweetener
3,2,Carrots
4,2,All Natural No Stir Creamy Almond Butter
...,...,...
2000392,320000,Organic Whole Grain Wheat English Muffins
2000393,320000,Organic Bread with 21 Whole Grains
2000394,320000,Original Real Live Chocolate
2000395,320000,Organic Sticks Low Moisture Part Skim Mozzarel...


## What are we going to do?

- Check product numbers
- Reduce number of products with the most purchased prods
- Apply Apriori algorithm
- See if it makes sense to standarize the products 
- Apply Apriori algorithm again

In [5]:
print('There are {} different products.'.format(prods['product_name'].nunique()))

There are 28668 different products.


In [6]:
pd.options.display.max_rows = 200

prods_count = prods['product_name'].value_counts().head(200)

print('Most purchased products:')
display(prods_count)


Most purchased products:


Banana                                                            44419
Bag of Organic Bananas                                            35411
Organic Strawberries                                              24684
Organic Baby Spinach                                              22614
Strawberries                                                      13439
Limes                                                             13136
Organic Whole Milk                                                12864
Organic Raspberries                                               12636
Organic Yellow Onion                                              10551
Organic Garlic                                                    10323
Organic Fuji Apple                                                 8364
Organic Lemon                                                      8218
Apple Honeycrisp Organic                                           7972
Seedless Red Grapes                                             

In [7]:
# prods['product_name'] = prods['product_name'].apply(lambda x: x.lower())
# prods['product_name'].head()

### Now we need to create the new DF to store purchased products in one row per user.

In [8]:
order_num = prods['order_id'].unique()
prod_lst = []

for v,num in enumerate(order_num):
    products = prods.loc[prods['order_id'] == num]['product_name'].tolist()
    prod_lst.append(products) 

In [9]:
basket = pd.DataFrame(order_num, columns = ['transaction'])
basket.head()

Unnamed: 0,transaction
0,2
1,3
2,4
3,5
4,6


In [10]:
basket['products_ordered'] = [lst for lst in prod_lst]
basket.head()

Unnamed: 0,transaction,products_ordered
0,2,"[Michigan Organic Kale, Garlic Powder, Natural..."
1,3,[Unsweetened Chocolate Almond Breeze Almond Mi...
2,4,"[Original Orange Juice, Kellogg's Nutri-Grain ..."
3,5,"[Organic Raspberries, Clementines, Natural Art..."
4,6,[Dryer Sheets Geranium Scent]


#### Get list of prods

In [11]:
prods_for_model = basket['products_ordered'].tolist()
filtered_prods_for_model = [ele for ele in prods_for_model if len(ele) > 1]
filtered_prods_for_model

[['Michigan Organic Kale',
  'Garlic Powder',
  'Natural Sweetener',
  'Carrots',
  'All Natural No Stir Creamy Almond Butter'],
 ['Unsweetened Chocolate Almond Breeze Almond Milk',
  'Unsweetened Almondmilk',
  'Organic Ezekiel 49 Bread Cinnamon Raisin',
  'Lemons',
  'Air Chilled Organic Boneless Skinless Chicken Breasts',
  'Organic Baby Spinach'],
 ['Original Orange Juice',
  "Kellogg's Nutri-Grain Apple Cinnamon Cereal",
  'Goldfish Cheddar Baked Snack Crackers',
  'Honey/Lemon Cough Drops',
  'Sugarfree Energy Drink',
  "Kellogg's Nutri-Grain Blueberry Cereal",
  'Oats & Chocolate Chewy Bars',
  'Nutri-Grain Soft Baked Strawberry Cereal Breakfast Bars'],
 ['Organic Raspberries',
  'Clementines',
  'Natural Artesian Water, Mini & Mobile',
  'Mini Original Babybel Cheese',
  'Dairy Milk Fruit & Nut Chocolate Bar',
  'Boneless Skinless Chicken Breast Fillets',
  'Wafer, Chocolate',
  'Matzos, Thin, Tea',
  'Original Black Box Tablewater Cracker',
  'French Lavender Hand Wash',
  'Ju

#### Testing APRIORI algorithm

1) Encode data.

In [12]:
te = TransactionEncoder()
te_ary = te.fit(filtered_prods_for_model).transform(filtered_prods_for_model)
te_ary

MemoryError: Unable to allocate 7.13 GiB for an array with shape (267740, 28596) and data type bool

In [None]:
encoded_df = pd.DataFrame(te_ary, columns = te.columns_)
encoded_df.head()

In [None]:
freq_items = apriori(encoded_df, min_support=0.01, use_colnames=True)

In [None]:
freq_items

In [None]:
rules = association_rules(freq_items, metric='confidence', min_threshold=0.01)
rules

### Testing FP Growth

In [None]:
freq_items_fp = fpgrowth(encoded_df, min_support=0.01, use_colnames=True)

In [None]:
freq_items_fp

In [None]:
rules_fp = association_rules(freq_items_fp, metric='confidence', min_threshold=0.2)
rules_fp

#### Compute element frequency

## Could we try tro look for associations between aisles?

In [None]:
# aisle_df = order_set[['order_id','aisle']]
# aisle_df = aisle_df.loc[aisles_df['orders'] 

In [None]:
# aisle_lst = []

# for v,num in enumerate(order_num):
#     aisle = aisle_df.loc[aisle_df['order_id'] == num]['aisle'].tolist()
#     aisle_lst.append(aisle) 

In [None]:
# aisle_df_fp = pd.DataFrame(order_num, columns = ['num_order'])
# aisle_df_fp['aisles'] = [lst for lst in aisle_lst]
# aisle_df_fp.head()

In [None]:
# te_ary_2 = te.fit(aisle_lst).transform(aisle_lst)
# te_ary_2

In [None]:
# aisle_enc = pd.DataFrame(te_ary_2, columns = te.columns_)
# aisle_enc.head()

#### Testing FP Growth

In [None]:
# freq_items_3 = fpgrowth(aisle_enc, min_support=0.1, use_colnames=True)
# freq_items_3

In [None]:
# rules_fq3 = association_rules(freq_items_3, metric='confidence', min_threshold=0.3)
# rules_fq3

# How can we visualize these rules?