# Instacart 

### Market Basket Analysis

Libraries.

In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import fpgrowth, apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [2]:
instacart = pd.read_csv('../data/instacart_sample.csv')
print(instacart.shape)
instacart.drop('Unnamed: 0', axis=1, inplace=True)
instacart.head()

(5204393, 15)


Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,department_id,aisle_id,aisle,department
0,6,22352,4,1,12,30.0,15873,2,0,Dryer Sheets Geranium Scent,17,75.0,laundry,household
1,8,3107,5,4,6,17.0,23423,1,1,Original Hawaiian Sweet Rolls,3,43.0,buns rolls,bakery
2,13,45082,2,6,17,1.0,3800,12,0,Hampshire 100% Natural Sour Cream,16,108.0,other creams cheeses,dairy eggs
3,13,45082,2,6,17,1.0,25783,7,0,Lemon Lime Thirst Quencher,7,64.0,energy sports drinks,beverages
4,13,45082,2,6,17,1.0,23020,10,0,Diet Tonic Water,7,77.0,soft drinks,beverages


In [3]:
prods = instacart[['order_id', 'product_name']].reset_index(drop=True)
prods.sort_values(by='order_id')

Unnamed: 0,order_id,product_name
0,6,Dryer Sheets Geranium Scent
1,8,Original Hawaiian Sweet Rolls
2,13,Hampshire 100% Natural Sour Cream
3,13,Lemon Lime Thirst Quencher
4,13,Diet Tonic Water
...,...,...
5204391,3421083,All Natural French Toast Sticks
5204387,3421083,Organic Mixed Berry Yogurt & Fruit Snack
5204386,3421083,Banana
5204388,3421083,Freeze Dried Mango Slices


## What are we going to do?

- Check product numbers
- Reduce number of products with the most purchased prods
- Apply Apriori algorithm
- See if it makes sense to standarize the products 
- Apply Apriori algorithm again

In [4]:
print('There are {} transactions and {} different products.'.format(prods['order_id'].nunique(), prods['product_name'].nunique()))

There are 761900 transactions and 31148 different products.


In [5]:
pd.options.display.max_rows = 200

prods_count = prods['product_name'].value_counts().head(200)

print('Most purchased products:')
display(prods_count)


Most purchased products:


Banana                                                            115372
Bag of Organic Bananas                                             92540
Organic Strawberries                                               63994
Organic Baby Spinach                                               59788
Strawberries                                                       34878
Limes                                                              34038
Organic Raspberries                                                34000
Organic Whole Milk                                                 33409
Organic Yellow Onion                                               28293
Organic Garlic                                                     27278
Organic Lemon                                                      21363
Organic Fuji Apple                                                 21147
Apple Honeycrisp Organic                                           20397
Seedless Red Grapes                                

In [6]:
# prods['product_name'] = prods['product_name'].apply(lambda x: x.lower())
# prods['product_name'].head()

### Now we need to create the new DF to store purchased products in one row per transaction.

In [7]:
order_num = prods['order_id'].unique()
prod_lst = []

for num in order_num:
    products = prods.loc[prods['order_id'] == num]['product_name'].tolist()
    prod_lst.append(products) 

In [8]:
basket = pd.DataFrame(order_num, columns = ['transaction'])
basket.head()

Unnamed: 0,transaction
0,6
1,8
2,13
3,14
4,22


In [9]:
prod_lst

[['Dryer Sheets Geranium Scent'],
 ['Original Hawaiian Sweet Rolls'],
 ['Hampshire 100% Natural Sour Cream',
  'Lemon Lime Thirst Quencher',
  'Diet Tonic Water',
  'Chunky Salsa Medium',
  'Light',
  'Soda',
  'Original Potato Chips',
  'G Series Lime Cucumber Sports Drink'],
 ['Unprocessed American Singles Colby-Style Cheese',
  'Organic Mini Homestyle Waffles',
  'Organic Broccoli Florets',
  'Naturals Chicken Nuggets',
  'Sriracha Chili Sauce',
  'Organic Whole Milk',
  'Corn Meal Pizza Crust',
  'Hair Bender Whole Bean Coffee'],
 ['2% Reduced Fat Milk',
  'Iceberg Lettuce',
  'Large Grade AA Eggs',
  'Banana',
  'Sandwich Bags',
  'Boneless And Skinless Chicken Breast',
  'Deli Fresh Honey Smoked Turkey Breast, 98% Fat Free, Gluten Free',
  'Presliced Everything Bagels',
  'Butter Top White Bread'],
 ['Organic 1% Low Fat Milk'],
 ['White Cheddar Semisoft Cheese',
  'Lowfat 2% Milkfat Cottage Cheese',
  'Mozzarella Light Low Moisture Part Skim String Cheese',
  'Original Denture Ad

In [10]:
basket['products_ordered'] = [lst for lst in prod_lst]
basket.head()

Unnamed: 0,transaction,products_ordered
0,6,[Dryer Sheets Geranium Scent]
1,8,[Original Hawaiian Sweet Rolls]
2,13,"[Hampshire 100% Natural Sour Cream, Lemon Lime..."
3,14,[Unprocessed American Singles Colby-Style Chee...
4,22,"[2% Reduced Fat Milk, Iceberg Lettuce, Large G..."


#### Get list of prods

In [11]:
prods_for_model = basket['products_ordered'].tolist()
filtered_prods_for_model = [ele for ele in prods_for_model if len(ele) > 1]
filtered_prods_for_model

[['Hampshire 100% Natural Sour Cream',
  'Lemon Lime Thirst Quencher',
  'Diet Tonic Water',
  'Chunky Salsa Medium',
  'Light',
  'Soda',
  'Original Potato Chips',
  'G Series Lime Cucumber Sports Drink'],
 ['Unprocessed American Singles Colby-Style Cheese',
  'Organic Mini Homestyle Waffles',
  'Organic Broccoli Florets',
  'Naturals Chicken Nuggets',
  'Sriracha Chili Sauce',
  'Organic Whole Milk',
  'Corn Meal Pizza Crust',
  'Hair Bender Whole Bean Coffee'],
 ['2% Reduced Fat Milk',
  'Iceberg Lettuce',
  'Large Grade AA Eggs',
  'Banana',
  'Sandwich Bags',
  'Boneless And Skinless Chicken Breast',
  'Deli Fresh Honey Smoked Turkey Breast, 98% Fat Free, Gluten Free',
  'Presliced Everything Bagels',
  'Butter Top White Bread'],
 ['White Cheddar Semisoft Cheese',
  'Lowfat 2% Milkfat Cottage Cheese',
  'Mozzarella Light Low Moisture Part Skim String Cheese',
  'Original Denture Adhesive Cream',
  '13 Gallon Kitchen Drawstring Trash Bags'],
 ['Reduced Fat Original Potato Crisps',

#### Testing APRIORI algorithm

1) Encode data.

In [12]:
te = TransactionEncoder()
te_ary = te.fit(prod_lst).transform(prod_lst)
te_ary

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [13]:
encoded_df = pd.DataFrame(te_ary, columns = te.columns_)
encoded_df.head()

Unnamed: 0,#2 Coffee Filters,#2 Cone White Coffee Filters,#2 Mechanical Pencils,#4 Natural Brown Coffee Filters,& Go! Hazelnut Spread + Pretzel Sticks,+Energy Black Cherry Vegetable & Fruit Juice,0 Calorie Acai Raspberry Water Beverage,0 Calorie Fuji Apple Pear Water Beverage,0 Calorie Strawberry Dragonfruit Water Beverage,0% Fat Black Cherry Greek Yogurt y,...,with Olive Oil Mayonnaise Dressing,with Pump Rebalancing Shampoo,with Sweet & Smoky BBQ Sauce Cheeseburger Sliders,with Sweet Cinnamon Bunches Cereal,with Xylitol Cinnamon 18 Sticks Sugar Free Gum,with Xylitol Minty Sweet Twist 18 Sticks Sugar Free Gum,with Xylitol Original Flavor 18 Sticks Sugar Free Gum,with Xylitol Unwrapped Original Flavor 50 Sticks Sugar Free Gum,with a Splash of Mango Coconut Water,with a Splash of Pineapple Coconut Water
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
# freq_items_apriori = apriori(encoded_df, min_support=0.01, use_colnames=True)

In [15]:
# print('Frequet items associations:')
# print(freq_items_apriori)

In [16]:
# rules = association_rules(freq_items_apriori, metric='confidence', min_threshold=0.01)
# for rule in rules:
#     print(rule)

### Testing FP Growth

In [17]:
freq_items_fp = fpgrowth(encoded_df, min_support=0.01, use_colnames=True)

In [18]:
freq_items_fp

Unnamed: 0,support,itemsets
0,0.012168,(Soda)
1,0.04385,(Organic Whole Milk)
2,0.010637,(Organic Broccoli Florets)
3,0.151427,(Banana)
4,0.011008,(2% Reduced Fat Milk)
5,0.018789,(100% Whole Wheat Bread)
6,0.078472,(Organic Baby Spinach)
7,0.035803,(Organic Garlic)
8,0.018433,(Blueberries)
9,0.012683,(Large Alfresco Eggs)


In [27]:
rules_fp = association_rules(freq_items_fp, metric='confidence', min_threshold=0.1)
rules_fp

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Organic Strawberries),(Organic Baby Spinach),0.083993,0.078472,0.012322,0.146701,1.869467,0.005731,1.079959
1,(Organic Baby Spinach),(Organic Strawberries),0.078472,0.083993,0.012322,0.157021,1.869467,0.005731,1.086632
2,(Banana),(Organic Baby Spinach),0.151427,0.078472,0.016753,0.110633,1.409842,0.00487,1.036162
3,(Organic Baby Spinach),(Banana),0.078472,0.151427,0.016753,0.213488,1.409842,0.00487,1.078907
4,(Organic Baby Spinach),(Bag of Organic Bananas),0.078472,0.12146,0.016469,0.209875,1.727941,0.006938,1.1119
5,(Bag of Organic Bananas),(Organic Baby Spinach),0.12146,0.078472,0.016469,0.135595,1.727941,0.006938,1.066084
6,(Organic Strawberries),(Bag of Organic Bananas),0.083993,0.12146,0.019176,0.228303,1.879661,0.008974,1.138452
7,(Bag of Organic Bananas),(Organic Strawberries),0.12146,0.083993,0.019176,0.157878,1.879661,0.008974,1.087737
8,(Banana),(Organic Strawberries),0.151427,0.083993,0.017665,0.116657,1.3889,0.004946,1.036979
9,(Organic Strawberries),(Banana),0.083993,0.151427,0.017665,0.210317,1.3889,0.004946,1.074574


## Could we try tro look for associations between aisles?

In [20]:
# aisle_df = order_set[['order_id','aisle']]
# aisle_df = aisle_df.loc[aisles_df['orders'] 

In [21]:
# aisle_lst = []

# for v,num in enumerate(order_num):
#     aisle = aisle_df.loc[aisle_df['order_id'] == num]['aisle'].tolist()
#     aisle_lst.append(aisle) 

In [22]:
# aisle_df_fp = pd.DataFrame(order_num, columns = ['num_order'])
# aisle_df_fp['aisles'] = [lst for lst in aisle_lst]
# aisle_df_fp.head()

In [23]:
# te_ary_2 = te.fit(aisle_lst).transform(aisle_lst)
# te_ary_2

In [24]:
# aisle_enc = pd.DataFrame(te_ary_2, columns = te.columns_)
# aisle_enc.head()

#### Testing FP Growth

In [25]:
# freq_items_3 = fpgrowth(aisle_enc, min_support=0.1, use_colnames=True)
# freq_items_3

In [26]:
# rules_fq3 = association_rules(freq_items_3, metric='confidence', min_threshold=0.3)
# rules_fq3

# How can we visualize these rules?