#### Modeling and Analysis 

In [19]:
import pandas as pd
import numpy as np
from fpgrowth_py import fpgrowth
import pyfpgrowth
import random
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth

In [20]:
# reading aisles dataset
aisles = pd.read_csv('data/aisles.csv')

In [21]:
# reading department dataset
department = pd.read_csv('data/departments.csv')

In [22]:
p = 0.10
all_orders = pd.read_csv('data/orders_train_prior.csv', header = 0, skiprows = lambda i: i>0 and random.random() > p)

In [23]:
all_orders_full = pd.read_csv('data/orders_train_prior.csv')

In [24]:
morning_orders = pd.read_csv('data/morning_orders.csv')

In [25]:
daytime_orders = pd.read_csv('data/day_orders.csv')

In [26]:
evening_orders = pd.read_csv('data/evening_orders.csv')

**Morning Orders**

In [27]:
# group products by order_id into a df
df_morning = morning_orders.groupby('order_id')['product_name'].agg([('product_name', ','.join)]).reset_index()

In [28]:
# change order_id column into an index
df_morning.set_index('order_id', inplace = True)

In [29]:
df_morning_split = df_morning.product_name.apply(lambda x: x.split(','))

In [30]:
df_morning_split.head()

order_id
174              [organic garlic, island vanilla cereal]
217                     [organic short grain brown rice]
521                                       [spring water]
598                                [breaded fish sticks]
750    [multi-seeds cracker, wheat gluten free waffle...
Name: product_name, dtype: object

In [31]:
df_morning_orders = pd.DataFrame(df_morning_split)

In [32]:
df_morning_orders.head()

Unnamed: 0_level_0,product_name
order_id,Unnamed: 1_level_1
174,"[organic garlic, island vanilla cereal]"
217,[organic short grain brown rice]
521,[spring water]
598,[breaded fish sticks]
750,"[multi-seeds cracker, wheat gluten free waffle..."


In [33]:
df_morning.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16460 entries, 174 to 3421049
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  16460 non-null  object
dtypes: object(1)
memory usage: 257.2+ KB


In [34]:
df_morning_orders.shape

(16460, 1)

In [35]:
patterns = pyfpgrowth.find_frequent_patterns(df_morning_orders['product_name'], 50)

In [36]:
patterns

{('organic broccoli',): 50,
 ('clementines bag',): 50,
 ('unsalted butter',): 51,
 ('clementines',): 51,
 ('uncured genoa salami',): 52,
 ('sparkling natural mineral water',): 53,
 ('extra virgin olive oil',): 54,
 ('boneless skinless chicken breasts',): 54,
 ('bunched cilantro',): 55,
 ('red vine tomato',): 58,
 ('organic garnet sweet potato (yam)',): 58,
 ("organic d'anjou pears",): 58,
 ('organic reduced fat 2% milk',): 61,
 ('organic kiwi',): 62,
 ('lime sparkling water',): 62,
 ('organic red bell pepper',): 62,
 ('hass avocados',): 63,
 ('unsweetened almondmilk',): 64,
 ('organic italian parsley bunch',): 64,
 ('organic navel orange',): 65,
 ('small hass avocado',): 65,
 ('blueberries',): 66,
 ('carrots',): 67,
 ('organic ginger root',): 67,
 ('organic peeled whole baby carrots',): 67,
 ('michigan organic kale',): 67,
 ('organic small bunch celery',): 68,
 ('100% whole wheat bread',): 68,
 ('organic unsweetened almond milk',): 69,
 ('organic granny smith apple',): 69,
 ('green bel

In [37]:
rules = pyfpgrowth.generate_association_rules(patterns, 0.01)

In [38]:
rules

{('bag of organic bananas',): (('organic strawberries',), 0.10441767068273092),
 ('organic strawberries',): (('bag of organic bananas',), 0.18181818181818182)}

**Daytime orders**

In [39]:
# group products by order_id into a df
df_daytime = daytime_orders.groupby('order_id')['product_name'].agg([('product_name', ','.join)]).reset_index()

In [40]:
# change order_id column into an index
df_daytime.set_index('order_id', inplace = True)

In [41]:
df_daytime_split = df_daytime.product_name.apply(lambda x: x.split(','))

In [42]:
df_daytime_split.head()

order_id
254                               [organic strawberries]
360                                  [shiitake mushroom]
470                         [sparkling water grapefruit]
729             [special k chocolatey strawberry cereal]
762    [organic strawberries, celery hearts, organic ...
Name: product_name, dtype: object

In [43]:
df_daytime_orders = pd.DataFrame(df_daytime_split)

In [44]:
df_daytime_orders.shape

(29208, 1)

In [45]:
patterns_daytime = pyfpgrowth.find_frequent_patterns(df_daytime_orders['product_name'], 50)

In [46]:
patterns_daytime

{('organic medium salsa',): 50,
 ('baby spinach',): 50,
 ('creamy peanut butter',): 50,
 ('organic creamy peanut butter',): 50,
 ('ground turkey breast',): 50,
 ('organic coconut milk',): 51,
 ('organic heavy whipping cream',): 51,
 ('organic white onions',): 51,
 ('hass avocado',): 51,
 ('sour cream',): 53,
 ('cantaloupe',): 53,
 ('organic bread with 21 whole grains',): 53,
 ('total 0% greek yogurt',): 54,
 ('organic broccoli crowns',): 54,
 ('gala apples',): 55,
 ('unsweetened vanilla almond milk',): 55,
 ('frozen organic wild blueberries',): 55,
 ('fat free milk',): 55,
 ('boneless skinless chicken breast',): 55,
 ('uncured hickory smoked sunday bacon',): 55,
 ('100% raw coconut water',): 57,
 ('pineapple chunks',): 57,
 ('shredded mild cheddar cheese',): 57,
 ('total 2% with strawberry lowfat greek strained yogurt',): 57,
 ('organic red radish bunch',): 57,
 ('organic reduced fat milk',): 58,
 ('lightly salted baked snap pea crisps',): 58,
 ('2% reduced fat milk',): 58,
 ('organic 

In [47]:
rules = pyfpgrowth.generate_association_rules(patterns_daytime, 0.01)

In [48]:
rules

{('bag of organic bananas',): (('organic strawberries',), 0.09838107098381071),
 ('organic raspberries',): (('bag of organic bananas',), 0.1529051987767584),
 ('banana',): (('organic strawberries',), 0.057435897435897436),
 ('large lemon',): (('banana',), 0.16),
 ('organic hass avocado',): (('bag of organic bananas',), 0.17048346055979643),
 ('organic avocado',): (('banana',), 0.1488833746898263),
 ('organic baby spinach',): (('bag of organic bananas',), 0.12015503875968993)}

**Evening Orders**

In [49]:
# group products by order_id into a df
df_evening = evening_orders.groupby('order_id')['product_name'].agg([('product_name', ','.join)]).reset_index()

In [50]:
# change order_id column into an index
df_evening.set_index('order_id', inplace = True)

In [51]:
df_evening_split = df_evening.product_name.apply(lambda x: x.split(','))

In [52]:
df_evening_split.head()

order_id
527     [organic jalapeno pepper, flat fillets anchovies]
1700    [organic classic rich crackers, strawberry fla...
2032               [raspberry blackberry sparkling water]
2458    [frozen organic wild blueberries, organic lemo...
2468    [original/ranch/bbq/chile picante con limon va...
Name: product_name, dtype: object

In [53]:
df_evening_orders = pd.DataFrame(df_evening_split)

In [54]:
df_evening_orders.shape

(7704, 1)

In [55]:
patterns_evening = pyfpgrowth.find_frequent_patterns(df_evening_orders['product_name'], 30)

In [56]:
patterns_daytime

{('organic medium salsa',): 50,
 ('baby spinach',): 50,
 ('creamy peanut butter',): 50,
 ('organic creamy peanut butter',): 50,
 ('ground turkey breast',): 50,
 ('organic coconut milk',): 51,
 ('organic heavy whipping cream',): 51,
 ('organic white onions',): 51,
 ('hass avocado',): 51,
 ('sour cream',): 53,
 ('cantaloupe',): 53,
 ('organic bread with 21 whole grains',): 53,
 ('total 0% greek yogurt',): 54,
 ('organic broccoli crowns',): 54,
 ('gala apples',): 55,
 ('unsweetened vanilla almond milk',): 55,
 ('frozen organic wild blueberries',): 55,
 ('fat free milk',): 55,
 ('boneless skinless chicken breast',): 55,
 ('uncured hickory smoked sunday bacon',): 55,
 ('100% raw coconut water',): 57,
 ('pineapple chunks',): 57,
 ('shredded mild cheddar cheese',): 57,
 ('total 2% with strawberry lowfat greek strained yogurt',): 57,
 ('organic red radish bunch',): 57,
 ('organic reduced fat milk',): 58,
 ('lightly salted baked snap pea crisps',): 58,
 ('2% reduced fat milk',): 58,
 ('organic 

In [57]:
rules = pyfpgrowth.generate_association_rules(patterns_evening, 0.001)

In [58]:
rules

{}

**All Orders**

In [59]:
# group products by order_id into a df
df_all_orders = all_orders_full.groupby('order_id')['product_name'].agg([('product_name', ','.join)]).reset_index()

In [60]:
# reset order_id as index
df_all_orders.set_index('order_id', inplace=True)

In [61]:
# split results product sets by comma
df_orders_split = df_all_orders.product_name.apply(lambda x: x.split(','))

In [62]:
# create a dataframe
df_orders = pd.DataFrame(df_orders_split) 

In [63]:
df_orders.head(2)

Unnamed: 0_level_0,product_name
order_id,Unnamed: 1_level_1
1,"[organic celery hearts, organic 4% milk fat wh..."
4,"[tiny twists pretzels, chewy 25% low sugar cho..."


In [64]:
df_orders.shape

(1305846, 1)

In [65]:
# Identify products with minimum support level
patterns_all = pyfpgrowth.find_frequent_patterns(df_orders['product_name'], 500)

In [66]:
patterns_all

{('original instant oatmeal',): 500,
 ('chocolate sea salt',): 500,
 ('sweet potato fries with sea salt',): 501,
 ('black beans no salt added',): 502,
 ('classic mix variety',): 502,
 ('organic ezekiel 4:9 sesame bread',): 502,
 ('organic yokids lemonade/blueberry variety pack yogurt squeezers tubes',): 502,
 ('smartwater® electrolyte enhanced water',): 504,
 ('hommus classic original',): 504,
 ('uncured beef hot dogs',): 504,
 ('vitamin water zero squeezed lemonade',): 505,
 ('beer',): 505,
 ('soft pretzel mini buns',): 505,
 ('frozen peaches',): 505,
 ('100% recycled bath tissue rolls',): 505,
 ('mexican finely shredded cheese',): 505,
 ('total 0% blueberry acai greek yogurt',): 506,
 ('organic super fruit punch juice drink',): 506,
 ('iced oatmeal cookie kid z bar',): 507,
 ('ground buffalo',): 507,
 ('hardwood smokedcenter cut original bacon',): 507,
 ('organic string cheese',): 508,
 ('tortillas corn organic',): 508,
 ('red grapefruit',): 508,
 ('chardonnay',): 509,
 ('blueberry m

In [67]:
# save patterns to a dataframe and create column names
patterns_df = pd.DataFrame(patterns_all.items(), columns=['product_name', 'support_level'])

In [68]:
# take out brackets and commas from the product_name column
patterns_df['product_name'] = patterns_df['product_name'].astype(str).str.replace(r'\(|\)|,', '')

  patterns_df['product_name'] = patterns_df['product_name'].astype(str).str.replace(r'\(|\)|,', '')


In [87]:
# patterns_df = patterns_df.strip("\''", "")

In [71]:
# save patterns to a .csv file in a data folder
patterns_df.to_csv('data/patterns.csv', index = False)

In [72]:
# Identify itemsets with minimum confidence level
rules = pyfpgrowth.generate_association_rules(patterns_all, 0.1)

In [73]:
rules

{('clementines bag',): (('banana',), 0.144961016459717),
 ('organic broccoli',): (('bag of organic bananas',), 0.15602443087173792),
 ('orange bell pepper',): (('banana',), 0.1340042372881356),
 ('roma tomato',): (('banana',), 0.15268329554043839),
 ('jalapeno peppers',): (('limes',), 0.14230019493177387),
 ('bunched cilantro',): (('limes',), 0.14061757719714965),
 ('organic navel orange',): (('bag of organic bananas',), 0.17908682280577243),
 ('lime sparkling water',): (('sparkling water grapefruit',),
  0.1264177693761815),
 ('boneless skinless chicken breasts',): (('banana',), 0.13367499466154176),
 ('hass avocados',): (('bag of organic bananas',), 0.12300587741393787),
 ('red vine tomato',): (('banana',), 0.12559713375796178),
 ('organic unsweetened almond milk',): (('bag of organic bananas',),
  0.12398765908214424),
 ('blueberries',): (('banana',), 0.14547206165703275),
 ('100% whole wheat bread',): (('banana',), 0.12448674878686077),
 ('red peppers',): (('banana',), 0.1526771514

In [132]:
# Create a dataframe of frequent patterns and transpose the dataframe
df_rules = pd.DataFrame(rules).T

In [133]:
# sort a dataframe in descending orders
df_rules_sorted = df_rules.sort_values(ascending = False, by = 1)

In [134]:
# reset index to display pair products
df_rules_sorted.reset_index(level = 0, inplace = True)
df_rules_sorted.reset_index(level = 0, inplace = True)

In [135]:
# rename columns to antecedent, consequent, and calculated confidence level
df_rules_sorted.rename(columns = {'index': 'consequent_2', 'level_0':'consequent_1', 0:'antecedent', 1: 'confidence_level'}, inplace = True)

In [144]:
# take out brackets and commas in the consequent column
df_rules_sorted['antecedent'] = df_rules_sorted['antecedent'].astype(str).str.replace(r'\(|\)|,', '')
# reorder columns
cols = ['antecedent', 'consequent_1', 'consequent_2', 'confidence_level']
df_rules_sorted = df_rules_sorted.reindex(columns=cols)

  df_rules_sorted['antecedent'] = df_rules_sorted['antecedent'].astype(str).str.replace(r'\(|\)|,', '')


In [145]:
df_rules_sorted

Unnamed: 0,antecedent,consequent_1,consequent_2,confidence_level
0,'bag of organic bananas',organic hass avocado,organic raspberries,0.49353
1,'bag of organic bananas',organic hass avocado,organic strawberries,0.434307
2,'bag of organic bananas',organic raspberries,organic strawberries,0.37037
3,'organic strawberries',bag of organic bananas,organic raspberries,0.343733
4,'bag of organic bananas',organic baby spinach,organic strawberries,0.339503
5,'organic strawberries',bag of organic bananas,organic hass avocado,0.276637
6,'organic strawberries',bag of organic bananas,organic baby spinach,0.249575
7,'organic baby spinach',bag of organic bananas,organic strawberries,0.181959
8,'bag of organic bananas',organic navel orange,,0.179087
9,'banana',organic fuji apple,,0.166195


In [146]:
# save results to a .csv file in data folder
df_rules_sorted.to_csv('data/rules.csv', index = False)

In [81]:
all_orders_full.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,order_id,add_to_cart_order,reordered,quantity
0,1,chocolate sandwich cookies,61,19,6695,7,1,1.0
1,1,chocolate sandwich cookies,61,19,48361,9,0,1.0
2,1,chocolate sandwich cookies,61,19,63770,4,0,1.0
3,1,chocolate sandwich cookies,61,19,75339,9,0,1.0
4,1,chocolate sandwich cookies,61,19,240996,3,1,1.0


In [141]:
# create lists of items and concatenate into one
antecedents = df_rules_sorted['antecedent'].to_list()
consequent_1 = df_rules_sorted['consequent_1'].to_list()
consequent_2 = df_rules_sorted['consequent_2'].to_list()
full_item_list = antecedents + consequent_1 + consequent_2
clean_list = [x for x in full_item_list if str(x) != 'nan']
used = set()
unique_items = [x for x in clean_list if x not in used and (used.add(x) or True)]

In [142]:
# loop through the list and find order in which they were added to a cart
cart_order = []

for item in unique_items:
    if item in all_orders_full['product_name']:
        print(True)
    else:
        print(False)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [118]:
check = [all_orders_full.loc[lambda all_orders_full: all_orders_full['product_name'] == 'bananas']]

In [143]:
unique_items

[('bag of organic bananas',),
 ('organic strawberries',),
 ('organic baby spinach',),
 ('banana',),
 ('limes',),
 ('sparkling water grapefruit',),
 'organic hass avocado',
 'organic raspberries',
 'bag of organic bananas',
 'organic baby spinach',
 'organic navel orange',
 'organic fuji apple',
 'organic broccoli',
 'roma tomato',
 'red peppers',
 'blueberries',
 'clementines bag',
 'jalapeno peppers',
 'bunched cilantro',
 'orange bell pepper',
 'boneless skinless chicken breasts',
 'lime sparkling water',
 'red vine tomato',
 '100% whole wheat bread',
 'organic unsweetened almond milk',
 'hass avocados',
 'cucumber kirby',
 'organic strawberries']