#### Modeling and Analysis 

In [None]:
import pandas as pd
import numpy as np
from fpgrowth_py import fpgrowth
import pyfpgrowth
import random
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth

In [7]:
# reading aisles dataset
aisles = pd.read_csv('data/aisles.csv')

In [8]:
# reading department dataset
department = pd.read_csv('data/departments.csv')

In [162]:
p = 0.10
all_orders = pd.read_csv('data/orders_train_prior.csv', header = 0, skiprows = lambda i: i>0 and random.random() > p)

In [96]:
all_orders_full = pd.read_csv('data/orders_train_prior.csv')

In [80]:
morning_orders = pd.read_csv('data/morning_orders.csv')

**Morning Orders: preprocessing, onehotencode, find patterns**

In [81]:
# group products by order_id into a df
df_morning = morning_orders.groupby('order_id')['product_name'].agg([('product_name', ', '.join)]).reset_index()

In [82]:
df_morning.set_index('order_id', inplace = True)

In [83]:
df_morning.head(2)

Unnamed: 0_level_0,product_name
order_id,Unnamed: 1_level_1
183,Black Tea Variety Pack
275,"Grated Parmesan, Organic Broccoli Florets"


In [84]:
df_morning.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16354 entries, 183 to 3420948
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  16354 non-null  object
dtypes: object(1)
memory usage: 255.5+ KB


In [55]:
df_morning.shape

(16354, 1)

In [85]:
morning_list = df_morning.values.tolist()

In [128]:
encoder = TransactionEncoder()
encoder_set = encoder.fit(morning_list).transform(morning_list)
df_morning_orders = pd.DataFrame(encoder_set, columns=encoder.columns_)
df_morning_orders.head(2)

Unnamed: 0,0 Calorie Strawberry Dragonfruit Water Beverage,0% Fat Blueberry Greek Yogurt,0% Fat Free Organic Milk,0% Fat Organic Greek Vanilla Yogurt,"0% Fat Organic Greek Vanilla Yogurt, Organic Beef Uncured Hot Dogs",0% Fat Strawberry Greek Yogurt,0% Greek Strained Yogurt,0% Milkfat Greek Plain Yogurt,1 Apple + 1 Mango Fruit Bar,"1 Apple + 1 Mango Fruit Bar, Nonfat Icelandic Style Strawberry Yogurt",...,"ZzzQuil Nighttime Sleep-Aid LiquiCaps, Slow Kettle Kickin' Crab & Sweet Corn Chowder Soup","\""Mokaccino\"" Milk + Blue Bottle Coffee Chocolate, Organic Hass Avocado, Chopped Onions, Organic Raspberries, Diced Red Onions, Organic Garlic, Organic Shredded Parmesan, Organic Large Grade AA Brown Eggs, Crunchy Almond Butter, Organic Sweetened Condensed Milk",for Tots Apple Juice,gelato Coffee Toffee,of Hanover 100 Calorie Pretzels Mini,"of Hanover 100 Calorie Pretzels Mini, Organic Cinnamon Apple Chips, Organic Lightly Salted Brown Rice Cakes, Crisp 'n Light Wholesome Wheat Crackerbread, Multi Grain Crispbread, Peppermint Sugar Free Gum, Organic Mango Acai Fruit Leather, 12 Ct, Green Seedless Grapes, Banana, Organic Blackberries, Organic Fuji Apple, Organic Granny Smith Apple, Multi Grain Cheerios Cereal, Mozzarella Ciliegine Fresh, Grade A Large White Eggs, Organic Grade A Free Range Large Brown Eggs, Original Whole Grain English Muffins, Original Nooks & Crannies English Muffins, Large Burrito Flour Tortillas, Pure Coconut Water, No Pulp Calcium & Vitamin D Pure Premium 100% Pure Orange Juice, Uncured Genoa Salami, Fusilli No. 34, Organic Turkey Bacon, White Giant Paper Towel Rolls","of Hanover 100 Calorie Pretzels Mini, Original No Pulp 100% Florida Orange Juice",smartwater® Electrolyte Enhanced Water,vitaminwater® XXX Acai Blueberry Pomegranate,with Crispy Almonds Cereal
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [93]:
fpgrowth(df_morning_orders, min_support=0.01)

Unnamed: 0,support,itemsets


**All Orders**

In [163]:
# group products by order_id into a df
df_all_orders = all_orders.groupby('order_id')['product_name'].agg([('product_name', ', '.join)]).reset_index()

In [164]:
# reset order_id as index
df_all_orders.set_index('order_id', inplace=True)

In [165]:
df_all_orders.head(10)

Unnamed: 0_level_0,product_name
order_id,Unnamed: 1_level_1
1,Bulgarian Yogurt
36,Spring Water
38,Organic Hot House Tomato
67,Thick & Crispy Tortilla Chips
95,Organic Avocado
98,"Pinto Beans No Salt Added, Organic Garlic, Org..."
125,Organic Whole String Cheese
129,Bag of Organic Bananas
170,Carrots
187,Tiny Twists Pretzels


In [166]:
df_full_orders.shape

(128723, 1)

In [None]:
patterns = pyfpgrowth.find_frequent_patterns(df_all_orders['product_name'],1000)

In [151]:
patterns

{('Spring',): 1006,
 ('Mini',): 1006,
 ('Beef',): 1011,
 ('Bunch',): 1017,
 ('Cucumber',): 1020,
 ('Baked',): 1021,
 ('Light',): 1035,
 ('Ginger',): 1065,
 ('Oil',): 1076,
 ('Tea',): 1081,
 ('Italian',): 1084,
 ('Food',): 1091,
 ('Sliced',): 1096,
 ('Coffee',): 1099,
 ('Ground',): 1101,
 ('Classic',): 1134,
 ('and',): 1146,
 ('Bell',): 1155,
 ('Frozen',): 1208,
 ('Pasta',): 1210,
 ('Kale',): 1215,
 ('Style',): 1216,
 ('Breast',): 1227,
 ('Lowfat',): 1231,
 ('With',): 1236,
 ('Low',): 1239,
 ('Cereal',): 1256,
 ('Sea',): 1266,
 ('Salt', 'Sea'): 1196,
 ('Seedless',): 1276,
 ('Dark',): 1299,
 ('Chocolate', 'Dark'): 1003,
 ('Shredded',): 1306,
 ('Onion',): 1313,
 ('Total', 'Yogurt'): 1223,
 ('Greek', 'Total', 'Yogurt'): 1158,
 ('Greek', 'Total'): 1226,
 ('Bar',): 1344,
 ('Mix',): 1353,
 ('Blueberry',): 1355,
 ('Eggs', 'Grade'): 1071,
 ('Eggs', 'Grade', 'Large'): 1038,
 ('Grade', 'Large'): 1189,
 ('Garlic',): 1384,
 ('Broccoli',): 1401,
 ('Crackers',): 1406,
 ('Ice',): 1411,
 ('Cream', 'Ice