In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_list
from pyspark.ml.fpm import FPGrowth

spark = SparkSession.builder.master("local").appName("MIC").getOrCreate()

## Load Data
### Orders

In [2]:
path = "D:/datamic_instacart-market-basket-analysis/order_products__train.csv/order_products__train.csv"

orders = spark.read.option('header', 'true').csv(path)
orders.show()

+--------+----------+-----------------+---------+
|order_id|product_id|add_to_cart_order|reordered|
+--------+----------+-----------------+---------+
|       1|     49302|                1|        1|
|       1|     11109|                2|        1|
|       1|     10246|                3|        0|
|       1|     49683|                4|        0|
|       1|     43633|                5|        1|
|       1|     13176|                6|        0|
|       1|     47209|                7|        0|
|       1|     22035|                8|        1|
|      36|     39612|                1|        0|
|      36|     19660|                2|        1|
|      36|     49235|                3|        0|
|      36|     43086|                4|        1|
|      36|     46620|                5|        1|
|      36|     34497|                6|        1|
|      36|     48679|                7|        1|
|      36|     46979|                8|        1|
|      38|     11913|                1|        0|


### Products

In [3]:
path = "D:/datamic_instacart-market-basket-analysis/products.csv/products.csv"

products = spark.read.option('header', 'true').csv(path)
products.show()

+----------+--------------------+--------+-------------+
|product_id|        product_name|aisle_id|department_id|
+----------+--------------------+--------+-------------+
|         1|Chocolate Sandwic...|      61|           19|
|         2|    All-Seasons Salt|     104|           13|
|         3|Robust Golden Uns...|      94|            7|
|         4|Smart Ones Classi...|      38|            1|
|         5|Green Chile Anyti...|       5|           13|
|         6|        Dry Nose Oil|      11|           11|
|         7|Pure Coconut Wate...|      98|            7|
|         8|Cut Russet Potato...|     116|            1|
|         9|Light Strawberry ...|     120|           16|
|        10|Sparkling Orange ...|     115|            7|
|        11|   Peach Mango Juice|      31|            7|
|        12|Chocolate Fudge L...|     119|            1|
|        13|   Saline Nasal Mist|      11|           11|
|        14|Fresh Scent Dishw...|      74|           17|
|        15|Overnight Diapers..

## Preprocessing
Prepare dataframe to be used with algortihm

In [4]:
orders_products = orders.join(products, orders.product_id == products.product_id, 'inner')
orders_products.show()

+--------+----------+-----------------+---------+----------+--------------------+--------+-------------+
|order_id|product_id|add_to_cart_order|reordered|product_id|        product_name|aisle_id|department_id|
+--------+----------+-----------------+---------+----------+--------------------+--------+-------------+
|       1|     49302|                1|        1|     49302|    Bulgarian Yogurt|     120|           16|
|       1|     11109|                2|        1|     11109|Organic 4% Milk F...|     108|           16|
|       1|     10246|                3|        0|     10246|Organic Celery He...|      83|            4|
|       1|     49683|                4|        0|     49683|      Cucumber Kirby|      83|            4|
|       1|     43633|                5|        1|     43633|Lightly Smoked Sa...|      95|           15|
|       1|     13176|                6|        0|     13176|Bag of Organic Ba...|      24|            4|
|       1|     47209|                7|        0|     4

In [5]:
#group products in same order into one array column
grouped_df = orders_products.groupBy('order_id').agg(collect_list('product_name').alias('products'))

grouped_df.show(truncate=70)

+--------+----------------------------------------------------------------------+
|order_id|                                                              products|
+--------+----------------------------------------------------------------------+
|       1|[Bulgarian Yogurt, Organic 4% Milk Fat Whole Milk Cottage Cheese, O...|
|  100000|[I Heart Baby Kale, Gala Apples, Organic Yellow Onion, Organic Baby...|
| 1000008|[ZzzQuil Liquid Warming Berry Flavor Sleep-Aid, G Series Perform Fr...|
| 1000029|[Almond Chia Granola Clusters, Fresh Cauliflower, Orange Bell Peppe...|
|  100003|                                       [Organic Salted Butter, Cereal]|
| 1000046|[Organic Red Bell Pepper, Organic Avocado, Parsley, Italian (Flat),...|
| 1000080|[Oatneal Cookie Ice Cream, Raw Cheddar Chips, Full Head Medium Toot...|
| 1000162|[Mexican Style Rice & Pasta, Special K Sausage, Egg & Cheese Flatbr...|
| 1000197|    [Sesame Topped Hamburger Buns, ProteinPLUS Multigrain Penne Pasta]|
| 1000209|[Organ

split data into train and test datasets
Will do an 80% - 20% split

In [6]:
train, test = grouped_df.randomSplit([0.8, 0.2], seed=289)

#### Set Model Parameters
our dataset has **49688** items
while testing diffenrt parameters, I found that a
minimum support of **0.01** and a minimum confidence of **0.10**
will provide us with good number of association rules

In [7]:
fpGrowth = FPGrowth(itemsCol="products", minSupport=0.01, minConfidence=0.10)

## Train Model

In [8]:
model = fpGrowth.fit(train)

In [9]:
frequent_items = model.freqItemsets
frequent_items.show(10)



+--------------------+----+
|               items|freq|
+--------------------+----+
|   [Unsalted Butter]|1152|
|        [Whole Milk]|1061|
|[Organic Reduced ...|1232|
|     [Russet Potato]|1128|
|     [Organic Lemon]|2798|
|[Organic Lacinato...|1403|
|[Grape White/Gree...|1199|
|       [Blueberries]|1854|
|[Extra Virgin Oli...|1664|
| [Green Bell Pepper]|2016|
+--------------------+----+
only showing top 10 rows



In [10]:
association_rules = model.associationRules
association_rules.show(10)

+--------------------+--------------------+-------------------+------------------+--------------------+
|          antecedent|          consequent|         confidence|              lift|             support|
+--------------------+--------------------+-------------------+------------------+--------------------+
|       [Large Lemon]|             [Limes]|0.19681421350896003|  4.29177994332976| 0.01227667908665329|
|       [Large Lemon]|   [Organic Avocado]|0.16342472047786796| 2.903201882623632|0.010193942868061526|
|       [Large Lemon]|            [Banana]| 0.2655843161280441|1.8575817152771383| 0.01656635139008312|
|      [Strawberries]|            [Banana]|  0.302759424795958| 2.117596324316266|0.014884876277825547|
|[Organic Strawber...|[Bag of Organic B...| 0.2797873569860164| 2.378215255459342|0.023129836629406708|
|[Organic Strawber...|            [Banana]|0.20073962787472552|1.4040372101334795|0.016595012897678417|
|[Organic Strawber...|[Organic Raspberr...|0.15081474633075234| 

In [11]:
print("number of frequent items and associations rules")
frequent_items.count(), association_rules.count()

number of frequent items and associations rules


(118, 31)

## Evaluate Model

In [12]:
predictions = model.transform(test)
predictions.show(40, truncate=50)

+--------+--------------------------------------------------+--------------------------------------------------+
|order_id|                                          products|                                        prediction|
+--------+--------------------------------------------------+--------------------------------------------------+
|       1|[Bulgarian Yogurt, Organic 4% Milk Fat Whole Mi...|[Organic Strawberries, Organic Raspberries, Org...|
| 1000008|[ZzzQuil Liquid Warming Berry Flavor Sleep-Aid,...|                                                []|
| 1000080|[Oatneal Cookie Ice Cream, Raw Cheddar Chips, F...|                                                []|
| 1000264|[Organic 2% Reduced Fat Milk, Organic Lowfat 1%...|[Organic Strawberries, Organic Raspberries, Org...|
| 1000277|[Bag of Jumbo Yellow Onions, Rainbow Bell Peppe...|                                                []|
| 1000520|[Choose-A-Size White Paper Towels 8 Giant Rolls...|                                   