In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_list, countDistinct
from pyspark.ml.fpm import FPGrowth

spark = SparkSession.builder.master("local").appName("MIC").getOrCreate()

## Load Data
### Orders

In [2]:
path = "D:/datamic_instacart-market-basket-analysis/order_products__train.csv/order_products__train.csv"

orders = spark.read.option('header', 'true').csv(path)
display(orders.toPandas())

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
...,...,...,...,...
1384612,3421063,14233,3,1
1384613,3421063,35548,4,1
1384614,3421070,35951,1,1
1384615,3421070,16953,2,1


### Products

In [3]:
path = "D:/datamic_instacart-market-basket-analysis/products.csv/products.csv"

products = spark.read.option('header', 'true').csv(path)
display(products.toPandas())

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5
49684,49685,En Croute Roast Hazelnut Cranberry,42,1
49685,49686,Artisan Baguette,112,3
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8


## Preprocessing
Prepare dataframe to be used with algortihm

In [4]:
orders_products = orders.join(products, orders.product_id == products.product_id, 'inner')
display(orders_products.toPandas())

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_id.1,product_name,aisle_id,department_id
0,1,49302,1,1,49302,Bulgarian Yogurt,120,16
1,1,11109,2,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16
2,1,10246,3,0,10246,Organic Celery Hearts,83,4
3,1,49683,4,0,49683,Cucumber Kirby,83,4
4,1,43633,5,1,43633,Lightly Smoked Sardines in Olive Oil,95,15
...,...,...,...,...,...,...,...,...
1384612,3421063,14233,3,1,14233,Natural Artesian Water,115,7
1384613,3421063,35548,4,1,35548,Twice Baked Potatoes,13,20
1384614,3421070,35951,1,1,35951,Organic Unsweetened Almond Milk,91,16
1384615,3421070,16953,2,1,16953,Creamy Peanut Butter,88,13


In [5]:
#group products in same order into one array column
grouped_df = orders_products.groupBy('order_id').agg(collect_list('product_name').alias('products'))

display(grouped_df.toPandas())

Unnamed: 0,order_id,products
0,1,"[Bulgarian Yogurt, Organic 4% Milk Fat Whole M..."
1,100000,"[I Heart Baby Kale, Gala Apples, Organic Yello..."
2,1000008,[ZzzQuil Liquid Warming Berry Flavor Sleep-Aid...
3,1000029,"[Almond Chia Granola Clusters, Fresh Cauliflow..."
4,100003,"[Organic Salted Butter, Cereal]"
...,...,...
131204,999879,"[Organic Gala Apples, Seedless Red Grapes, Ban..."
131205,999926,"[Total 0% Greek Yogurt, Whole Strawberries, Li..."
131206,999964,[Original Almondmilk]
131207,999971,"[Organic Baby Spinach, Frozen Organic Blueberr..."


split data into train and test datasets
Will do an 80% - 20% split

In [6]:
train, test = grouped_df.randomSplit([0.8, 0.2], seed=289)

### Model Parameters

[FP-growth](https://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html) implementation takes the following parameters:<br>

**minSupport**: the minimum support for an itemset to be identified as frequent. For example, if an item appears 3 out of 5 transactions, it has a support of 3/5=0.6. <br>

**minConfidence**: minimum confidence for generating Association Rule. Confidence is an indication of how often an association rule has been found to be true. For example, if in the transactions itemset X appears 4 times, X and Y co-occur only 2 times, the confidence for the rule X => Y is then 2/4 = 0.5. The parameter will not affect the mining for frequent itemsets, but specify the minimum confidence for generating association rules from frequent itemsets. <br>


# Train and Evaluate models

## Testing different support values
---
#### minimum support = 0.01 and minimum confidence = 0.2

In [8]:
fpGrowth = FPGrowth(itemsCol="products", minSupport=0.01, minConfidence=0.2)

model = fpGrowth.fit(train)

In [9]:
cart = spark.createDataFrame([
    (1, ["Compostable Forks"]),
    (2, ["Skim Milk"]),
    (3, ["Organic Milk"]),
    (4, ["Half & Half"]),
    (5, ["Blackberries"]),
    (6, ["Organic Egg Whites"])
], ["order_id", "products"])

In [10]:
pred = model.transform(cart)
pred.show(truncate=False)

+--------+--------------------+----------+
|order_id|products            |prediction|
+--------+--------------------+----------+
|1       |[Compostable Forks] |[]        |
|2       |[Skim Milk]         |[]        |
|3       |[Organic Milk]      |[]        |
|4       |[Half & Half]       |[]        |
|5       |[Blackberries]      |[]        |
|6       |[Organic Egg Whites]|[]        |
+--------+--------------------+----------+



### No predictions for any of the products

---
#### minimum support = 0.001 and minimum confidence = 0.2

In [11]:
fpGrowth = FPGrowth(itemsCol="products", minSupport=0.001, minConfidence=0.2)
model = fpGrowth.fit(train)

In [12]:
pred = model.transform(cart)
pred.show(truncate=False)

+--------+--------------------+----------------------------------------------+
|order_id|products            |prediction                                    |
+--------+--------------------+----------------------------------------------+
|1       |[Compostable Forks] |[]                                            |
|2       |[Skim Milk]         |[]                                            |
|3       |[Organic Milk]      |[Bag of Organic Bananas]                      |
|4       |[Half & Half]       |[Banana]                                      |
|5       |[Blackberries]      |[Raspberries, Strawberries]                   |
|6       |[Organic Egg Whites]|[Bag of Organic Bananas, Organic Baby Spinach]|
+--------+--------------------+----------------------------------------------+



### Some predictions - Popular products covered but not some of the least popular items

---
#### minimum support = 0.0001 and minimum confidence = 0.2

In [13]:
fpGrowth = FPGrowth(itemsCol="products", minSupport=0.0001, minConfidence=0.2)
model = fpGrowth.fit(train)

In [14]:
pred = model.transform(cart)
pred.show(truncate=False)

+--------+--------------------+----------------------------------------------+
|order_id|products            |prediction                                    |
+--------+--------------------+----------------------------------------------+
|1       |[Compostable Forks] |[Plastic Spoons, Plastic Knives]              |
|2       |[Skim Milk]         |[Banana]                                      |
|3       |[Organic Milk]      |[Bag of Organic Bananas]                      |
|4       |[Half & Half]       |[Banana]                                      |
|5       |[Blackberries]      |[Raspberries, Strawberries]                   |
|6       |[Organic Egg Whites]|[Bag of Organic Bananas, Organic Baby Spinach]|
+--------+--------------------+----------------------------------------------+



### All transactions get predictions even not common items

---
## Testing different confidence values

#### minimum support = 0.0001 and minimum confidence = 0.5

In [15]:
fpGrowth = FPGrowth(itemsCol="products", minSupport=0.0001, minConfidence=0.5)
model = fpGrowth.fit(train)

In [16]:
pred = model.transform(cart)
pred.show(truncate=False)

+--------+--------------------+----------+
|order_id|products            |prediction|
+--------+--------------------+----------+
|1       |[Compostable Forks] |[]        |
|2       |[Skim Milk]         |[]        |
|3       |[Organic Milk]      |[]        |
|4       |[Half & Half]       |[]        |
|5       |[Blackberries]      |[]        |
|6       |[Organic Egg Whites]|[]        |
+--------+--------------------+----------+



### with confidence of 50% none of the items in the cart give a prediction
---
#### minimum support = 0.0001 and minimum confidence = 0.3

In [17]:
fpGrowth = FPGrowth(itemsCol="products", minSupport=0.0001, minConfidence=0.3)
model = fpGrowth.fit(train)

In [18]:
pred = model.transform(cart)
pred.show(truncate=False)

+--------+--------------------+--------------------------------+
|order_id|products            |prediction                      |
+--------+--------------------+--------------------------------+
|1       |[Compostable Forks] |[Plastic Spoons, Plastic Knives]|
|2       |[Skim Milk]         |[]                              |
|3       |[Organic Milk]      |[]                              |
|4       |[Half & Half]       |[]                              |
|5       |[Blackberries]      |[]                              |
|6       |[Organic Egg Whites]|[]                              |
+--------+--------------------+--------------------------------+



### only a few predictions not good enough
---
#### minimum support = 0.0001 and minimum confidence = 0.1

In [19]:
fpGrowth = FPGrowth(itemsCol="products", minSupport=0.0001, minConfidence=0.1)
model = fpGrowth.fit(train)

In [20]:
pred = model.transform(cart)
pred.show(truncate=90)

+--------+--------------------+------------------------------------------------------------------------------------------+
|order_id|            products|                                                                                prediction|
+--------+--------------------+------------------------------------------------------------------------------------------+
|       1| [Compostable Forks]|                                                          [Plastic Spoons, Plastic Knives]|
|       2|         [Skim Milk]|              [Organic Strawberries, Bag of Organic Bananas, Organic Baby Spinach, Banana]|
|       3|      [Organic Milk]|[Organic Strawberries, Bag of Organic Bananas, Organic Raspberries, Organic Baby Spinac...|
|       4|       [Half & Half]|                                    [Organic Strawberries, Bag of Organic Bananas, Banana]|
|       5|      [Blackberries]|[Raspberries, Organic Strawberries, Strawberries, Organic Blueberries, Bag of Organic B...|
|       6|[Organ

### All items get predictions, but there is overlap between most of the items. The pattern is less reliable
---
#### minimum support = 0.0001 and minimum confidence = 0.2

In [21]:
fpGrowth = FPGrowth(itemsCol="products", minSupport=0.0001, minConfidence=0.2)
model = fpGrowth.fit(train)

In [22]:
pred = model.transform(cart)
pred.show(truncate=False)

+--------+--------------------+----------------------------------------------+
|order_id|products            |prediction                                    |
+--------+--------------------+----------------------------------------------+
|1       |[Compostable Forks] |[Plastic Spoons, Plastic Knives]              |
|2       |[Skim Milk]         |[Banana]                                      |
|3       |[Organic Milk]      |[Bag of Organic Bananas]                      |
|4       |[Half & Half]       |[Banana]                                      |
|5       |[Blackberries]      |[Raspberries, Strawberries]                   |
|6       |[Organic Egg Whites]|[Bag of Organic Bananas, Organic Baby Spinach]|
+--------+--------------------+----------------------------------------------+



### With a 20% confidence there is a clearer pattern between the items
### For example: organic products associate with other organic products and blackberries with other types of berries

### After testing the different parameters we can conclude that minimum support of 0.0001 and a minimum confidence of 20% give us a very reliable set association rules
# Evaluate Model

In [23]:
predictions = model.transform(test)

predictions.show()

+--------+--------------------+--------------------+
|order_id|            products|          prediction|
+--------+--------------------+--------------------+
|       1|[Bulgarian Yogurt...|[Organic Cucumber...|
| 1000008|[ZzzQuil Liquid W...|            [Banana]|
| 1000080|[Oatneal Cookie I...|[Peanut Butter Ic...|
| 1000264|[Organic 2% Reduc...|[Organic Strawber...|
| 1000277|[Bag of Jumbo Yel...|[Strawberries, Ba...|
| 1000520|[Choose-A-Size Wh...|                  []|
| 1000834|[Organic & Raw St...|[Organic Strawber...|
| 1000872|[Naked Green Mach...|            [Banana]|
| 1001005|[Organic Grade A ...|[Bag of Organic B...|
| 1001030|[Variety Pack, Ta...|                  []|
| 1001087|[Organic Unsweete...|[Organic Yellow O...|
| 1001139|[Fig Newmans Frui...|[Organic Strawber...|
| 1001169|[Coppertop AA Alk...|                  []|
| 1001225|[Organic Blueberr...|[Organic Strawber...|
|   10015|[Unsweetened Orga...|[Banana, Bag of O...|
| 1001510|[Zero Calorie Gra...|[Bag of Organic