## Import initial required libraries and packages

In [1]:
# Import all required libraries and packages

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, month, year, date_format, date_sub, to_date, date_add, lag,  dayofweek, dayofmonth, dayofyear, dayofweek
import pandas as pd
import numpy as np
from pyspark.context import SparkContext
from pyspark.sql.functions import approxCountDistinct
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.types import DateType, IntegerType,NumericType
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.ml.feature import Bucketizer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder,CrossValidatorModel

import matplotlib.pyplot as plt
%matplotlib inline

## Load in the clean data 

Here is where we load the cleaned parquet file and take a few initial looks at it

In [2]:
# Load the latest clean dataset

sc = SparkContext('local')
spark = SparkSession(sc)

df = spark.read.load("hdfs://ca4022-m/user/adam/parquet/cleaned2")

In [3]:
# Quick look at how many alcohol transactions we have

df.count()

12591077

In [4]:
# Quick look at the schema features and data types

df.printSchema()

root
 |-- Category: integer (nullable = true)
 |-- Item_Number: integer (nullable = true)
 |-- Vendor_Number: integer (nullable = true)
 |-- County_Number: integer (nullable = true)
 |-- Zip_Code: string (nullable = true)
 |-- Store_Number: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Pack: integer (nullable = true)
 |-- Bottle_Volume_ml: integer (nullable = true)
 |-- State_Bottle_Cost: float (nullable = true)
 |-- State_Bottle_Retail: float (nullable = true)
 |-- Bottles_Sold: integer (nullable = true)
 |-- Sale_Dollars: float (nullable = true)
 |-- Volume_Sold_Liters: double (nullable = true)
 |-- Volume_Sold_Gallons: double (nullable = true)
 |-- invoice_number: string (nullable = true)
 |-- line_number: string (nullable = true)
 |-- Store_Name: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- median_lat: float (nullable = true)
 |-- median_lon: float (nullable = true)
 |-- City: string (nullable = true)
 |-- County: string (nullable = true)


In [6]:
# Example of what a row looks like in this set, having been cleaned and preprocessed

df.show(1)

+--------+-----------+-------------+-------------+--------+------------+----------+----+----------------+-----------------+-------------------+------------+------------+------------------+-------------------+--------------+-----------+---------------+-------------+----------+----------+---------------+------+---------------+--------------------+-----------------+
|Category|Item_Number|Vendor_Number|County_Number|Zip_Code|Store_Number|      Date|Pack|Bottle_Volume_ml|State_Bottle_Cost|State_Bottle_Retail|Bottles_Sold|Sale_Dollars|Volume_Sold_Liters|Volume_Sold_Gallons|invoice_number|line_number|     Store_Name|      Address|median_lat|median_lon|           City|County|    Vendor_Name|    Item_Description|    Category_Name|
+--------+-----------+-------------+-------------+--------+------------+----------+----+----------------+-----------------+-------------------+------------+------------+------------------+-------------------+--------------+-----------+---------------+-------------+---

In [7]:
# Quick look at some store numbers alongside the store names

df.select(["Store_Number","Store_Name"]).dropDuplicates().show(5)

+------------+--------------------+
|Store_Number|          Store_Name|
+------------+--------------------+
|        3162|Nash Finch / Whol...|
|        4909|Pump N Pak Rock V...|
|        3056|Clarion Super Val...|
|        4180|Smokin' Joe's #10...|
|        9002|Mississippi River...|
+------------+--------------------+
only showing top 5 rows



## Create derived sales features

We need some derived, aggregated, sales values to work with.

First we calculate how much sales in USD each item generates in each store.

We also want to get the total sales in USD that each store has throughout the whole period within the dataset.

Then we join these together in a table, sale_by_store_by_item

In [3]:
# Calculate the most popular liquor items in terms of sales for each store

sale_by_store_item = df.groupBy("Store_Number", "Item_Number").sum("Sale_Dollars")

In [4]:
# Calculate total sales in US dollars for each store

sales_by_store = df.groupBy("Store_Number").sum("Sale_Dollars")

In [5]:
# rename columns

sales_by_store = sales_by_store.withColumnRenamed("sum(Sale_Dollars)", "store_total_sale_dollars")

In [6]:
# Rename columns

sales_by_store = sales_by_store.withColumnRenamed("Store_Number", "store_num")

In [7]:
# join two tables

sale_by_store_by_item = sale_by_store_item.alias("a").join(sales_by_store\
                     .alias("b"),sale_by_store_item['Store_Number'] == sales_by_store['store_num'],how='left')

In [8]:
sale_by_store_by_item.show(5)

+------------+-----------+-----------------+---------+------------------------+
|Store_Number|Item_Number|Sale_Dollars     |store_num|store_total_sale_dollars|
+------------+-----------+-----------------+---------+------------------------+
|        2659|      28866|215.8800048828125|     2659|      131108.44009304047|
|        2659|      87408|298.8999938964844|     2659|      131108.44009304047|
|        2659|      81206|933.8399658203125|     2659|      131108.44009304047|
|        2659|      12888|2262.480010986328|     2659|      131108.44009304047|
|        2659|      43197|            38.25|     2659|      131108.44009304047|
+------------+-----------+-----------------+---------+------------------------+
only showing top 5 rows



Because we now have this table containing the total sales generated by each product within each store, and the total sales that the store generates,  we can now derive what percentage of a store's total sales are generated by each product.

We now do this and add that to the table.

In [9]:
# add a "percentage of total store sales for each item" column

sale_by_store_by_item = sale_by_store_by_item.withColumn("perc_of_store_total_sale", 
                                                         sale_by_store_by_item["sum(Sale_Dollars)"]/ sale_by_store_by_item["store_total_sale_dollars"] )


In [15]:
# Quick look at what the table now contains

sale_by_store_by_item.show(5)

+------------+-----------+-----------------+---------+------------------------+------------------------+
|Store_Number|Item_Number|Sale_Dollars     |store_num|store_total_sale_dollars|perc_of_store_total_sale|
+------------+-----------+-----------------+---------+------------------------+------------------------+
|        2659|      28866|215.8800048828125|     2659|      131108.44009304047|    0.001646575954451...|
|        2659|      87408|298.8999938964844|     2659|      131108.44009304047|    0.002279792160477...|
|        2659|      81206|933.8399658203125|     2659|      131108.44009304047|    0.007122653317800269|
|        2659|      12888|2262.480010986328|     2659|      131108.44009304047|    0.017256555027126936|
|        2659|      43197|            38.25|     2659|      131108.44009304047|    2.917432315788066E-4|
+------------+-----------+-----------------+---------+------------------------+------------------------+
only showing top 5 rows



The next step is take a look at each store, and rank the items it sells from most popular to least popular.

This is done based on the total sales for each product, in USD

In [10]:
# import some needed pyspark.sql functions

from pyspark.sql.functions import *
from pyspark.sql.window import Window

# For each store, we can rank all items from most popular to least popular based on sales in US dollars 

ranked =  sale_by_store_by_item.withColumn("rank", dense_rank().over(Window.partitionBy("store_num").orderBy(desc("perc_of_store_total_sale"))))

In [20]:
# quick look at what this ranked list looks like

ranked.show(10)

+------------+-----------+------------------+---------+------------------------+------------------------+----+
|Store_Number|Item_Number| Sale_Dollars     |store_num|store_total_sale_dollars|perc_of_store_total_sale|rank|
+------------+-----------+------------------+---------+------------------------+------------------------+----+
|        2659|      11776| 6132.840026855469|     2659|      131108.44009304047|     0.04677685145596522|   1|
|        2659|      34433| 4931.760009765625|     2659|      131108.44009304047|    0.037615885035820924|   2|
|        2659|      11788| 4882.559829711914|     2659|      131108.44009304047|     0.03724062178031429|   3|
|        2659|      25608|  4727.40007019043|     2659|      131108.44009304047|     0.03605717577629368|   4|
|        2659|      37998| 4516.200050354004|     2659|      131108.44009304047|     0.03444629535023912|   5|
|        2659|      23828| 4490.279945373535|     2659|      131108.44009304047|     0.03424859560671326|   6|
|

"ranked" now contains an ordered ranking for each item, for each store in Iowa, just like what we wanted.

We now need to convert this to a ranking based ratings system. To do this we create 10 buckets in which items get placed.

The top 10% of items in each store in terms of total sales in USD will receive a rating of 9, the next highest 10% receive a rating of 8 and so on.

If we compare the "ranked" table above to the "ratings" one after this process, we can see that the top 10 have all received this max rating of 9, as we would expect.

In [11]:
# use QuantileDiscretizer to place all of the ranked items into ten ratings buckets, 
# equally distributed based on their position in the ranking 

store_item_rated = QuantileDiscretizer(numBuckets=10, inputCol="perc_of_store_total_sale",outputCol="rating")\
.fit(sale_by_store_by_item).transform(sale_by_store_by_item)

In [23]:
# take a look at what these ratings looks like 

store_item_rated.filter(store_item_rated.Store_Number == 2659).sort('perc_of_store_total_sale',ascending=False)\
.show(50)

+------------+-----------+------------------+---------+------------------------+------------------------+------+
|Store_Number|Item_Number| Sale_Dollars     |store_num|store_total_sale_dollars|perc_of_store_total_sale|rating|
+------------+-----------+------------------+---------+------------------------+------------------------+------+
|        2659|      11776| 6132.840026855469|     2659|      131108.44009304047|     0.04677685145596522|   9.0|
|        2659|      34433| 4931.760009765625|     2659|      131108.44009304047|    0.037615885035820924|   9.0|
|        2659|      11788| 4882.559829711914|     2659|      131108.44009304047|     0.03724062178031429|   9.0|
|        2659|      25608|  4727.40007019043|     2659|      131108.44009304047|     0.03605717577629368|   9.0|
|        2659|      37998| 4516.200050354004|     2659|      131108.44009304047|     0.03444629535023912|   9.0|
|        2659|      23828| 4490.279945373535|     2659|      131108.44009304047|     0.034248595

We showed 50 above so we can see when the ratings start to drop under 9. We can see that towards the end of the 50, we start to have some 8s

Next we want to take the important columns that the ALS recommender model needs and store them in "ratings". 

We take the store number, the item number, the rating it received and the percentage of total sales it generates for the store.

In [12]:
# carrying on with the important features, stored in "ratings"

ratings = store_item_rated.select("Store_Number","Item_Number", "rating","perc_of_store_total_sale")

In [23]:
# quick look at what the "ratings" table looks like

ratings.show(5)

+------------+-----------+------+------------------------+
|Store_Number|Item_Number|rating|perc_of_store_total_sale|
+------------+-----------+------+------------------------+
|        2659|      28866|   7.0|    0.001646575954451...|
|        2659|      87408|   7.0|    0.002279792160477...|
|        2659|      81206|   9.0|    0.007122653317800269|
|        2659|      12888|   9.0|    0.017256555027126936|
|        2659|      43197|   4.0|    2.917432315788066E-4|
+------------+-----------+------+------------------------+
only showing top 5 rows



The next few lines are required to fix the Item Number column, casting it to INT so ALS can read it properly.

We also drop any nulls here.

In [13]:
df = df.withColumn("Item_Number", regexp_extract("Item_Number", "\\d+", 0))

In [14]:
ratings = ratings.withColumn("Item_Number", regexp_extract("Item_Number", "\\d+", 0))

In [15]:
ratings = ratings.withColumn("Item_Number", ratings["Item_Number"].cast(IntegerType()))

In [16]:
ratings = ratings.withColumnRenamed("Item_Number", "item_num")

In [17]:
ratings = ratings.dropna()

## Training and test split

It's time to split the ratings data into a training and test split of 0.8/0.2

In [80]:
# split the data into training and test

training, test = ratings.randomSplit([0.8,0.2], seed = 36)

Now that we have our training and test split, we can create and train our ALS model. 

Alternating Least Square (ALS) runs itself in a parallel fashion. ALS is implemented in Apache Spark ML and built for a large-scale collaborative filtering problems.

Our "User" we want to recommend items to are our liquor stores and the "Items" we want to recommend to the user are liquor products. We do not implement a "cold start" strategy as we know a lot about each store based on how the products within them currently rank

In [81]:
# create our model

als = ALS(maxIter=10, regParam=0.01, userCol="Store_Number", itemCol="item_num", ratingCol="rating",
          coldStartStrategy="drop", nonnegative = True)

In [82]:
# train out model by fitting it to the training data

model = als.fit(training)

In [83]:
# Apply model on the test set to predict 

predictions = model.transform(test)

In [84]:
# quick sample of predictions for sanity check

predictions.show(50)

+------------+--------+------+------------------------+------------+
|Store_Number|item_num|rating|perc_of_store_total_sale|  prediction|
+------------+--------+------+------------------------+------------+
|        4174|   10623|   7.0|    0.001464883876655...|   4.5173454|
|        2627|   10623|   0.0|    4.761695292000214E-6|   1.0178543|
|        4129|   10623|   0.0|    1.676564976479242...|   0.2742364|
|        2448|   10623|   0.0|    2.621142842165380...|0.0092287455|
|        2622|   21220|   1.0|    4.289953009793906...|  0.17363909|
|        4773|   21220|   2.0|    1.217239627645284E-4|   2.1917303|
|        4327|   21220|   4.0|    2.430985772381577...|   5.1853323|
|        2633|   21220|   0.0|    1.004612157969494...| 0.039267782|
|        4288|   21220|   1.0|    5.051777354313654...|   1.9180535|
|        2560|   21220|   0.0|    1.353293021502036...|   0.2180132|
|        2190|   21220|   0.0|    1.834473321788793...|  0.39269286|
|        2552|   21220|   0.0|    

In [85]:
df_alt = df.withColumnRenamed("Store_Number", "store_num_b")

We now want to create a table of predictions so we can evaluate how the model performed on the test data

In [86]:
# create table of predictions

predictions = predictions.join(df_alt,df_alt.store_num_b == predictions.Store_Number)

In [61]:
# evaluate model performance

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating')
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.4468538985486474


We can see that our RMSE is 1.447. With the range of our ratings system being between 1 and 10, we can see that this isn't too bad and it's likely that we can get some useful results. If a product actually had a rating of 8 and it received a 6.663, this seems pretty good at a glance.

Next we can see that we could take two approaches using this ALS model. We can recommend products to stores as directly below:

In [88]:
# get top 10 liquor recommendations for 3 different stores

users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)

In [89]:
userSubsetRecs.show(truncate=False)

+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Store_Number|recommendations                                                                                                                                                                                                   |
+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|3918        |[[902767, 24.220943], [904827, 19.688126], [904984, 18.685905], [901610, 15.277443], [904605, 14.816101], [988063, 14.655827], [903032, 14.406505], [902551, 14.207243], [973448, 13.987808], [902585, 13.973974]]|
|2659        |[[989640, 21.285685], [902576, 20.985794], [965269, 20.293386], [965221, 20.22396]

The other approach that's possible is a reverse if the previous, in that we can actually take the liquor product itself and recommend a list of stores that it would probably sell well in 

In [90]:
# get top 10 store recommendation for 3 different liquor items

liquorRecs = model.recommendForAllItems(10)

In [91]:
liquors = ratings.select(als.getItemCol()).distinct().limit(3)
liquorSubsetRecs = model.recommendForItemSubset(liquors, 10)

In [92]:
liquorSubsetRecs.show(truncate=False)

+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|item_num|recommendations                                                                                                                                                                              |
+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|65251   |[[5304, 10.073424], [3569, 9.652113], [5424, 9.438894], [4338, 9.427406], [4693, 9.412919], [5208, 9.393775], [5433, 9.390767], [4857, 9.263572], [4834, 9.244173], [5410, 9.237768]]        |
|43302   |[[4990, 9.792597], [5304, 9.735309], [5285, 9.4549465], [4832, 9.453925], [5492, 9.427067], [5256, 9.3855095], [4465, 9.202315], [5461, 9.163044], [5365, 9.162868], [5310, 9.162399]]    

We can see how both approaches can be useful depending on the use case. 

If we take the perspective of the store who is purchasing the liquor, it's of course useful to know what liquor is likely to sell well in your store.

If you look from the perspective of the company who makes the liquor however, it would be very useful to identify which stores you should be selling it to.

Now we want to get a list of the top stores in terms of total sales so we can use one of them for a small case study. 

In [93]:
# get list of top stores

top_stores = df.groupBy(df['Store_Number']).agg({'Sale_Dollars':"sum"}).sort("sum(Sale_Dollars)", ascending=False).dropna().limit(20)

In [95]:
top20_store_list = [row.Store_Number for row in top_stores.select("Store_Number").collect()]

In [96]:
# display list of top 20 stores

top20_store_list

[2633,
 4829,
 2512,
 3385,
 3420,
 3952,
 3814,
 2190,
 3354,
 3773,
 2625,
 3494,
 3447,
 2616,
 3524,
 3820,
 2619,
 2629,
 2593,
 2648]

In [97]:
# collect all of the recommendations for these top 20 stores

rec_liquor_top20_store = {}
for store in top20_store_list:
    rec_liquor = userRecs.where(userRecs.Store_Number == store).select("recommendations").collect()
    rec_liquor_top10_store[store] = [i.item_num for i in rec_liquor[0]["recommendations"]]

In [98]:
# create list of all individual liquor products

liquor_product_list = df.select('Item_Number','Item_Description','Category_Name').dropDuplicates()
liquor_product_list = liquor_product_list.dropDuplicates(['Item_Number'])

Now that we have our list of top 20 stores, we want to grab one at random to use in our small case study

In [113]:
# Find the name of a store to use in our example

df_alt.filter(df_alt.store_num_b==3494).select('store_num_b', "Store_Name")\
.show(1, truncate = False)

+-----------+--------------------------+
|store_num_b|Store_Name                |
+-----------+--------------------------+
|3494       |Sam's Club 6514 / Waterloo|
+-----------+--------------------------+
only showing top 1 row



We went with 3494 which turns out to be "Sam's Club" in Waterloo

Next we want to get a list of the top 10 products that our model recommends for this store to stock, along with the top 10 products currently sell the best in the store.

In [114]:
sam_recommend = liquor_product_list.filter(liquor_product_list["Item_Number"].isin(rec_liquor_top10_store[3494]))

In [115]:
# Take a quick look at the recommended items for Sam's Club, Waterloo

sam_recommend.show(10)

+-----------+--------------------+--------------------+
|Item_Number|    Item_Description|       Category_Name|
+-----------+--------------------+--------------------+
|     904476|8 Seconds Canadia...| Special Order Items|
|     967286|Ice Hole Salty Ca...|   American Schnapps|
|     903037|     Agavero Liqueur| Special Order Items|
|       1016|Glenlivet w/2 Gla...|DECANTERS & SPECI...|
|     977256|  Revel Stoke Spiced|   CANADIAN WHISKIES|
|     902585|     Belvedere Vodka|      IMPORTED VODKA|
|     904605|      Sobieski Vodka|      IMPORTED VODKA|
|     902120|       Fernet Branca|MISC. IMPORTED CO...|
|     904884|Ice Hole Exotic S...| Special Order Items|
|      80504|Tres Leches Cream...|      CREAM LIQUEURS|
+-----------+--------------------+--------------------+



In [116]:
sam_actual = sale_by_store_by_item.join(liquor_product_list ,sale_by_store_by_item["Item_Number"] == liquor_product_list["Item_Number"], how='inner')\
.filter(sale_by_store_by_item.store_num==3494)\
.select("store_num", sale_by_store_by_item.Item_Number, "Item_Description","Category_Name",'sum(Sale_Dollars)')\
.dropDuplicates(["Item_Number"])\
.sort("sum(Sale_Dollars)", ascending=False)

In [117]:
# Take a quick look at the actual best selling items for Sam's Club, Waterloo

sam_actual.show(10)

+---------+-----------+--------------------+--------------------+------------------+
|store_num|Item_Number|    Item_Description|       Category_Name|      Sale_Dollars|
+---------+-----------+--------------------+--------------------+------------------+
|     3494|      43337|Captain Morgan Sp...|          SPICED RUM|         371714.27|
|     3494|      77487|   Tortilla Gold Dss|DISTILLED SPIRITS...|         325368.19|
|     3494|      11788|        Black Velvet|   CANADIAN WHISKIES|         308781.77|
|     3494|      26827|Jack Daniels Old ...|  TENNESSEE WHISKIES|         302438.88|
|     3494|      11297|Crown Royal Canad...|   CANADIAN WHISKIES|         274505.39|
|     3494|      43338|Captain Morgan Sp...|          SPICED RUM|         221117.39|
|     3494|      65257|Jagermeister Liqueur|MISC. IMPORTED CO...|         212291.51|
|     3494|      34007|Absolut Swedish V...|     Imported Vodkas|         208327.44|
|     3494|      88296|Patron Tequila Si...|             TEQUILA|

In [135]:
# convert to pandas for output

sam_recommend_pd = sam_recommend.toPandas()
sam_recommend_pd['Category_Name'] = sam_recommend_pd['Category_Name'].str.title()


In [136]:
# convert to pandas for output

sam_actual_pd = sam_actual.toPandas()
sam_actual_pd['Category_Name'] = sam_actual_pd['Category_Name'].str.title()


## List of 10 highest recommended new items for Sam's Club, Waterloo

In [137]:
sam_recommend_pd

Unnamed: 0,Item_Number,Item_Description,Category_Name
0,904476,8 Seconds Canadian Whiskey,Special Order Items
1,967286,Ice Hole Salty Caramel Schnapps,American Schnapps
2,903037,Agavero Liqueur,Special Order Items
3,1016,Glenlivet w/2 Glasses,Decanters & Specialty Packages
4,977256,Revel Stoke Spiced,Canadian Whiskies
5,902585,Belvedere Vodka,Imported Vodka
6,904605,Sobieski Vodka,Imported Vodka
7,902120,Fernet Branca,Misc. Imported Cordials & Liqueurs
8,904884,Ice Hole Exotic Schnapps,Special Order Items
9,80504,Tres Leches Cream Liqueur,Cream Liqueurs


## List of actual top 10 best selling items in Sam's Club, Waterloo

In [138]:
sam_actual_pd = sam_actual_pd.rename(columns={"store_num":"Store_Num", "sum(Sale_Dollars)":"Total_Sales_USD"})
sam_actual_pd.round(2).head(10)

Unnamed: 0,Store_Num,Item_Number,Item_Description,Category_Name,Total_Sales_USD
0,3494,43337,Captain Morgan Spiced Rum,Spiced Rum,371714.28
1,3494,77487,Tortilla Gold Dss,Distilled Spirits Specialty,325368.2
2,3494,11788,Black Velvet,Canadian Whiskies,308781.78
3,3494,26827,Jack Daniels Old #7 Black Lbl,Tennessee Whiskies,302438.88
4,3494,11297,Crown Royal Canadian Whisky,Canadian Whiskies,274505.4
5,3494,43338,Captain Morgan Spiced Rum,Spiced Rum,221117.4
6,3494,65257,Jagermeister Liqueur,Misc. Imported Cordials & Liqueurs,212291.52
7,3494,34007,Absolut Swedish Vodka 80 Prf,Imported Vodkas,208327.44
8,3494,88296,Patron Tequila Silver,Tequila,159508.68
9,3494,69637,Dr. Mcgillicuddy's Cherry Schnapps,Imported Schnapps,157408.56


## Thoughts on this recommendation example

It's clear to see why certain products may have been recommended to Sam's Club in this case.

Two of the recommneded products are Canadian Whiskeys, "8 Seconds Canadian Whiskey" and "Revel Stoke Spiced".

We can see that two of Sam's Club's biggest current sellers are also Canadian Whiskeys, "Black Velvet" and "Crown Royal".

This makes intuitive sense and serves as a nice sanity check for our system. It's likely that if certain types of liqour already sell well, than other products of that same type may also sell well.

A second cofirmation that the recommender is doing its job is the presence of two sweet schnapps in the recommendations, "Ice Hole Salty Caramel Schnapps" and "Ice Hole Exotic Schnapps", and the presence of another sweet schnapps in the actual best selling list - "Dr. Mcgillicuddy's Cherry Schnapps"