## Introduction

- This is a concise notebook on running the implicit library
- Data can be found from the following link: https://archive.ics.uci.edu/ml/datasets/online+retail
- More details can be found in the medium article: https://medium.com/@zaishanweng/quick-start-guide-to-build-a-collaborative-filtering-recommendation-system-with-implicit-library-in-c5e79e35dfb8

In [1]:
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sparse

In [2]:
# Read in dataset
df_raw = pd.read_excel("./data/Online Retail.xlsx")

In [3]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [4]:
# Dropping records with no CustomerID
df_raw.dropna(subset=["CustomerID"], inplace=True)

In [5]:
df_raw["CustomerID"] = df_raw["CustomerID"].astype("int64")
df_raw["StockCode"] = df_raw["StockCode"].astype("str")

In [6]:
df_raw["Sales"] = df_raw["Quantity"] * df_raw["UnitPrice"]

In [7]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    406829 non-null  object        
 1   StockCode    406829 non-null  object        
 2   Description  406829 non-null  object        
 3   Quantity     406829 non-null  int64         
 4   InvoiceDate  406829 non-null  datetime64[ns]
 5   UnitPrice    406829 non-null  float64       
 6   CustomerID   406829 non-null  int64         
 7   Country      406829 non-null  object        
 8   Sales        406829 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 31.0+ MB


## Filter out records on for Customers who bought more than n items

In [8]:
df_items_per_cust = (
    df_raw.groupby(["CustomerID"]).agg({"StockCode": "nunique"}).reset_index()
)

In [9]:
df_items_per_cust.columns = ["CustomerID", "Count_item_cust"]

In [10]:
# Setting of Threshold
item_in_cust_threshold = 6

In [11]:
# Filtering Results
mask = df_items_per_cust["Count_item_cust"] >= item_in_cust_threshold
valid_cust = set(df_items_per_cust.loc[mask, "CustomerID"].tolist())

In [12]:
df_filter_cust = df_raw[df_raw["CustomerID"].isin(valid_cust)].copy()

In [13]:
invoiceno_filter_cust = set(df_filter_cust["InvoiceNo"].tolist())

## Filter out items only for those that are bought by multiple customers

In [14]:
df_custs_per_item = (
    df_raw.groupby(["StockCode"]).agg({"CustomerID": "nunique"}).reset_index()
)

In [15]:
df_custs_per_item.columns = ["StockCode", "Count_cust_item"]

In [16]:
df_custs_per_item["Count_cust_item"].value_counts()

1      189
2      145
3      116
6       83
5       81
      ... 
407      1
274      1
319      1
256      1
379      1
Name: Count_cust_item, Length: 379, dtype: int64

In [17]:
# Set threshold
cust_in_item_threshold = 6

In [18]:
# Filter Results
mask = df_custs_per_item["Count_cust_item"] >= cust_in_item_threshold
valid_stockcode = set(df_custs_per_item.loc[mask, "StockCode"].tolist())

In [19]:
df_filter_item = df_raw[df_raw["StockCode"].isin(valid_stockcode)].copy()

In [20]:
invoiceno_filter_item = set(df_filter_item["InvoiceNo"].tolist())

In [21]:
invoiceno_intersect = set.intersection(invoiceno_filter_item, invoiceno_filter_cust)

In [22]:
print(f"No. of invoice after filtering customer: {len(invoiceno_filter_cust)}")
print(f"No. of invoice after filtering item: {len(invoiceno_filter_item)}")
print(f"No. of invoice from intersect: {len(invoiceno_intersect)}")

No. of invoice after filtering customer: 21604
No. of invoice after filtering item: 22136
No. of invoice from intersect: 21557


In [23]:
df_filter_cust_item = df_raw[df_raw["InvoiceNo"].isin(invoiceno_intersect)].copy()

In [24]:
df_filter_cust_item.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 405521 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    405521 non-null  object        
 1   StockCode    405521 non-null  object        
 2   Description  405521 non-null  object        
 3   Quantity     405521 non-null  int64         
 4   InvoiceDate  405521 non-null  datetime64[ns]
 5   UnitPrice    405521 non-null  float64       
 6   CustomerID   405521 non-null  int64         
 7   Country      405521 non-null  object        
 8   Sales        405521 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 30.9+ MB


## Apply Collaborative Filtering with Implicit LIbrary

In [25]:
unique_customers = df_filter_cust_item["CustomerID"].unique()
cust_ids = dict(
    zip(unique_customers, np.arange(unique_customers.shape[0], dtype=np.int32))
)

unique_items = df_filter_cust_item["StockCode"].unique()
item_ids = dict(zip(unique_items, np.arange(unique_items.shape[0], dtype=np.int32)))

df_filter_cust_item["cust_id"] = df_filter_cust_item["CustomerID"].apply(
    lambda i: cust_ids[i]
)
df_filter_cust_item["item_id"] = df_filter_cust_item["StockCode"].apply(
    lambda i: item_ids[i]
)

In [26]:
print(f"{len(cust_ids)}, {len(item_ids)}")

4017, 3671


In [27]:
df_filter_cust_item.sort_values(by=["StockCode"], inplace=True)
df_filter_cust_item.head(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Sales,cust_id,item_id
143930,548714,10002,INFLATABLE POLITICAL GLOBE,2,2011-04-03 15:07:00,0.85,17337,United Kingdom,1.7,1329,31
160128,550452,10002,INFLATABLE POLITICAL GLOBE,1,2011-04-18 12:56:00,0.85,14525,United Kingdom,0.85,679,31
5466,536863,10002,INFLATABLE POLITICAL GLOBE,1,2010-12-03 11:19:00,0.85,17967,United Kingdom,0.85,202,31
75792,542610,10002,INFLATABLE POLITICAL GLOBE,14,2011-01-30 14:05:00,0.85,13148,United Kingdom,11.9,602,31
20617,538069,10002,INFLATABLE POLITICAL GLOBE,8,2010-12-09 14:08:00,0.85,16795,United Kingdom,6.8,544,31
21507,538086,10002,INFLATABLE POLITICAL GLOBE,10,2010-12-09 14:44:00,0.85,12872,United Kingdom,8.5,553,31
21551,538093,10002,INFLATABLE POLITICAL GLOBE,12,2010-12-09 14:49:00,0.85,12682,France,10.2,217,31
77513,542735,10002,INFLATABLE POLITICAL GLOBE,12,2011-01-31 15:36:00,0.85,12681,France,10.2,380,31
22380,538167,10002,INFLATABLE POLITICAL GLOBE,12,2010-12-09 18:58:00,0.85,14713,United Kingdom,10.2,575,31
23275,538196,10002,INFLATABLE POLITICAL GLOBE,36,2010-12-10 10:56:00,0.85,12731,France,30.6,588,31


In [28]:
df_cust_item_qty = (
    df_filter_cust_item.groupby(["cust_id", "item_id"])
    .agg({"Quantity": "sum"})
    .reset_index()
)

In [29]:
df_cust_item_qty.head()

Unnamed: 0,cust_id,item_id,Quantity
0,0,0,122
1,0,1,122
2,0,2,108
3,0,3,110
4,0,4,104


In [30]:
# Create Sparse Matrix

sparse_customer_item = sparse.csr_matrix(
    (
        df_cust_item_qty["Quantity"].astype(float),
        (df_cust_item_qty["cust_id"], df_cust_item_qty["item_id"]),
    )
)

In [31]:
sparse_customer_item

<4017x3671 sparse matrix of type '<class 'numpy.float64'>'
	with 266596 stored elements in Compressed Sparse Row format>

In [32]:
model = implicit.als.AlternatingLeastSquares(num_threads=1)

In [33]:
model.fit(sparse_customer_item)

  0%|          | 0/15 [00:00<?, ?it/s]

## Generate Similar Items

In [34]:
ref_item_id = df_filter_cust_item["item_id"].unique()

In [35]:
type(ref_item_id)

numpy.ndarray

In [36]:
item_arr, score_arr = model.similar_items(ref_item_id, N=10)

In [37]:
df_item_temp = pd.DataFrame(item_arr)

In [38]:
df_item_temp["Ref Item ID"] = ref_item_id

In [39]:
df_item_temp.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Ref Item ID
0,31,2830,2915,372,2692,2543,3188,802,2685,2279,31
1,2761,2296,3143,1002,2309,1810,1659,220,1238,1660,2761
2,1334,1890,2814,2105,316,2876,1085,2033,3382,2068,1334
3,1333,2736,2733,2794,2904,3500,2452,3496,3637,2433,1333
4,1790,1125,2752,1674,1157,1673,944,2546,2708,3423,1790


In [40]:
df_item_rank = pd.melt(
    df_item_temp,
    id_vars=["Ref Item ID"],
    var_name=["Item Rank"],
    value_name="Related Item ID",
)

In [41]:
df_item_rank.sort_values(["Ref Item ID", "Item Rank"]).head(20)

Unnamed: 0,Ref Item ID,Item Rank,Related Item ID
3239,0,0,0
6910,0,1,57
10581,0,2,260
14252,0,3,2807
17923,0,4,127
21594,0,5,130
25265,0,6,1286
28936,0,7,126
32607,0,8,2950
36278,0,9,125


In [42]:
df_score_temp = pd.DataFrame(score_arr)

In [43]:
df_score_temp["Ref Item ID"] = ref_item_id

In [44]:
df_score_rank = pd.melt(
    df_score_temp, id_vars=["Ref Item ID"], var_name=["Item Rank"], value_name="Score"
)

In [45]:
df_score_rank.sort_values(["Ref Item ID", "Item Rank"]).head(20)

Unnamed: 0,Ref Item ID,Item Rank,Score
3239,0,0,1.0
6910,0,1,0.711047
10581,0,2,0.628523
14252,0,3,0.451945
17923,0,4,0.438483
21594,0,5,0.431154
25265,0,6,0.407431
28936,0,7,0.406546
32607,0,8,0.389352
36278,0,9,0.386939


In [46]:
df_item_score = df_item_rank.merge(
    df_score_rank, how="inner", on=["Ref Item ID", "Item Rank"]
)

In [47]:
df_item_score.sort_values(["Ref Item ID", "Item Rank"], inplace=True)

In [48]:
df_item_score.head(20)

Unnamed: 0,Ref Item ID,Item Rank,Related Item ID,Score
3239,0,0,0,1.0
6910,0,1,57,0.711047
10581,0,2,260,0.628523
14252,0,3,2807,0.451945
17923,0,4,127,0.438483
21594,0,5,130,0.431154
25265,0,6,1286,0.407431
28936,0,7,126,0.406546
32607,0,8,2950,0.389352
36278,0,9,125,0.386939


In [49]:
df_item_desc = df_filter_cust_item[
    ["item_id", "StockCode", "Description"]
].drop_duplicates(subset=["item_id"])

In [50]:
df_similar_item_temp = df_item_score.merge(
    df_item_desc, how="left", left_on="Ref Item ID", right_on="item_id"
)

In [51]:
df_similar_item_temp.drop(columns=["item_id"], inplace=True)

In [52]:
df_similar_item_temp.columns = [
    "Ref Item ID",
    "Item Rank",
    "Related Item ID",
    "Score",
    "Ref_StockCode",
    "Ref_Description",
]

In [53]:
df_similar_item_temp.shape

(36710, 6)

In [54]:
df_similar_item_temp.head()

Unnamed: 0,Ref Item ID,Item Rank,Related Item ID,Score,Ref_StockCode,Ref_Description
0,0,0,0,1.0,85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,0,1,57,0.711047,85123A,WHITE HANGING HEART T-LIGHT HOLDER
2,0,2,260,0.628523,85123A,WHITE HANGING HEART T-LIGHT HOLDER
3,0,3,2807,0.451945,85123A,WHITE HANGING HEART T-LIGHT HOLDER
4,0,4,127,0.438483,85123A,WHITE HANGING HEART T-LIGHT HOLDER


In [55]:
df_similar_item_final = df_similar_item_temp.merge(
    df_item_desc, how="left", left_on="Related Item ID", right_on="item_id"
)

In [56]:
df_similar_item_final.drop(columns=["item_id"], inplace=True)

In [57]:
df_similar_item_final.columns = [
    "Ref Item ID",
    "Item Rank",
    "Related Item ID",
    "Score",
    "Ref_StockCode",
    "Ref_Description",
    "Related_StockCode",
    "Related_Description",
]

In [58]:
df_similar_item_final.head()

Unnamed: 0,Ref Item ID,Item Rank,Related Item ID,Score,Ref_StockCode,Ref_Description,Related_StockCode,Related_Description
0,0,0,0,1.0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,0,1,57,0.711047,85123A,WHITE HANGING HEART T-LIGHT HOLDER,21733,RED HANGING HEART T-LIGHT HOLDER
2,0,2,260,0.628523,85123A,WHITE HANGING HEART T-LIGHT HOLDER,22804,CANDLEHOLDER PINK HANGING HEART
3,0,3,2807,0.451945,85123A,WHITE HANGING HEART T-LIGHT HOLDER,84798B,PURPLE FOXGLOVE ARTIIFCIAL FLOWER
4,0,4,127,0.438483,85123A,WHITE HANGING HEART T-LIGHT HOLDER,22470,HEART OF WICKER LARGE


## Generate recommendation for a given user

In [59]:
# Generate user library
df_cust_id_map = df_filter_cust_item[["CustomerID", "cust_id"]].drop_duplicates(
    subset="cust_id"
)

In [60]:
df_cust_id_map.head()

Unnamed: 0,CustomerID,cust_id
143930,17337,1329
160128,14525,679
5466,17967,202
75792,13148,602
20617,16795,544


In [61]:
CustomerID = 17337
cust_id = df_cust_id_map[df_cust_id_map["CustomerID"] == CustomerID]["cust_id"].item()
print(cust_id)

1329


In [62]:
df_filter_cust_item[df_filter_cust_item["cust_id"] == cust_id]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Sales,cust_id,item_id
143930,548714,10002,INFLATABLE POLITICAL GLOBE,2,2011-04-03 15:07:00,0.85,17337,United Kingdom,1.70,1329,31
143834,548714,11001,ASSTD DESIGN RACING CAR PEN,16,2011-04-03 15:07:00,1.69,17337,United Kingdom,27.04,1329,1489
143835,548714,15044B,BLUE PAPER PARASOL,1,2011-04-03 15:07:00,2.95,17337,United Kingdom,2.95,1329,631
143836,548714,15044D,RED PAPER PARASOL,1,2011-04-03 15:07:00,2.95,17337,United Kingdom,2.95,1329,2555
80348,543040,16010,FOLDING CAMPING SCISSOR W/KNIF & S,12,2011-02-02 17:06:00,0.12,17337,United Kingdom,1.44,1329,1712
...,...,...,...,...,...,...,...,...,...,...,...
80437,543040,85176,SEWING SUSAN 21 NEEDLE SET,2,2011-02-02 17:06:00,0.85,17337,United Kingdom,1.70,1329,1866
188411,553035,85184C,S/4 VALENTINE DECOUPAGE HEART BOX,1,2011-05-12 20:13:00,2.95,17337,United Kingdom,2.95,1329,388
143935,548714,85194S,HANGING SPRING FLOWER EGG SMALL,2,2011-04-03 15:07:00,0.65,17337,United Kingdom,1.30,1329,2419
143885,548714,85202,HANGING WOOD AND FELT HEART,4,2011-04-03 15:07:00,0.42,17337,United Kingdom,1.68,1329,2601


In [63]:
ids, scores = model.recommend(
    cust_id, sparse_customer_item[cust_id], N=20, filter_already_liked_items=False
)

In [64]:
list_stockcode = df_item_desc[df_item_desc["item_id"].isin(ids)]["StockCode"].tolist()
list_desc = df_item_desc[df_item_desc["item_id"].isin(ids)]["Description"].tolist()
df_recommendations = pd.DataFrame(
    {
        "Stockcode": list_stockcode,
        "Description": list_desc,
        "score": scores,
        "already_liked": np.in1d(ids, sparse_customer_item[cust_id].indices),
    }
)

In [65]:
df_recommendations

Unnamed: 0,Stockcode,Description,score,already_liked
0,20914,SET/5 RED RETROSPOT LID GLASS BOWLS,1.344945,True
1,20983,12 PENCILS TALL TUBE RED RETROSPOT,1.301106,True
2,21080,SET/20 RED RETROSPOT PAPER NAPKINS,1.245094,True
3,21212,PACK OF 72 RETROSPOT CAKE CASES,1.188693,True
4,21877,HOME SWEET HOME MUG,1.105531,True
5,21891,TRADITIONAL WOODEN SKIPPING ROPE,1.062009,True
6,21914,BLUE HARMONICA IN BOX,1.053307,True
7,21967,PACK OF 12 SKULL TISSUES,1.048815,True
8,21980,PACK OF 12 RED RETROSPOT TISSUES,1.020712,True
9,21983,PACK OF 12 BLUE PAISLEY TISSUES,1.009332,True
