Install the necessary libraries 

In [None]:
!pip install implicit

In [None]:
import pandas as pd 
import numpy as np 
import implicit
import scipy.sparse as sparse

In [None]:
#Read in dataset
df_raw = pd.read_excel("./data/Online Retail.xlsx")

In [None]:
df_raw.info()

In [None]:
#Dropping records with no CustomerID
df_raw.dropna(subset=['CustomerID'],
                      inplace=True)

In [None]:
df_raw['CustomerID'] = df_raw['CustomerID'].astype('int64')
df_raw['StockCode'] = df_raw['StockCode'].astype('str')

In [None]:
df_raw['Sales'] = df_raw['Quantity'] * df_raw['UnitPrice']

In [None]:
df_raw.info()

## Filter out records on for Customers who bought more than n items

In [None]:
df_items_per_cust = df_raw.groupby(['CustomerID'])\
.agg({'StockCode': 'nunique'}).reset_index()

In [None]:
df_items_per_cust.columns = ['CustomerID', 'Count_item_cust']

In [None]:
#Setting of THreshold
item_in_cust_threshold = 6

In [None]:
#Filtering Results
mask = df_items_per_cust['Count_item_cust'] >= item_in_cust_threshold
valid_cust= set(df_items_per_cust.loc[mask, 'CustomerID'].tolist())

In [None]:
df_filter_cust = df_raw[df_raw['CustomerID'].isin(valid_cust)].copy()

In [None]:
invoiceno_filter_cust = set(df_filter_cust['InvoiceNo'].tolist())

## Filter out items only for those that are bought by multiple customers

In [None]:
df_custs_per_item = df_raw.groupby(['StockCode'])\
.agg({'CustomerID': 'nunique'}).reset_index()

In [None]:
df_custs_per_item.columns = ['StockCode', 'Count_cust_item']

In [None]:
df_custs_per_item['Count_cust_item'].value_counts()

In [None]:
# Set threshold 
cust_in_item_threshold = 6

In [None]:
mask = df_custs_per_item['Count_cust_item'] >= cust_in_item_threshold
valid_stockcode = set(df_custs_per_item.loc[mask, 'StockCode'].tolist())

In [None]:
df_filter_item = df_raw[df_raw['StockCode'].isin(valid_stockcode)].copy()

In [None]:
invoiceno_filter_item = set(df_filter_item['InvoiceNo'].tolist())

In [None]:
invoiceno_intersect = set.intersection(invoiceno_filter_item,invoiceno_filter_cust)

In [None]:
print(f"No. of invoice after filtering customer: {len(invoiceno_filter_cust)}")
print(f"No. of invoice after filtering item: {len(invoiceno_filter_item)}")
print(f"No. of invoice from intersect: {len(invoiceno_intersect)}")

In [None]:
df_filter_cust_item = df_raw[df_raw['InvoiceNo'].isin(invoiceno_intersect)].copy()

In [None]:
df_filter_cust_item.info()

## Apply Collaborative Filtering with Implicit LIbrary

In [None]:
unique_customers = df_filter_cust_item['CustomerID'].unique()
cust_ids = dict(zip(unique_customers, np.arange(unique_customers.shape[0], dtype=np.int32)))

unique_items = df_filter_cust_item['StockCode'].unique()
item_ids = dict(zip(unique_items, np.arange(unique_items.shape[0], dtype=np.int32)))

df_filter_cust_item['cust_id'] = df_filter_cust_item['CustomerID'].apply(lambda i : cust_ids[i])
df_filter_cust_item['item_id'] = df_filter_cust_item['StockCode'].apply(lambda i : item_ids[i])

In [None]:
print(f"{len(cust_ids)}, {len(item_ids)}")

In [None]:
df_filter_cust_item.sort_values(by=['StockCode'], inplace=True)
df_filter_cust_item.head(10)

In [None]:
df_cust_item_qty = df_filter_cust_item.groupby(['cust_id','item_id'])\
.agg({'Quantity':'sum'}).reset_index()

In [None]:
df_cust_item_qty.head()

In [None]:
#Create Sparse Matrix

sparse_customer_item = \
sparse.csr_matrix((df_cust_item_qty['Quantity'].astype(float), (df_cust_item_qty['cust_id'],df_cust_item_qty['item_id'])))

In [None]:
sparse_customer_item

In [None]:
model = implicit.als.AlternatingLeastSquares(num_threads=1)

In [None]:
model.fit(sparse_customer_item)

## Generate Similar Items

In [None]:
ref_item_id = df_filter_cust_item['item_id'].unique()

In [None]:
item_arr, score_arr = model.similar_items(ref_item_id, N=10)

In [None]:
df_item_temp = pd.DataFrame(item_arr)

In [None]:
df_item_temp['Ref Item ID'] = ref_item_id

In [None]:
df_item_temp.head()

In [None]:
df_item_rank= pd.melt(df_item_temp,
                  id_vars = ['Ref Item ID'],
                  var_name = ['Item Rank'],
                  value_name = 'Related Item ID')

In [None]:
df_item_rank.sort_values(['Ref Item ID','Item Rank']).head(20)

In [None]:
 df_score_temp = pd.DataFrame(score_arr)

In [None]:
df_score_temp['Ref Item ID'] = ref_item_id

In [None]:
df_score_rank= pd.melt(df_score_temp,
                  id_vars = ['Ref Item ID'],
                  var_name = ['Item Rank'],
                  value_name = 'Score')

In [None]:
df_score_rank.sort_values(['Ref Item ID','Item Rank']).head(20)

In [None]:
df_item_score = df_item_rank.merge(df_score_rank,
                                   how='inner',
                                   on=['Ref Item ID','Item Rank'])

In [None]:
df_item_score.sort_values(['Ref Item ID','Item Rank'],
                          inplace=True)

In [None]:
df_item_score.head(20)

In [None]:
df_item_desc = df_filter_cust_item[['item_id','StockCode', 'Description']].copy()

In [None]:
df_similar_item_temp = df_item_score.merge(df_item_desc,
                    how="left",
                    left_on="Ref Item ID",
                    right_on="item_id")

In [None]:
df_similar_item_temp.drop(columns=['item_id'], inplace=True)

In [None]:
df_similar_item_temp.columns = ['Ref Item ID', 'Item Rank', 
                                'Related Item ID', 'Score', 
                                'Ref_StockCode','Ref_Description']

In [None]:
df_similar_item_temp.shape

In [None]:
df_similar_item_temp.head()

In [None]:
df_similar_item = df_similar_item_temp.merge(df_item_desc,
                    how="left",
                    left_on="Related Item ID",
                    right_on="item_id")

In [None]:
df_similar_item.drop(columns=['item_id'], inplace=True)


In [None]:
df_similar_item_temp.columns