In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

References: 


Collaborative Filtering for Implicit Feedback DataSets by Hu, Koren, and Volinsky - http://yifanhu.net/PUB/cf.pdf

Alternating Least Squares Method for Collaborative Filtering by Bugra Akyildiz - http://bugra.github.io/work/notes/2014-04-19/alternating-least-squares-method-for-collaborative-filtering/


The incredible Jesse Steinweg - https://jessesw.com/ and his blog that guided me through this https://jessesw.com/Rec-System/

In [2]:
# Read the data Online Retail from the UCI Machine Learning Repo
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx'
raw_data = pd.read_excel(url)

In [3]:
# Preview the ratings dataframe
print(raw_data.head())
print(raw_data.shape)

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
(541909, 8)


In [4]:
# Let's see how many items and customers there are in the dataset
num_cust = len(raw_data.CustomerID.unique())
num_items = len(raw_data.StockCode.unique())
print('Number of customers: ' + str(num_cust))
print('Number of items bought: ' + str(num_items))

Number of customers: 4373
Number of items bought: 4070


In [5]:
# Let's check for missing data
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      541909 non-null object
StockCode      541909 non-null object
Description    540455 non-null object
Quantity       541909 non-null int64
InvoiceDate    541909 non-null datetime64[ns]
UnitPrice      541909 non-null float64
CustomerID     406829 non-null float64
Country        541909 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [6]:
# Since some records have no cusotmer ID, let's drop those columns
retail_data = raw_data.loc[pd.isnull(raw_data.CustomerID) == False]

#Convert CustomerID to int
retail_data.CustomerID = retail_data.CustomerID.astype(int)

#Convert StockCode to string
retail_data.StockCode = retail_data.StockCode.astype(str)

# Check if data doesn't contain any nulls now
retail_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      406829 non-null object
StockCode      406829 non-null object
Description    406829 non-null object
Quantity       406829 non-null int64
InvoiceDate    406829 non-null datetime64[ns]
UnitPrice      406829 non-null float64
CustomerID     406829 non-null int64
Country        406829 non-null object
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 27.9+ MB


In [7]:
# Let's make a dictionary of Stock Code to Description
item_desc = retail_data[['StockCode', 'Description']]
item_desc = item_desc.drop_duplicates().dropna(how='all').reset_index(drop=True)
itemDescDict = item_desc.set_index('StockCode').to_dict()['Description']

# Preview first 5 elements of dictionary
print(dict(list(itemDescDict.items())[0:5]))

{'85123A': 'CREAM HANGING HEART T-LIGHT HOLDER', '71053': 'WHITE MOROCCAN METAL LANTERN', '84406B': 'CREAM CUPID HEARTS COAT HANGER', '84029G': 'KNITTED UNION FLAG HOT WATER BOTTLE', '84029E': 'RED WOOLLY HOTTIE WHITE HEART.'}


In [8]:
# Let's explore the quantity feature
print(retail_data.Quantity.describe())
print ('Lowest 10 values for Quantity' + str(np.sort(retail_data.Quantity)[0:10]))

# Let's assume the negative quantities indicate that the customer returned the item. 

count    406829.000000
mean         12.061303
std         248.693370
min      -80995.000000
25%           2.000000
50%           5.000000
75%          12.000000
max       80995.000000
Name: Quantity, dtype: float64
Lowest 10 values for Quantity[-80995 -74215  -9360  -3114  -2000  -1930  -1515  -1350  -1300  -1296]


In [9]:
# Let's group purchase quantities by Stock Code and CustomerID
retail_data = retail_data[['StockCode', 'Quantity', 'CustomerID']]
retail_grouped = retail_data.groupby(['CustomerID', 'StockCode']).sum().reset_index()
print(retail_grouped.head())

# If the quantity sum is 0, replace with 1 to indicate that there was a purchase of that item atleast
retail_grouped.Quantity.loc[retail_grouped.Quantity == 0] = 1

# Filter out all negative quantities so that we can focus the recommendation of items that the customer purchased and liked
retail_grouped_final = retail_grouped[retail_grouped.Quantity > 0]
print ('\nFinal Matrix of grouped purchases')
print (retail_grouped_final.head())

   CustomerID StockCode  Quantity
0       12346     23166         0
1       12347     16008        24
2       12347     17021        36
3       12347     20665         6
4       12347     20719        40

Final Matrix of grouped purchases
   CustomerID StockCode  Quantity
0       12346     23166         1
1       12347     16008        24
2       12347     17021        36
3       12347     20665         6
4       12347     20719        40


In [10]:
'''
Let's build the sparse customer-stockID matrix where customers are on rows, items on columns
and the values in the matrix is the quantity purchased
      item1  item2   item3 ....
cust1  1            
cust2         24      36
cust3         12
'''


# Get list of unique customers
cust_list = list(np.sort(retail_grouped_final.CustomerID.unique()))
# Get list of unique items bought
item_list = list(np.sort(retail_grouped_final.StockCode.unique()))
# Get list of all the purchase quantities
quantity_list = list(retail_grouped_final.Quantity)


# Building the matrix....
mat_rows = retail_grouped_final.CustomerID.astype('category', categories = cust_list).cat.codes
mat_cols = retail_grouped_final.StockCode.astype('category', categories = item_list).cat.codes

purchases_mat = sparse.csr_matrix((quantity_list, (mat_rows, mat_cols)), shape = (len(cust_list), len(item_list)))

"\nLet's build the sparse customer-stockID matrix where customers are on rows, items on columns\nand the values in the matrix is the quantity purchased\n      item1  item2   item3 ....\ncust1  1            \ncust2         24      36\ncust3         12\n"

In [11]:
print ("Shape of sparse matrix " + str(purchases_mat.shape))

purchases_mat
'''
4338 customers with 3664 items and of those 266723 have a purchase 
quantity associated with that customer/item combination
'''


Shape of sparse matrix (4338, 3664)


<4338x3664 sparse matrix of type '<class 'numpy.int64'>'
	with 266723 stored elements in Compressed Sparse Row format>

'\n4338 customers with 3664 items and of those 266723 have a purchase \nquantity associated with that customer/item combination\n'

In [12]:
# Let's check how sparse the matrix is 
# Get all possible combination of purchases
purchase_mat_size = purchases_mat.shape[0]*purchases_mat.shape[1]
# Get actual number of item purchased
num_purchases = len(purchases_mat.nonzero()[0])
sparse_per = 100*(1 - (num_purchases/purchase_mat_size))
print ('Sparsity Percentage in Ratings Matrix is ' + str(round(sparse_per,2)))

Sparsity Percentage in Ratings Matrix is 98.32


In [13]:
'''
Split into test and train by masking some values of the dataset in the training set with 0s indicating the customer did not purchase.
Then run the model on the complete matrix and see if the predicted values are equal to the original values before masking

'''

def create_train(matrix_data, mask_pct = 0.2):
    '''
    This function will take in the complete customer-item matrix and "mask" a percentage of the original purchases where a
    user-item interaction has taken place for use as a test set. The test set will contain all of the original purchases, 
    while the training set replaces the specified percentage of them with a zero in the original purchases matrix. 
    
    args:
    matrix_data - the original purchases matrix from which you want to generate a train/test set. Takes sparse csr_matrix form.
    
    mask_pct - The percentage of customer-item interactions where an interaction took place that you want to mask in the 
    training set for later comparison to the test set. 
    
    returns:
    
    training_set - The altered version of the original data with a certain percentage of the customer-item pairs 
    that originally had interaction set back to zero.
    
    test_set - A copy of the original purchase matrix converted to binary - 1 indicates purchase and 0 indicates no purchase.
    
    user_inds - From the randomly selected customer-item indices, which customer rows were altered in the training data.
    '''
    # Prepare the test set
    test_mat = matrix_data.copy()
    test_mat[test_mat != 0] = 1
    # Prepare the training set
    training_mat = matrix_data.copy()
    # Get indices of purchases in the matrix
    purchase_idx = training_mat.nonzero()
    # Get corresponding user-item indices of the purchase
    purchase_pairs = list(zip(purchase_idx[0], purchase_idx[1]))
    random.seed(0)
    # Number of samples to mask
    num_samples = int(np.ceil(mask_pct*len(purchase_pairs)))
    # Randomly sample from the purchases
    samples = random.sample(purchase_pairs, num_samples)
    customer_idx = [index[0] for index in samples]
    item_idx = [index[1] for index in samples]
    # Mask the items in the above indentified indices as 0
    training_mat[customer_idx, item_idx] = 0 
    # To save space, eliminate the zeros in the sparse matrix
    training_mat.eliminate_zeros()
    return training_mat, test_mat, list(set(customer_idx))

'\nSplit into test and train by masking some values of the dataset in the training set with 0s indicating the customer did not purchase.\nThen run the model on the complete matrix and see if the predicted values are equal to the original values before masking\n\n'

In [14]:
# Call the create_train function

train_mat, test_mat, customer_idx = create_train(purchases_mat)
train_mat
test_mat

<4338x3664 sparse matrix of type '<class 'numpy.int64'>'
	with 213378 stored elements in Compressed Sparse Row format>

<4338x3664 sparse matrix of type '<class 'numpy.int64'>'
	with 266723 stored elements in Compressed Sparse Row format>

In [15]:
##### ALS Matrix Factorization
### Reference : http://yifanhu.net/PUB/cf.pdf


def implicit_weighted_ALS(training_mat, lambda_val = 0.1, alpha = 40, iterations = 10, rank_size = 20, seed = 0):
    '''
    args:
    training_mat - Matrix with shape m x n; m = number of customers, n = number of items
    
    lambda_val - Regularization constraint for bias-variance trade-off. Increasing lambda_val increases bias but reduced variance
    
    alpha - Parameter describing the confidence of the matrix. The paper identified 40 as most effective. Descreasing this value will decrease the confidence between various purchases.
    
    iterations - Number of times to alternate between the customer feature vector (U) and item feature vector (V) in ALS. More iterations will give better convergence but increase computation.
    
    rank_size - Number of latent features in the customer/item feature vectors. Paper recommends between 20-200. Increasing may overfit but reduce bias.
    
    seed - internal state of random number generator.
    
    returns:
    U (feature vector for customers) and V (feature vector for item.)
    U.dot(V) would give us the predicted purchases matrix.
    '''
    
    # Create confidence Matrix of size m x n
    conf = (alpha*training_mat)
    num_cust = conf.shape[0]
    num_item = conf.shape[1]
    
    # Initial U/V feature vectors randomly
    state = np.random.RandomState(seed)
    # Create the customer feature vector with random numbers of size m x rank_size (number of latent features)
    U = sparse.csr_matrix(state.normal(size = (num_cust, rank_size)))
    # Create the item feature vector with random numbers of size n x rank_size (number of latent features). Will transpose it later
    V = sparse.csr_matrix(state.normal(size = (num_item, rank_size)))
    
    # Create a sparse matrix with 1s along the diagonal for U
    U_diag = sparse.eye(num_cust)
    # Create a sparse matrix with 1s along the diagonal for V
    V_diag = sparse.eye(num_item)
    # Create a sparse matrix of 1s along the diagonal of the latent feature vector and the regularitzation term
    lambda_diag = lambda_val * sparse.eye(rank_size)
    
    # Set up iterations
    # Iterate between solving for U with V fixed and vice versa
    for step in range(iterations):
        # Compute vTv and uTu before to save computing time
        vTv = V.T.dot(V)
        uTu = U.T.dot(U)
        # Begin iteration to solve for U on fixed V
        for u in range(num_cust):
            # Convert customer row from confidence matrix to dense vector
            conf_samp = conf[u,:].toarray()
            pref = conf_samp.copy()
            # Create a binary preference vector
            pref[pref != 0] = 1
            # Beging solving through the equations defined in the paper
            # Cu -I term
            CuI = sparse.diags(conf_samp, [0])
            # yT(Cu-I)Y term
            vTCuIV = V.T.dot(CuI).dot(V)
            # yTCuPu term where we add the diagonal back in
            vTCupu = V.T.dot(CuI + V_diag).dot(pref.T)
            # Solve for Xu = ((yTy + yT(Cu-I)Y + lambda*I)^-1)yTCuPu
            U[u] = spsolve(vTv + vTCuIV + lambda_diag, vTCupu)
            
        # Begin iteration to solve for V on fixed U
        for v in range(num_item):
            # Transpose item row from confidence matrix to dense vector
            conf_samp = conf[:,v].T.toarray()
            pref = conf_samp.copy()
            # Create a binary preference vector
            pref[pref != 0] = 1
            # Beging solving through the equations defined in the paper
            # Cu -I term
            CvI = sparse.diags(conf_samp, [0])
            # xT(Cv-I)X term
            uTCvIU = U.T.dot(CvI).dot(U)
            # xTCuPu term where we add the diagonal back in
            uTCvpv = U.T.dot(CvI + U_diag).dot(pref.T)
            # Solve for Yv = ((xTx + xT(Cu-I)X) + lambda*I)^-1)xTCvPv
            V[v] = spsolve(uTu + uTCvIU + lambda_diag, uTCvpv)
            
    return U, V.T
            


In [16]:
# Call the function with lambda_val 0.1, alpha 40, 30 iterations and 10 latent features
cust_vecs, item_vecs = implicit_weighted_ALS(train_mat, lambda_val=0.1, alpha = 40, iterations = 30, rank_size = 10)

In [17]:
# Predictions for the first customer across the first 5 items
cust_vecs[0:,].dot(item_vecs).toarray()[0,:5]

array([-0.05921392,  0.00722905, -0.01314641, -0.00457686, -0.01961767])

In [18]:
# Evaluating the recommendation
# Compare the predictions of those customer-item combinations that we intentionally set to zero against the most popular items

def calc_mean_auc(training_mat, altered_custs, predictions, test_mat):
    '''
    args:
    training_mat - The orginial training_mat where we masked some customers' purchases to zero. 
    
    altered_custs - The indices of the customers where atleast one customer/item pair was altered to 0.
    
    predictions - The matrix of predicted purchases. These should be stored in a list, with customer vectors as item zero and item vectors as item one.
    
    test_mat - The test matrix constructed from the create_train function
    
    returns:
    
    The mean AUC of the test set only on customer-item iteractions that were originally zero to test ranking ability in addition to most popular items as a benchmark.
    '''
    
    store_auc = []
    popularity_auc = []
    # Get sum of item interactions to get most popular items
    popular_items = np.array(test_mat.sum(axis =0)).reshape(-1)
    item_vecs = predictions[1]
    for cust in altered_custs:
        # Get the training matrix row where the interactions were zero
        training_row = training_mat[cust, :].toarray().reshape(-1)
        zero_inds = np.where(training_row == 0)
        # Get the predicted values based on our customer/item vectors
        cust_vec = predictions[0][cust,:]
        pred = cust_vec.dot(item_vecs).toarray()[0, zero_inds].reshape(-1)
        # Get only items that were originally zero
        actual = test_mat[cust,:].toarray()[0, zero_inds].reshape(-1)
        popular = popular_items[zero_inds]
        # Calculate AUC for predicted vs actual
        fpr, tpr, thresholds = metrics.roc_curve(actual, pred)
        store_auc.append(metrics.auc(fpr, tpr))
        # Calculate AUC for popular vs actual
        fpr_pop, tpr_pop, thresholds_pop = metrics.roc_curve(actual, popular)
        popularity_auc.append(metrics.auc(fpr_pop, tpr_pop))
        
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))
    

In [19]:
# Calculate AUC for the recommender system we built

calc_mean_auc(train_mat, customer_idx, [sparse.csr_matrix(cust_vecs), sparse.csr_matrix(item_vecs)], test_mat)

(0.853, 0.814)

Mean AUC Score of 0.853 shows that our recommendation engine did better than the popular item recommender which had a mean 
AUC of 0.814. An AUC of 0.853 also menas that the system is recommending items the customer did purchase in the test set more frequently than the items they did not purchase.

Increasing the number of iterations/latent features might improve our score but we must also be careful to not overfit. 
Cross-validation is one technique that might help to prevent over-fitting.


There are in-built libraries that parallelize the ALS matrix factorization but the above function is to understand each step of the factorization and find areas for tuning.

In [20]:
# Sampling the Recommendations provide using the item:desc dictionary we had created earlier

customers_arr = np.array(cust_list)
items_arr = np.array(item_list)

In [21]:
def get_items_purchased(customer_id, train_mat, customers_arr, items_arr, item_lookup):
    '''
    Returns the items purchased by a specific customer in the training set
    
    args:
    customer_id - ID of a customer whose made atleast one purchase
    train_mat - The initial purchase matrix that we masked a percentage of
    customers_arr - Array of customers in the purchase matrix
    items_arr - Array of items in the purchase matrix
    item_lookup - Dictionary of unique item ID to description
    
    returns:
    A dictionary of stock_cd and description of those items already purchased
    '''
    # Get the index of the row where that customer ID is present
    cust_ind = np.where(customers_arr == customer_id)[0][0]
    # Get all the indices of the purchases made
    purchase_ind = train_mat[cust_ind,:].nonzero()[1]
    # Retrieve the product codes for the purchase indices
    stock_codes = items_arr[purchase_ind]
    # Look up the description for the stock code from itemDescDict
    subdict = {x: item_lookup[x] for x in stock_codes if x in item_lookup}
    return subdict

In [22]:
# Let's look at what the first 5 customers purchased
for cust in customers_arr[6:9]:
    print('Customer ID: ', cust)
    print(get_items_purchased(cust, train_mat, customers_arr, items_arr, itemDescDict))
    print('--------------------------------------------------')

Customer ID:  12353
{'22890': 'NOVELTY BISCUITS CAKE STAND 3 TIER', '37446': 'MINI CAKE STAND WITH HANGING CAKES', '37449': 'CERAMIC CAKE STAND + HANGING CAKES', '37450': 'CERAMIC CAKE BOWL + HANGING CAKES'}
--------------------------------------------------
Customer ID:  12354
{'20675': 'BLUE POLKADOT BOWL', '20676': 'RED RETROSPOT BOWL', '20677': 'PINK POLKADOT BOWL', '20725': 'LUNCH BAG RED SPOTTY', '20749': 'ASSORTED COLOUR MINI CASES', '21080': 'SET/20 RED RETROSPOT PAPER NAPKINS ', '21156': 'RETROSPOT CHILDRENS APRON', '21217': 'RED RETROSPOT ROUND CAKE TINS', '21238': 'RED RETROSPOT CUP', '21239': 'PINK  POLKADOT CUP', '21240': 'BLUE POLKADOT CUP', '21242': 'RED RETROSPOT PLATE ', '21243': 'PINK POLKADOT PLATE ', '21244': 'BLUE POLKADOT PLATE ', '21246': 'RED RETROSPOT BIG BOWL', '21380': 'WOODEN HAPPY BIRTHDAY GARLAND', '21533': 'RETROSPOT LARGE MILK JUG', '21731': 'RED TOADSTOOL LED NIGHT LIGHT', '21890': 'S/6 WOODEN SKITTLES IN COTTON BAG', '21891': 'TRADITIONAL WOODEN SKIPPI

In [23]:
# Now let's write a function to get the recommended items for each of these customers using our recommendation engine

def get_rec_item(customer_id, train_mat, customer_vecs, items_vecs, customer_arr, item_arr, item_lookup, num_items = 10):
    '''
    This function will return the top num_items recommended items to the customers
    
    args:
    customer_id - ID of a customer who we want to see the recommendations for
    train_mat - The initial purchase matrix that we masked a percentage of
    customers_arr - Array of customers in the purchase matrix
    items_arr - Array of items in the purchase matrix
    item_lookup - Dictionary of unique item ID to description
    num_items - The number of recommended items in order of best recommendation to lowest.
    
    returns:
    The top n recommendations based on the U/V vectors for items never purchased/interacted with before
    '''
    # Get index of customerID
    cust_ind = np.where(customer_arr == customer_id)[0][0]
    # Get purchases made by that customer
    pref_vec = train_mat[cust_ind,:].toarray()
    # Add 1 to all purchases so that items not purchased yet become equal to 1
    pref_vec = pref_vec.reshape(-1) + 1
    # Make items that were already purchased 0 (so that they don't get included in the recommendation)
    pref_vec[pref_vec > 1] = 0
    # Get dot product of customer vector across all items in the item vector
    rec_vector = customer_vecs[cust_ind,:].dot(item_vecs).toarray()
    # Scale the recommendations between 0 and 1 using MinMax
    scaler = MinMaxScaler()
    rec_vector_scaled = scaler.fit_transform(rec_vector.reshape(-1,1))[:,0]
    # Multiply by the purchased vector so that items already purchased are set to 0
    recommend_vector = pref_vec*rec_vector_scaled
    # Sort the indices in order of recommendations
    item_idx = np.argsort(recommend_vector)[::-1][:num_items]
    # Get the list of recommended items
    rec_list = []
    for index in item_idx:
        stock_code = item_arr[index]
        rec_list.append([stock_code, item_lookup[stock_code]])
    return rec_list

In [24]:
# Let's look at what was recommended for the 3 customers
for cust in customers_arr[6:9]:
    print('Customer ID: ', cust)
    print(get_rec_item(cust, train_mat, cust_vecs, item_vecs, customers_arr, items_arr, itemDescDict))
    print('--------------------------------------------------')

Customer ID:  12353
[['84568', 'GIRLS ALPHABET IRON ON PATCHES '], ['47567B', 'TEA TIME KITCHEN APRON'], ['21626', 'VINTAGE UNION JACK PENNANT'], ['22057', 'CERAMIC PLATE STRAWBERRY DESIGN'], ['37448', 'CERAMIC CAKE DESIGN SPOTTED MUG'], ['37447', 'CERAMIC CAKE DESIGN SPOTTED PLATE'], ['22893', 'MINI CAKE STAND T-LIGHT HOLDER'], ['22063', 'CERAMIC BOWL WITH STRAWBERRY DESIGN'], ['22059', 'CERAMIC STRAWBERRY DESIGN MUG'], ['22649', 'STRAWBERRY FAIRY CAKE TEAPOT']]
--------------------------------------------------
Customer ID:  12354
[['21746', 'SMALL RED RETROSPOT WINDMILL'], ['21700', 'BIG DOUGHNUT FRIDGE MAGNETS'], ['23155', 'KNICKERBOCKERGLORY MAGNET ASSORTED '], ['22898', 'CHILDRENS APRON APPLES DESIGN'], ['22243', '5 HOOK HANGER RED MAGIC TOADSTOOL'], ['22432', 'WATERING CAN PINK BUNNY'], ['22956', '36 FOIL HEART CAKE CASES'], ['21716', 'BOYS VINTAGE TIN SEASIDE BUCKET'], ['21749', 'LARGE RED RETROSPOT WINDMILL'], ['21878', 'PACK OF 6 SANDCASTLE FLAGS ASSORTED']]
-----------------

In [25]:
'''
Let's compare the items bought vs the items recommended by customer in a dataframe so 
that we can see more clearly how the recommendation engine did
'''

def compare_purchase_rec(customer_id, purchase_dict, rec_list):
    '''
    This function returns a dataframe with the 10 purchased items and top 10 recommended items for each customer
    
    args:
    customer_id - The customer ID in the purchase matrix
    purchase_dict - The output of the get_items_purchased function which is a dictionary of stock_cd:description of items purchased
    rec_list - The output of the get_rec_item function which is a list of the top n stock_cd and description pairs
    
    returns:
    A dataframe with all purchased items and top n recommended items by customer
    '''
    # Create dataframes of one column each - CustomerID, Purchased items, Recommended Items
    cust_df = pd.DataFrame({'CustID': [customer_id]})
    purchase_df = pd.DataFrame({'PurchasedItem': list(purchase_dict.values())})
    rec_df = pd.DataFrame({'RecommendedItem': [pair[1] for pair in rec_list]})

    # Column wise concatenate the dataframes
    final_df = pd.concat([cust_df, purchase_df, rec_df], ignore_index=True, axis=1)
    # Format the final dataframe
    final_df.columns = ['CustID', 'PurchasedItem', 'RecommendedItem']
    final_df['PurchasedItem'] = final_df.PurchasedItem.astype(str)
    final_df['RecommendedItem'] = final_df.RecommendedItem.astype(str)
    final_df = final_df.fillna('')
    final_df = final_df.replace('nan', '', regex=True)
    return final_df
    
    
    

"\nLet's compare the items bought vs the items recommended by customer in a dataframe so \nthat we can see more clearly how the recommendation engine did\n"

In [26]:
# Let's compare the same 3 customers
for cust in customers_arr[6:9]:
    print(tabulate(compare_purchase_rec(cust, 
                               get_items_purchased(cust, train_mat, customers_arr, items_arr, itemDescDict),
                               get_rec_item(cust, train_mat, cust_vecs, item_vecs, customers_arr, items_arr, itemDescDict)),
                  headers= ['CustID', 'PurchasedItem', 'RecommendedItem']))
    

    CustID    PurchasedItem                       RecommendedItem
--  --------  ----------------------------------  -----------------------------------
 0  12353.0   NOVELTY BISCUITS CAKE STAND 3 TIER  GIRLS ALPHABET IRON ON PATCHES
 1            MINI CAKE STAND WITH HANGING CAKES  TEA TIME KITCHEN APRON
 2            CERAMIC CAKE STAND + HANGING CAKES  VINTAGE UNION JACK PENNANT
 3            CERAMIC CAKE BOWL + HANGING CAKES   CERAMIC PLATE STRAWBERRY DESIGN
 4                                                CERAMIC CAKE DESIGN SPOTTED MUG
 5                                                CERAMIC CAKE DESIGN SPOTTED PLATE
 6                                                MINI CAKE STAND T-LIGHT HOLDER
 7                                                CERAMIC BOWL WITH STRAWBERRY DESIGN
 8                                                CERAMIC STRAWBERRY DESIGN MUG
 9                                                STRAWBERRY FAIRY CAKE TEAPOT
    CustID    PurchasedItem                


Looking at the recommendations, we can see that the engine did pretty well. 
For customer 12353, all his purchases are cake related so it seems like he's planning for a birthday party.
All the recommendations include design patterns for the cake or utensils/cutlery that he/she could use for making/serving the cake. 

Similarly for customer 12355, we can see that all his purchases indicate that he's hosting a high tea apart from the lip gloss. 
His recommendations include more variants of candles and other ornaments. 
