In [13]:
import scipy.sparse.linalg as linalg
import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib

# Read original dataset from csv files

In [14]:
aisles_df = pd.read_csv('capstone-dataset/aisles.csv')
departments_df = pd.read_csv('capstone-dataset/departments.csv')
products_df = pd.read_csv('capstone-dataset/products.csv')
orders_df = pd.read_csv('capstone-dataset/orders.csv')
order_products_prior_df = pd.read_csv('capstone-dataset/order_products__prior.csv')
order_products_train_df = pd.read_csv('capstone-dataset/order_products__train.csv')

In [15]:
products_df = pd.merge(products_df, aisles_df, on="aisle_id")
products_df = pd.merge(products_df, departments_df, on="department_id")
products_df

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,0,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,1,All-Seasons Salt,104,13,spices seasonings,pantry
2,2,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages
3,3,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,frozen meals,frozen
4,4,Green Chile Anytime Sauce,5,13,marinades meat preparation,pantry
...,...,...,...,...,...,...
49683,49683,"Vodka, Triple Distilled, Twist of Vanilla",124,5,spirits,alcohol
49684,49684,En Croute Roast Hazelnut Cranberry,42,1,frozen vegan vegetarian,frozen
49685,49685,Artisan Baguette,112,3,bread,bakery
49686,49686,Smartblend Healthy Metabolism Dry Cat Food,41,8,cat food care,pets


In [16]:
order_products_prior_df = pd.merge(order_products_prior_df, products_df, on="product_id")
order_products_train_df = pd.merge(order_products_train_df, products_df, on="product_id")

In [17]:
order_products_train_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,1,49301,1,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs
1,1,11108,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,other creams cheeses,dairy eggs
2,1,10245,3,0,Organic Celery Hearts,83,4,fresh vegetables,produce
3,1,49682,4,0,Cucumber Kirby,83,4,fresh vegetables,produce
4,1,43632,5,1,Lightly Smoked Sardines in Olive Oil,95,15,canned meat seafood,canned goods
...,...,...,...,...,...,...,...,...,...
1384612,3421063,14232,3,1,Natural Artesian Water,115,7,water seltzer sparkling water,beverages
1384613,3421063,35547,4,1,Twice Baked Potatoes,13,20,prepared meals,deli
1384614,3421070,35950,1,1,Organic Unsweetened Almond Milk,91,16,soy lactosefree,dairy eggs
1384615,3421070,16952,2,1,Creamy Peanut Butter,88,13,spreads,pantry


# prepare training dataset using data when eval_set='prior' and prepare test dataset using data when eval_set='train'

In [18]:
order_user_prior_df = orders_df[orders_df["eval_set"]=='prior']
order_user_train_df = orders_df[orders_df["eval_set"]=='train']
order_user_train_df

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
10,1187899,0,train,11,4,8,14.0
25,1492625,1,train,15,1,11,30.0
49,2196797,4,train,5,0,11,6.0
74,525192,6,train,21,2,11,6.0
78,880375,7,train,4,1,14,10.0
...,...,...,...,...,...,...,...
3420838,2585586,206198,train,20,2,16,30.0
3420862,943915,206199,train,24,6,19,6.0
3420924,2371631,206202,train,6,4,19,30.0
3420933,1716008,206204,train,4,1,16,10.0


In [19]:
order_user_product_prior_df = pd.merge(order_user_prior_df, order_products_prior_df, on="order_id")
order_user_product_train_df = pd.merge(order_user_train_df, order_products_train_df, on="order_id")
order_user_product_train_df

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,1187899,0,train,11,4,8,14.0,195,1,1,Soda,77,7,soft drinks,beverages
1,1187899,0,train,11,4,8,14.0,25132,2,1,Organic String Cheese,21,16,packaged cheese,dairy eggs
2,1187899,0,train,11,4,8,14.0,38927,3,1,0% Greek Strained Yogurt,120,16,yogurt,dairy eggs
3,1187899,0,train,11,4,8,14.0,26404,4,1,XL Pick-A-Size Paper Towel Rolls,54,17,paper goods,household
4,1187899,0,train,11,4,8,14.0,39656,5,1,Milk Chocolate Almonds,45,19,candy chocolate,snacks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1384612,272231,206208,train,14,6,14,30.0,40602,4,0,Fabric Softener Sheets,75,17,laundry,household
1384613,272231,206208,train,14,6,14,30.0,15654,5,0,Dark Chocolate Mint Snacking Chocolate,45,19,candy chocolate,snacks
1384614,272231,206208,train,14,6,14,30.0,42605,6,0,Phish Food Frozen Yogurt,37,1,ice cream ice,frozen
1384615,272231,206208,train,14,6,14,30.0,37965,7,0,French Baguette Bread,112,3,bread,bakery


In [20]:
train_df, test_df = order_user_product_prior_df, order_user_product_train_df
train_df

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,2539329,0,prior,1,2,8,,195,1,0,Soda,77,7,soft drinks,beverages
1,2539329,0,prior,1,2,8,,14083,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,soy lactosefree,dairy eggs
2,2539329,0,prior,1,2,8,,12426,3,0,Original Beef Jerky,23,19,popcorn jerky,snacks
3,2539329,0,prior,1,2,8,,26087,4,0,Aged White Cheddar Popcorn,23,19,popcorn jerky,snacks
4,2539329,0,prior,1,2,8,,26404,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,paper goods,household
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32434484,2977660,206208,prior,13,1,12,7.0,14196,5,1,Tomato Paste,9,9,pasta sauce,dry goods pasta
32434485,2977660,206208,prior,13,1,12,7.0,38729,6,0,Brownie Crunch High Protein Bar,3,19,energy granola bars,snacks
32434486,2977660,206208,prior,13,1,12,7.0,31476,7,0,High Protein Bar Chunky Peanut Butter,3,19,energy granola bars,snacks
32434487,2977660,206208,prior,13,1,12,7.0,6566,8,0,Chocolate Peanut Butter Protein Bar,3,19,energy granola bars,snacks


# Making user-product matrix 

In [21]:
train_df = train_df.groupby(['user_id', 'product_id']).size().reset_index(name='purchase_count')

In [22]:
train_df

Unnamed: 0,user_id,product_id,purchase_count
0,0,195,10
1,0,10257,9
2,0,10325,1
3,0,12426,10
4,0,13031,3
...,...,...,...
13307948,206208,43960,3
13307949,206208,44324,1
13307950,206208,48369,1
13307951,206208,48696,1


In [23]:
test_df = test_df.groupby(['user_id', 'product_id']).size().reset_index(name='purchase_count')

In [24]:
test_df

Unnamed: 0,user_id,product_id,purchase_count
0,0,195,1
1,0,10257,1
2,0,13031,1
3,0,25132,1
4,0,26087,1
...,...,...,...
1384612,206208,24851,1
1384613,206208,37965,1
1384614,206208,39215,1
1384615,206208,40602,1


# build user-product sparse matrix

In [33]:
user_product_matrix = sparse.coo_matrix((train_df["purchase_count"],
                                            (train_df["user_id"],
                                             train_df["product_id"])))
user_product_matrix = user_product_matrix.tocsr().astype(np.float32)

In [34]:
user_product_matrix

<206209x49688 sparse matrix of type '<class 'numpy.float32'>'
	with 13307953 stored elements in Compressed Sparse Row format>

In [75]:
def get_user_product_embeddings(sparse_matrix, embedding_dimension=50):
    user_embeddings, S, product_embeddings  = linalg.svds(user_product_matrix, embedding_dimension)
    S_diag_matrix = np.diag(S)
    user_embeddings = user_embeddings @ S_diag_matrix
    return user_embeddings, product_embeddings.T
    
def recommend_product_df(product_ids):
    return products_df[products_df["product_id"].isin(product_ids)]
    
def actual_product_df(user_id):
    df = test_df[test_df["user_id"]==user_id]
    pids = df["product_id"]
    return products_df[products_df["product_id"].isin(pids)]
    

In [76]:
user_embeddings, product_embeddings = get_user_product_embeddings(user_product_matrix, 50)

In [99]:
def recommend(user_id, N=10):
    scores =  user_embeddings[user_id].dot(product_embeddings.T)
    top_N_indices = np.argpartition(scores, -N)[-N:]
    top_N_indices = top_N_indices[np.argsort(scores[top_N_indices])[::-1]]
    return top_N_indices

In [100]:
user_id = 4
recommended_items = recommend(user_id)
recommended_items

array([21615, 26603, 40705, 39274, 43351,  9075, 39927, 41949, 31716,
       30390])

In [101]:
actual_product_df(user_id).sort_values(by="aisle_id")

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
16184,16184,Sharp Cheddar Cheese,21,16,packaged cheese,dairy eggs
19056,19056,Organic Large Extra Fancy Fuji Apple,24,4,fresh fruits,produce
15348,15348,Organic Raw Agave Nectar,29,13,honeys syrups nectars,pantry
21412,21412,Organic Soba,66,6,asian foods,international
48203,48203,Tamari Gluten Free Soy Sauce,66,6,asian foods,international
20113,20113,Jalapeno Peppers,83,4,fresh vegetables,produce
20842,20842,100% Pure Eucalyptus Essential Oil,101,17,air fresheners candles,household
21615,21615,Organic Baby Arugula,123,4,packaged vegetables fruits,produce
40705,40705,Organic Grape Tomatoes,123,4,packaged vegetables fruits,produce


In [102]:
recommend_product_df(recommended_items).sort_values(by="aisle_id")

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
31716,31716,Organic Cilantro,16,4,fresh herbs,produce
26603,26603,Organic Blackberries,24,4,fresh fruits,produce
39927,39927,Organic Kiwi,24,4,fresh fruits,produce
43351,43351,Raspberries,32,4,packaged produce,produce
30390,30390,Organic Cucumber,83,4,fresh vegetables,produce
41949,41949,Organic Tomato Cluster,83,4,fresh vegetables,produce
9075,9075,Blueberries,116,1,frozen produce,frozen
21615,21615,Organic Baby Arugula,123,4,packaged vegetables fruits,produce
39274,39274,Organic Blueberries,123,4,packaged vegetables fruits,produce
40705,40705,Organic Grape Tomatoes,123,4,packaged vegetables fruits,produce


In [21]:
def get_product_neighbors(model, product_id, num_neighbors=10):
    similar_items = model.similar_items(product_id, N=num_neighbors)
    return similar_items

In [22]:
items, scores = get_product_neighbors(model, 1)

In [109]:
def calculate_baseline_evaluation(train_df, test_df, N=10):
    user_precision = []
    user_recall = []
    user_f1_score = []
    #generate recommendations for each user in the test set
    popular_items = train_df.groupby('product_id')['purchase_count'].sum().sort_values(ascending=False).index.tolist()
    for user_id in test_df['user_id'].unique():
        #get the actual items for the user in the test set
        test_items = test_df[test_df['user_id'] == user_id]['product_id'].tolist()
        #the actual products the customer purchased in training dataset
        train_items = train_df[train_df['user_id'] == user_id]['product_id'].tolist()
        #find products in the test set that the user never bought in the training set
        actual_items = set(test_items) - set(train_items)
        
        #recommend the top N popular items
        recommended_items = popular_items[:N]
        
        #calculate the number of relevant items recommended
        relevant_items_recommended = len(set(recommended_items) & set(actual_items))

        #calculate precision for the user
        precision = relevant_items_recommended / len(recommended_items) if recommended_items else 0
        user_precision.append(precision)
        
        #calculate recall for the user
        recall = relevant_items_recommended / len(actual_items) if actual_items else 0
        user_recall.append(recall)

        #calculate f1 score for the user
        f1_score = 0
        if precision + recall > 0:
            f1_score = 2 * (precision * recall) / (precision + recall)
        user_f1_score.append(f1_score)

    #calculate the average precision across all users
    average_precision = np.mean(user_precision)
    
    #calculate the average recall across all users
    average_recall = np.mean(user_recall)

    #calculate the average f1 score across all users
    average_f1_score = np.mean(user_f1_score)

    return average_precision, average_recall, average_f1_score

In [110]:
baseline_precision, baseline_recall, baseline_f1_score = calculate_baseline_evaluation(train_df, test_df)

In [105]:
def calculate_model_evaluation(train_df, test_df, N=10):
    user_precision = []
    user_recall = []
    user_f1_score = []
    for user_id in test_df['user_id'].unique():
        #get the actual items for the user in the test set
        test_items = test_df[test_df['user_id'] == user_id]['product_id'].tolist()
        #the actual products the customer purchased in training dataset
        train_items = train_df[train_df['user_id'] == user_id]['product_id'].tolist()
        #find products in the test set that the user never bought in the training set
        actual_items = set(test_items) - set(train_items)
        
        #get N recommended items from the model
        recommended_items = recommend(user_id, N=N)
        
        #calculate the number of relevant items recommended
        relevant_items_recommended = len(set(recommended_items) & set(actual_items))

        #calculate precision for the user
        precision = relevant_items_recommended / len(recommended_items) if len(recommended_items) > 0 else 0
        user_precision.append(precision)
        
        #calculate recall for the user
        recall = relevant_items_recommended / len(actual_items) if actual_items else 0
        user_recall.append(recall)

        #calculate f1 score for the user
        f1_score = 0
        if precision + recall > 0:
            f1_score = 2 * (precision * recall) / (precision + recall)
        user_f1_score.append(f1_score)

    #calculate the average precision across all users
    average_precision = np.mean(user_precision)
    
    #calculate the average recall across all users
    average_recall = np.mean(user_recall)

    #calculate the average f1 score across all users
    average_f1_score = np.mean(user_f1_score)

    return average_precision, average_recall, average_f1_score

In [106]:
model_precision, model_recall, model_f1_score = calculate_model_evaluation(train_df, test_df)

In [111]:
print("baseline precision: {:.2f}%, model precision: {:.2f}% ".format(baseline_precision * 100, model_precision * 100))
print("baseline recall: {:.2f}%, model recall: {:.2f}% ".format(baseline_recall * 100, model_recall * 100))
print("baseline f1 score: {:.2f}, model f1 score: {:.2f}% %".format(baseline_f1_score * 100, model_f1_score * 100))

baseline precision: 1.31%, model precision: 0.55% 
baseline recall: 2.64%, model recall: 1.32% 
baseline f1 score: 1.52, model f1 score: 0.67% %
