# Recommendation System PART 1 - Use of Collaborative Filtering in Retail using LightFM library on Instacart Dataset

In [1]:
import pandas as pd # pandas for data manipulation
import numpy as np # numpy for sure
from scipy.sparse import coo_matrix # for constructing sparse matrix
# lightfm 
from lightfm import LightFM # model
from lightfm.evaluation import auc_score

# timing
import time



In [60]:
# importing datasets
aisles = pd.read_csv('aisles.csv')
departments = pd.read_csv('departments.csv')
orders = pd.read_csv('orders.csv')
order_products__prior = pd.read_csv('order_products__prior.csv')
order_products__train = pd.read_csv('order_products__train.csv')
products = pd.read_csv('products.csv')

In [61]:
aisles.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [62]:
departments.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [63]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [64]:
order_products__prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [65]:
order_products__train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [66]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [9]:
# removing aisles with aisle == "missing" and aisle == "other"
# removing departments with department == "missing" and department == "other"

aisles = aisles[aisles["aisle"].apply(lambda x: x != "missing" and x != "other")]
departments = departments[departments["department"].apply(lambda x: x != "missing" and x != "other")]

In [75]:
# orders["eval_set"].value_counts()
orders.groupby('eval_set')['user_id'].nunique()

eval_set
prior    206209
test      75000
train    131209
Name: user_id, dtype: int64

In [10]:
def get_user_list(df, user_column):
    """
    
    creating a list of user from dataframe df, user_column is a column 
    consisting of users in the dataframe df
    
    """
    
    return np.sort(df[user_column].unique())

def get_item_list(df, item_name_column):
    
    """
    
    creating a list of items from dataframe df, item_column is a column 
    consisting of items in the dataframe df
    
    return to item_id_list and item_id2name_mapping
    
    """
    
    item_list = df[item_name_column].unique()
    
    
    return item_list

def get_feature_list(aisle_df, department_df, aisle_name_column, department_name_column):
    
    aisle = aisle_df[aisle_name_column]
    department = department_df[department_name_column]
    
    return pd.concat([aisle, department], ignore_index = True).unique()

# creating user_id, item_id, and features_id

def id_mappings(user_list, item_list, feature_list):
    """
    
    Create id mappings to convert user_id, item_id, and feature_id
    
    """
    user_to_index_mapping = {}
    index_to_user_mapping = {}
    for user_index, user_id in enumerate(user_list):
        user_to_index_mapping[user_id] = user_index
        index_to_user_mapping[user_index] = user_id
        
    item_to_index_mapping = {}
    index_to_item_mapping = {}
    for item_index, item_id in enumerate(item_list):
        item_to_index_mapping[item_id] = item_index
        index_to_item_mapping[item_index] = item_id
        
    feature_to_index_mapping = {}
    index_to_feature_mapping = {}
    for feature_index, feature_id in enumerate(feature_list):
        feature_to_index_mapping[feature_id] = feature_index
        index_to_feature_mapping[feature_index] = feature_id
        
        
    return user_to_index_mapping, index_to_user_mapping, \
           item_to_index_mapping, index_to_item_mapping, \
           feature_to_index_mapping, index_to_feature_mapping


def get_user_product_interaction(orders_df, order_products_train_df, order_products_test_df, products_df):
    
    # creating a dataframe consists of TWO columns user_id, and product_name (product bought by the user) for the train data
    user_to_product_train_df = orders_df[orders_df["eval_set"] == "prior"][["user_id", "order_id"]].\
    merge(order_products_train_df[["order_id", "product_id"]]).merge(products_df[["product_id", "product_name"]])\
    [["user_id", "product_name"]].copy()
    
    # giving rating as the number of product purchase count
    user_to_product_train_df["product_count"] = 1
    user_to_product_rating_train = user_to_product_train_df.groupby(["user_id", "product_name"], as_index = False)["product_count"].sum()
    
    # creating a dataframe consists of TWO columns user_id, and product_name (product bought by the user) for the test data
    user_to_product_test_df = orders_df[orders_df["eval_set"] == "train"][["user_id", "order_id"]].\
    merge(order_products_test_df[["order_id", "product_id"]]).merge(products_df[["product_id", "product_name"]])\
    [["user_id", "product_name"]].copy()
    
    # giving rating as the number of product purchase count (including the previous purchase in the training data)
    user_to_product_test_df["product_count"] = 1
    user_to_product_rating_test = user_to_product_test_df.groupby(["user_id", "product_name"], as_index = False)["product_count"].sum()
    
    # merging with the previous training user_to_product_rating_training
    
    user_to_product_rating_test = user_to_product_rating_test.\
    merge(user_to_product_rating_train.rename(columns = {"product_count" : "previous_product_count"}), how = "left").fillna(0)
    user_to_product_rating_test["product_count"] = user_to_product_rating_test.apply(lambda x: x["previous_product_count"] + \
                                                                                    x["product_count"], axis = 1)
    user_to_product_rating_test.drop(columns = ["previous_product_count"], inplace = True)
    
    return user_to_product_rating_train, user_to_product_rating_test

def get_interaction_matrix(df, df_column_as_row, df_column_as_col, df_column_as_value, row_indexing_map, 
                          col_indexing_map):
    
    row = df[df_column_as_row].apply(lambda x: row_indexing_map[x]).values
    col = df[df_column_as_col].apply(lambda x: col_indexing_map[x]).values
    value = df[df_column_as_value].values
    
    return coo_matrix((value, (row, col)), shape = (len(row_indexing_map), len(col_indexing_map)))

def get_product_feature_interaction(product_df, aisle_df, department_df, aisle_weight = 1, department_weight = 1):
    item_feature_df = product_df.merge(aisle_df).merge(department_df)[["product_name", "aisle", "department"]]
    
    # start indexing
    item_feature_df["product_name"] = item_feature_df["product_name"]
    item_feature_df["aisle"] = item_feature_df["aisle"]
    item_feature_df["department"] = item_feature_df["department"]
    
    # allocate aisle and department into one column as "feature"
    
    product_aisle_df = item_feature_df[["product_name", "aisle"]].rename(columns = {"aisle" : "feature"})
    product_aisle_df["feature_count"] = aisle_weight # adding weight to aisle feature
    product_department_df = item_feature_df[["product_name", "department"]].rename(columns = {"department" : "feature"})
    product_department_df["feature_count"] = department_weight # adding weight to department feature
    
    # combining aisle and department into one
    product_feature_df = pd.concat([product_aisle_df, product_department_df], ignore_index=True)
    
    # saving some memory
    del item_feature_df
    del product_aisle_df
    del product_department_df
    
    
    # grouping for summing over feature_count
    product_feature_df = product_feature_df.groupby(["product_name", "feature"], as_index = False)["feature_count"].sum()
    
    
    return product_feature_df


#=======================
    # converting to coo_matrix
    
#     row = product_feature_df["product_name"].values
#     col = product_feature_df["feature"].values
#     value = product_feature_df["feature_count"].values
    
#     return coo_matrix((value, (row, col)))
    

### Start

In [49]:
# create the user, item, feature lists
users = get_user_list(orders, "user_id")
items = get_item_list(products, "product_name")
features = get_feature_list(aisles, departments, "aisle", "department")


In [12]:
users

array([     1,      2,      3, ..., 206207, 206208, 206209])

In [13]:
items

array(['Chocolate Sandwich Cookies', 'All-Seasons Salt',
       'Robust Golden Unsweetened Oolong Tea', ..., 'Artisan Baguette',
       'Smartblend Healthy Metabolism Dry Cat Food',
       'Fresh Foaming Cleanser'], dtype=object)

In [14]:
features

array(['prepared soups salads', 'specialty cheeses',
       'energy granola bars', 'instant foods',
       'marinades meat preparation', 'packaged meat', 'bakery desserts',
       'pasta sauce', 'kitchen supplies', 'cold flu allergy',
       'fresh pasta', 'prepared meals', 'tofu meat alternatives',
       'packaged seafood', 'fresh herbs', 'baking ingredients',
       'bulk dried fruits vegetables', 'oils vinegars', 'oral hygiene',
       'packaged cheese', 'hair care', 'popcorn jerky', 'fresh fruits',
       'soap', 'coffee', 'beers coolers', 'red wines',
       'honeys syrups nectars', 'latino foods', 'refrigerated',
       'packaged produce', 'kosher foods', 'frozen meat seafood',
       'poultry counter', 'butter', 'ice cream ice', 'frozen meals',
       'seafood counter', 'dog food care', 'cat food care',
       'frozen vegan vegetarian', 'buns rolls', 'eye ear care',
       'candy chocolate', 'mint gum', 'vitamins supplements',
       'breakfast bars pastries', 'packaged poultry

In [15]:
# generate mapping, LightFM library can't read other than (integer) index
user_to_index_mapping, index_to_user_mapping, \
           item_to_index_mapping, index_to_item_mapping, \
           feature_to_index_mapping, index_to_feature_mapping = id_mappings(users, items, features)

In [16]:
# convert to the user, item, feature lists into indexes.
# interaction matrices can only consume indexes
user_to_product_rating_train, user_to_product_rating_test = get_user_product_interaction(orders, order_products__prior, 
                                                                                        order_products__train, products)

In [17]:
# create product and feature interaction matrix
product_to_feature = get_product_feature_interaction(product_df = products, 
                                                     aisle_df = aisles, 
                                                     department_df = departments,
                                                     aisle_weight=1, 
                                                     department_weight=1)

In [18]:
user_to_product_rating_train.head()

Unnamed: 0,user_id,product_name,product_count
0,1,0% Greek Strained Yogurt,1
1,1,Aged White Cheddar Popcorn,2
2,1,Bag of Organic Bananas,2
3,1,Bartlett Pears,1
4,1,Cinnamon Toast Crunch,3


In [19]:
user_to_product_rating_test.head()

Unnamed: 0,user_id,product_name,product_count
0,1,0% Greek Strained Yogurt,2.0
1,1,Aged White Cheddar Popcorn,3.0
2,1,Cinnamon Toast Crunch,4.0
3,1,Milk Chocolate Almonds,2.0
4,1,Organic Half & Half,3.0


In [20]:
product_to_feature.head()

Unnamed: 0,product_name,feature,feature_count
0,#2 Coffee Filters,beverages,1
1,#2 Coffee Filters,coffee,1
2,#2 Cone White Coffee Filters,beverages,1
3,#2 Cone White Coffee Filters,coffee,1
4,#2 Mechanical Pencils,household,1


In [21]:
del aisles 
del departments 
del orders 
del order_products__prior 
del order_products__train 
del products 

In [22]:
# generate user_item_interaction_matrix for train data
user_to_product_interaction_train = get_interaction_matrix(user_to_product_rating_train, "user_id", 
                                                    "product_name", "product_count", user_to_index_mapping, item_to_index_mapping)

# generate user_item_interaction_matrix for test data
user_to_product_interaction_test = get_interaction_matrix(user_to_product_rating_test, "user_id", 
                                                    "product_name", "product_count", user_to_index_mapping, item_to_index_mapping)

# generate item_to_feature interaction
product_to_feature_interaction = get_interaction_matrix(product_to_feature, "product_name", "feature",  "feature_count", 
                                                        item_to_index_mapping, feature_to_index_mapping)

In [23]:
user_to_product_interaction_train

<206209x49688 sparse matrix of type '<class 'numpy.int64'>'
	with 13307953 stored elements in COOrdinate format>

In [24]:
user_to_product_interaction_test

<206209x49688 sparse matrix of type '<class 'numpy.float64'>'
	with 1384617 stored elements in COOrdinate format>

In [25]:
product_to_feature_interaction

<49688x151 sparse matrix of type '<class 'numpy.int64'>'
	with 95764 stored elements in COOrdinate format>

### Applying LightFM Cross Validation

using pure collaborative filtering, not adding some item features as consideration

In [26]:
# initialising model with warp loss function
model_without_features = LightFM(loss = "warp")

In [27]:
# fitting into user to product interaction matrix only / pure collaborative filtering factor

start = time.time()
#===================

model_without_features.fit(user_to_product_interaction_train,
          user_features=None, 
          item_features=None, 
          sample_weight=None, 
          epochs=1, 
          num_threads=4,
          verbose=False)

#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 20.55 seconds


In [28]:
# auc metric score (ranging from 0 to 1)

start = time.time()
#===================

auc_without_features = auc_score(model = model_without_features, 
                        test_interactions = user_to_product_interaction_test,
                        num_threads = 4, check_intersections = False)
#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 250.76 seconds


In [29]:
print("average AUC without adding item-feature interaction = {0:.{1}f}".format(auc_without_features.mean(), 2))

average AUC without adding item-feature interaction = 0.95


adding item features as consideration

In [30]:
# initialising model with warp loss function
model_with_features = LightFM(loss = "warp")

In [31]:
# fitting the model with hybrid collaborative filtering + content based (product + features)
start = time.time()
#===================


model_with_features.fit(user_to_product_interaction_train,
          user_features=None, 
          item_features=product_to_feature_interaction, 
          sample_weight=None, 
          epochs=1, 
          num_threads=4,
          verbose=False)

#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 20.37 seconds


In [32]:
start = time.time()
#===================
auc_with_features = auc_score(model = model_with_features, 
                        test_interactions = user_to_product_interaction_test,
                        train_interactions = user_to_product_interaction_train, 
                        item_features = product_to_feature_interaction,
                        num_threads = 4, check_intersections=False)
#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))



# model, test_interactions, train_interactions=None, 
# user_features=None, item_features=None, preserve_rows=False, num_threads=1, check_intersections=True

time taken = 385.54 seconds


In [33]:
print("average AUC without adding item-feature interaction = {0:.{1}f}".format(auc_with_features.mean(), 2))

average AUC without adding item-feature interaction = 0.80


In [58]:
user_to_product_interaction_train.shape

(206209, 49688)

### Requesting Products / Items Recommendation

we need to combine the training and the test set and retrain everything

In [34]:
def combined_train_test(train, test):
    """
    
    test set is the more recent rating/number_of_order of users.
    train set is the previous rating/number_of_order of users.
    non-zero value in the test set will replace the elements in 
    the train set matrices

    """
    # initialising train dict
    train_dict = {}
    for train_row, train_col, train_data in zip(train.row, train.col, train.data):
        train_dict[(train_row, train_col)] = train_data
        
    # replacing with the test set
    
    for test_row, test_col, test_data in zip(test.row, test.col, test.data):
        train_dict[(test_row, test_col)] = max(test_data, train_dict.get((test_row, test_col), 0))
        
    
    # converting to the row
    row_element = []
    col_element = []
    data_element = []
    for row, col in train_dict:
        row_element.append(row)
        col_element.append(col)
        data_element.append(train_dict[(row, col)])
        
    # converting to np array
    
    row_element = np.array(row_element)
    col_element = np.array(col_element)
    data_element = np.array(data_element)
    
    return coo_matrix((data_element, (row_element, col_element)), shape = (train.shape[0], train.shape[1]))

In [35]:
user_to_product_interaction = combined_train_test(user_to_product_interaction_train, 
                                                 user_to_product_interaction_test)

In [36]:
user_to_product_interaction

<206209x49688 sparse matrix of type '<class 'numpy.float64'>'
	with 13863746 stored elements in COOrdinate format>

In [37]:
# retraining the final model with combined dataset

final_model = LightFM(loss = "warp")

# fitting to combined dataset with pure collaborative filtering result

start = time.time()
#===================

final_model.fit(user_to_product_interaction,
          user_features=None, 
          item_features=None, 
          sample_weight=None, 
          epochs=1, 
          num_threads=4,
          verbose=False)

#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 22.66 seconds


### Printing Out Recommendations for user 2 and user 10 (sample)

In [38]:
class recommendation_sampling:
    
    def __init__(self, model, items = items, user_to_product_interaction_matrix = user_to_product_interaction, 
                user2index_map = user_to_index_mapping):
        
        self.user_to_product_interaction_matrix = user_to_product_interaction_matrix
        self.model = model
        self.items = items
        self.user2index_map = user2index_map
    
    def recommendation_for_user(self, user):
        
        # getting the userindex
        
        userindex = self.user2index_map.get(user, None)
        
        if userindex == None:
            return None
        
        users = [userindex]
        
        # products already bought
        
        known_positives = self.items[self.user_to_product_interaction_matrix.tocsr()[userindex].indices]
        
        # scores from model prediction
        scores = self.model.predict(user_ids = users, item_ids = np.arange(self.user_to_product_interaction_matrix.shape[1]))
        
        # top items
        
        top_items = self.items[np.argsort(-scores)]
        
        # printing out the result
        print("User %s" % user)
        print("     Known positives:")
        
        for x in known_positives[:3]:
            print("                  %s" % x)
            
            
        print("     Recommended:")
        
        for x in top_items[:3]:
            print("                  %s" % x)

In [39]:
# giving recommendations
recom = recommendation_sampling(model = final_model)

In [40]:
recom.recommendation_for_user(2)

User 2
     Known positives:
                  Organic Turkey Burgers
                  Wild Albacore Tuna No Salt Added
                  Cherry Pomegranate Greek Yogurt
     Recommended:
                  Organic Strawberries
                  Organic Garlic
                  Organic Baby Spinach


In [41]:
recom.recommendation_for_user(10)

User 10
     Known positives:
                  Cantaloupe
                  Parsley, Italian (Flat), New England Grown
                  Seedless Red Grapes
     Recommended:
                  Bag of Organic Bananas
                  Organic Strawberries
                  Organic Baby Spinach
