In [1]:
# This notebook was created to do validation on collaboration filtering models
# It relies on a data extract in certain formats to operate, although there is a mock data generator
#
# There are 2 main functions:
#
# ** collab_recall_validation_tester()
# Input: customer purchase data, testing prarameters
# K-fold will be used for validation, using the (k-1)/k of the customer purcahse data to build a co-occurrence
# matrix and validating with 1/k of the data. The 1/k of the data will have X number of purchases removed
# for each customer and try to recommend the missing purchase using the co-occurrence matrix. The limitation
# is that any users with less than X purchases will not be used to calculate the accurracy.
# Output: Recall Accurracy
# **
#
# ** collab_catalog_validation_tester()
# Input: customer purchase data, testing prarameters
# K-fold will be used for validation, using the (k-1)/k of the customer purcahse data to build a co-occurrence
# matrix and validating with 1/k of the data. The 1/k of the data will keep all the original customer purchases
# and use it to recommend products using the co-occurrence matrix. Taking all the recommended products and 
# dividing it by the total amount of products offered will give us catalog coverage.
# Output: Catalog Coverage
# **
#
# Testing with categories is not implemented yet

In [80]:
import numpy as np
import random
import time

In [3]:
# Create Mock Data

NUM_OF_CUSTOMERS = 150000
NUM_OF_PRODUCTS = 3990
NUM_OF_CATEGORIES = 8000 #level 1 + 1 general
NUM_OF_CATEGORY_MAXMAPPINGS = 5
NUM_OF_DEMOREGION = (9+1+1)  # 9 us regions + 1 international + 1 unknown, 
NUM_OF_DEMOGENDER = (2+1) # 2 genders + 1 unknown
    
# create symmetric matrix
def random_symmetric_matrix(n,low,high):
    _R = np.random.randint(low,high,size=n*(n-1)/2)
    P = np.zeros((n,n))
    P[np.triu_indices(n, 1)] = _R
    P[np.tril_indices(n, -1)] = P.T[np.tril_indices(n, -1)]
    return P

# create product to category array
def create_mock_prod_cat(num_prod,num_cat,max_mappings):
    list_cat_mappings = []
    for i in range(max_mappings):
        list_cat_mappings.append(np.c_[np.random.randint(0,num_cat,num_prod),\
                                       np.random.randint(0,num_cat,num_prod),\
                                       np.random.randint(0,num_cat,num_prod),\
                                       np.random.randint(0,num_cat,num_prod),\
                                       np.random.randint(0,num_cat,num_prod)])
    return np.array(list_cat_mappings)

# create customer to demographics array
def create_mock_cust_demo(num_cust,num_demoregion, num_demogender):
    return np.c_[np.random.randint(0,num_demoregion,num_cust),\
                np.random.randint(0,num_demogender,num_cust)]

# create customer x orders
def create_mock_cust_orders(num_cust,num_prod,order_percentage):
    R = np.random.choice([0, 1], size=num_cust*num_prod, p=[1-order_percentage, order_percentage])
    return np.reshape(R,(-1,num_prod))


In [4]:
# Create co-occurrence matrix

def create_cocmatrix(subset_matrix_cust_order):
    rows, cols = subset_matrix_cust_order.shape
    m = np.zeros((cols,cols))
    for i in range(cols):
        t = np.sum(subset_matrix_cust_order[subset_matrix_cust_order[:,i] > 0],axis=0)
        t[i] = 0
        m[i,:] = t
        if i % (cols/5) == 0:
            print "created rows in cooccurrence matrix at row",i
    return m

In [5]:
# initialize all data for collaborative

# product to categories lookup
arr_pxc = create_mock_prod_cat(NUM_OF_PRODUCTS,NUM_OF_CATEGORIES,NUM_OF_CATEGORY_MAXMAPPINGS)
#arr_pxc = np.load('')

print "created product to categories lookup"

# customer to demographic lookup
#arr_cxd = create_mock_cust_demo(NUM_OF_CUSTOMERS,NUM_OF_DEMOREGION,NUM_OF_DEMOGENDER)
arr_cxd = np.load('demo_matrix.npy')

print "created customer to demographic lookup"

# customer to order lookup
#matrix_co = create_mock_cust_orders(NUM_OF_CUSTOMERS,NUM_OF_PRODUCTS,0.01)
matrix_co = np.load('cust_item_matrix.npy')

print "created customer to order lookup"


created product to categories lookup
created customer to demographic lookup
created customer to order lookup


In [57]:
# print statistics
print "\nproduct x lookup shape:",arr_pxc.shape

print "\ncustomer x demographic shape:",arr_cxd.shape
print "customer regions + unknown:",len(np.unique(arr_cxd[:,0]))
for num, name in enumerate(np.unique(arr_cxd[:,0])):
    print "\tregion",name,"% of pop",len(np.where(arr_cxd[:,0] == name)[0])/float(arr_cxd.shape[0])*100
print "customer genders + unknown:",len(np.unique(arr_cxd[:,1]))
for num, name in enumerate(np.unique(arr_cxd[:,1])):
    print "\tgender",name,"% of pop",len(np.where(arr_cxd[:,1] == name)[0])/float(arr_cxd.shape[0])*100
    
print "\ncustomer x purchases shape:",matrix_co.shape
print "if already normalized the stats should be the same"
print "not normalized"
stat_sum_not_normal = np.sum(matrix_co,axis=1)
print "customer average purchases (multiples of item allowed):",np.mean(stat_sum_not_normal)
print "customer variance purchases (multiples of item allowed):",np.var(stat_sum_not_normal)
print "customer min purchases (multiples of item allowed):",np.min(stat_sum_not_normal)
print "customer max purchases (multiples of item allowed):",np.max(stat_sum_not_normal)
print "normalized"
stat_sum_normal = np.count_nonzero(matrix_co,axis=1)
print "customer average purchases (multiples of item set to 1):",np.mean(stat_sum_normal)
print "customer variance purchases (multiples of item set to 1):",np.var(stat_sum_normal)
print "customer min purchases (multiples of item set to 1):",np.min(stat_sum_normal)
print "customer max purchases (multiples of item set to 1):",np.max(stat_sum_normal)


product x lookup shape: (5, 3990, 5)

customer x demographic shape: (189559, 2)
customer regions + unknown: 22
	region 0.0 % of pop 0.193079727156
	region 1.0 % of pop 10.3403162076
	region 2.0 % of pop 42.9655146946
	region 3.0 % of pop 8.69280804393
	region 4.0 % of pop 2.49262762517
	region 5.0 % of pop 14.3607003624
	region 6.0 % of pop 1.45653859748
	region 7.0 % of pop 4.01194351099
	region 8.0 % of pop 3.41318534071
	region 9.0 % of pop 11.9308500256
	region 10.0 % of pop 0.0949572428637
	region 11.0 % of pop 0.00474786214318
	region 12.0 % of pop 0.0290147130972
	region 13.0 % of pop 0.00896818404824
	region 15.0 % of pop 0.000527540238132
	region 18.0 % of pop 0.00105508047626
	region 20.0 % of pop 0.000527540238132
	region 21.0 % of pop 0.000527540238132
	region 22.0 % of pop 0.000527540238132
	region 28.0 % of pop 0.000527540238132
	region 30.0 % of pop 0.000527540238132
	region 35.0 % of pop 0.000527540238132
customer genders + unknown: 3
	gender 0.0 % of pop 8.54826201869

In [95]:
# get category filter
# matrix_cat = list of product x category matrixes
# l_catid = categories interested in in list [lvl1,lvl2,lvl3,lvl4,lvl5]
def get_catarray(matrix_cat,l_catid):
    catarray = np.zeros(matrix_cat.shape[1])
    # iterate through different levels
    for i in range(len(matrix_cat)):
        #print "layer",i
        sub_matrix = matrix_cat[i]
        array_holder = np.arange(matrix_cat.shape[1])
        for l in range(len(l_catid)):
            array_holder = np.intersect1d(np.where(sub_matrix[:,l] == l_catid[l]),array_holder)
            #print np.where(sub_matrix[:,i] == l_catid[l])
        catarray[array_holder] = 1
    return catarray

In [96]:
# generate recommendations
# purchase_vec = customer purchases (vector)
# cocm = co occurrence matrix [items x items]
# num_rec = number of recommendations
# custid = customer id you are recommending for
# list_catid = category customer is interested in
# mpxc = matrix of products to categories [level x products x categories]
def gen_recom(purchase_vec,cocm,num_rec,list_catid,mpxc):
    #print "generating",num_rec,"recommendations for customer in category",list_catid
    
    # find purchase asins/indexes
    #print "purchase indexes/values:"
    purchase_prodid = np.where(purchase_vec > 0)[0] 
    #print purchase_prodid
    
    # add up co occurrence rows 
    #print "summed co-occurence vector:"
    rowsum = np.zeros(purchase_vec.shape[0])
    for p in purchase_prodid:
        rowsum += cocm[p]
    #print rowsum
    
    # remove not in category
    #print "remove not in category"
    # do nothing if category 0 which is general
    #if len(list_catid) != 0:
    #    catarray = np.zeros(purchase_vec.shape[0])
    #    catarray[get_catarray(mpxc,list_catid)] = 1
    #    rowsum = np.multiply(rowsum,catarray)
    #else:
    #    print "no category removed general"
    #print rowsum
    
    # remove already purchased items
    #print "removed already purchased items:"
    rowsum = np.multiply(rowsum,1-np.where(purchase_vec > 0, 1,0))
    #print rowsum
    
    # find largest indexes
    #toprec = np.argsort(rowsum)[-1 * num_rec:][::-1]
    indices = np.nonzero(rowsum)[0]
    toprec = indices[np.argsort(rowsum[indices])][-1 * num_rec:][::-1]
    #print "top",num_rec,"recommendation indexes:"
    #print toprec
    #print "top",num_rec,"recommendation indexes scores:"
    #print rowsum[toprec]
    
    return toprec


In [45]:
from sklearn.model_selection import KFold

In [99]:
# put matrixes in to get cross validation recall
# generate recommendations
# mpxc = matrix of products to categories [level x products x categories]
# mco = matrix of customer to order [customer x items]
# num_rec = number of recommendations
# num_folds = k number of folds for cross validation
# recall_remove = removed number from purchase history
# list_catid = category customer is interested in
def collab_recall_validation_tester(mpxc,mco,num_rec,num_folds,recall_remove,list_catid):
    # cross validation n-folds
    start_time = time.time()
    kf = KFold(n_splits=num_folds)
    list_total_acc=[]
    sum = 0
    k_index = 0
    for train, test in kf.split(mco):
        print "------------------------------------start new k=",k_index,\
        ",train set=",mco[train].shape[0],\
        ",validation set=",mco[test].shape[0]
        list_sub_acc = []
        # build co-occurrence matrix
        coo_matrix = create_cocmatrix(mco[train])

        # loop through test set
        # for each customer in test pool
        for i in range(mco[test].shape[0]):
            
            #print "customer",i
            # purchase vector
            #print "purchase vector"
            p_vec = mco[test][i,:]
            #print p_vec
            # determine purchase indexes
            #print "purchase indexes"
            list_original_purchases = np.where(p_vec > 0)[0]
            #print list_original_purchases
            
            # only run tests on customers with > recall_remove purchases for prediction
            if len(list_original_purchases) > recall_remove:
                # randomly select indexes to leave out
                #print "remove purchase indexes"
                #list_removed_purchases = np.random.choice(list_original_purchases,size=recall_remove, replace=False)
                list_removed_purchases = random.sample(list_original_purchases,recall_remove)
                #print list_removed_purchases
                # remove
                list_modified_purchases = list(set(list_original_purchases) - set(list_removed_purchases))
                #print "modified purchase vector"
                p_vec_mod = np.zeros(p_vec.shape[0])
                p_vec_mod[list_modified_purchases] = 1
                #print p_vec_mod
                # get sum all purchases except ones left out
                list_summed_coo_vec = gen_recom(p_vec_mod,coo_matrix,num_rec,list_catid,mpxc)
                #print list_summed_coo_vec
                # check if return recommendations are in list
                list_recommended_match = set(list_removed_purchases) & set(list_summed_coo_vec)
                acc = len(list_recommended_match)/float(len(list_removed_purchases))
                #print "predicted ratio",accuracy
                list_sub_acc.append(acc)
                #print "end customer",i

                #if i % 500 == 0:
                # sanity check to make sure not all recommendations are #1
                #if (list_removed_purchases[0] != list_summed_coo_vec[0]) & (acc == 1):
                #    print "\t--customer example index",i
                #    print "\toriginal purchased indexes",list_original_purchases
                #    print "\tremoved index",list_removed_purchases
                #    print "\toriginal modified purchase indexes",np.where(p_vec_mod > 0)[0]
                #    print "\tpredicted indexes",list_summed_coo_vec     
                #    print "\tpredicted accuracy ratio",acc
                #    print "\t--customer end example index",i
            #else:
                #print "skipped due to length of purchases",list_original_purchases,\
                #"less than recall remove",recall_remove
        mean_sub_acc = np.mean(list_sub_acc)
        print "** number of elements in calculation",len(list_sub_acc)
        #print list_sub_acc
        print "** average fold accuracy",mean_sub_acc
        list_total_acc.append(mean_sub_acc)
        k_index += 1
        print "**time elapsed",(time.time() - start_time)
        print "------------------------------------end"
    print "list of total accuracy for each fold",list_total_acc
    average = np.mean(list_total_acc)
    print num_folds,"-fold total average",average
    print "time elapsed",(time.time() - start_time)
    return average

In [100]:
#collab_recall_validation_tester(arr_pxc,matrix_co[0:5000,:],10,10,1,[])
collab_recall_validation_tester(arr_pxc,matrix_co[np.random.randint(0,matrix_co.shape[0],25000)],10,10,1,[])


------------------------------------start new k= 0 ,train set= 22500 ,validation set= 2500
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 688
** average fold accuracy 0.640988372093
**time elapsed 90.3330118656
------------------------------------end
------------------------------------start new k= 1 ,train set= 22500 ,validation set= 2500
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 725
** average fold accuracy 0.608275862069
**time elapsed 187.806768894
------------------------------------end
------------------------------------start new 

0.62478323062231755

In [101]:
# put matrixes in to get cross validation coverage
# generate recommendations
# mpxc = matrix of products to categories [level x products x categories]
# mco = matrix of customer to order [customer x items]
# num_rec = number of recommendations
# num_folds = k number of folds for cross validation
# recall_remove = removed number from purchase history
# list_catid = category customer is interested in
def collab_catalog_validation_tester(mpxc,mco,num_rec,num_folds,recall_remove,list_catid):
    # cross validation n-folds
    start_time = time.time()
    kf = KFold(n_splits=num_folds)
    list_total_coverage=[]
    sum = 0
    k_index = 0
    for train, test in kf.split(mco):
        set_products = set()
        # build co-occurrence matrix
        print "------------------------------------start new k=",k_index,\
        ",train set=",mco[train].shape[0],\
        ",validation set=",mco[test].shape[0]
        #coo_matrix = random_symmetric_matrix(mco.shape[1],0,10)
        coo_matrix = create_cocmatrix(mco[train])

        # loop through test set
        # for each customer in test pool
        for i in range(mco[test].shape[0]):
            #print "customer",i
            # purchase vector
            #print "purchase vector"
            p_vec = mco[test][i,:]
            # get sum all purchases
            list_summed_coo_vec = gen_recom(p_vec,coo_matrix,p_vec.shape[0],list_catid,mpxc)    
            set_products = set_products | set(list_summed_coo_vec)
            
            if i % 500 == 0:
                print "\t--customer example index",i
                print "\treturn length",len(list_summed_coo_vec)
                print "\t--customer end example index",i
            
        mean_sub_cov = float(len(set_products))/p_vec.shape[0]
        print "**length fold set",float(len(set_products))
        print "**average fold coverage",mean_sub_cov
        list_total_coverage.append(mean_sub_cov)
        k_index += 1
        print "**time elapsed",(time.time() - start_time)
        print "------------------------------------end"
    
    print "list of total coverage for each fold", list_total_coverage
    average = np.mean(list_total_coverage)
    print num_folds,"-fold total average",average
    print "time elapsed",(time.time() - start_time)
    return average
        

In [102]:
#collab_catalog_validation_tester(arr_pxc,matrix_co[0:5000,:],10,10,1,[])
collab_catalog_validation_tester(arr_pxc,matrix_co[np.random.randint(0,matrix_co.shape[0],25000)],10,10,1,[])

------------------------------------start new k= 0 ,train set= 22500 ,validation set= 2500
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
	--customer example index 0
	return length 107
	--customer end example index 0
	--customer example index 500
	return length 85
	--customer end example index 500
	--customer example index 1000
	return length 8
	--customer end example index 1000
	--customer example index 1500
	return length 66
	--customer end example index 1500
	--customer example index 2000
	return length 69
	--customer end example index 2000
**length fold set 980.0
**average fold coverage 0.245614035088
**time elapsed 92.9679238796
------------------------------------end
------------------------------------start new k= 1 ,train set= 22500 ,validation set= 2500
created rows in cooccurrence matrix at

	--customer example index 500
	return length 41
	--customer end example index 500
	--customer example index 1000
	return length 23
	--customer end example index 1000
	--customer example index 1500
	return length 68
	--customer end example index 1500
	--customer example index 2000
	return length 14
	--customer end example index 2000
**length fold set 1002.0
**average fold coverage 0.251127819549
**time elapsed 888.252238989
------------------------------------end
list of total coverage for each fold [0.24561403508771928, 0.25012531328320803, 0.24686716791979949, 0.24937343358395989, 0.24887218045112783, 0.24862155388471177, 0.24285714285714285, 0.24461152882205514, 0.24711779448621554, 0.2511278195488722]
10 -fold total average 0.247518796992
time elapsed 888.259826899


0.24751879699248119