In [5]:
# This notebook was created to do holdout testing on the hybrid combiner
# It relies on a data extract in certain formats to operate, although there is a mock data generator


In [6]:
import numpy as np
import random
import time

In [7]:
# Create Mock Data

NUM_OF_CUSTOMERS = 150000
NUM_OF_PRODUCTS = 3990
NUM_OF_CATEGORIES = 8000 #level 1 + 1 general
NUM_OF_CATEGORY_MAXMAPPINGS = 7
NUM_OF_DEMOREGION = (9+1+1)  # 9 us regions + 1 international + 1 unknown, 
NUM_OF_DEMOGENDER = (2+1) # 2 genders + 1 unknown
    
# create symmetric matrix
def random_symmetric_matrix(n,low,high):
    _R = np.random.randint(low,high,size=n*(n-1)/2)
    P = np.zeros((n,n))
    P[np.triu_indices(n, 1)] = _R
    P[np.tril_indices(n, -1)] = P.T[np.tril_indices(n, -1)]
    return P

# create product to category array
def create_mock_prod_cat(num_prod,num_cat,max_mappings):
    list_cat_mappings = []
    for i in range(max_mappings):
        list_cat_mappings.append(np.c_[np.random.randint(0,num_cat,num_prod),\
                                       np.random.randint(0,num_cat,num_prod),\
                                       np.random.randint(0,num_cat,num_prod),\
                                       np.random.randint(0,num_cat,num_prod),\
                                       np.random.randint(0,num_cat,num_prod)])
    return np.array(list_cat_mappings)

# create customer to demographics array
def create_mock_cust_demo(num_cust,num_demoregion, num_demogender):
    return np.c_[np.random.randint(0,num_demoregion,num_cust),\
                np.random.randint(0,num_demogender,num_cust)]

# create customer x orders
def create_mock_cust_orders(num_cust,num_prod,order_percentage):
    R = np.random.choice([0, 1], size=num_cust*num_prod, p=[1-order_percentage, order_percentage])
    return np.reshape(R,(-1,num_prod))


In [8]:
# Create co-occurrence matrix

def create_cocmatrix(subset_matrix_cust_order):
    rows, cols = subset_matrix_cust_order.shape
    m = np.zeros((cols,cols))
    for i in range(cols):
        t = np.sum(subset_matrix_cust_order[subset_matrix_cust_order[:,i] > 0],axis=0)
        t[i] = 0
        m[i,:] = t
        if i % (cols/5) == 0:
            print "created rows in cooccurrence matrix at row",i
    return m

In [9]:
def create_seed(rand):
    np.random.seed(rand)

In [10]:
# initialize all data for collaborative

# customer to demographic lookup
#arr_cxd = create_mock_cust_demo(NUM_OF_CUSTOMERS,NUM_OF_DEMOREGION,NUM_OF_DEMOGENDER)
#arr_cxd = np.load('demo_matrix.npy')
arr_cxd = np.load('../../data/extracts/cluster1CustmeridsMapping.npy')

print "created customer to demographic lookup"

# customer to order lookup
#matrix_co = create_mock_cust_orders(NUM_OF_CUSTOMERS,NUM_OF_PRODUCTS,0.01)
matrix_co = np.load('../../data/extracts/cust_item_matrix.npy')

print "created customer to order lookup"

# content rating lookup
arr_conrate = np.load('../../data/derived/rating_indexed.npy')

print "created content lookup"


created customer to demographic lookup
created customer to order lookup
created content lookup


In [11]:
# print statistics

print "\ncustomer x demographic shape:",arr_cxd.shape
print "customer regions + unknown:",len(np.unique(arr_cxd[:,0]))
for num, name in enumerate(np.unique(arr_cxd[:,0])):
    print "\tregion",name,"% of pop",len(np.where(arr_cxd[:,0] == name)[0])/float(arr_cxd.shape[0])*100
#print "customer genders + unknown:",len(np.unique(arr_cxd[:,1]))
#for num, name in enumerate(np.unique(arr_cxd[:,1])):
#    print "\tgender",name,"% of pop",len(np.where(arr_cxd[:,1] == name)[0])/float(arr_cxd.shape[0])*100
    
print "\ncustomer x purchases shape:",matrix_co.shape
print "if already normalized the stats should be the same"
print "not normalized"
stat_sum_not_normal = np.sum(matrix_co,axis=1)
print "customer average purchases (multiples of item allowed):",np.mean(stat_sum_not_normal)
print "customer variance purchases (multiples of item allowed):",np.var(stat_sum_not_normal)
print "customer min purchases (multiples of item allowed):",np.min(stat_sum_not_normal)
print "customer max purchases (multiples of item allowed):",np.max(stat_sum_not_normal)
print "normalized"
stat_sum_normal = np.count_nonzero(matrix_co,axis=1)
print "customer average purchases (multiples of item set to 1):",np.mean(stat_sum_normal)
print "customer variance purchases (multiples of item set to 1):",np.var(stat_sum_normal)
print "customer min purchases (multiples of item set to 1):",np.min(stat_sum_normal)
print "customer max purchases (multiples of item set to 1):",np.max(stat_sum_normal)
print "customers that bought at least 1 item:",len(np.where(np.sum(matrix_co,axis=1) > 0)[0])
print "customers that bought at least 2 item:",len(np.where(np.sum(matrix_co,axis=1) > 1)[0])
print "customers that bought at least 3 item:",len(np.where(np.sum(matrix_co,axis=1) > 2)[0])
print "customers that bought at least 4 item:",len(np.where(np.sum(matrix_co,axis=1) > 3)[0])
print "customers that bought at least 5 item:",len(np.where(np.sum(matrix_co,axis=1) > 4)[0])


customer x demographic shape: (189559, 1)
customer regions + unknown: 8
	region 0.0 % of pop 2.59708059232
	region 1.0 % of pop 23.4512737459
	region 2.0 % of pop 3.5403225381
	region 3.0 % of pop 17.3581839955
	region 4.0 % of pop 19.4113706023
	region 5.0 % of pop 9.34695793922
	region 6.0 % of pop 1.4955765751
	region 7.0 % of pop 22.7992340116

customer x purchases shape: (189559, 3990)
if already normalized the stats should be the same
not normalized
customer average purchases (multiples of item allowed): 1.44386180556
customer variance purchases (multiples of item allowed): 1.28894093873
customer min purchases (multiples of item allowed): 1.0
customer max purchases (multiples of item allowed): 150.0
normalized
customer average purchases (multiples of item set to 1): 1.44386180556
customer variance purchases (multiples of item set to 1): 1.28894093873
customer min purchases (multiples of item set to 1): 1
customer max purchases (multiples of item set to 1): 150
customers that bou

In [12]:
# generate recommendations
# purchase_vec = customer purchases (vector)
# cocm = co occurrence matrix [items x items]
# num_rec = number of recommendations
def gen_recom_collab(purchase_list,cocm,num_rec):
    rowsum = np.zeros(cocm.shape[0])
    
    for p in purchase_list:
        rowsum += cocm[p,:]
        
    rowsum[purchase_list] = 0
    indices = np.nonzero(rowsum)[0]
    toprec = indices[np.argsort(rowsum[indices])][-1 * num_rec:][::-1]
    return list(toprec)

In [13]:
# generate recommendations
# purchase_vec = customer purchases (vector)
# list_rating = content rating
# num_rec = number of recommendations
def gen_recom_content(purchase_list,list_rating,num_rec):
    NUM_RECOMMENDATIONS_C = 40
    rowsum = np.copy(list_rating)
        
    rowsum[purchase_list] = 0
    
    indices = np.nonzero(rowsum)[0]
    randrec = indices[np.argsort(rowsum[indices])][-1 * NUM_RECOMMENDATIONS_C:][::-1]
    randindices = np.random.permutation(randrec)[:num_rec]
    toprec = randindices[np.argsort(rowsum[randindices])][::-1]
    return list(toprec)

In [14]:
from sklearn.model_selection import KFold

In [34]:
# put matrixes in to get cross validation recall
# generate recommendations
# mco = matrix of customer to order [customer x items]
# num_rec = number of recommendations
# num_folds = k number of folds for cross validation
# recall_remove = removed number from purchase history
# important_c = important clusters that you want specialized matrix
# coc = clusters of customers
# c_rating = content rating array
def hybrid_recall_holdout_tester(mco_train,mco_test,num_rec_collab,num_rec_content,recall_remove,important_c,coc_train,coc_test,c_rating):
    # cross validation n-folds
    start_time = time.time()
    list_total_acc=[]
        
    list_sub_acc = []
    # build co-occurrence matrixes
    list_coo_matrix = {}
    list_coo_matrix[0] = create_cocmatrix(mco_train)
    for c in important_c:
        list_sub = list(np.where(coc_train == c)[0])
        list_coo_matrix[c] = create_cocmatrix(mco_train[list_sub])

    # loop through test set
    # for each customer in test pool
    for i in range(mco_test.shape[0]):
        p_vec = mco_test[i,:]
        list_original_purchases = np.where(p_vec > 0)[0]
        list_summed_coo_vec = []
        list_content_vec = []
        list_hybrid_vec = []

        # randomly select indexes to leave out
        list_removed_purchases = random.sample(list(list_original_purchases),recall_remove)
        # remove
        list_modified_purchases = list(set(list_original_purchases) - set(list_removed_purchases))

        # content recomendation
        list_content_vec = gen_recom_content(list_modified_purchases,c_rating,num_rec_collab+num_rec_content)

        # only run tests on customers with > recall_remove purchases for prediction
        # collab recommendation
        if len(list_original_purchases) > recall_remove:
            # get sum all purchases except ones left out
            cust_cluster = coc_test[i][0]
            if cust_cluster in important_c:
                list_summed_coo_vec = gen_recom_collab(list_modified_purchases,list_coo_matrix[cust_cluster],num_rec_collab+num_rec_content)
            else:
                list_summed_coo_vec = gen_recom_collab(list_modified_purchases,list_coo_matrix[0],num_rec_collab+num_rec_content)
            # check if return recommendations are in list

        # Combiner
        list_hybrid_vec = list_summed_coo_vec[:num_rec_collab]

        for r in list_content_vec:
            if len(list_hybrid_vec) >= num_rec_collab+num_rec_content:
                break
            if r not in list_hybrid_vec:
                list_hybrid_vec.append(r)

        #print 'purchased' + 'list_modified_purchases'
        #print 'collab' + str(list_summed_coo_vec)
        #print 'content' + str(list_content_vec)
        #print 'hybrid' + str(list_hybrid_vec)
        #print 'missing' + str(list_removed_purchases)

        list_recommended_match = set(list_removed_purchases) & set(list_hybrid_vec)
        acc = len(list_recommended_match)/float(len(list_removed_purchases))
        list_sub_acc.append(acc)                
        
        
    average = np.mean(list_sub_acc)
    return average

In [45]:
# put matrixes in to get cross validation coverage
# generate recommendations
# mco = matrix of customer to order [customer x items]
# num_rec = number of recommendations
# num_folds = k number of folds for cross validation
# recall_remove = removed number from purchase history
# important_c = important clusters that you want specialized matrix
# coc = clusters of customers
# c_rating = content rating array
def hybrid_catalog_holdout_tester(mco_train,mco_test,num_rec_collab,num_rec_content,recall_remove,important_c,coc_train,coc_test,c_rating):
    # cross validation n-folds
    start_time = time.time()
    list_total_coverage=[]
    
    set_products = set()
    # build co-occurrence matrix
    list_coo_matrix = {}
    list_coo_matrix[0] = create_cocmatrix(mco_train)
    for c in important_c:
        list_sub = list(np.where(coc_train == c)[0])
        list_coo_matrix[c] = create_cocmatrix(mco_train[list_sub])

    # loop through test set
    # for each customer in test pool
    for i in range(mco_test.shape[0]):
        # purchase vector
        p_vec = mco_test[i,:]
        list_original_purchases = np.where(p_vec > 0)[0]
        cust_cluster = coc_test[i][0]
        if cust_cluster in important_c:
            list_summed_coo_vec = gen_recom_collab(list_original_purchases,list_coo_matrix[cust_cluster],num_rec_collab+num_rec_content)
        else:
            list_summed_coo_vec = gen_recom_collab(list_original_purchases,list_coo_matrix[0],num_rec_collab+num_rec_content)
        # content recomendation
        list_content_vec = gen_recom_content(list_original_purchases,c_rating,num_rec_collab+num_rec_content)  

        # Combiner
        list_hybrid_vec = list_summed_coo_vec[:num_rec_collab]

        for r in list_content_vec:
            if len(list_hybrid_vec) >= num_rec_collab+num_rec_content:
                break
            if r not in list_hybrid_vec:
                list_hybrid_vec.append(r)

        set_products = set_products | set(list_hybrid_vec)

    mean_sub_cov = float(len(set_products))/p_vec.shape[0]
    list_total_coverage.append(mean_sub_cov)
    
    average = np.mean(list_total_coverage)
    return average

In [17]:
# find all clusters, 0 is unknown and all
unique_clusters = np.unique(arr_cxd[:,0])
# define useful regions
list_useful_clusters = [1,3,4,7]



In [18]:
#80/20, 80 for 10-fold cross validation, 20 for test set holdout
create_seed(0)
indices = np.random.permutation(matrix_co.shape[0])
tsize = int(matrix_co.shape[0]*.8)

# customer to order matrix
matrix_co_training_idx, matrix_co_testing_idx  = indices[:tsize], indices[tsize:]
matrix_co_training, matrix_co_testing = matrix_co[matrix_co_training_idx,:], matrix_co[matrix_co_testing_idx,:]

# demo matrix
arr_cxd_training_idx, arr_cxd_testing_idx = indices[:tsize], indices[tsize:]
arr_cxd_training, arr_cxd_testing = arr_cxd[arr_cxd_training_idx,:], arr_cxd[arr_cxd_testing_idx,:]

In [19]:
matrix_co_training.shape, arr_cxd_training.shape, matrix_co.shape, arr_cxd.shape

((151647, 3990), (151647, 1), (189559, 3990), (189559, 1))

In [49]:
# General test
seed = 0
create_seed(seed)

# test set
accuracy = hybrid_recall_holdout_tester(matrix_co_training,matrix_co_testing,\
                                        9,1,1,list_useful_clusters,\
                                        arr_cxd_training,arr_cxd_testing,arr_conrate)

coverage = hybrid_catalog_holdout_tester(matrix_co_training,matrix_co_testing,\
                                        9,1,1,list_useful_clusters,\
                                        arr_cxd_training,arr_cxd_testing,arr_conrate)



# small test
#randidx_train = np.random.choice(matrix_co_training.shape[0], 100000, replace=False)
#randidx_test = np.random.choice(matrix_co_testing.shape[0], 10000, replace=False)
#accuracy = hybrid_recall_holdout_tester(matrix_co_training[randidx_train],matrix_co_testing[randidx_test],\
#                                        9,1,1,list_useful_clusters,\
#                                        arr_cxd_training[randidx_train],arr_cxd_testing[randidx_test],arr_conrate)

#coverage = hybrid_catalog_holdout_tester(matrix_co_training[randidx_train],matrix_co_testing[randidx_test],\
#                                        9,1,1,list_useful_clusters,\
#                                        arr_cxd_training[randidx_train],arr_cxd_testing[randidx_test],arr_conrate)


created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
created rows in cooccurrence matrix at row 0
created row

In [50]:
accuracy

0.20949999999999999

In [51]:
coverage

0.3275689223057644