In [1]:
#Import API
from dora.api import DataExplorer

import pandas as pd
from datetime import date, timedelta, datetime
from matplotlib import pyplot
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from uszipcode import ZipcodeSearchEngine
import numpy as np

import random
import time
from sklearn.model_selection import KFold
import json
import sys

explorer = DataExplorer()

In [2]:
def plot3d(X,x,y,z,points):
    fig = pyplot.figure()
    ax = Axes3D(fig)
    colors = ['green','red','blue','black','salmon','indigo','plum','seagreen','grey','yellow']
    for i in range(points):
        ax.scatter(X.results[i][x], X.results[i][y],X.results[i][z], c=colors[X.results[i][12]])
    return fig

In [3]:
def custPerCluster(df):
    fig=plt.figure()
    maxn=df['cluster'].max()
    plt.hist(df['cluster'].values)
    plt.xlabel('Clusters')
    plt.ylabel('# of Customers')
    plt.xticks(range(0,maxn+1))
    plt.title('Number of Customers per Cluster')
    return fig

In [4]:
def clusterDist(df):
    maxn=df['cluster'].max()
    clusterStats=pd.DataFrame()
    for i in range(maxn+1):
        mask=(df['cluster']==i)
        t=df.loc[mask]
        clusterStats.loc[i,'avgNumOrders']=t['numorders'].mean()
        clusterStats.loc[i,'avgTotalSpent']=t['totalspent'].mean()
        clusterStats.loc[i,'numHouseholds']=t['householdid'].nunique()
        clusterStats.loc[i,'numZipcoes']=t['zipcode'].nunique()
        clusterStats.loc[i,'avgTotalPopMales']=t['totalmales'].mean()
        clusterStats.loc[i,'avgTotalPopFemales']=t['totalfemales'].mean()
        clusterStats.loc[i,'avgTotalPop']=t['totalpop'].mean()
        clusterStats.loc[i,'medianAge']=t['medianage'].median()
    return clusterStats

In [5]:
def clusterZips(df):
    maxn=df['cluster'].max()
    search = ZipcodeSearchEngine()
    zipstates={}
    zipcities={}
    for i in range(maxn+1):
        #print ("Cluster "+str(i))
        states=set()
        cities=set()
        mask=(df['cluster']==i)
        t=df[['zipcode','cluster']].loc[mask]
        zipcodes=t['zipcode'].unique()
        for j in range(len(zipcodes)):
            city=search.by_zipcode(str(zipcodes[j]))
            cities.add(city['City'])
            states.add(city['State'])
        zipstates.update({i:states})
        zipcities.update({i:cities})
    return zipstates, zipcities

statsByCustomer Query

In [6]:
statsByCust=explorer.customers.statsByCustomer()
statsByCustomer=pd.DataFrame(statsByCust.results, columns=statsByCust.columns)
statsByCustomer.head()

Unnamed: 0,customermatchedid,numorders,gender,zipcode,totalpop,medianage,totalmales,totalfemales,totalspent,householdid,firstname,numcustomerid
0,37917,412,0,10036,22413,38.8,12687,9726,606.25,19885296,MIKE,412
1,37907,189,1,10036,22413,38.8,12687,9726,651.59,19885296,HILDA,189
2,140299,99,0,10036,22413,38.8,12687,9726,0.0,49927024,MIKE,99
3,37901,99,2,10036,22413,38.8,12687,9726,2226.32,19885296,,99
4,140298,70,0,10036,22413,38.8,12687,9726,0.0,49927024,JIM,70


Use the API to get 10 clusters of customers. Bad zipcodes have been removed from the dataset. Customers with no gender assigned are within the dataset of customers to be clustered. A specified feature_set is not defined by the user, neither are the features that will be used for the clustering. In this case, the feature_set will be the data from statsByCustomer and the features will be [numOrders, gender, totalpop, totalspent]. 

In [7]:
#Get 10 clusters and print out the head of the dataframe
cCluster=explorer.customers.clusterCustomers(n_clusters=10)

In [8]:
#Lists the columns of the cCluster
cCluster.columns

array(['numorders', 'gender', 'totalspent', 'zipcode', 'customermatchedid',
       'totalpop', 'medianage', 'totalmales', 'totalfemales',
       'householdid', 'firstname', 'numcustomerid', 'cluster',
       'customerids'], dtype=object)

In [9]:
#3-D Plot of the customer clusters (gender, medianage, totalpop)
#plot=plot3d(cCluster, 1,5, 2, 1000)
#plot

In [10]:
#Histogram of the number of customers in each cluster
#df=pd.DataFrame(cCluster.results, columns=cCluster.columns)
#dist=custPerCluster(df)
#dist

In [11]:
#Stats by cluster
#stats=clusterDist(df)
#stats

In [12]:
#Get the states and cities that are clustered together
#zipdata=clusterZips(df)
#zipstates=zipdata[0]
#zipcities=zipdata[1]

Cluster the customers but remove certain householdids. Specifiy the features that will be used in the clustering. 

In [13]:
#Elimate householdids from the data that will be clustered
response=explorer.customers.statsByCustomer(householdid=['19885296','49927024'])

In [14]:
#Show results from the query
#response.results

In [15]:
#Cluster the results by customer
cCluster_rmHID=explorer.customers.clusterCustomers(feature_set=response, n_clusters=7, 
                                                   cluster_on=['gender','totalmales','totalfemales','totalpop'])

In [16]:
#Get cluster stats
df=pd.DataFrame(cCluster_rmHID.results, columns=cCluster_rmHID.columns )
stats=clusterDist(df)
stats

Unnamed: 0,avgNumOrders,avgTotalSpent,numHouseholds,numZipcoes,avgTotalPopMales,avgTotalPopFemales,avgTotalPop,medianAge
0,1.190962,82.721845,35682.0,2079.0,13651.024379,14547.523659,28198.548038,40.15
1,1.212001,65.252829,3010.0,66.0,42834.013848,48141.761292,90975.77514,37.7
2,1.180092,91.478685,28322.0,4899.0,3540.93551,3658.76702,7199.702529,43.2
3,1.194058,79.746778,14369.0,628.0,24411.302771,26117.666896,50528.969668,36.3
4,1.190426,81.02133,25212.0,1287.0,18880.354152,20286.82016,39167.174312,39.0
5,1.188336,84.460504,36553.0,2548.0,8807.290947,9304.880179,18112.171126,41.3
6,1.200572,83.46654,10049.0,285.0,30493.876381,33517.923244,64011.799625,38.3


In [17]:
cCluster_rmHID.columns

array(['gender', 'totalmales', 'totalfemales', 'totalpop',
       'customermatchedid', 'numorders', 'zipcode', 'medianage',
       'totalspent', 'householdid', 'firstname', 'numcustomerid',
       'cluster', 'customerids'], dtype=object)

In [18]:
#3-D Plot of the customer clusters (gender, totalmales , totalfemales)
#plot=plot3d(cCluster_rmHID, 0,1, 2, 1000)
#plot

In [19]:
df.head()

Unnamed: 0,gender,totalmales,totalfemales,totalpop,customermatchedid,numorders,zipcode,medianage,totalspent,householdid,firstname,numcustomerid,cluster,customerids
0,2,26770,28773,55543,72872,28,11354,42.1,1218.96,22269801,PO-WEN,28,3,"(164055, 164066, 164065, 164080, 164076, 16406..."
1,1,0,0,0,108804,26,10168,,0.0,36201520,NANCY,26,2,"(113229, 113228, 113247, 113246, 113253, 11325..."
2,0,12687,9726,22413,28252,24,10036,38.8,814.56,19440306,PETER,24,5,"(4118, 4131, 4130, 4129, 4117, 4138, 4137, 413..."
3,2,21660,23317,44977,116601,21,11590,36.7,3312.75,36209331,,21,3,"(119154, 119155, 119157, 119160, 119156, 11916..."
4,0,15295,18367,33662,32345,17,44122,46.4,3757.26,19626230,JEROME,17,4,"(78833, 78839, 78837, 78838, 78836, 78844, 788..."


In [20]:
# Create co-occurrence matrix

def create_cocmatrix(subset_matrix_cust_order):
    rows, cols = subset_matrix_cust_order.shape
    m = np.zeros((cols,cols))
    
    for i in range(cols):
        t = np.sum(subset_matrix_cust_order[subset_matrix_cust_order[:,i] > 0],axis=0)
        t[i] = 0
        m[i,:] = t
                    
    return m

In [21]:
def create_seed(rand):
    np.random.seed(rand) # create seed for repeatable results

In [22]:
# generate recommendations
# purchase_vec = customer purchases (vector)
# cocm = co occurrence matrix [items x items]
# num_rec = number of recommendations
def gen_recom(purchase_list,cocm,num_rec):
    rowsum = np.zeros(cocm.shape[0])
    
    for p in purchase_list:
        rowsum += cocm[p,:]
        
    rowsum[purchase_list] = 0
    indices = np.nonzero(rowsum)[0]
    toprec = indices[np.argsort(rowsum[indices])][-1 * num_rec:][::-1]
    return toprec

In [23]:
def get_indices(dataset, col, var=None):
    if var is None:
        return [y for y, x in enumerate(dataset[:,col])] # all
    
    return [y for y, x in enumerate(dataset[:,col]) if x == var]

In [24]:
def check_idx_size(idxlist,maximum_rows):
    if len(idxlist) > maximum_rows:
        return np.random.choice(idxlist, maximum_rows, replace=False)
    
    return idxlist

In [25]:
# customer to order lookup
matrix_co = np.load('cust_item_matrix.npy')
print ("created customer to order lookup")

matrix_co.shape

created customer to order lookup


(189559, 3990)

In [26]:
custlist = np.load('custids.npy')
print ("created customer id lookup")

custlist.shape

created customer id lookup


(189559,)

In [27]:
np.where(custlist==164055)

(array([164054]),)

In [28]:
np.where(custlist==113229)

(array([113228]),)

In [29]:
unique_clusters = sorted(df['cluster'].unique())
print (unique_clusters)

[0, 1, 2, 3, 4, 5, 6]


In [30]:
# bring in customerids to map with cluster ids
arr_cxd = np.zeros((custlist.shape[0],1))

In [31]:
arr_cxd.shape

(189559, 1)

In [32]:
# mapping cluster ids
for i in range(df.shape[0]):
    for c in df.loc[i,'customerids']:
        idx = np.where(custlist==c)
        cid = df.loc[i,'cluster']
        arr_cxd[idx] = cid+1 # increment by 1 so we can reserve 0 for customers with no cluster id
    

In [33]:
maxn=df['cluster'].max()
print (maxn)

6


In [34]:
unique_clusters = unique_clusters + [maxn+1]
print (unique_clusters)

[0, 1, 2, 3, 4, 5, 6, 7]


In [35]:
arr_cxd[113228]

array([ 3.])

In [36]:
#80/20, 80 for 10-fold cross validation, 20 for test set holdout
create_seed(0)
indices = np.random.permutation(matrix_co.shape[0])
tsize = int(matrix_co.shape[0]*.8)

In [37]:
# customer to order matrix
matrix_co_training_idx, matrix_co_testing_idx  = indices[:tsize], indices[tsize:]
matrix_co_training, matrix_co_testing = matrix_co[matrix_co_training_idx,:], matrix_co[matrix_co_testing_idx,:]

In [38]:
# demo matrix
arr_cxd_training_idx, arr_cxd_testing_idx = indices[:tsize], indices[tsize:]
arr_cxd_training, arr_cxd_testing = arr_cxd[arr_cxd_training_idx,:], arr_cxd[arr_cxd_testing_idx,:]

In [39]:
matrix_co_training.shape, arr_cxd_training.shape, matrix_co.shape, arr_cxd.shape

((151647, 3990), (151647, 1), (189559, 3990), (189559, 1))

In [52]:
# put matrixes in to get cross validation recall
# generate recommendations
# mco = matrix of customer to order [customer x items]
# num_rec = number of recommendations
# num_folds = k number of folds for cross validation
# recall_remove = removed number from purchase history
def collab_recall_validation_tester(mco,num_rec,num_folds,recall_remove):
    # cross validation n-folds
    start_time = time.time()
    kf = KFold(n_splits=num_folds)
    list_total_acc=[]
    k_index = 0
    
    for train, test in kf.split(mco):
        print("start new k={0},train set={1},validation set={2}".format(k_index,mco[train].shape[0],mco[test].shape[0]))
        
        list_sub_acc = []
        # build co-occurrence matrix
        coo_matrix = create_cocmatrix(mco[train])

        # loop through test set
        # for each customer in test pool
        for i in range(mco[test].shape[0]):
            
            p_vec = mco[test][i,:]
            list_original_purchases = np.where(p_vec > 0)[0]
            # only run tests on customers with > recall_remove purchases for prediction
            if len(list_original_purchases) > recall_remove:
                # randomly select indexes to leave out
                list_removed_purchases = random.sample(list(list_original_purchases),recall_remove)
                # remove
                list_modified_purchases = list(set(list_original_purchases) - set(list_removed_purchases))
                # get sum all purchases except ones left out
                list_summed_coo_vec = gen_recom(list_modified_purchases,coo_matrix,num_rec)
                # check if return recommendations are in list
                list_recommended_match = set(list_removed_purchases) & set(list_summed_coo_vec)
                acc = len(list_recommended_match)/float(len(list_removed_purchases))
                list_sub_acc.append(acc)
                
        mean_sub_acc = np.mean(list_sub_acc)
        print ("** number of elements in calculation {0}".format(len(list_sub_acc)))
        #print list_sub_acc
        print ("** average fold accuracy {0}".format(mean_sub_acc))
        list_total_acc.append(mean_sub_acc)
        k_index += 1
        print ("**time elapsed {0}".format((time.time() - start_time)))
        print ("------------------------------------end")
        
    print ("list of total accuracy for each fold {0}".format(list_total_acc))
    average = np.mean(list_total_acc)
    print ("{0}-fold total average {1}".format(num_folds,average))
    print ("time elapsed {0}".format((time.time() - start_time)))
    return average

In [56]:
# put matrixes in to get cross validation coverage
# generate recommendations
# mco = matrix of customer to order [customer x items]
# num_rec = number of recommendations
# num_folds = k number of folds for cross validation
# recall_remove = removed number from purchase history
# list_catid = category customer is interested in
def collab_catalog_validation_tester(mco,num_rec,num_folds,recall_remove):
    # cross validation n-folds
    start_time = time.time()
    kf = KFold(n_splits=num_folds)
    list_total_coverage=[]
    k_index = 0
    
    for train, test in kf.split(mco):
        set_products = set()
        # build co-occurrence matrix
        print("start new k={0},train set={1},validation set={2}".format(k_index,mco[train].shape[0],mco[test].shape[0]))
        coo_matrix = create_cocmatrix(mco[train])

        # loop through test set
        # for each customer in test pool
        for i in range(mco[test].shape[0]):
            # purchase vector
            p_vec = mco[test][i,:]
            list_original_purchases = np.where(p_vec > 0)[0]
            # get sum all purchases
            list_summed_coo_vec = gen_recom(list_original_purchases,coo_matrix,num_rec)    
            set_products = set_products | set(list_summed_coo_vec)
                        
        mean_sub_cov = float(len(set_products))/p_vec.shape[0]
        print ("**length fold set {0}".format(float(len(set_products))))
        print ("**average fold coverage {0}".format(mean_sub_cov))
        list_total_coverage.append(mean_sub_cov)
        k_index += 1
        print ("**time elapsed {0}".format((time.time() - start_time)))
        print ("------------------------------------end")
    
    print ("list of total coverage for each fold {0}".format(list_total_coverage))
    average = np.mean(list_total_coverage)
    print ("{0}-fold total average {1}".format(num_folds,average))
    print ("time elapsed {0}".format((time.time() - start_time)))
    return average

In [57]:
def demo_filter(mco,mdo,clusters,maximum_rows,minimum_rows,num_rec,num_folds,recall_remove):
    accuracy = []
    coverage = []
    
    for c in clusters:
        #if r == 0: # 0 means not found in the cluster
        #    ridx = get_indices(mdo, 0) # use all regions
        #else:   
        cidx = get_indices(mdo, 0, c) # look for cluster in col 0

        if len(cidx) == 0:
            print ("cluster: {0} not found".format(c))
            continue

        if len(cidx) < minimum_rows:
            print ("too little data with cluster {0} found".format(c))
            continue

        cidx = check_idx_size(cidx,maximum_rows) # limit size
        print ("\ncluster: {0}, size: {1}, {2}".format(c, len(cidx), len(mco)))
                             
        total_accuracy = collab_recall_validation_tester(mco[cidx],num_rec,num_folds,recall_remove)
        accuracy.append((c,mco[cidx].shape[0],total_accuracy))
        total_coverage = collab_catalog_validation_tester(mco[cidx],num_rec,num_folds,recall_remove)
        coverage.append((c,mco[cidx].shape[0],total_coverage))
        
    return accuracy, coverage

In [None]:
# General test
seed = 0
create_seed(seed)
maximum_rows = 25000
minimum_rows = 1000

# test set
#randidx = np.random.choice(matrix_co_training.shape[0], 10000, replace=False)
#demo_results = demo_filter(matrix_co_training[randidx],arr_cxd_training[randidx],\
#                           regions,maximum_rows,minimum_rows,10,10,1)

# big set
demo_accuracy, demo_coverage = demo_filter(matrix_co_training,arr_cxd_training,\
                                   unique_clusters,maximum_rows,minimum_rows,10,10,1)


cluster: 0, size: 3942, 151647
start new k=0,train set=3547,validation set=395
** number of elements in calculation 120
** average fold accuracy 0.5416666666666666
**time elapsed 2.3083529472351074
------------------------------------end
start new k=1,train set=3547,validation set=395
** number of elements in calculation 118
** average fold accuracy 0.4491525423728814
**time elapsed 4.687896966934204
------------------------------------end
start new k=2,train set=3548,validation set=394
** number of elements in calculation 112
** average fold accuracy 0.48214285714285715
**time elapsed 7.1698689460754395
------------------------------------end
start new k=3,train set=3548,validation set=394
** number of elements in calculation 124
** average fold accuracy 0.47580645161290325
**time elapsed 9.436200141906738
------------------------------------end
start new k=4,train set=3548,validation set=394
** number of elements in calculation 113
** average fold accuracy 0.48672566371681414
**time

In [None]:
demo_accuracy

In [None]:
np.save('demo_clustering_cluster_demo_accuracy.npy',\
        [(maximum_rows,minimum_rows,seed,datetime.now().strftime("%Y-%m-%d_%H:%M:%S"))]+demo_accuracy)

In [None]:
demo_coverage

In [None]:
np.save('demo_clustering_cluster_demo_accuracy.npy',\
        [(maximum_rows,minimum_rows,seed,datetime.now().strftime("%Y-%m-%d_%H:%M:%S"))]+demo_coverage)