In [None]:
# This notebook was created to do validation on collaboration filtering models
# It relies on a data extract in certain formats to operate, although there is a mock data generator
#
# There are 2 main functions:
#
# ** collab_recall_validation_tester()
# Input: customer purchase data, testing prarameters
# K-fold will be used for validation, using the (k-1)/k of the customer purcahse data to build a co-occurrence
# matrix and validating with 1/k of the data. The 1/k of the data will have X number of purchases removed
# for each customer and try to recommend the missing purchase using the co-occurrence matrix. The limitation
# is that any users with less than X purchases will not be used to calculate the accurracy.
# Output: Recall Accurracy
# **
#
# ** collab_catalog_validation_tester()
# Input: customer purchase data, testing prarameters
# K-fold will be used for validation, using the (k-1)/k of the customer purcahse data to build a co-occurrence
# matrix and validating with 1/k of the data. The 1/k of the data will keep all the original customer purchases
# and use it to recommend products using the co-occurrence matrix. Taking all the recommended products and 
# dividing it by the total amount of products offered will give us catalog coverage.
# Output: Catalog Coverage
# **
#
# Testing with categories is not implemented yet

In [1]:
import numpy as np
import random
import time
from sklearn.model_selection import KFold
import json
import sys
from datetime import date, datetime

In [2]:
# Create co-occurrence matrix

def create_cocmatrix(subset_matrix_cust_order):
    rows, cols = subset_matrix_cust_order.shape
    m = np.zeros((cols,cols))
    
    for i in range(cols):
        t = np.sum(subset_matrix_cust_order[subset_matrix_cust_order[:,i] > 0],axis=0)
        t[i] = 0
        m[i,:] = t
        
        if i % (cols/5) == 0:
            print "created rows in cooccurrence matrix at row",i
            
    return m

In [3]:
def create_seed(rand):
    np.random.seed(rand) # create seed for repeatable results

In [4]:
# initialize all data for collaborative
# customer to demographic lookup
arr_cxd = np.load('demo_matrix.npy')

print "created customer to demographic lookup"

# customer to order lookup
matrix_co = np.load('cust_item_matrix.npy')

print "created customer to order lookup"

regions = []

with open("regions.txt", 'r') as regioninputs:
    for r in regioninputs:
        r = r.strip()
        try:
            rname, rid = r.split('\t')
        except:
            continue
            
        regions.append(float(rid))
        
regions = list(set(regions))

created customer to demographic lookup
created customer to order lookup


In [5]:
# print statistics
print "\ncustomer x demographic shape:",arr_cxd.shape
print "customer regions + unknown:",len(np.unique(arr_cxd[:,0]))

for num, name in enumerate(np.unique(arr_cxd[:,0])):
    print "\tregion",name,"% of pop",len(np.where(arr_cxd[:,0] == name)[0])/float(arr_cxd.shape[0])*100
    
print "customer genders + unknown:",len(np.unique(arr_cxd[:,1]))

for num, name in enumerate(np.unique(arr_cxd[:,1])):
    print "\tgender",name,"% of pop",len(np.where(arr_cxd[:,1] == name)[0])/float(arr_cxd.shape[0])*100
    
print "\ncustomer x purchases shape:",matrix_co.shape # customer and purchases
print "if already normalized the stats should be the same"
print "not normalized"

stat_sum_not_normal = np.sum(matrix_co,axis=1)

print "customer average purchases (multiples of item allowed):",np.mean(stat_sum_not_normal)
print "customer variance purchases (multiples of item allowed):",np.var(stat_sum_not_normal)
print "customer min purchases (multiples of item allowed):",np.min(stat_sum_not_normal)
print "customer max purchases (multiples of item allowed):",np.max(stat_sum_not_normal)
print "normalized"

stat_sum_normal = np.count_nonzero(matrix_co,axis=1)

print "customer average purchases (multiples of item set to 1):",np.mean(stat_sum_normal)
print "customer variance purchases (multiples of item set to 1):",np.var(stat_sum_normal)
print "customer min purchases (multiples of item set to 1):",np.min(stat_sum_normal)
print "customer max purchases (multiples of item set to 1):",np.max(stat_sum_normal)
print "customers that bought at least 1 item:",len(np.where(np.sum(matrix_co,axis=1) > 0)[0])
print "customers that bought at least 2 item:",len(np.where(np.sum(matrix_co,axis=1) > 1)[0])
print "customers that bought at least 3 item:",len(np.where(np.sum(matrix_co,axis=1) > 2)[0])
print "customers that bought at least 4 item:",len(np.where(np.sum(matrix_co,axis=1) > 3)[0])
print "customers that bought at least 5 item:",len(np.where(np.sum(matrix_co,axis=1) > 4)[0])


customer x demographic shape: (189559, 2)
customer regions + unknown: 22
	region 0.0 % of pop 0.193079727156
	region 1.0 % of pop 10.3403162076
	region 2.0 % of pop 42.9655146946
	region 3.0 % of pop 8.69280804393
	region 4.0 % of pop 2.49262762517
	region 5.0 % of pop 14.3607003624
	region 6.0 % of pop 1.45653859748
	region 7.0 % of pop 4.01194351099
	region 8.0 % of pop 3.41318534071
	region 9.0 % of pop 11.9308500256
	region 10.0 % of pop 0.0949572428637
	region 11.0 % of pop 0.00474786214318
	region 12.0 % of pop 0.0290147130972
	region 13.0 % of pop 0.00896818404824
	region 15.0 % of pop 0.000527540238132
	region 18.0 % of pop 0.00105508047626
	region 20.0 % of pop 0.000527540238132
	region 21.0 % of pop 0.000527540238132
	region 22.0 % of pop 0.000527540238132
	region 28.0 % of pop 0.000527540238132
	region 30.0 % of pop 0.000527540238132
	region 35.0 % of pop 0.000527540238132
customer genders + unknown: 3
	gender 0.0 % of pop 8.54826201869
	gender 1.0 % of pop 50.8976097152
	g

In [6]:
# generate recommendations
# purchase_vec = customer purchases (vector)
# cocm = co occurrence matrix [items x items]
# num_rec = number of recommendations
def gen_recom(purchase_list,cocm,num_rec):
    rowsum = np.zeros(cocm.shape[0])
    
    for p in purchase_list:
        rowsum += cocm[p,:]
        
    rowsum[purchase_list] = 0
    indices = np.nonzero(rowsum)[0]
    toprec = indices[np.argsort(rowsum[indices])][-1 * num_rec:][::-1]
    return toprec

In [7]:
def get_indices(dataset, col, var=None):
    if var is None:
        return [y for y, x in enumerate(dataset[:,col])] # all
    
    return [y for y, x in enumerate(dataset[:,col]) if x == var]

In [8]:
def check_idx_size(idxlist,maximum_rows):
    if len(idxlist) > maximum_rows:
        return np.random.choice(idxlist, maximum_rows, replace=False)
    
    return idxlist

In [9]:
#80/20, 80 for 10-fold cross validation, 20 for test set holdout
create_seed(0)
indices = np.random.permutation(matrix_co.shape[0])
tsize = int(matrix_co.shape[0]*.8)

# customer to order matrix
matrix_co_training_idx, matrix_co_testing_idx  = indices[:tsize], indices[tsize:]
matrix_co_training, matrix_co_testing = matrix_co[matrix_co_training_idx,:], matrix_co[matrix_co_testing_idx,:]

# demo matrix
arr_cxd_training_idx, arr_cxd_testing_idx = indices[:tsize], indices[tsize:]
arr_cxd_training, arr_cxd_testing = arr_cxd[arr_cxd_training_idx,:], arr_cxd[arr_cxd_testing_idx,:]

In [10]:
matrix_co_training.shape, arr_cxd_training.shape, matrix_co.shape, arr_cxd.shape

((151647, 3990), (151647, 2), (189559, 3990), (189559, 2))

In [11]:
# put matrixes in to get cross validation recall
# generate recommendations
# mco = matrix of customer to order [customer x items]
# num_rec = number of recommendations
# num_folds = k number of folds for cross validation
# recall_remove = removed number from purchase history
def collab_recall_validation_tester(mco,num_rec,num_folds,recall_remove):
    # cross validation n-folds
    start_time = time.time()
    kf = KFold(n_splits=num_folds)
    list_total_acc=[]
    k_index = 0
    
    for train, test in kf.split(mco):
        print "------------------------------------start new k=",k_index,\
        ",train set=",mco[train].shape[0],\
        ",validation set=",mco[test].shape[0]
        list_sub_acc = []
        # build co-occurrence matrix
        coo_matrix = create_cocmatrix(mco[train])

        # loop through test set
        # for each customer in test pool
        for i in range(mco[test].shape[0]):
            
            #print "customer",i
            # purchase vector
            #print "purchase vector"
            p_vec = mco[test][i,:]
            #print p_vec
            # determine purchase indexes
            #print "purchase indexes"
            list_original_purchases = np.where(p_vec > 0)[0]
            #print list_original_purchases
            
            # only run tests on customers with > recall_remove purchases for prediction
            if len(list_original_purchases) > recall_remove:
                # randomly select indexes to leave out
                #print "remove purchase indexes"
                #list_removed_purchases = np.random.choice(list_original_purchases,size=recall_remove, replace=False)
                list_removed_purchases = random.sample(list_original_purchases,recall_remove)
                #print list_removed_purchases
                # remove
                list_modified_purchases = list(set(list_original_purchases) - set(list_removed_purchases))
                #print "modified purchase vector"
                #p_vec_mod = np.zeros(p_vec.shape[0])
                #p_vec_mod[list_modified_purchases] = 1
                #print p_vec_mod
                # get sum all purchases except ones left out
                list_summed_coo_vec = gen_recom(list_modified_purchases,coo_matrix,num_rec)
                #print list_summed_coo_vec
                # check if return recommendations are in list
                list_recommended_match = set(list_removed_purchases) & set(list_summed_coo_vec)
                acc = len(list_recommended_match)/float(len(list_removed_purchases))
                #print "predicted ratio",accuracy
                list_sub_acc.append(acc)
                #print "end customer",i

                #if i % 500 == 0:
                # sanity check to make sure not all recommendations are #1
                #if (list_removed_purchases[0] != list_summed_coo_vec[0]) & (acc == 1):
                #    print "\t--customer example index",i
                #    print "\toriginal purchased indexes",list_original_purchases
                #    print "\tremoved index",list_removed_purchases
                #    print "\toriginal modified purchase indexes",np.where(p_vec_mod > 0)[0]
                #    print "\tpredicted indexes",list_summed_coo_vec     
                #    print "\tpredicted accuracy ratio",acc
                #    print "\t--customer end example index",i
            #else:
                #print "skipped due to length of purchases",list_original_purchases,\
                #"less than recall remove",recall_remove
        mean_sub_acc = np.mean(list_sub_acc)
        print "** number of elements in calculation",len(list_sub_acc)
        #print list_sub_acc
        print "** average fold accuracy",mean_sub_acc
        list_total_acc.append(mean_sub_acc)
        k_index += 1
        print "**time elapsed",(time.time() - start_time)
        print "------------------------------------end"
    print "list of total accuracy for each fold",list_total_acc
    average = np.mean(list_total_acc)
    print num_folds,"-fold total average",average
    print "time elapsed",(time.time() - start_time)
    return average

def demo_filter(mco,mdo,regions,maximum_rows,minimum_rows,num_rec,num_folds,recall_remove):
    list_info = []
    
    for r in regions:
        if r == 0.0:
            ridx = get_indices(mdo, 0) # use all regions
        else:   
            ridx = get_indices(mdo, 0, r) # look for region r in col 0

            if len(ridx) == 0:
                print "region: {0} not found".format(r)
                continue

        for g in [1.0,2.0,0.0]: #gender_list:
            if g == 0.0:
                gidx = get_indices(mdo, 1) # all, col 1 is gender
            else:
                gidx = get_indices(mdo, 1, g)

                if len(gidx) == 0:
                    print "gender: {0} not found".format(g)
                    continue

            interidx = list(set(ridx).intersection(set(gidx)))

            if len(interidx) < minimum_rows:
                print "too little data with region {0} and gender {1} found".format(r, g)
                continue

            interidx = check_idx_size(interidx,maximum_rows) # limit size
            print "\nregion: {0}, gender: {1}, intersect: {2}, {3}".format(r, g, len(interidx), len(mco))
                             
            total_accuracy = collab_recall_validation_tester(mco[interidx],num_rec,num_folds,recall_remove)
            list_info.append((r,g,mco[interidx].shape[0],total_accuracy))
    
    return list_info

In [12]:
# General test
seed = 0
create_seed(seed)
maximum_rows = 25000
minimum_rows = 5000

# test set
randidx = np.random.choice(matrix_co_training.shape[0], 10000, replace=False)
demo_results = demo_filter(matrix_co_training[randidx],arr_cxd_training[randidx],\
                           regions,maximum_rows,minimum_rows,10,10,1)

# big set
demo_results = demo_filter(matrix_co_training,arr_cxd_training,\
                           regions,maximum_rows,minimum_rows,10,10,1)


region: 0.0, gender: 1.0, intersect: 5113, 10000
------------------------------------start new k= 0 ,train set= 4601 ,validation set= 512
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 139
** average fold accuracy 0.532374100719
**time elapsed 3.56005215645
------------------------------------end
------------------------------------start new k= 1 ,train set= 4601 ,validation set= 512
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 130
** average fold accuracy 0.484615384615
**time elapsed 7.41186499596
------------------------------------end


created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 256
** average fold accuracy 0.55078125
**time elapsed 90.6736779213
------------------------------------end
------------------------------------start new k= 7 ,train set= 9000 ,validation set= 1000
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 265
** average fold accuracy 0.6
**time elapsed 103.982406855
------------------------------------end
------------------------------------start new k= 8 ,train set= 9000 ,validation set= 1000
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in

created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 693
** average fold accuracy 0.620490620491
**time elapsed 666.912633896
------------------------------------end
------------------------------------start new k= 8 ,train set= 22500 ,validation set= 2500
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 671
** average fold accuracy 0.618479880775
**time elapsed 759.730753899
------------------------------------end
------------------------------------start new k= 9 ,train set= 22500 ,validation set= 2500
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 239

created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 657
** average fold accuracy 0.62404870624
**time elapsed 366.152621984
------------------------------------end
------------------------------------start new k= 4 ,train set= 22500 ,validation set= 2500
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 678
** average fold accuracy 0.629793510324
**time elapsed 458.179766178
------------------------------------end
------------------------------------start new k= 5 ,train set= 22500 ,validation set= 2500
created rows in cooccurrence matrix at row 0
cr

** number of elements in calculation 203
** average fold accuracy 0.591133004926
**time elapsed 95.1950309277
------------------------------------end
list of total accuracy for each fold [0.57281553398058249, 0.5665236051502146, 0.5535714285714286, 0.5847457627118644, 0.6339285714285714, 0.61352657004830913, 0.59113300492610843, 0.6280193236714976, 0.5213675213675214, 0.59113300492610843]
10 -fold total average 0.585676432678
time elapsed 95.1958520412

region: 1.0, gender: 2.0, intersect: 6579, 151647
------------------------------------start new k= 0 ,train set= 5921 ,validation set= 658
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 176
** average fold accuracy 0.573863636364
**time elapsed 7.00156807899
------------------------------------end
-----------------

created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 424
** average fold accuracy 0.662735849057
**time elapsed 242.067199945
------------------------------------end
------------------------------------start new k= 6 ,train set= 14182 ,validation set= 1576
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 421
** average fold accuracy 0.612826603325
**time elapsed 282.199629068
------------------------------------end
------------------------------------start new k= 7 ,train set= 14182 ,validation set= 1576
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 239

created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 644
** average fold accuracy 0.611801242236
**time elapsed 154.67070508
------------------------------------end
------------------------------------start new k= 2 ,train set= 22500 ,validation set= 2500
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 637
** average fold accuracy 0.610675039246
**time elapsed 234.207154036
------------------------------------end
------------------------------------start new k= 3 ,train set= 22500 ,validation set= 2500
created rows in cooccurrence matrix at row 0
cr

** number of elements in calculation 653
** average fold accuracy 0.621745788668
**time elapsed 618.644371986
------------------------------------end
------------------------------------start new k= 8 ,train set= 22500 ,validation set= 2500
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 671
** average fold accuracy 0.639344262295
**time elapsed 688.584228039
------------------------------------end
------------------------------------start new k= 9 ,train set= 22500 ,validation set= 2500
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 626
** av

created rows in cooccurrence matrix at row 3192
** number of elements in calculation 130
** average fold accuracy 0.484615384615
**time elapsed 14.5678880215
------------------------------------end
------------------------------------start new k= 4 ,train set= 4609 ,validation set= 512
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 133
** average fold accuracy 0.578947368421
**time elapsed 18.1580569744
------------------------------------end
------------------------------------start new k= 5 ,train set= 4609 ,validation set= 512
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
**

too little data with region 4.0 and gender 1.0 found
too little data with region 4.0 and gender 2.0 found
too little data with region 4.0 and gender 0.0 found

region: 5.0, gender: 1.0, intersect: 11463, 151647
------------------------------------start new k= 0 ,train set= 10316 ,validation set= 1147
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 310
** average fold accuracy 0.587096774194
**time elapsed 17.042386055
------------------------------------end
------------------------------------start new k= 1 ,train set= 10316 ,validation set= 1147
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matri

created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 230
** average fold accuracy 0.517391304348
**time elapsed 69.7559978962
------------------------------------end
------------------------------------start new k= 7 ,train set= 7551 ,validation set= 839
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 223
** average fold accuracy 0.529147982063
**time elapsed 80.2187929153
------------------------------------end
------------------------------------start new k= 8 ,train set= 7551 ,validation set= 839
created rows in cooccurrence matrix at row 0
creat

created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 167
** average fold accuracy 0.562874251497
**time elapsed 9.55113601685
------------------------------------end
------------------------------------start new k= 2 ,train set= 5503 ,validation set= 612
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 180
** average fold accuracy 0.538888888889
**time elapsed 14.3318638802
------------------------------------end
------------------------------------start new k= 3 ,train set= 5503 ,validation set= 612
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
cr

created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 146
** average fold accuracy 0.527397260274
**time elapsed 36.3366160393
------------------------------------end
------------------------------------start new k= 9 ,train set= 4663 ,validation set= 518
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 129
** average fold accuracy 0.542635658915
**time elapsed 40.3027749062
------------------------------------end
list of total accuracy for each fold [0.58450704225352113, 0.56081081081081086, 0.53333333333333333, 0.55118110236220474, 0.527027027027026

created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 213
** average fold accuracy 0.511737089202
**time elapsed 35.2926609516
------------------------------------end
------------------------------------start new k= 5 ,train set= 6400 ,validation set= 711
created rows in cooccurrence matrix at row 0
created rows in cooccurrence matrix at row 798
created rows in cooccurrence matrix at row 1596
created rows in cooccurrence matrix at row 2394
created rows in cooccurrence matrix at row 3192
** number of elements in calculation 212
** average fold accuracy 0.514150943396
**time elapsed 42.5687558651
------------------------------------end
------------------------------------start new k= 6 ,train set= 6400 ,validation set= 711
created rows in cooccurrence matrix at row 0
creat

too little data with region 11.0 and gender 1.0 found
too little data with region 11.0 and gender 2.0 found
too little data with region 11.0 and gender 0.0 found
too little data with region 12.0 and gender 1.0 found
too little data with region 12.0 and gender 2.0 found
too little data with region 12.0 and gender 0.0 found
too little data with region 13.0 and gender 1.0 found
too little data with region 13.0 and gender 2.0 found
too little data with region 13.0 and gender 0.0 found
region: 14.0 not found
too little data with region 15.0 and gender 1.0 found
too little data with region 15.0 and gender 2.0 found
too little data with region 15.0 and gender 0.0 found
region: 16.0 not found
region: 17.0 not found
too little data with region 18.0 and gender 1.0 found
too little data with region 18.0 and gender 2.0 found
too little data with region 18.0 and gender 0.0 found
region: 19.0 not found
too little data with region 20.0 and gender 1.0 found
too little data with region 20.0 and gender 

In [13]:
demo_results

[(0.0, 1.0, 25000, 0.61674962672589118),
 (0.0, 2.0, 25000, 0.58887166897047094),
 (0.0, 0.0, 25000, 0.6029222995850132),
 (1.0, 1.0, 7924, 0.58567643267822067),
 (1.0, 2.0, 6579, 0.53312319778180806),
 (1.0, 0.0, 15758, 0.59859156067508024),
 (2.0, 1.0, 25000, 0.65228295008667403),
 (2.0, 2.0, 25000, 0.61803797500265034),
 (2.0, 0.0, 25000, 0.6354467041963705),
 (3.0, 1.0, 6837, 0.54412036108839845),
 (3.0, 2.0, 5121, 0.51562635388704203),
 (3.0, 0.0, 13182, 0.57804080512589651),
 (5.0, 1.0, 11463, 0.57150386603229086),
 (5.0, 2.0, 8390, 0.53139917276999749),
 (5.0, 0.0, 21856, 0.58809641163132587),
 (7.0, 0.0, 6115, 0.531402564058575),
 (8.0, 0.0, 5181, 0.53822911489271441),
 (9.0, 1.0, 9336, 0.53932598428930434),
 (9.0, 2.0, 7111, 0.51475332668662965),
 (9.0, 0.0, 18033, 0.55716884900478492)]

In [14]:
np.save('demo_results_big.npy',[(maximum_rows,minimum_rows,seed,datetime.now().strftime("%Y-%m-%d_%H:%M:%S"))]+demo_results)