In [81]:
import numpy as np
import scipy.sparse
import math



def BinSketch(X, k):
    ''''Documentation of BinSketch:
        Parameters:
           X: is a sparse array (numpy array or can be scipy sparse matrix) of shape (n, d),
              where n is number of samples and d is feature dimension
           k: reduced dimension value (must be postive integer)
            
        Returns:
          scipy sparse csr_matrix of shape (n, k)
    '''

    d = X.shape[1]
    random_map = np.array([], int)
    M = np.arange(k)
    for i in range(d//k):
        np.random.shuffle(M)
        random_map = np.concatenate((random_map, M))
    np.random.shuffle(M)
    random_map = np.concatenate((random_map, M[0:d%k]))
    np.random.shuffle(random_map)

    new_X = np.zeros((X.shape[0], k))
    for i in range(k):
        index = np.where(random_map == i)[0]
        new_X[:,i] = np.max(X[:,index].A, axis = 1)
        
    return scipy.sparse.csr_matrix(new_X)


def Hamming_distance(a,b):
    'Funtion to calculate hamming distance between array a and b'
    ham = 0
    for i in range(a.shape[1]):
        if a[:,i] != b[:,i]:
            ham += 1
    return ham



def BinSketch_Similarity_Estimate(a, b):
    ''''Documentation:
        Parameters:
          a: scipy sparse scr array
          b: sipy sparse scr  array 

        Returns:
          integer values
    '''
    N = a.shape[1]
    spar_a = a.nnz   #nnz is scipy sparse matrix method to find number of nonzero entries
    spar_b = b.nnz
    
    if spar_a/N < 1:
        spar_est_a = round(math.log(1-(spar_a/N))/math.log(1- 1/N))
    else:
        spar_est_a = spar_a
        
    if spar_b/N < 1:
        spar_est_b = round(math.log(1-(spar_b/N))/math.log(1- 1/N))
    else:
        spar_est_b = spar_b
        
    IP = int(a.dot(b.T)[0,0])
    val = (1 - 1/N)**(spar_est_a) + (1 - 1/N)**(spar_est_b) + (IP/N) - 1
    if  val > 0:
        Bin_IP_Est  = round(spar_est_a + spar_est_b - (math.log(val) / math.log(1 - 1/N)))
        if Bin_IP_Est < 0:
            Bin_IP_Est = 0
        Bin_Ham_Est = spar_est_a + spar_est_b - 2*Bin_IP_Est
        Bin_JS_Est  = Bin_IP_Est/(Bin_Ham_Est + Bin_IP_Est)
        Bin_CS_Est  = Bin_IP_Est/(spar_est_a * spar_est_b)**(0.5)
    else:
        spar_est_a = spar_a
        spar_est_b = spar_b
        Bin_IP_Est = IP
        Bin_Ham_Est = spar_a + spar_b - 2*IP
        Bin_JS_Est  = Bin_IP_Est/(Bin_Ham_Est + Bin_IP_Est)
        Bin_CS_Est  = Bin_IP_Est/(spar_a * spar_b)**(0.5)
    return spar_est_a, spar_est_b, Bin_IP_Est, Bin_Ham_Est, Bin_JS_Est, Bin_CS_Est
   

In [82]:
#Example
X = scipy.sparse.load_npz('Sample.npz')
print('Shape of actual matrix:', X.shape)


new_X = BinSketch(X, 1000)
print('Shape of compressed sktech matrix:', new_X.shape)

Shape of actual matrix: (100, 102660)
Shape of compressed sktech matrix: (100, 1000)


In [83]:
'''Hamming estimate'''
a = X[0,:]
b = X[1,:]

spar_a = a.nnz  #nnz is scipy sparse  array method to find number of non-zero entries
spar_b = b.nnz

#Finding Inner produt(IP), Hamming distance, Cosine similiarity, Jaccard Similarity between a and b
Actual_IP      = a.dot(b.T)[0,0]
Actual_Ham     = spar_a + spar_b - 2*Actual_IP
Actual_Jaccard = Actual_IP/(Actual_Ham + Actual_IP)
Actual_Cosine  = Actual_IP/(spar_a + spar_b)*(0.5)

print('Sparsity of a is:', spar_a)
print('Sparsity of b is:', spar_b)
print('Inner product of a and b is :', Actual_IP)
print('Hamming distance between a and b is :', Actual_Ham)
print('Jaccard similarity between a and b is :', Actual_Jaccard)
print('Cosine similarity between a and b is:', Actual_Cosine)

print('\nEstimation of similarity measures for a and b using BinSketch\n')
# a_new and b_new corresponds to compressed sketch of a and b
a_new = new_X[0,:]
b_new = new_X[1,:]

spar_est_a, spar_est_b, IP_Est, Ham_Est, JS_Est, CS_Est = BinSketch_Similarity_Estimate(a_new, b_new)

print('Sparsity estimate of a:', spar_est_a)
print('Sparsity estimate of a:', spar_est_b)
print('IP estimate of a and b using BinSketch is :', IP_Est)
print('Hamming estimate of a and b  using BinSketch is :', Ham_Est)
print('Jaccard similarity estimate of a and b  using BinSketch is :', JS_Est)
print('Cosine similarity estimate of a and b  using BinSketch is :', CS_Est)

Sparsity of a is: 128
Sparsity of b is: 108
Inner product of a and b is : 6
Hamming distance between a and b is : 224
Jaccard similarity between a and b is : 0.02608695652173913
Cosine similarity between a and b is: 0.012711864406779662

Estimation of similarity measures for a and b using BinSketch

Sparsity estimate of a: 128
Sparsity estimate of a: 108
IP estimate of a and b using BinSketch is : 7
Hamming estimate of a and b  using BinSketch is : 222
Jaccard similarity estimate of a and b  using BinSketch is : 0.03056768558951965
Cosine similarity estimate of a and b  using BinSketch is : 0.05953620902598002
