In [8]:
import numpy as np
import scipy.sparse
import math
from scipy.sparse import  find

def BinEm(X):
    '''Catogrical to binray conversion'''
    I = find(X)
    X_new = np.zeros(X.shape, dtype = np.uint32)
    for i in range(len(I[0])):
        if np.random.rand() < 0.5:
             X_new[I[0][i], I[1][i]] = 1
        else:
             X_new[I[0][i], I[1][i]] = 0
    return scipy.sparse.csr_matrix(X_new)

def BCS(X, k):
    '''Sketch generation using BCS: This method takes a binary matrix X of size nxd, and generates its sketch matrix of size nxk, where k<<d. 
        Parameters:
           X  is a sparse array (NumPy array or can be scipy sparse matrix) of shape (n, d),
              where n is the number of samples and d is feature dimension
           k is the  reduced dimension value (must be a positive integer)
            
        Returns:
          Sketch matrix of X of size nxk-- scipy sparse csr_matrix of shape (n, k)
    '''
    d = X.shape[1]
    
    random_map = np.array([], int)
    M = np.arange(k)
    for i in range(d//k):
        np.random.shuffle(M)
        random_map = np.concatenate((random_map, M))
    np.random.shuffle(M)
    random_map = np.concatenate((random_map, M[0:d%k]))
    np.random.shuffle(random_map)

    new_X = np.zeros((X.shape[0], k))
    for i in range(k):
        index = np.where(random_map == i)[0]
        new_X[:,i] = np.sum(X[:,index].A, axis = 1) % 2
    return scipy.sparse.csr_matrix(new_X)

def BCS_Ham(a,b):
    '''This method takes two low-dimensional (sketch) binary vectors a and b obtained from the BCS method and estimate the hamming distance between their full-dimensional version.
        Parameters:
            binary vectors a, b 
        Returns:
            An estimate of the Hamming distance between  full-dimensional vectors corresponding to a and b

    '''
    BCS_Ham_Est = (a-b).nnz
    return  BCS_Ham_Est

def Hamming_distance(a,b):
    'Funtion to calculate hamming distance between array a and b'
    ham = 0
    for i in range(a.shape[1]):
        if a[:,i] != b[:,i]:
            ham += 1
    return ham


In [9]:
#Example
X = scipy.sparse.load_npz('Sample.npz')
print('Shape of actual matrix:', X.shape)

X_Bin = BinEm(X) #catogrical to binary conversion
print('Shape of binary embeding matrix:', X_cat.shape)

new_X = BCS(X_Bin, 1000)
print('Shape of BCS Sketch: ', new_X.shape)

Shape of actual matrix: (100, 102660)
Shape of binary embeding matrix: (100, 102660)
Shape of BCS Sketch:  (100, 1000)


In [10]:
#hamming estimate
a = X[0,:]
b = X[1,:]
#hamming distance between a and b taken from actual matrix
print('Hamming distance between a and b is :', Hamming_distance(a,b))

a_new = new_X[0,:]
b_new = new_X[1,:]
# a_new and b_new corresponds to compressed sketch of a and b
print('Hamming estimate of a and b  using BCS sketch is :', 2*BCS_Ham(a_new, b_new))

Hamming distance between a and b is : 225
Hamming estimate of a and b  using BCS sketch is : 212
