## Create some data

In [49]:
import numpy as np
import time as t
from sklearn import random_projection
n = 10000
d = 100000
X = np.random.rand(n, d)

## Random Gaussian Transform: approx O(nd)

In [57]:
Xn1 = X[:500]
Xn2 = X[:1000]
Xn3 = X

Xd1 = X[:, :500]
Xd2 = X[:, :1000]
Xd3 = X

#create random projection transformer
transformer = random_projection.GaussianRandomProjection(eps=0.99)

# n = 10
start = t.time()
X_new1 = transformer.fit_transform(Xn1.T).T
end = t.time()
t1 = end - start
# n = 100
start = t.time()
X_new2 = transformer.fit_transform(Xn2.T).T
end = t.time()
t2 = end - start
# n = 1000
start = t.time()
X_new3 = transformer.fit_transform(Xn3.T).T
end = t.time()
t3 = end - start
print(t1, X_new1.shape, '\n', t2, X_new2.shape, '\n', t3, X_new3.shape)

# p = 1000
start = t.time()
X_new1 = transformer.fit_transform(Xd1.T).T
end = t.time()
t1 = end - start
# p = 10000
start = t.time()
X_new2 = transformer.fit_transform(Xd2.T).T
end = t.time()
t2 = end - start
# p = 10000
start = t.time()
X_new3 = transformer.fit_transform(Xd3.T).T
end = t.time()
t3 = end - start
print(t1, X_new1.shape, '\n', t2, X_new2.shape, '\n', t3, X_new3.shape)


0.2568662166595459 (276, 100000) 
 0.40973591804504395 (276, 100000) 
 3.140739679336548 (276, 100000)
0.1384449005126953 (149, 500) 
 0.22084927558898926 (165, 1000) 
 3.160515069961548 (276, 100000)


## Random Sparse Transform: O(nd)

In [56]:
Xn1 = X[:500]
Xn2 = X[:1000]
Xn3 = X

Xd1 = X[:, :500]
Xd2 = X[:, :1000]
Xd3 = X

#create random projection transformer
transformer = random_projection.SparseRandomProjection(eps=0.99)

# n = 500
start = t.time()
X_new1 = transformer.fit_transform(Xn1.T).T
end = t.time()
t1 = end - start
# n = 1000
start = t.time()
X_new2 = transformer.fit_transform(Xn2.T).T
end = t.time()
t2 = end - start
# n = full
start = t.time()
X_new3 = transformer.fit_transform(Xn3.T).T
end = t.time()
t3 = end - start
print(t1, X_new1.shape, '\n', t2, X_new2.shape, '\n', t3, X_new3.shape)

# p = 500
start = t.time()
X_new1 = transformer.fit_transform(Xd1.T).T
end = t.time()
t1 = end - start
# p = 1000
start = t.time()
X_new2 = transformer.fit_transform(Xd2.T).T
end = t.time()
t2 = end - start
# p = full
start = t.time()
X_new3 = transformer.fit_transform(Xd3.T).T
end = t.time()
t3 = end - start
print(t1, X_new1.shape, '\n', t2, X_new2.shape, '\n', t3, X_new3.shape)

0.505361795425415 (276, 100000) 
 0.6847708225250244 (276, 100000) 
 2.434791088104248 (276, 100000)
0.07642889022827148 (149, 500) 
 0.1138160228729248 (165, 1000) 
 2.395220994949341 (276, 100000)


## Implement FJLT  via fast Hadamard Transform: slow??

In [10]:
import numpy as np
import numpy.random as npr
import math
import scipy.sparse as sparse
import FWHT



def nextPow(d_act):
    d_act = d_act - 1
    d_act |= d_act >> 1
    d_act |= d_act >> 2
    d_act |= d_act >> 4
    d_act |= d_act >> 8
    d_act |= d_act >> 16
    d_act += 1
    return d_act

def fast_sample(n, sample_size):
    swap_records = {}
    sample_wor = np.empty(sample_size, dtype=int)
    for i in range(sample_size):
        rand_ix = npr.randint(i, n)
        
        if i in swap_records:
            el1 = swap_records[i]
        else:
            el1 = i

        if rand_ix in swap_records:
            el2 = swap_records[rand_ix]
        else:
            el2 = rand_ix
        
        swap_records[rand_ix] = el1
        sample_wor[i] = el2
        if i in swap_records:            
            del swap_records[i]
    return sample_wor

def fjlt(A, k, q):
    (d, n) = A.shape
    #Calculate the next power of 2
    d_act = nextPow(d)
    sc_ft = np.sqrt(d_act / float(d * k))
    #Calculate D plus some constansts
    D = npr.randint(0, 2, size=(d, 1)) * 2 * sc_ft - sc_ft
    DA = np.zeros((d_act, n))
    DA[0:d, :] = A * D
    
    #Apply hadamard transform to each row
    hda = np.apply_along_axis(FWHT.FWHT, 1, DA.T)[:,:,0].T

    #Apply P transform
    sample_size = npr.binomial(k * d, q)
    indc = fast_sample(k * d, sample_size)
    p_rows, p_cols = np.unravel_index(indc, (k, d))
    p_data = npr.normal(loc=0, scale=math.sqrt(1/q), size=len(p_rows))
    P = sparse.csr_matrix((p_data, (p_rows, p_cols)), shape=(k, d_act))
    return P.dot(hda)

In [45]:
start = t.time()
b = fjlt(X, 100, 1)
end = t.time()
print(end-start)
b.shape

28.226892709732056


(100, 10000)

## Random Sampling: sooo much faster

In [44]:
from numpy.random import choice

#unweighted
start = t.time()
draw = choice(range(n), 442, replace=False)
X[draw,:]
end = t.time()
print(end-start)

#weighted by row norm
norms = np.linalg.norm(X, axis=1) #only do this once
weights = norms / sum(norms)
start = t.time()
draw = choice(range(n), 442, p=weights, replace=False)
X[draw,:]
end = t.time()
print(end-start)

0.010050058364868164
0.007711887359619141
