Permalink
Browse files

Cleaned up code and documentation. Added new TestLSHCode.m to run thr…

…ough all

the tests.
  • Loading branch information...
MalcolmSlaney committed Nov 10, 2011
1 parent e7fadce commit 00c99e42fd61b596374b61be6bbec1abd63abf1b
Showing with 211 additions and 60 deletions.
  1. +48 −56 CalculateLSHParameters.m
  2. +150 −0 TestLSHCode.m
  3. +9 −0 doc/index.html
  4. +4 −4 lsh.py
View
@@ -322,75 +322,67 @@
%%%%%%%%%%%%%%%%%%% EXACT PARAMETER ESTIMATION %%%%%%%%%%%%%%%%%%%%%%
% Now do it with the full optimal calculations
% Eq. 41 for all values of w
-alpha = log((binNnProb-binAnyProb)./(1-binNnProb)) + ...
+wAlpha = log((binNnProb-binAnyProb)./(1-binNnProb)) + ...
r * log(binAnyProb2./binAnyProb) + log(uCheck/uHash) - log(factorial(r));
-% Eq. 40 fir all values of w
-binK = (log(N)+alpha)./(-log(binAnyProb)) + ...
- r*log((log(N)+alpha)./(-log(binAnyProb)));
-binK(imag(binK) ~= 0.0 | binK < 1) = 1; % Set bad values to at least 1
-if 0 % Hopefully not necessary
- for iTemp = 1:length(binK)
- if isreal(binK(iTemp)) ==0 | binK(iTemp) < 0
- binK(iTemp) = 1;
- end
- end
-end
+% Eq. 40 for all values of w
+wBestK = (log(N)+wAlpha)./(-log(binAnyProb)) + ...
+ r*log((log(N)+wAlpha)./(-log(binAnyProb)));
+wBestK(imag(wBestK) ~= 0.0 | wBestK < 1) = 1; % Set bad values to at least 1
% Now we want to find the total cost for all values of w. We will argmin
% this for find the best w (This is the inside of Eq. 39. We first compute
% the inside of x^(log/log)
-temp = ((binK.^r).*(binNnProb-binAnyProb)*N*uCheck.*((binAnyProb2./binAnyProb).^r))./ ...
- ((1-binNnProb).*uHash*factorial(r));
-wFullCost = (-log(deltaTarget))* uHash*factorial(r)*((binNnProb./binNnProb2).^r)./ ...
- (binK.^r).*(1-binAnyProb)./(binNnProb-binAnyProb).* ...
- temp.^(log(binNnProb)./log(binAnyProb));
-results.wExactCost = wFullCost;
+% Equations (48), (49) and (50) for optimalBin estimate.
+Ch = uHash * (-log(deltaTarget)) * ...
+ factorial(r) * (binNnProb./ binNnProb2).^r;
+Cc = uCheck * (-log(deltaTarget))*N*((binNnProb./binNnProb2)).^r .* ...
+ (binAnyProb2./binAnyProb).^r;
+wFullCost = Ch /( (wBestK.^r) .* (binNnProb.^wBestK)) + ...
+ Cc .* (binAnyProb./binNnProb).^wBestK;
+results.wExpExactCost = wFullCost;
+
+% Now compute the integer values of k and l.
+% Don't let r get bigger than k (vs. W), which can happen for very small w.
+kVsW = floor(wBestK);
+% We compute L via Equation (42)
+lVsW = ceil(-log(deltaTarget)./ ...
+ ( choose(kVsW, min(r, kVsW)) .* (binNnProb.^(max(1,wBestK-r))) .* ...
+ (binNnProb2.^r)));
[optimalMin, optimalBin] = min(wFullCost);
optimalW = wList(optimalBin)/dScale;
-optimalK = floor(binK(optimalBin));
-% optimalL = ceil(-log(deltaTarget)/ ...
-% ( (optimalK^r)/factorial(r) * (binNnProb(optimalBin)^(optimalK-r)) * ...
-% (binNnProb2(optimalBin)^r))); % Wrong expression for C^r_k -
-% Malcolm 9/8/2011
- % Equation (42)
-optimalL = ceil(-log(deltaTarget)/ ...
- ( choose(optimalK,r) * (binNnProb(optimalBin)^(optimalK-r)) * ...
- (binNnProb2(optimalBin)^r)));
-
-kVsW = floor(binK);
-% Don't let r get bigger than k (vs. W), which can happen for very small w.
-binL = ceil(-log(deltaTarget)./ ...
- ( choose(kVsW, min(r, kVsW)) .* (binNnProb.^(binK-r)) .* ...
- (binNnProb2.^r)));
-
-% Equations (48), (49) and (50) for optimalBin estimate.
-Ch = uHash * (-log(deltaTarget)) * ...
- factorial(r) * (binNnProb(optimalBin)./ binNnProb2(optimalBin))^r;
-Cc = uCheck * (-log(deltaTarget))*N*((binNnProb(optimalBin)./binNnProb2(optimalBin)))^r * ...
- (binAnyProb2(optimalBin)/binAnyProb(optimalBin))^r;
-optimalCost = Ch /( (optimalK^r) * (binNnProb(optimalBin)^optimalK)) + ...
- Cc * (binAnyProb(optimalBin)/binNnProb(optimalBin))^optimalK;
+optimalK = kVsW(optimalBin);
+optimalL = lVsW(optimalBin);
-% results.exactCostVsW = wFullCost;
results.exactW = optimalW;
results.exactK = optimalK;
results.exactL = optimalL;
results.exactBin = optimalBin;
-results.exactCost = optimalCost;
-results.wExactK = binK;
-results.wExactL = binL;
-results.wCandidateCount = N*optimalL.*choose(floor(binK),r).* ...
- binAnyProb.^(floor(binK)-r) .* binAnyProb2.^r;
+results.exactCost = wFullCost(optimalBin);
+results.wExactK = wBestK;
+results.wExactL = lVsW;
+results.wExactCost = wFullCost;
+
+% Now figure out the best possible estimate of the total number of
+% candidates we will check. Sum over all distances < r.
+probSum = 0;
+for ri = 0:r
+ candidatesPerBucket = choose(kVsW, min(ri, kVsW)) .* ...
+ results.binAnyProb.^max(0,kVsW-ri) .* ...
+ results.binAnyProb2.^ri;
+ probSum = probSum + candidatesPerBucket;
+end
+results.wCandidateCount = N * (1 - (1-probSum).^lVsW);
+results.wCandidateCount2 = N*probSum.*lVsW;
fprintf('Exact Optimization:\n');
fprintf('\tFor %d points of data use: ', N);
fprintf('w=%g and get %g hits per bin and %g nn.\n', ...
optimalW, binAnyProb(optimalBin), ...
binNnProb(optimalBin));
fprintf('\t\tK=%g L=%g cost is %g\n', ...
- optimalK, optimalL, optimalCost);
+ optimalK, optimalL, wFullCost(optimalBin));
% And print the statistics for L=1 for simple parameters.
desiredOptimalK = round(optimalK);
@@ -647,8 +639,8 @@
% F=SIMPLEPDF(X,MU,SIG,'boxcar') computes a uniform pdf with mean MU
% and standard deviation SIG.
%
-% F=SIMPLEPDF(X,XO,ALPHA,'cauchy') computes a Cauchy pdf with location
-% parameter XO and scale parameter ALPHA.
+% F=SIMPLEPDF(X,XO,wAlpha,'cauchy') computes a Cauchy pdf with location
+% parameter XO and scale parameter wAlpha.
%
% F=SIMPLEPDF(X,BETA,'exponential') computes an exponential pdf with
% scale parameter, hence mean and standard deviation, equal to BETA.
@@ -657,7 +649,7 @@
%
% Usage: f=simplepdf(x,mu,sig,'gaussian');
% f=simplepdf(x,mu,sig,'boxcar');
-% f=simplepdf(x,xo,alpha,'cauchy');
+% f=simplepdf(x,xo,wAlpha,'cauchy');
% f=simplepdf(x,beta,'exponential');
% __________________________________________________________________
% This is part of JLAB --- type 'help jlab' for more information
@@ -688,8 +680,8 @@
f(ia:ib)=1;
f=f./vsum(f*dx,1);
elseif strcmp(flag,'cauchy')
- alpha=sig;
- f=frac(alpha./pi,(x-mu).^2 + alpha.^2);
+ wAlpha=sig;
+ f=frac(wAlpha./pi,(x-mu).^2 + wAlpha.^2);
elseif strcmp(flag,'exponential')
f=frac(1,mu).*exp(-abs(x)./mu);
end
@@ -1000,12 +992,12 @@
function[]=pdfinv_test
-alpha=2;
+wAlpha=2;
dy=0.01;
yi=(-40:dy:40)';
-fx=simplepdf(yi,0,alpha,'cauchy');
-fy=simplepdf(yi,0,1./alpha,'cauchy');
+fx=simplepdf(yi,0,wAlpha,'cauchy');
+fy=simplepdf(yi,0,1./wAlpha,'cauchy');
fy2=pdfinv(yi,fx);
tol=1e-3;
View
@@ -0,0 +1,150 @@
+% Create some data. First pick out the test dimensions, and the locations
+% of some files.
+D = 4;
+PythonCmd = 'python2.6';
+LSHCmd = 'lsh.py';
+tmpFile = '/tmp/lshtest.out';
+tmpFile2 = '/tmp/lshtest.out2';
+subPlots = 1;
+
+%%
+% Now run the python command to create the data.
+cmd = sprintf('%s %s -d %d -create', PythonCmd, LSHCmd, D);
+fprintf('Running the command: %s\n', cmd);
+system(cmd);
+%%
+% Now run the python command to measure the distance data
+cmd = sprintf('%s %s -d %d -histogram', PythonCmd, LSHCmd, D);
+fprintf('Running the command: %s\n', cmd);
+system(cmd);
+%%
+% Load in the distance data and calculate the distance histograms.
+testData = load(sprintf('testData%03d.distances', D));
+nBins = 40;
+[dnnHist, dnnBins] = hist(testData(:,1), nBins);
+[danyHist, danyBins] = hist(testData(:,2), nBins);
+if subPlots
+ subplot(2,2,1);
+end
+plot(dnnBins, dnnHist, danyBins, danyHist);
+legend('Nearest Neighbor', 'Any Neighbor');
+xlabel('Distance')
+ylabel('Frequency of Occurance');
+title(sprintf('Distance Histogram for %d-D data', D));
+%%
+% Now calculate the optimum LSH parameters.
+N=100000;
+deltaTarget = 0.5;
+r = 0;
+uHash = 1;
+uCheck = 1;
+results = CalculateMPLSHParameters(N, ...
+ dnnHist, dnnBins, danyHist, danyBins, deltaTarget, r, uHash, uCheck);
+%%
+% Now let's run the W test.
+w = results.exactW;
+cmd = sprintf('%s %s -d %d -w %g -wTest > %s', PythonCmd, LSHCmd, ...
+ D, w, tmpFile);
+fprintf('Running the command: %s\n', cmd);
+system(cmd);
+
+%%
+% And load the W-test results.
+cmd = sprintf('egrep -v "[a-zA-Z]" %s | tail -20 > %s', tmpFile, tmpFile2);
+[s,res] = system(cmd);
+wTest = load(tmpFile2);
+
+
+%%
+% And now create the w-test plot.
+if subPlots
+ subplot(2,2,2);
+end
+
+semilogx(results.wList/results.dScale, results.binNnProb, ...
+ wTest(:,1), wTest(:,2), 'bx', ...
+ results.wList/results.dScale, results.binAnyProb, ...
+ wTest(:,1), wTest(:,3), 'gx', ...
+ [results.exactW, results.exactW], [0 1], 'r--');
+
+xlabel('W');
+ylabel('Probability of Collision');
+legend('p_{nn}', 'p_{nn} experimental', ...
+ 'p_{any}', 'p_{any} experimental', ...
+ 'Optimum W', ...
+ 'Location', 'SouthEast');
+title(sprintf('wTest for %d-D data', D));
+
+%%
+% Now let's run the K test
+cmd = sprintf('%s %s -d %d -w %g -kTest > %s', PythonCmd, LSHCmd, ...
+ D, w, tmpFile);
+fprintf('Running the command: %s\n', cmd);
+system(cmd);
+
+%%
+% And grab the K-test results.
+cmd = sprintf('grep " 10 " %s > %s', tmpFile, tmpFile2);
+system(cmd);
+kTest = load(tmpFile2);
+
+%%
+% Now plot the k-test results.
+if subPlots
+ subplot(2,2,3);
+end
+
+semilogy(kTest(:,2), kTest(:,4), 'bx', ...
+ kTest(:,2), kTest(1,4).^kTest(:,2), 'g-', ...
+ kTest(:,2), ...
+ results.binNnProb(results.exactBin).^kTest(:,2), 'r--');
+xlabel('Number of Projections (K)');
+ylabel('Probability');
+legend('Experimental Data', 'Extrapolated Prediction', ...
+ 'Theoretical Prediction', 'Location', 'SouthWest');
+title(sprintf('kTest for %d-D data', D));
+
+%%
+% Now let's run the L test.
+cmd = sprintf('%s %s -d %d -w %g -lTest > %s', PythonCmd, LSHCmd, ...
+ D, w, tmpFile);
+fprintf('Running the command: %s\n', cmd);
+system(cmd);
+
+%%
+% And grab the L-test results
+cmd = sprintf('grep " 10 " %s > %s', tmpFile, tmpFile2);
+system(cmd);
+lTest = load(tmpFile2);
+
+%%
+% Now plot the results.
+if subPlots
+ subplot(2,2,4);
+end
+
+baseNN = results.binNnProb(results.exactBin).^lTest(1,2);
+baseAny = results.binAnyProb(results.exactBin).^lTest(1,2);
+semilogy(lTest(:,3), lTest(:,4), 'rx', ...
+ lTest(:,3), lTest(:,5), 'kx', ...
+ lTest(:,3), (1-(1-baseNN).^lTest(:,3)), ...
+ lTest(:,3), (1-(1-baseAny).^lTest(:,3)), ...
+ lTest(:,3), (1-(1-kTest(8,4)).^lTest(:,3)), ...
+ lTest(:,3), (1-(1-kTest(8,5)).^lTest(:,3)));
+
+legend('p_{nn} Experimental', 'p_{any} Experimental', ...
+ 'p_{nn} Theory', 'p_{any} Theory',...
+ 'p_{nn} k-Prediction', 'p_{any} k-Prediction', ...
+ 'Location', 'SouthEast');
+xlabel('Number of Tables (L)');
+ylabel('Probability');
+title(sprintf('kTest for %d-D data', D));
+
+%%
+save TestLSHCode wTest lTest kTest results D subplots ...
+ dnnHist dnnBins danyHist danyBins
+
+%%
+% set(gcf,'Position', [100 100 900 800])
+pictureFile = sprintf('TestLSHCode-%s.eps', date);
+print('-depsc', pictureFile);
View
@@ -11,8 +11,17 @@
</head>
<body>
<H1>Yahoo! Optimal LSH</H1>
+Malcolm Slaney, Yahoo! Research, November 2011
<H3>Introduction</H3>
The Python and Matlab code in this distribution implement a locality-sensitive (LSH) hash for high-dimensional data. This task is important because web-scale multimedia is often high-dimensional, and thus suffers from the curse of dimensionality, and large. Yet users want answers within milliseconds.</p>
+<p>
+My favorite introduction to LSH is this article:
+<a href="http://www.slaney.org/malcolm/yahoo/Slaney2008-LSHTutorial.pdf">
+Slaney, M. and Casey, M.,
+"Locality Sensitive Hashing for Finding Nearest Neighbours,"
+<em>IEEE Signal Processing,</em> March 2008.
+</a>
+
<p>
The LSH parameter optimization algorithm is described in this article, which is currently under review:
Malcolm Slaney, Yury Lifshits, Junfeng He, "Optimal Locality-Sensitive Hashing," Submitted to <em>Proceedings of the IEEE,</em> Special Issue on Web-Scale Multimedia, Summer 2012.</p>
View
8 lsh.py
@@ -213,18 +213,18 @@ def CalculateHashIterator(self, data, multiprobeRadius=0):
if multiprobeRadius > 0:
# print "Multiprobe points:", points
# print "Multiprobe bin:", bins
- # print "Multiprobe direct:", direct
+ # print "Multiprobe direct:", directVector
dimensions = range(self.k)
deltaVector = numpy.zeros((self.k, 1), 'int') # Preallocate
- for r in range(1, multiprobeRadius):
+ for r in range(1, multiprobeRadius+1):
# http://docs.python.org/library/itertools.html
for candidates in itertools.combinations(dimensions, r):
deltaVector *= 0 # Start Empty
deltaVector[list(candidates), 0] = 1 # Set some bits
newProbe[:] = bins + deltaVector*directVector # New probe
t1 = self.ListHash(newProbe)
t2 = self.ListHash(newProbe[::-1]) # Reverse data for second hash
- # print "Multiprobe probe:",p, t1, t2
+ # print "Multiprobe probe:",newProbe, t1, t2
yield (t1,t2)
# Put some data into the hash bucket for this LSH projection
@@ -707,7 +707,7 @@ def ComputeDistanceHistogram(self, fp = sys.stdout):
the input for the parameter optimization routine. Enhanced
to also print the NN binary projections.'''
numPoints = self.NumPoints()
- medians = self.FindMedian()
+ # medians = self.FindMedian() # Not used now, but useful for binary quantization
print "Pulling %d items from the NearestNeighbors list for ComputeDistanceHistogram" % \
len(self.nearestNeighbors.items())
for (queryKey,(nnKey,nnDist)) in self.nearestNeighbors.items():

0 comments on commit 00c99e4

Please sign in to comment.