In [1]:
# Copyright 2016-present, Facebook, Inc.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE-examples file in the root directory of this source tree.

## Study Sparse vs Dense Matrix Implementations
Pysparnn defaults to sparse matricies but you may also use a dense matrix to improve performance

This is typically when the number of dimensions is small

In [2]:
import numpy as np
import time

In [3]:
# make sure you run 'python setup.py install' first!
import pysparnn.cluster_index as ci
import pysparnn.matrix_distance

# Get data

In [4]:
# feature vectors are ~10% full and there are only 100 dimensions
features = np.random.binomial(1, 0.1, size=(100000, 100))

In [5]:
test_features = features[:5000]
train_features = features[5000:]

data_to_return = range(train_features.shape[0])

## Build models to compare

In [6]:
cp = ci.MultiClusterIndex(train_features, data_to_return)

  magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)


In [7]:
dense_cp = ci.MultiClusterIndex(train_features, data_to_return, 
                                distance_type=pysparnn.matrix_distance.DenseCosineDistance)

  magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)
  return 1 - (dprod * magnitude)
  dist_filter = (dist_matrix <= max_distance)


## Answer Key

In [8]:
import pysparnn_utils

In [9]:
from sklearn.neighbors import NearestNeighbors 
knn = NearestNeighbors()
        
knn.fit(train_features)

# get top 3 nearest neighbors for each document
answers = knn.kneighbors(test_features, 3, return_distance=False)

## Compare Performance
Don't worry so much about the recall performance. There are many items in this space (congested). These methods should return close matches even if they arent the closest absolute matches.

In [13]:
t0 = time.time()

results = cp.search(test_features, return_distance=False)

print('Percent of time sparse returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))

cp_time = time.time() - t0

Percent of time sparse returns a top 3 result: 0.2498


In [14]:
t0 = time.time()

results = dense_cp.search(test_features, return_distance=False)

print('Percent of time dense returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))

dense_cp_time = time.time() - t0

Percent of time dense returns a top 3 result: 0.2458


In [15]:
# sparse is x times slower than dense
cp_time / dense_cp_time

4.979948311566905

**Analysis:** Equivalent performance (the indexes use random seeds for construction) and the dense version is ~4x faster in this case.