In [1]:
# Copyright 2016-present, Facebook, Inc.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE-examples file in the root directory of this source tree.

# Evaluate pysparnn on 20 Newsgroups data

In [2]:
import numpy as np
import time
from scipy.sparse import csr_matrix
from sklearn.datasets import fetch_20newsgroups

In [3]:
# make sure you run 'python setup.py install' first!
import pysparnn.cluster_index as ci

# Get data

In [4]:
dataset = fetch_20newsgroups(subset='all', shuffle=True)

In [5]:
print('Num docs: {}'.format(len(dataset.data)))
print('Avg doc length: {}'.format(np.mean([len(x.split()) for x in dataset.data])))
words = set()
for doc in dataset.data:
    words.update(doc.split())
print('Num unique words: {}'.format(len(words)))

Num docs: 18846
Avg doc length: 283.656001273
Num unique words: 386410


## Turn documents into vectors

In [6]:
from sklearn.neighbors import LSHForest, NearestNeighbors 
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(decode_error='ignore')

features = csr_matrix(tv.fit_transform(dataset.data))

doc_index = np.array(range(len(dataset.data)))

In [7]:
test_features = features[:200]
train_features = features[200:]

## Create an answer key

In [8]:
knn = NearestNeighbors()
        
knn.fit(train_features)

# get top 3 nearest neighbors for each document
answers = knn.kneighbors(test_features, 3, return_distance=False)

## Build models to compare

In [9]:
snn = ci.MultiClusterIndex(train_features, doc_index, num_indexes=2)

In [10]:
lshf = LSHForest()
        
lshf.fit(train_features)

LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10, n_neighbors=5,
     radius=1.0, radius_cutoff_ratio=0.9, random_state=None)

## Compare results

In [11]:
import pysparnn_utils

In [12]:
t0 = time.time()

results = snn.search(test_features, return_distance=False, num_indexes=1)

print('Percent of time snn returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))

snn_time = time.time() - t0

Percent of time snn returns a top 3 result: 0.66


In [13]:
t0 = time.time()

results = lshf.kneighbors(test_features, return_distance=False)

print('Percent of time lsh returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))

lsh_time = time.time() - t0

Percent of time lsh returns a top 3 result: 0.143


In [14]:
# LSH is x times slower than snn
lsh_time / snn_time

5.112146987324278