In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.neighbors import LSHForest

In [3]:
>>> X_train = [[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1], [6, 10, 2]]
>>> X_test = [[9, 1, 6], [3, 1, 10], [7, 10, 3], [1,1,1]]
>>> lshf = LSHForest(random_state=42)
>>> lshf.fit(X_train)  
LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10,
          n_neighbors=5, radius=1.0, radius_cutoff_ratio=0.9,
          random_state=42)
>>> distances, indices = lshf.kneighbors(X_test, n_neighbors=2)
print distances                                        
print indices


[[  6.93930993e-02   1.49609594e-01]
 [  2.29325364e-01   4.81000704e-01]
 [  4.89443707e-03   1.48191329e-02]
 [ -2.22044605e-16   5.71909584e-02]]
[[1 2]
 [2 0]
 [4 0]
 [2 0]]


In [4]:
import time
import numpy as np
from sklearn.datasets.samples_generator import make_blobs
from sklearn.neighbors import LSHForest
from sklearn.neighbors import NearestNeighbors

# Initialize size of the database, iterations and required neighbors.
n_samples = 10000
n_features = 100
n_iter = 30
n_neighbors = 100
rng = np.random.RandomState(42)

# Generate sample data
X, _ = make_blobs(n_samples=n_samples, n_features=n_features,
                  centers=10, cluster_std=5, random_state=0)

# Set `n_estimators` values
n_estimators_values = np.linspace(1, 30, 5).astype(np.int)
accuracies_trees = np.zeros(n_estimators_values.shape[0], dtype=float)

# Calculate average accuracy for each value of `n_estimators`
for i, n_estimators in enumerate(n_estimators_values):
    lshf = LSHForest(n_candidates=500, n_estimators=n_estimators,
                     n_neighbors=n_neighbors)
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute')

    lshf.fit(X)
    nbrs.fit(X)
    for j in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        neighbors_approx = lshf.kneighbors(query, return_distance=False)
        neighbors_exact = nbrs.kneighbors(query, return_distance=False)

        intersection = np.intersect1d(neighbors_approx,
                                      neighbors_exact).shape[0]
        ratio = intersection/float(n_neighbors)
        accuracies_trees[i] += ratio

    accuracies_trees[i] = accuracies_trees[i]/float(n_iter)



In [5]:
# Set `n_candidate` values
n_candidates_values = np.linspace(10, 500, 5).astype(np.int)
accuracies_c = np.zeros(n_candidates_values.shape[0], dtype=float)

# Calculate average accuracy for each value of `n_candidates`
for i, n_candidates in enumerate(n_candidates_values):
    lshf = LSHForest(n_candidates=n_candidates, n_neighbors=n_neighbors)
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute')
    # Fit the Nearest neighbor models
    lshf.fit(X)
    nbrs.fit(X)
    for j in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        # Get neighbors
        neighbors_approx = lshf.kneighbors(query, return_distance=False)
        neighbors_exact = nbrs.kneighbors(query, return_distance=False)

        intersection = np.intersect1d(neighbors_approx,
                                      neighbors_exact).shape[0]
        ratio = intersection/float(n_neighbors)
        accuracies_c[i] += ratio

    accuracies_c[i] = accuracies_c[i]/float(n_iter)



In [6]:
from annoy import AnnoyIndex
import random

f = 40
t = AnnoyIndex(f)  # Length of item vector that will be indexed
for i in xrange(1000):
    v = [random.gauss(0, 1) for z in xrange(f)]
    t.add_item(i, v)

t.build(10) # 10 trees
t.save('test.ann')

# ...

u = AnnoyIndex(f)
u.load('test.ann') # super fast, will just mmap the file
print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors

[0, 575, 327, 201, 554, 950, 483, 411, 454, 829, 51, 14, 92, 895, 997, 880, 857, 779, 881, 451, 654, 352, 652, 171, 418, 843, 265, 400, 487, 78, 197, 36, 748, 951, 299, 651, 31, 876, 193, 5, 316, 144, 49, 109, 8, 392, 774, 362, 723, 99, 813, 384, 731, 837, 523, 608, 101, 923, 278, 975, 69, 938, 390, 931, 252, 40, 126, 462, 968, 134, 818, 110, 805, 515, 54, 617, 735, 851, 616, 279, 817, 194, 808, 18, 726, 369, 577, 196, 769, 623, 539, 560, 492, 221, 349, 839, 580, 34, 74, 714, 664, 816, 604, 792, 288, 693, 264, 381, 732, 822, 484, 441, 88, 307, 720, 474, 929, 712, 96, 138, 199, 541, 743, 900, 932, 553, 133, 855, 448, 200, 146, 283, 498, 191, 189, 128, 460, 289, 961, 226, 33, 518, 597, 249, 781, 373, 629, 391, 413, 23, 671, 751, 386, 190, 612, 335, 310, 835, 848, 131, 710, 776, 704, 412, 925, 906, 980, 907, 729, 878, 68, 559, 500, 257, 470, 468, 528, 174, 543, 9, 93, 385, 398, 317, 48, 95, 408, 877, 161, 708, 902, 254, 607, 229, 583, 650, 910, 853, 84, 800, 107, 945, 524, 469, 676, 198, 

In [7]:
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer 

tfidf = TfidfVectorizer()
corpus = 5 * ['blah']
arr= tfidf.fit_transform(corpus)

arr.toarray()

array([[ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.]])

In [8]:
!pwd

/Users/ajay/Documents/take_homes/NLP ML Engineering ASAPP Challenge/markov_model
