<a href="https://colab.research.google.com/github/adamMcneil/data-management-project/blob/main/598_mp4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import sys
!{sys.executable} -m pip install h5py faiss-cpu requests scann
import faiss
import h5py
import numpy as np
import time
import matplotlib.pyplot as plt
import os
import requests
import scann





In [7]:
SIFT1M_URL = "http://ann-benchmarks.com/sift-128-euclidean.hdf5"
SIFT1M_FILENAME = "sift-128-euclidean.hdf5"

GLOVE_URL = "http://ann-benchmarks.com/glove-100-angular.hdf5"
GLOVE_FILENAME = "glove-100-angular.hdf5"

def download_sift1m(url, file_name):
    """Downloads the SIFT1M dataset if it's not already present."""
    if not os.path.exists(file_name):
        print("Downloading ", file_name, "dataset...")
        response = requests.get(url, stream=True)
        with open(file_name, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print("Download complete.")
    else:
        print("SIFT1M dataset already exists.")

In [8]:
def load_sift1m(file_name):
    with h5py.File(file_name, "r") as f:
        print(f)
        train_data = np.array(f["train"], dtype=np.float32)
        test_queries = np.array(f["test"], dtype=np.float32)
        ground_truth = np.array(f["neighbors"], dtype=np.int64)[:, 0]
    print("Dataset loaded.")
    return train_data, test_queries, ground_truth

In [4]:
def evaluate_hnsw(train_data, test_queries, ground_truth, M=32, efSearch_vals=[10, 50, 100, 200]):
    d = train_data.shape[1]
    index = faiss.IndexHNSWFlat(d, M, faiss.METRIC_L2)
    index.hnsw.efConstruction = 200
    index.add(train_data)

    results = []
    for ef in efSearch_vals:
        print("Testing ef value:", ef)
        index.hnsw.efSearch = ef
        start_time = time.time()
        _, indices = index.search(test_queries, 1)
        elapsed_time = time.time() - start_time

        recall = np.mean(indices[:, 0] == ground_truth)
        qps = len(test_queries) / elapsed_time
        results.append((ef, recall, qps, 0))
        print("Took:", elapsed_time, "seconds")

    print("HNSW evaluated.")

    return results

In [10]:
def evaluate_scann(train_data, test_queries, ground_truth, num_neighbors=1, num_search_trees=30, num_leaves=1000, num_leaves_to_search_vals=[10, 50, 100, 200], quantize=True):
    train_data = train_data.astype(np.float32)
    test_queries = test_queries.astype(np.float32)

    results = []

    for num_leaves_to_search in num_leaves_to_search_vals:
        print("Building index")
        index = scann.scann_ops_pybind.builder(train_data, num_neighbors, "dot_product") \
        .tree(num_leaves=num_leaves, num_leaves_to_search=num_leaves_to_search, training_sample_size=len(train_data)) \
        .score_ah(2, anisotropic_quantization_threshold=0.2).reorder(100).build()

        print("Testing num_leaves_to_search:", num_leaves_to_search)
        start_time = time.time()
        indices = []
        for query in test_queries:
            i, _ = index.search(query, 1)
            indices.append(i)
        elapsed_time = time.time() - start_time

        print("Took:", elapsed_time, "seconds")

        indices = np.array(indices)
        recall = np.mean(indices[:, 0] == ground_truth)
        print("Recall:", recall)
        qps = len(test_queries) / elapsed_time
        print("QPS:", qps)

        results.append((num_leaves_to_search, recall, qps, 0))

    print("ScaNN evaluated.")

    return results

In [11]:
def evaluate_lsh(train_data, test_queries, ground_truth, nbits_vals=[32, 64, 512, 768]):
    d = train_data.shape[1]
    results = []

    for nbits in nbits_vals:
        index = faiss.IndexLSH(d, nbits)
        index.train(train_data)
        index.add(train_data)

        start_time = time.time()
        _, indices = index.search(test_queries, 1)
        elapsed_time = time.time() - start_time

        recall = np.mean(indices[:, 0] == ground_truth)
        qps = len(test_queries) / elapsed_time
        results.append((nbits, recall, qps, 0))

    print("LSH evaluated.")

    return results

In [10]:
def eval_fiass(train_data, test_queries, ground_truth, index_type="ivfflat", batches=[1, 200, 500, 1000, 5000, 10000]):
    results = []
    total = train_data.shape[0]
    for batch_size in batches:
        dim = train_data.shape[1]

        nlist = 100
        m = 8
        n_bit = 4   # 4 specifies that each sub-vector is encoded as 4 bits
        bbs = 64    # build block size ( bbs % 32 == 0 ) for PQ
        quantizer = faiss.IndexFlatL2(dim)
        if index_type == "ivfflat":
            index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_L2)
        elif index_type == "ivfpq":
            index = faiss.IndexIVFPQ(quantizer, dim, nlist, m, 8)
        elif index_type == "pqfastscan":
            index = faiss.IndexPQFastScan(dim, m, n_bit, faiss.METRIC_L2, bbs)
        else:
          index = quantizer

        print("Building Index")
        batch_len = total // batch_size
        start_time = time.time()
        for i in range(batch_size):
            start = i * batch_len
            end = (i + 1) * batch_len if i != batch_size - 1 else total
            index.train(train_data[start:end])
            index.add(train_data[start:end])
        build_time = time.time() - start_time
        print("Build time:", build_time)

        print("Testing Search")
        start_time = time.time()
        distances, indices = index.search(test_queries, 1)
        elapsed_time = time.time() - start_time

        recall = np.mean(indices[:, 0] == ground_truth)
        print("Recall:", recall)
        qps = len(test_queries) / elapsed_time
        print("QPS:", qps)
        results.append((batch_size, recall, qps, build_time, "batch_size"))
    return results

In [11]:
def eval_fiass_percent(train_data, test_queries, ground_truth, index_type="ivfflat", percents=[.50, .60, .70, .80, .90, 1.00]):
    results = []
    total = train_data.shape[0]
    for percent in percents:
        dim = train_data.shape[1]

        nlist = 100
        m = 8
        n_bit = 4   # 4 specifies that each sub-vector is encoded as 4 bits
        bbs = 64    # build block size ( bbs % 32 == 0 ) for PQ
        quantizer = faiss.IndexFlatL2(dim)
        if index_type == "ivfflat":
            index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_L2)
        elif index_type == "ivfpq":
            index = faiss.IndexIVFPQ(quantizer, dim, nlist, m, 8)
        elif index_type == "pqfastscan":
            index = faiss.IndexPQFastScan(dim, m, n_bit, faiss.METRIC_L2, bbs)
        else:
          index = quantizer

        print("Building Index")
        start_time = time.time()
        break_point = int(total * percent)
        print(break_point)
        index.train(train_data[0:break_point])
        index.add(train_data)
        build_time = time.time() - start_time
        print("Build time:", build_time)

        print("Testing Search")
        start_time = time.time()
        distances, indices = index.search(test_queries, 1)
        elapsed_time = time.time() - start_time

        recall = np.mean(indices[:, 0] == ground_truth)
        print("Recall:", recall)
        qps = len(test_queries) / elapsed_time
        print("QPS:", qps)
        results.append((percent, recall, qps, build_time, "batch_size"))
    return results

In [19]:
def plot_results(results):
    plt.figure(figsize=(8, 6))

    for result in results:
        for metric, recall, qps, build_time, label in result:
            plt.scatter(qps, recall, label=f'{label}={metric}', marker='o')
    plt.xlabel("Queries Per Second (QPS)")
    plt.ylabel("1-Recall@1")
    plt.title("QPS vs Recall")
    plt.legend()
    plt.grid()
    plt.show()

    for result in results:
        for metric, recall, qps, build_time, label in result:
            plt.scatter(build_time, recall, label=f'{label}={metric}', marker='o')
    plt.xlabel("Build Time (second)")
    plt.ylabel("1-Recall@1")
    plt.title("Build Time vs Recall")
    plt.legend()
    plt.grid()
    plt.show()

In [12]:
download_sift1m(SIFT1M_URL, SIFT1M_FILENAME)
train_data, test_queries, ground_truth = load_sift1m(SIFT1M_FILENAME)

print("Train data shape:", train_data.shape)
print(train_data)
print("Test queries shape:", test_queries.shape)
print(test_queries)
print("Ground truth shape:", ground_truth.shape)
print(ground_truth)

SIFT1M dataset already exists.
<HDF5 file "sift-128-euclidean.hdf5" (mode r)>
Dataset loaded.
Train data shape: (1000000, 128)
[[  0.  16.  35. ...  25.  23.   1.]
 [ 14.  35.  19. ...  11.  21.  33.]
 [  0.   1.   5. ...   4.  23.  10.]
 ...
 [ 30.  12.  12. ...  50.  10.   0.]
 [  0.   5.  12. ...   1.   2.  13.]
 [114.  31.   0. ...  25.  16.   0.]]
Test queries shape: (10000, 128)
[[  1.   3.  11. ...  42.  48.  11.]
 [ 40.  25.  11. ...   3.  19.  13.]
 [ 28.   4.   3. ...   2.  54.  47.]
 ...
 [  0.  15.  64. ...   3.  62. 118.]
 [131.   2.   0. ...   7.   0.   0.]
 [ 23.   0.   0. ...  79.  16.   4.]]
Ground truth shape: (10000,)
[932085 413247 669835 ... 123855 755327 874343]


In [16]:
hnsw_results = evaluate_hnsw(train_data, test_queries, ground_truth)
print(hnsw_results)

Testing ef value: 10
Took: 1.0034310817718506 seconds
Testing ef value: 50
Took: 2.15854811668396 seconds
Testing ef value: 100
Took: 3.562847375869751 seconds
Testing ef value: 200
Took: 8.163212299346924 seconds
HNSW evaluated.
[(10, np.float64(0.854), 9965.806502965883, 0), (50, np.float64(0.9783), 4632.743612573419, 0), (100, np.float64(0.9906), 2806.7438610273985, 0), (200, np.float64(0.9929), 1225.0079543809027, 0)]


In [None]:
scann_results = evaluate_scann(train_data, test_queries, ground_truth)
print(scann_results)

Building index


In [None]:
# fiass_results = eval_fiass(train_data, test_queries, ground_truth, index_type="ivfpq", batches=[1, 10, 20, 50, 100])
fiass_results = eval_fiass(train_data, test_queries, ground_truth, batches=[1, 10, 20, 50, 100])
print(fiass_results)

Building Index


In [14]:
fiass_results_percent = eval_fiass_percent(train_data, test_queries, ground_truth)
print(fiass_results_percent)

Building Index
500000
Build time: 1.6443662643432617
Testing Search
Recall: 0.6138
QPS: 1831.3078996171953
Building Index
600000
Build time: 1.7050743103027344
Testing Search
Recall: 0.6138
QPS: 2086.423194648479
Building Index
700000
Build time: 1.5279006958007812
Testing Search
Recall: 0.6113
QPS: 1864.5663645938375
Building Index
800000
Build time: 1.635122537612915
Testing Search
Recall: 0.6131
QPS: 2206.0716010327287
Building Index
900000
Build time: 1.639585018157959
Testing Search
Recall: 0.6054
QPS: 1774.27742247564
Building Index
1000000
Build time: 1.6969890594482422
Testing Search
Recall: 0.6144
QPS: 2179.210837862982
[(0.5, np.float64(0.6138), 1831.3078996171953, 1.6443662643432617, 'batch_size'), (0.6, np.float64(0.6138), 2086.423194648479, 1.7050743103027344, 'batch_size'), (0.7, np.float64(0.6113), 1864.5663645938375, 1.5279006958007812, 'batch_size'), (0.8, np.float64(0.6131), 2206.0716010327287, 1.635122537612915, 'batch_size'), (0.9, np.float64(0.6054), 1774.277422475

In [2]:
print(ground_truth)
plot_results([fiass_results_percent])

NameError: name 'plot_results' is not defined