<a href="https://colab.research.google.com/github/adamMcneil/hot-topics-data-management-project/blob/main/598_mp4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
!{sys.executable} -m pip install h5py faiss-cpu requests scann
import faiss
import h5py
import numpy as np
import time
import matplotlib.pyplot as plt
import os
import requests
import scann

SIFT1M_URL = "http://ann-benchmarks.com/sift-128-euclidean.hdf5"
SIFT1M_FILENAME = "sift-128-euclidean.hdf5"

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting scann
  Downloading scann-1.4.0-cp311-cp311-manylinux_2_27_x86_64.whl.metadata (5.8 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scann-1.4.0-cp311-cp311-manylinux_2_27_x86_64.whl (11.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scann, faiss-cpu
Successfully installed faiss-cpu-1.10.0 scann-1.4.0


In [2]:
def download_sift1m():
    """Downloads the SIFT1M dataset if it's not already present."""
    if not os.path.exists(SIFT1M_FILENAME):
        print("Downloading SIFT1M dataset...")
        response = requests.get(SIFT1M_URL, stream=True)
        with open(SIFT1M_FILENAME, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print("Download complete.")
    else:
        print("SIFT1M dataset already exists.")

In [3]:
def load_sift1m():
    with h5py.File("sift-128-euclidean.hdf5", "r") as f:
        train_data = np.array(f["train"], dtype=np.float32)
        test_queries = np.array(f["test"], dtype=np.float32)
        ground_truth = np.array(f["neighbors"], dtype=np.int64)[:, 0]
    print("Dataset loaded.")
    return train_data, test_queries, ground_truth

In [4]:
def evaluate_hnsw(train_data, test_queries, ground_truth, M=32, efSearch_vals=[10, 50, 100, 200]):
    d = train_data.shape[1]
    index = faiss.IndexHNSWFlat(d, M, faiss.METRIC_L2)
    index.hnsw.efConstruction = 200
    index.add(train_data)

    results = []
    for ef in efSearch_vals:
        index.hnsw.efSearch = ef
        start_time = time.time()
        _, indices = index.search(test_queries, 1)
        elapsed_time = time.time() - start_time

        recall = np.mean(indices[:, 0] == ground_truth)
        qps = len(test_queries) / elapsed_time
        results.append((ef, recall, qps))

    print("HNSW evaluated.")

    return results

In [5]:
def evaluate_scann(train_data, test_queries, ground_truth, num_neighbors=1, num_search_trees=30, num_leaves=2000, num_leaves_to_search_vals=[10, 50, 100, 200], quantize=True):
    train_data = train_data.astype(np.float32)
    test_queries = test_queries.astype(np.float32)

    results = []

    for num_leaves_to_search in num_leaves_to_search_vals:
        index = scann.scann_ops_pybind.builder(train_data, num_neighbors, "dot_product") \
        .tree(num_leaves=num_leaves, num_leaves_to_search=num_leaves_to_search).score_ah(2, anisotropic_quantization_threshold=0.2).build()

        start_time = time.time()
        _, indices = index.search(test_queries, num_neighbors)
        elapsed_time = time.time() - start_time

        recall = np.mean(indices[:, 0] == ground_truth)
        qps = len(test_queries) / elapsed_time

        results.append((num_leaves_to_search, recall, qps))

    print("ScaNN evaluated.")

    return results


In [6]:
def evaluate_lsh(train_data, test_queries, ground_truth, nbits_vals=[32, 64, 512, 768]):
    d = train_data.shape[1]
    results = []

    for nbits in nbits_vals:
        index = faiss.IndexLSH(d, nbits)
        index.train(train_data)
        index.add(train_data)

        start_time = time.time()
        _, indices = index.search(test_queries, 1)
        elapsed_time = time.time() - start_time

        recall = np.mean(indices[:, 0] == ground_truth)
        qps = len(test_queries) / elapsed_time
        results.append((nbits, recall, qps))

    print("LSH evaluated.")

    return results

In [7]:
def plot_results(hnsw_results, lsh_results):
    plt.figure(figsize=(8, 6))

    for ef, recall, qps in hnsw_results:
        plt.scatter(qps, recall, label=f'HNSW ef={ef}', marker='o')
    for nbits, recall, qps in lsh_results:
        plt.scatter(qps, recall, label=f'LSH nbits={nbits}', marker='x')

    plt.xlabel("Queries Per Second (QPS)")
    plt.ylabel("1-Recall@1")
    plt.title("HNSW vs LSH: QPS vs Recall")
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
download_sift1m()
train_data, test_queries, ground_truth = load_sift1m()
hnsw_results = evaluate_hnsw(train_data, test_queries, ground_truth)
scann_results = evaluate_scann(train_data, test_queries, ground_truth)
plot_results(hnsw_results, scann_results)

Downloading SIFT1M dataset...
Download complete.
Dataset loaded.


In [None]:
scann_results = evaluate_scann(train_data, test_queries, ground_truth)


ValueError: Query must be one-dimensional