<a href="https://colab.research.google.com/github/a6iyyu/MachineLearning_3F_23/blob/main/JS6/P3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Praktikum 3**

Instal `hnswlib` terlebih dahulu.

In [4]:
!pip install hnswlib



Percobaan berikut akan membandingkan exact NN dengan HNSW pada 1000 data 2D.

In [5]:
import hnswlib
import numpy as np
import time
from sklearn.neighbors import NearestNeighbors

# ===========================
# 1. Buat data 2D acak
# ===========================
num_elements = 1000
dim = 2
data = np.random.random((num_elements, dim)).astype(np.float32)

# Query point
query = np.array([[0.5, 0.5]], dtype=np.float32)
k = 5  # cari 5 tetangga terdekat

# ===========================
# 2. Exact NN (Brute Force)
# ===========================
nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
nn.fit(data)

start = time.time()
distances, indices = nn.kneighbors(query)
end = time.time()

print("=== Exact NN ===")
print("Indices:", indices)
print("Distances:", distances)
print("Waktu:", end - start, "detik")

# ===========================
# 3. HNSW
# ===========================
# Inisialisasi index HNSW
p = hnswlib.Index(space='l2', dim=dim)

# Ukuran maksimum elemen yang bisa ditampung
p.init_index(max_elements=num_elements, ef_construction=100, M=16)

# Tambahkan data
p.add_items(data)

# Set parameter pencarian
p.set_ef(50)   # tradeoff speed vs accuracy

start = time.time()
labels, distances = p.knn_query(query, k=k)
end = time.time()

print("\n=== HNSW ===")
print("Indices:", labels)
print("Distances:", distances)
print("Waktu:", end - start, "detik")

=== Exact NN ===
Indices: [[124 805  96 890 212]]
Distances: [[0.03437702 0.03632919 0.03853166 0.04725471 0.05015854]]
Waktu: 0.0016150474548339844 detik

=== HNSW ===
Indices: [[124 805  96 890 212]]
Distances: [[0.00118178 0.00131981 0.00148469 0.00223301 0.00251588]]
Waktu: 0.00021004676818847656 detik


Lakukan percobaan pada metric distance yang berbeda, 1000 vs 1jt data, 2D vs 5D data. catat hasilnya pada tabel yang anda buat sendiri seperti pada praktikum 1.

In [6]:
def run_experiment(num_elements, dim, metric_type, k=10):
    """Fungsi untuk menjalankan satu skenario eksperimen."""
    print(f"--- Menjalankan: {num_elements} data, {dim}D, Metrik {metric_type.upper()} ---")

    # 1. Buat data acak
    data = np.random.random((num_elements, dim)).astype(np.float32)
    query = np.random.random((1, dim)).astype(np.float32)

    # Scikit-learn butuh mapping metrik
    metric_map_sklearn = {'l2': 'euclidean', 'ip': 'cosine'}

    if metric_type == 'ip':
        # Normalisasi diperlukan agar Inner Product setara dengan Cosine Similarity
        # Ini adalah praktik standar.
        data_normalized = data / np.linalg.norm(data, axis=1, keepdims=True)
        query_normalized = query / np.linalg.norm(query, axis=1, keepdims=True)
    else:
        data_normalized = data
        query_normalized = query

    # 2. Exact NN (Brute Force)
    nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric=metric_map_sklearn[metric_type])
    nn.fit(data_normalized)

    start_time = time.time()
    distances_exact, indices_exact = nn.kneighbors(query_normalized)
    time_exact = time.time() - start_time

    # 3. HNSW
    # Inisialisasi index
    p = hnswlib.Index(space=metric_type, dim=dim)

    # Waktu Build Index
    start_build = time.time()
    p.init_index(max_elements=num_elements, ef_construction=200, M=16)
    p.add_items(data)
    time_build = time.time() - start_build

    # Waktu Query
    p.set_ef(50)  # Parameter pencarian, semakin tinggi semakin akurat tapi lambat
    start_query = time.time()
    indices_hnsw, distances_hnsw = p.knn_query(query, k=k)
    time_hnsw = time.time() - start_query

    # 4. Hitung metrik performa
    exact_set = set(indices_exact[0])
    hnsw_set = set(indices_hnsw[0])
    recall = len(exact_set.intersection(hnsw_set)) / k
    speedup = time_exact / time_hnsw if time_hnsw > 0 else float('inf')

    return {
        "Jumlah Data": num_elements,
        "Dimensi": dim,
        "Metrik Jarak": metric_type.upper(),
        "Waktu Build HNSW (s)": time_build,
        "Waktu Exact (ms)": time_exact * 1000,
        "Waktu HNSW (ms)": time_hnsw * 1000,
        "Speedup": speedup,
        "Recall@10": recall
    }

In [7]:
experiments = [
    {'num_elements': 1000, 'dim': 2, 'metric_type': 'l2'},
    {'num_elements': 1000, 'dim': 2, 'metric_type': 'ip'},
    {'num_elements': 1000, 'dim': 5, 'metric_type': 'l2'},
    {'num_elements': 1000, 'dim': 5, 'metric_type': 'ip'},
    {'num_elements': 1_000_000, 'dim': 2, 'metric_type': 'l2'},
    {'num_elements': 1_000_000, 'dim': 2, 'metric_type': 'ip'},
    {'num_elements': 1_000_000, 'dim': 5, 'metric_type': 'l2'},
    {'num_elements': 1_000_000, 'dim': 5, 'metric_type': 'ip'},
]

results = []
for params in experiments:
    results.append(run_experiment(**params))

--- Menjalankan: 1000 data, 2D, Metrik L2 ---
--- Menjalankan: 1000 data, 2D, Metrik IP ---
--- Menjalankan: 1000 data, 5D, Metrik L2 ---
--- Menjalankan: 1000 data, 5D, Metrik IP ---
--- Menjalankan: 1000000 data, 2D, Metrik L2 ---
--- Menjalankan: 1000000 data, 2D, Metrik IP ---
--- Menjalankan: 1000000 data, 5D, Metrik L2 ---
--- Menjalankan: 1000000 data, 5D, Metrik IP ---


In [8]:
# Tampilkan hasil dalam tabel yang rapi
import pandas as pd
df = pd.DataFrame(results)
print("\n--- 📊 HASIL EKSPERIMEN ---")
# Mengatur format tampilan float
pd.options.display.float_format = '{:,.4f}'.format
print(df.to_string())


--- 📊 HASIL EKSPERIMEN ---
   Jumlah Data  Dimensi Metrik Jarak  Waktu Build HNSW (s)  Waktu Exact (ms)  Waktu HNSW (ms)  Speedup  Recall@10
0         1000        2           L2                0.0485            0.9229           0.0598  15.4223     1.0000
1         1000        2           IP                0.0353           14.5750           0.0563 259.0339     0.0000
2         1000        5           L2                0.0568            0.5891           0.0904   6.5198     1.0000
3         1000        5           IP                0.0579            1.4615           0.0596  24.5200     0.0000
4      1000000        2           L2              121.0759           47.3881           0.0877 540.1087     1.0000
5      1000000        2           IP               76.6119           34.2424           0.0522 655.8128     0.0000
6      1000000        5           L2              180.9975           35.5079           0.1228 289.1864     1.0000
7      1000000        5           IP              127.0769  