<a href="https://colab.research.google.com/github/a6iyyu/MachineLearning_3F_23/blob/main/JS6/P4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Praktikum 4**

Percobaan kali ini kita akan melihat perbedaan ketiga model yang telah kita bahas dan bandingkan hasilnya.

In [17]:
!pip install annoy faiss-cpu hnswlib



In [18]:
# Import setelah instalasi selesai
import numpy as np
import pandas as pd
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

In [19]:
# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000
dim = 5
print(f"Membuat dataset dengan {n_data} titik data berdimensi {dim}...\n")
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10

Membuat dataset dengan 1000000 titik data berdimensi 5...



In [20]:
# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print(f"Build time: {build_time:.4f} detik")
print(f"Query time: {query_time*1000:.4f} ms")
print("Neighbors:", neighbors[0][:5], "...")

=== Annoy ===
Build time: 21.3802 detik
Query time: 0.3221 ms
Neighbors: [191144, 640339, 985012, 807627, 592975] ...


In [21]:
# ===============================
# 3. FAISS (IndexFlatL2 - Brute Force)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print(f"Build time: {build_time:.4f} detik")
print(f"Query time: {query_time*1000:.4f} ms")
print("Neighbors:", indices[0][:5], "...")


=== FAISS (IndexFlatL2) ===
Build time: 0.0164 detik
Query time: 14.8003 ms
Neighbors: [191144 640339 169804 985012 807627] ...


In [22]:
# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print(f"Build time: {build_time:.4f} detik")
print(f"Query time: {query_time*1000:.4f} ms")
print("Neighbors:", labels[0][:5], "...")


=== HNSW (hnswlib) ===
Build time: 197.0543 detik
Query time: 0.2759 ms
Neighbors: [191144 640339 169804 985012 807627] ...


Lakukan percobaan pada metric distance yang berbeda. catat hasilnya pada tabel yang anda buat sendiri seperti pada praktikum 1.

In [23]:
def run_experiment(metric_type):
    """
    Fungsi untuk menjalankan eksperimen komparasi untuk metrik jarak tertentu.
    """
    # ===============================
    # 1. Konfigurasi & Data
    # ===============================
    n_data = 1_000_000
    dim = 5
    k = 10

    print(f"--- MEMULAI EKSPERIMEN: METRIK {metric_type.upper()} ---")
    print(f"Membuat dataset dengan {n_data} data, {dim} dimensi...")
    X = np.random.random((n_data, dim)).astype(np.float32)
    query = np.random.random((1, dim)).astype(np.float32)

    # Normalisasi data untuk metrik Inner Product (agar setara Cosine Similarity)
    if metric_type == 'ip':
        faiss.normalize_L2(X)
        faiss.normalize_L2(query)
        annoy_metric = 'angular' # Annoy menggunakan 'angular' untuk cosine
    else:
        annoy_metric = 'euclidean'

    results = []

    # ===============================
    # 2. FAISS (Ground Truth - Exact Search)
    # ===============================
    print("Menjalankan FAISS (Exact)...")
    if metric_type == 'ip':
        index_exact = faiss.IndexFlatIP(dim)
    else:
        index_exact = faiss.IndexFlatL2(dim)

    start = time.time()
    index_exact.add(X)
    build_time = time.time() - start

    start = time.time()
    distances_exact, indices_exact = index_exact.search(query, k)
    query_time = time.time() - start

    ground_truth = set(indices_exact[0])
    results.append({
        "Library": "FAISS (Exact)", "Metrik": metric_type.upper(),
        "Waktu Build (s)": build_time, "Waktu Query (ms)": query_time * 1000,
        "Recall@10": 1.0 # Akurasi 100% karena ini ground truth
    })

    # ===============================
    # 3. Annoy
    # ===============================
    print("Menjalankan Annoy...")
    ann_index = AnnoyIndex(dim, annoy_metric)
    start = time.time()
    for i in range(n_data):
        ann_index.add_item(i, X[i])
    ann_index.build(10) # 10 trees
    build_time = time.time() - start

    start = time.time()
    indices_ann, _ = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
    query_time = time.time() - start
    recall_ann = len(ground_truth.intersection(set(indices_ann))) / k
    results.append({
        "Library": "Annoy", "Metrik": metric_type.upper(),
        "Waktu Build (s)": build_time, "Waktu Query (ms)": query_time * 1000,
        "Recall@10": recall_ann
    })

    # ===============================
    # 4. HNSW (hnswlib)
    # ===============================
    print("Menjalankan HNSW...")
    hnsw_index = hnswlib.Index(space=metric_type, dim=dim)
    start = time.time()
    hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
    hnsw_index.add_items(X)
    build_time = time.time() - start

    hnsw_index.set_ef(50)
    start = time.time()
    indices_hnsw, _ = hnsw_index.knn_query(query, k=k)
    query_time = time.time() - start
    # FIX: Ambil baris pertama dari hasil sebelum mengubahnya menjadi set
    recall_hnsw = len(ground_truth.intersection(set(indices_hnsw[0]))) / k
    results.append({
        "Library": "HNSW", "Metrik": metric_type.upper(),
        "Waktu Build (s)": build_time, "Waktu Query (ms)": query_time * 1000,
        "Recall@10": recall_hnsw
    })

    print(f"--- EKSPERIMEN {metric_type.upper()} SELESAI ---\n")
    return results

In [24]:
# Jalankan eksperimen untuk kedua metrik
all_results = []
all_results.extend(run_experiment('l2'))
all_results.extend(run_experiment('ip'))

# Tampilkan hasil dalam tabel yang rapi
df = pd.DataFrame(all_results)
pd.options.display.float_format = '{:,.4f}'.format
print("--- HASIL AKHIR PERBANDINGAN ---")
print(df.to_string())

--- MEMULAI EKSPERIMEN: METRIK L2 ---
Membuat dataset dengan 1000000 data, 5 dimensi...
Menjalankan FAISS (Exact)...
Menjalankan Annoy...
Menjalankan HNSW...
--- EKSPERIMEN L2 SELESAI ---

--- MEMULAI EKSPERIMEN: METRIK IP ---
Membuat dataset dengan 1000000 data, 5 dimensi...
Menjalankan FAISS (Exact)...
Menjalankan Annoy...
Menjalankan HNSW...
--- EKSPERIMEN IP SELESAI ---

--- HASIL AKHIR PERBANDINGAN ---
         Library Metrik  Waktu Build (s)  Waktu Query (ms)  Recall@10
0  FAISS (Exact)     L2           0.0176            6.6075     1.0000
1          Annoy     L2          22.6033            0.1636     1.0000
2           HNSW     L2         215.6296            0.1335     1.0000
3  FAISS (Exact)     IP           0.0176            7.2770     1.0000
4          Annoy     IP          37.0943            0.1454     1.0000
5           HNSW     IP         209.8552            0.1717     1.0000
