In [32]:
import numpy as np
from sklearn.neighbors import KDTree

def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

def get_top10_nearest(vector_input, vector_compare):
    # Chuẩn hóa các vector trong vector_compare
    norms = np.linalg.norm(vector_compare, axis=1)
    normalized_compare = vector_compare / norms[:, np.newaxis]

    # Chuẩn hóa vector_input
    normalized_input = vector_input / np.linalg.norm(vector_input)

    # Sử dụng KDTree trên vector_compare chuẩn hóa
    kdtree = KDTree(normalized_compare)

    num_neighbors = 10  # Số lượng hàng xóm gần nhất cần tìm

    # Tìm kiếm hàng xóm gần nhất trong KDTree
    distances, indices = kdtree.query([normalized_input], k=num_neighbors, return_distance=True)

    # Tính toán cosine similarity
    similarities = [cosine_similarity(normalized_input, normalized_compare[i]) for i in indices[0]]

    return similarities, indices[0]

In [23]:
import faiss
import numpy as np
import pandas as pd

def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity


def get_closest(vector_input:np.array, df: pd.DataFrame):
    # Chuẩn hóa các vector thành độ dài 1
    normalized_input = vector_input / np.linalg.norm(vector_input)
    normalized_compare = df.values / np.linalg.norm(df.values, axis=1)[:, np.newaxis]

    # Xây dựng chỉ số Faiss
    index = faiss.IndexFlatIP(df.values.shape[1])  # vector_compare.shape[1] là số chiều của vector
    index.add(normalized_compare)

    num_neighbors = 10  # Số lượng hàng xóm gần nhất cần tìm

    # Tìm kiếm hàng xóm gần nhất
    distances, indices = index.search(np.array([normalized_input]), k=num_neighbors)

    # Tính toán cosine similarity
    similarities = [cosine_similarity(normalized_input, normalized_compare[i]) for i in indices[0]]

    return similarities, indices[0]

In [24]:
data = pd.read_csv('./processed_data.csv')
df = data.drop(columns=["ID", "artist_name", "track_name", "audio_feats_key", "audio_feats_mode", "audio_feats_time_signature", "label"]).dropna()

In [25]:
df.values.shape

(2149378, 10)

In [26]:
df.shape

(2149378, 10)

In [27]:
input_vector = np.random.randint(-10, 10, size = df.shape[1])
input_vector

array([-9,  9, -3, -1, -1,  0,  6,  8, -4,  6])

In [28]:
get_closest(input_vector, df)

([0.34218933381745636,
  0.3376605064460142,
  0.33705913130868465,
  0.3358309466584777,
  0.33527855190250794,
  0.33527855190250794,
  0.33527855190250794,
  0.33527855190250794,
  0.33527855190250794,
  0.33523866689620824],
 array([1573409, 1051996, 1280782, 2031104,  542759,  413075,  257965,
         257964,  108135, 1066246], dtype=int64))

In [33]:
get_top10_nearest(input_vector, df.values)

([0.34218933381745636,
  0.3376605064460142,
  0.33705913130868465,
  0.3358309466584777,
  0.33527855190250794,
  0.33527855190250794,
  0.33527855190250794,
  0.33527855190250794,
  0.33527855190250794,
  0.33523866689620824],
 array([1573409, 1051996, 1280782, 2031104,  257964,  542759,  108135,
         257965,  413075, 1066246], dtype=int64))

In [30]:
result = []
for array in df.values:
    tichvohuong = np.dot(array, input_vector)
    ketqua = (tichvohuong) / (np.linalg.norm(array) * np.linalg.norm(input_vector))
    result.append(ketqua)

result = sorted(result, reverse=True)[:10]

In [31]:
result

[0.3421893338174563,
 0.3376605064460141,
 0.3370591313086846,
 0.3358309466584777,
 0.33527855190250794,
 0.33527855190250794,
 0.33527855190250794,
 0.33527855190250794,
 0.33527855190250794,
 0.33523866689620824]

In [35]:
%timeit get_top10_nearest(input_vector, df.values)

19.2 s ± 3.51 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [36]:
%timeit get_closest(input_vector, df)

670 ms ± 37.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Sử dụng KDtree mất khoảng 19.2s
Sử dụng Faiss mất khoảng 0.67s