In [3]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KDTree
from sklearn.model_selection import train_test_split
from pathlib import Path
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import pickle
import faiss
from scipy.spatial.distance import cdist

## Lấy ra các pretrain models

In [4]:
cwd = Path(os.getcwd())
model_path = cwd / "models"

with open(model_path / "pca.pickle", "rb") as f:
    pca: PCA = pickle.load(f)

with open(model_path / "scaler.pickle", "rb") as f:
    scaler: StandardScaler = pickle.load(f)

with open(model_path / "kmean.pickle", "rb") as f:
    k_mean: KMeans = pickle.load(f)

## Các cách tính toán độ giống nhau

- FAISS 

In [5]:
def get_closest(vector_input, df):
    # Chuẩn hóa các vector thành độ dài 1
    normalized_input = vector_input / np.linalg.norm(vector_input)
    normalized_compare = df / np.linalg.norm(df, axis=1)[:, np.newaxis]

    # normalized_input = vector_input
    # normalized_compare = df
    # Xây dựng chỉ số Faiss
    index = faiss.IndexFlatIP(df.shape[1])  # vector_compare.shape[1] là số chiều của vector
    index.add(normalized_compare)

    num_neighbors = 10  # Số lượng hàng xóm gần nhất cần tìm

    # Tìm kiếm hàng xóm gần nhất
    distances, indices = index.search(np.array([normalized_input]), k=num_neighbors)

    return distances, indices[0]

- KDTree

In [6]:
def get_top10_neighborest(vector_input, vector_compare):

    kdtree = KDTree(vector_compare)
    num_neighbors = 10 

    distances, indices = kdtree.query([vector_input], k=num_neighbors, return_distance=True)

    return distances, indices

- Công thức cosine sim bình thường

In [7]:
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a, axis=1)
    norm_b = np.linalg.norm(b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

## Dữ liệu

### Đọc dữ liệu

In [8]:
df = pd.read_csv("./processed_data.csv").iloc[:, 1:]
df

Unnamed: 0,ID,artist_name,track_name,audio_feats_danceability,audio_feats_energy,audio_feats_key,audio_feats_loudness,audio_feats_mode,audio_feats_speechiness,audio_feats_acousticness,audio_feats_instrumentalness,audio_feats_liveness,audio_feats_valence,audio_feats_tempo,audio_feats_duration_ms,audio_feats_time_signature,label
0,0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0.9040,0.8130,4.0,-7.105,0.0,0.1210,0.03110,0.006970,0.0471,0.8100,125.461,226864.0,4.0,30
1,6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0.7740,0.8380,5.0,-3.914,0.0,0.1140,0.02490,0.025000,0.2420,0.9240,143.040,198800.0,4.0,24
2,0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0.6640,0.7590,2.0,-6.583,0.0,0.2090,0.00238,0.000000,0.0598,0.7010,99.252,235933.0,4.0,24
3,1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0.8920,0.7140,4.0,-6.055,0.0,0.1410,0.20100,0.000234,0.0521,0.8170,100.972,267267.0,4.0,30
4,1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0.8530,0.6060,0.0,-4.596,1.0,0.0713,0.05610,0.000000,0.3130,0.6540,94.759,227600.0,4.0,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2149373,6cVHCeUEJOz8aW6pgbrSXT,"Father Kari, the Gameboy","Trial II. Welcome to Taimagakure, the Wav Village",0.8220,0.6230,10.0,-8.357,1.0,0.3510,0.26900,0.000000,0.1070,0.5730,109.964,280425.0,4.0,49
2149374,2YpdaJQSPHM1MsMITng93V,Dean Evenson,Let Go,0.0712,0.0310,5.0,-26.889,1.0,0.0531,0.99400,0.982000,0.0995,0.0297,76.160,398707.0,5.0,25
2149375,7s9RepEmLkr0xx2KeETO9e,Savasana,Ambient Music for Relaxation,0.2040,0.1480,6.0,-22.488,1.0,0.0437,0.97600,0.940000,0.1210,0.0733,89.825,144292.0,3.0,7
2149376,7DneV7iDFHmuc8wt8sDqzG,Savasana,R-E-M Phase,0.2290,0.0921,9.0,-21.673,1.0,0.0453,0.95900,0.957000,0.1050,0.0351,62.058,203923.0,4.0,25


### Tiền xử lý

- Lấy ra các trường liên tục

In [9]:
X = df.iloc[:, 3:-1]
X

Unnamed: 0,audio_feats_danceability,audio_feats_energy,audio_feats_key,audio_feats_loudness,audio_feats_mode,audio_feats_speechiness,audio_feats_acousticness,audio_feats_instrumentalness,audio_feats_liveness,audio_feats_valence,audio_feats_tempo,audio_feats_duration_ms,audio_feats_time_signature
0,0.9040,0.8130,4.0,-7.105,0.0,0.1210,0.03110,0.006970,0.0471,0.8100,125.461,226864.0,4.0
1,0.7740,0.8380,5.0,-3.914,0.0,0.1140,0.02490,0.025000,0.2420,0.9240,143.040,198800.0,4.0
2,0.6640,0.7590,2.0,-6.583,0.0,0.2090,0.00238,0.000000,0.0598,0.7010,99.252,235933.0,4.0
3,0.8920,0.7140,4.0,-6.055,0.0,0.1410,0.20100,0.000234,0.0521,0.8170,100.972,267267.0,4.0
4,0.8530,0.6060,0.0,-4.596,1.0,0.0713,0.05610,0.000000,0.3130,0.6540,94.759,227600.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2149373,0.8220,0.6230,10.0,-8.357,1.0,0.3510,0.26900,0.000000,0.1070,0.5730,109.964,280425.0,4.0
2149374,0.0712,0.0310,5.0,-26.889,1.0,0.0531,0.99400,0.982000,0.0995,0.0297,76.160,398707.0,5.0
2149375,0.2040,0.1480,6.0,-22.488,1.0,0.0437,0.97600,0.940000,0.1210,0.0733,89.825,144292.0,3.0
2149376,0.2290,0.0921,9.0,-21.673,1.0,0.0453,0.95900,0.957000,0.1050,0.0351,62.058,203923.0,4.0


- Lấy ra các labels

In [10]:
Y = df.iloc[:, -1]
Y

0          30
1          24
2          24
3          30
4          24
           ..
2149373    49
2149374    25
2149375     7
2149376    25
2149377    44
Name: label, Length: 2149378, dtype: int64

- Drop một số trường phân loại đi

In [11]:
continuous_data = X.drop(columns=["audio_feats_key", "audio_feats_mode", "audio_feats_time_signature"])
continuous_data

Unnamed: 0,audio_feats_danceability,audio_feats_energy,audio_feats_loudness,audio_feats_speechiness,audio_feats_acousticness,audio_feats_instrumentalness,audio_feats_liveness,audio_feats_valence,audio_feats_tempo,audio_feats_duration_ms
0,0.9040,0.8130,-7.105,0.1210,0.03110,0.006970,0.0471,0.8100,125.461,226864.0
1,0.7740,0.8380,-3.914,0.1140,0.02490,0.025000,0.2420,0.9240,143.040,198800.0
2,0.6640,0.7590,-6.583,0.2090,0.00238,0.000000,0.0598,0.7010,99.252,235933.0
3,0.8920,0.7140,-6.055,0.1410,0.20100,0.000234,0.0521,0.8170,100.972,267267.0
4,0.8530,0.6060,-4.596,0.0713,0.05610,0.000000,0.3130,0.6540,94.759,227600.0
...,...,...,...,...,...,...,...,...,...,...
2149373,0.8220,0.6230,-8.357,0.3510,0.26900,0.000000,0.1070,0.5730,109.964,280425.0
2149374,0.0712,0.0310,-26.889,0.0531,0.99400,0.982000,0.0995,0.0297,76.160,398707.0
2149375,0.2040,0.1480,-22.488,0.0437,0.97600,0.940000,0.1210,0.0733,89.825,144292.0
2149376,0.2290,0.0921,-21.673,0.0453,0.95900,0.957000,0.1050,0.0351,62.058,203923.0


In [12]:
continuous_data.columns

Index(['audio_feats_danceability', 'audio_feats_energy',
       'audio_feats_loudness', 'audio_feats_speechiness',
       'audio_feats_acousticness', 'audio_feats_instrumentalness',
       'audio_feats_liveness', 'audio_feats_valence', 'audio_feats_tempo',
       'audio_feats_duration_ms'],
      dtype='object')

- Scale dữ liệu về phân bố chuẩn và áp dụng PCA về còn 3 chiều (Để giúp cho việc trực quan, cluster và tính toán dễ dàng, nhanh hơn)

In [13]:
processed_train = pca.transform(scaler.transform(continuous_data))
processed_train

array([[-2.19658248e+00, -1.29361192e+00, -8.87683980e-01],
       [-2.63215089e+00, -6.70602015e-01, -3.01267420e-01],
       [-1.59941039e+00, -8.51452029e-01,  1.53082773e-02],
       ...,
       [ 4.54106112e+00,  1.32775440e-01,  9.52485594e-03],
       [ 4.75713131e+00,  6.57297456e-04,  7.25920036e-02],
       [ 5.36735658e+00,  1.08679701e+00,  3.64164611e-03]])

## Test đầu vào

- Chuẩn bị dữ liệu input

In [14]:
# input_vector = np.array([0.601, 0.789, -6.822, 0.0328, 0.0395, 0.812, 0.114, 0.315, 137.06, 205926.0])
input_vector= [ 6.89000e-01,  2.68000e-01, -1.57220e+01,  5.05000e-01,9.23000e-01,  1.69000e-06,  3.10000e-01,  5.69000e-01,1.21202e+02,  9.10630e+04]
input_id = "4S0zagxseJHOL5ZVnTQA6X"
input_id

'4S0zagxseJHOL5ZVnTQA6X'

- Scale dữ liệu input về phân bố chuẩn và áp dụng PCA về 3 chiều

In [15]:
temp = pca.transform(scaler.transform([input_vector])).flatten()
temp



array([ 0.71516298, -2.53901098,  2.94671557])

- Sử dụng model k mean dự đoán label của bài input

In [16]:
pred = k_mean.predict([temp])
pred

array([20])

- Để giảm thiểu thời gian chạy của thuật toán cosine similarity thì việc clustering là rất cần thiết <br/>
=> Lấy ra tất cả các bài có cùng label với bài input

In [17]:
group = processed_train[Y.to_numpy() == pred]
group_ID = df["ID"].to_numpy()[Y.to_numpy() == pred]
group_ID

array(['0hkxAENNrJ18AzHqync5n3', '3eze1OsZ1rqeXkKStNfTmi',
       '5tsI3xxDHDgzHhn30LTQNz', ..., '2Ttgv5kpXX4dKvtyhKXNbv',
       '61OLOsYMI9PuUftr0muz28', '18o37lczunBF67BRPMBO64'], dtype=object)

## So sánh kết quả

In [18]:
dist, indx = get_top10_neighborest(temp, group)
dist2, indx2 = get_closest(temp, group)
dist3 = cdist([temp], group, metric="cosine")
indx3 = np.argsort(dist3).flatten()


### Kết quả sử dụng công thức cosine similarity bình thường

- Kết quả chạy

In [19]:
np.sort(dist3), indx3

(array([[1.11022302e-16, 4.00426661e-06, 1.06983968e-05, ...,
         2.12818119e-01, 2.17459897e-01, 2.19018252e-01]]),
 array([   0, 5296, 4451, ..., 4481, 4800,  258], dtype=int64))

- Thời gian chạy khi ko sử dụng K mean để phân cụm

In [20]:
%timeit cdist([temp], processed_train, metric="cosine")

25.7 ms ± 865 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


- Thời gian chạy sử dụng K mean để phân cụm

In [21]:
%timeit cdist([temp], group, metric="cosine")

87.7 µs ± 1.06 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### Kết quả sử dụng FAISS

- Kết quả chạy

In [22]:
dist2, indx2

(array([[1.        , 0.999996  , 0.99998933, 0.99998784, 0.9999727 ,
         0.99996924, 0.9999629 , 0.9999503 , 0.9999465 , 0.9999452 ]],
       dtype=float32),
 array([    0,  5296,  4451, 16124, 12167, 11685, 13896,  3301,  5027,
        11728], dtype=int64))

- Thời gian chạy khi ko sử dụng K mean để phân cụm

In [23]:
%timeit get_closest(temp, processed_train)

123 ms ± 1.77 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


- Thời gian chạy khi sử dụng K mean để phân cụm

In [24]:
%timeit get_closest(temp, group)

472 µs ± 4.69 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### Matrix multiply

In [25]:
%timeit cosine_similarity(processed_train, temp)

79.8 ms ± 399 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [26]:
%timeit cosine_similarity(group, temp)

332 µs ± 3.55 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## Lấy ra các ID từ kết quả chạy được

- FAISS

In [23]:
rcm_ID2 = []
for i in range(len(dist2.flatten())):
    # if (input_id == group_ID[indx2.flatten()[i]]):
    #     print(f"Dupp: {group_ID[indx2.flatten()[i]]}")
    #     continue
    rcm_ID2.append(group_ID[indx2.flatten()[i]])

rcm_ID2

['0hkxAENNrJ18AzHqync5n3',
 '6I0VTz6ZM9baTiF7tmQiSo',
 '0c2JAT3kHR10OtP6Fo6Gsp',
 '2WEkcN2px48oY4H6hsKkWm',
 '0FKzmqTBCJpFLeHexdUrf5',
 '7H5gjcPW0sM6LnwRl0wW12',
 '2EMQMhtArm8keUpQw81KrX',
 '0I0X4crb43LYAeSEIN9sSA',
 '3zxqeUdFVsmDQymkkN96Im',
 '54Ax1ooNyhTER5bR2JmzqG']

- Công thức cosine similarity

In [24]:
rcm_ID3 = [id for id in group_ID[indx3]]
rcm_ID3[:10]

['0hkxAENNrJ18AzHqync5n3',
 '6I0VTz6ZM9baTiF7tmQiSo',
 '0c2JAT3kHR10OtP6Fo6Gsp',
 '2WEkcN2px48oY4H6hsKkWm',
 '0FKzmqTBCJpFLeHexdUrf5',
 '7H5gjcPW0sM6LnwRl0wW12',
 '2EMQMhtArm8keUpQw81KrX',
 '0I0X4crb43LYAeSEIN9sSA',
 '3zxqeUdFVsmDQymkkN96Im',
 '54Ax1ooNyhTER5bR2JmzqG']

## So sánh feature của input và kết quả recommend

- input

In [23]:
input_vector

[0.689, 0.268, -15.722, 0.505, 0.923, 1.69e-06, 0.31, 0.569, 121.202, 91063.0]

- recommend

In [27]:
continuous_data[df["ID"] == "0hkxAENNrJ18AzHqync5n3"].to_numpy()

array([[ 6.89000e-01,  2.68000e-01, -1.57220e+01,  5.05000e-01,
         9.23000e-01,  1.69000e-06,  3.10000e-01,  5.69000e-01,
         1.21202e+02,  9.10630e+04]])