In [51]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import pandas as pd
from typing import List, Dict, Set
import string
from tqdm import tqdm
import numpy as np
import gensim
from gensim.models import Word2Vec
from numpy.typing import NDArray, ArrayLike
from collections import defaultdict
import hdbscan
from collections import Counter
stop_words = set(stopwords.words('english'))
import seaborn as sns
from sklearn.decomposition import PCA
from random import sample

In [3]:
playlists_names_path = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/data/processed/playlist_names.csv"
playlists_content_path = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/data/processed/playlists.csv"
model_path = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/models/coldstart"

In [4]:
def process_playlist_names(playlist_names: List[str]) -> List[List[str]]:
    printable = set(string.printable)
    playlist_names = [''.join(filter(lambda sign: sign in printable, name)) for name in playlist_names]
    playlist_names = [word_tokenize(sentence) for sentence in tqdm(playlist_names)]
    playlist_names = [list(filter(lambda word: word not in stop_words, sentence)) for sentence in playlist_names]
    return playlist_names

In [5]:
playlist_names = pd.read_csv(playlists_names_path)["0"].to_numpy(dtype=str)
playlist_names = process_playlist_names(playlist_names)

100%|██████████| 1000000/1000000 [00:43<00:00, 22897.98it/s]


In [6]:
print(Counter(list(map(len, playlist_names))))
default_playlist_len = 10

Counter({1: 622472, 2: 297594, 3: 43999, 0: 17694, 4: 10584, 5: 4462, 6: 1470, 7: 658, 8: 322, 9: 279, 11: 139, 10: 132, 12: 90, 14: 24, 13: 23, 15: 12, 18: 7, 19: 6, 20: 5, 17: 4, 16: 4, 21: 4, 23: 2, 26: 2, 33: 2, 29: 2, 92: 1, 61: 1, 40: 1, 22: 1, 31: 1, 30: 1, 24: 1, 28: 1})


In [7]:
model = gensim.models.Word2Vec(playlist_names, min_count=1, vector_size=10, window=5)

In [8]:
def prepare_embedding_dict(model: Word2Vec) -> Dict[str, ArrayLike]:
    def missing():
        return ""
    embeddings = defaultdict(missing, {word: np.array(embedding) for word, embedding in zip(model.wv.index_to_key, model.wv)})
    return embeddings
embeddings_dict = prepare_embedding_dict(model)

In [9]:
def embed_playlists(embeddings_dict: Dict[str, ArrayLike], playlist_names: List, playlist_len=10):
    def embed_playlist(name):
        return embeddings_dict[name]
    embedded_playlists = [np.array(list(map(embed_playlist, name[:playlist_len]))).flatten() for name in playlist_names]
    max_name_len = max(list(map(len, embedded_playlists)))
    embedded_playlists = np.array(list(map(lambda embedding: np.pad(embedding, (0, max_name_len - len(embedding)), 'constant', constant_values=(0,0)), embedded_playlists)))
    return embedded_playlists

In [10]:
embedded_playlists = embed_playlists(embeddings_dict, playlist_names)

In [11]:
pca = PCA(n_components=10)

In [12]:
rd_names = pca.fit_transform(embedded_playlists)

In [13]:
print(rd_names.shape)

(1000000, 10)


In [39]:
clusterer = hdbscan.HDBSCAN(cluster_selection_epsilon=0.5, prediction_data=True)

In [40]:
clusterer.fit(rd_names[:1000, :])

In [42]:
def recommend_coldstart_cluster(clusterer: hdbscan.HDBSCAN, pca: PCA, playlist_name: str, embeddigns_dict: Dict, playlist_len: int = 10, max_name_len=100):
    playlist_names = process_playlist_names([playlist_name,  ])[0]
    playlist_names = playlist_names[:10]
    playlist_embedding = np.array(list(map(lambda name: embeddings_dict[name], playlist_names))).flatten()
    playlist_embedding = playlist_embedding[:max_name_len]
    playlist_embedding = np.pad(playlist_embedding, (0, max_name_len - len(playlist_embedding)), 'constant', constant_values=(0,0)).reshape(1, -1)
    playlist_embedding = pca.transform(playlist_embedding)
    probs = hdbscan.membership_vector(clusterer, playlist_embedding)
    return np.argmax(probs)
    


In [43]:
playlist_name = "Music"
best_cluster = recommend_coldstart_cluster(clusterer, pca, playlist_name, embeddings_dict)
print(best_cluster)

100%|██████████| 1/1 [00:00<00:00, 3862.16it/s]

8





In [44]:
playlists = pd.read_csv(playlists_content_path).to_numpy()

In [45]:
labels = clusterer.labels_
def cluster_tracks(labels: List[int], playlists: List[List[int]]) -> Dict[int, Set[int]]:
    clustered_playlists = {}
    for label, playlist in zip(labels, playlists):
        if label == -1:
            continue
        if label not in clustered_playlists:
            clustered_playlists[label] = set()
        for track in playlist:
            if track == -1:
                continue
            clustered_playlists[label].add(track)
    return clustered_playlists


In [46]:
clustered_tracks = cluster_tracks(labels, playlists)
print(list(map(len, list(clustered_tracks.values()))))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, -1}
[607, 462, 502, 14109, 1155, 474, 279, 215, 380, 493, 125, 573, 631, 208, 165, 495, 413, 1034, 472, 595]


In [54]:
def select_n_from_cluster(clustered_tracks: Dict[int, Set[int]], cluster_id: int, n: int= 50):
    tracks_set = clustered_tracks[cluster_id]
    return sample(tracks_set, n)

In [55]:
tracks = select_n_from_cluster(clustered_tracks, best_cluster, 100)
print(tracks)

[987773, 676176, 1939272, 675136, 953736, 1538475, 2067133, 1106812, 732167, 722890, 1625901, 1102992, 613229, 1863806, 1918775, 582371, 1499968, 1159763, 1746837, 84929, 473714, 2139062, 2142326, 2093239, 1330007, 353917, 2161195, 847267, 1072016, 740451, 2176113, 1465694, 317446, 203409, 1733184, 575192, 1238937, 1645661, 1086735, 2199623, 831958, 72455, 580455, 1568336, 1479534, 958651, 409452, 1659903, 2027543, 1474887, 807555, 440767, 230671, 2155488, 2245255, 908345, 2127845, 1328248, 1301548, 2045374, 917345, 1827503, 774841, 2008960, 2153143, 1599427, 213693, 380179, 903818, 645458, 2077095, 282241, 144638, 1607091, 2130560, 1112955, 2086712, 1384601, 473614, 424818, 1138221, 1384251, 1199888, 402772, 2001633, 1451744, 357029, 1588378, 67979, 1348628, 596553, 33272, 1288827, 66835, 118057, 155883, 424904, 629415, 56374, 752125]


since Python 3.9 and will be removed in a subsequent version.
  return sample(tracks_set, n)
