In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import pandas as pd
from typing import List, Dict, Set, Optional
import string
from tqdm import tqdm
import numpy as np
import gensim
from gensim.models import Word2Vec
from numpy.typing import NDArray, ArrayLike
from collections import defaultdict
from sklearn.decomposition import PCA
import click
import hdbscan
import os
import logging
from random import sample
import dill

def missing_character(character):
    return  ""


def process_playlist_names(playlist_names: List[str]) -> List[List[str]]:
    stop_words = set(stopwords.words('english'))

    printable = set(string.printable)
    playlist_names = [''.join(filter(lambda sign: sign in printable, name)) for name in playlist_names]
    playlist_names = [word_tokenize(sentence) for sentence in tqdm(playlist_names)]
    playlist_names = [list(filter(lambda word: word.lower() not in stop_words, sentence)) for sentence in playlist_names]
    return playlist_names


def prepare_embedding_dict(model: Word2Vec) -> Dict[str, ArrayLike]:
    embeddings = {word: np.array(embedding) for word, embedding in zip(model.wv.index_to_key, model.wv)}
    return embeddings

def embed_playlists(embeddings_dict: Dict[str, ArrayLike], playlist_names: List, playlist_len=10):
    def embed_playlist(name):
        return embeddings_dict[name]
    embedded_playlists = [np.array(list(map(embed_playlist, name[:playlist_len]))).flatten() for name in playlist_names]
    max_name_len = max(list(map(len, embedded_playlists)))
    embedded_playlists = np.array(list(map(lambda embedding: np.pad(embedding, (0, max_name_len - len(embedding)), 'constant', constant_values=(0,0)), embedded_playlists)))
    return embedded_playlists

def recommend_coldstart_cluster(clusterer: hdbscan.HDBSCAN, pca: PCA, playlist_name: str, embeddings_dict: Dict, max_name_len=100) -> int:
    playlist_names = process_playlist_names([playlist_name,  ])[0]
    print(f"playlist_names 1 {playlist_names}")
    playlist_names = playlist_names[:10]
    print(f"playlist_names 2 {playlist_names}")
    playlist_embedding = np.array(list(map(lambda name: embeddings_dict[name], playlist_names))).flatten()
    print(f"playlist_embedding 1 {playlist_embedding}")
    playlist_embedding = playlist_embedding[:max_name_len]
    print(f"playlist_embedding 2 {playlist_embedding}")
    playlist_embedding = np.pad(playlist_embedding, (0, max_name_len - len(playlist_embedding)), 'constant', constant_values=(0,0)).reshape(1, -1)
    print(f"playlist_embedding 3 {playlist_embedding}")
    playlist_embedding = pca.transform(playlist_embedding)
    print(f"playlist_embedding 4 {playlist_embedding}")
    print(type(playlist_embedding))
    labels, probs = hdbscan.approximate_predict(clusterer, playlist_embedding)
    print(labels, probs)
    probs = hdbscan.membership_vector(clusterer, playlist_embedding)
    return np.argmax(probs)

def select_n_from_cluster(clustered_tracks: Dict[int, Set[int]], cluster_id: int, n: int= 50):
    tracks_set = clustered_tracks[cluster_id]
    return sample(tracks_set, n)

def cluster_tracks(labels: List[int], playlists: List[List[int]]) -> Dict[int, Set[int]]:
    clustered_playlists = {}
    for label, playlist in tqdm(zip(labels, playlists)):
        if label == -1:
            continue
        if label not in clustered_playlists:
            clustered_playlists[label] = set()
        for track in playlist:
            if track == -1:
                continue
            clustered_playlists[label].add(track)
    return clustered_playlists

class ColdstartRecommender:
    def __init__(self, embeddings_dict: Dict[str, List[int]], pca: PCA, clusterer: hdbscan.HDBSCAN, clustered_tracks: Dict[int, Set[int]], songs_encodings: Dict[int, str]):
        self._embeddings_dict = defaultdict(lambda _: np.zeros(10), embeddings_dict)
        self._pca = pca
        self._clusterer = clusterer
        self._clustered_tracks = defaultdict(lambda x: np.zeros(100), clustered_tracks)
        self._songs_encodings = defaultdict(lambda x: -1, songs_encodings)
    
    def recommend_n_tracks(self, playlist_name: str, n: Optional[int]=50):
        cluster_id = recommend_coldstart_cluster(self._clusterer, self._pca, playlist_name, self._embeddings_dict)
        logging.info(f"Cluster_id: {cluster_id}")
        track_ids = select_n_from_cluster(self._clustered_tracks, cluster_id)
        logging.info(f"track_ids: {track_ids}")
        track_uris = [self._songs_encodings[track_id] for track_id in track_ids]
        logging.info(f"Track_uris: {track_uris}")
        return track_uris

def missing_song_encoding(track):
    return -1

def empty_track():
    return -1

def load_songs_encodings(input_path: str) -> Dict[str, int]:
    songs_encodings_df = pd.read_csv(input_path, index_col=False)
    songs_encodings = defaultdict(empty_track)
    for track_uri, track_encoding in zip(songs_encodings_df.track_uris, songs_encodings_df.track_encoding):
        songs_encodings[track_uri] = track_encoding
    return songs_encodings

In [2]:
input_filepath = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/data/processed"
playlist_names = pd.read_csv(os.path.join(input_filepath, "playlist_names.csv"))["0"].to_numpy(dtype=str)
playlists = pd.read_csv(os.path.join(input_filepath, "playlists.csv")).to_numpy()

In [3]:
songs_encodings = load_songs_encodings(os.path.join(input_filepath, "songs_encodings.csv"))
songs_encodings = {track_id: track_uri for track_uri, track_id in songs_encodings.items()}

In [4]:
playlist_names = process_playlist_names(playlist_names)


100%|██████████| 1000000/1000000 [00:41<00:00, 24096.47it/s]


In [5]:
model = gensim.models.Word2Vec(playlist_names, min_count=1, vector_size=10, window=5)
embeddings_dict = prepare_embedding_dict(model)
embedded_playlists = embed_playlists(embeddings_dict, playlist_names)

In [7]:
pca = PCA(n_components=10)
reduced_pn = pca.fit_transform(embedded_playlists)[:200000, :]
print(reduced_pn.shape)

(100000, 10)


In [8]:
clusterer = hdbscan.HDBSCAN(cluster_selection_epsilon=1, prediction_data=True)
clusterer.fit(reduced_pn)
labels = clusterer.labels_
clustered_tracks = cluster_tracks(labels, playlists)

100000it [00:05, 17281.00it/s]


In [9]:
coldstart_model = ColdstartRecommender(embeddings_dict, pca, clusterer, clustered_tracks, songs_encodings)
print(coldstart_model.recommend_n_tracks("rock and roll"))

100%|██████████| 1/1 [00:00<00:00, 4696.87it/s]


playlist_names 1 ['rock', 'roll']
playlist_names 2 ['rock', 'roll']
playlist_embedding 1 [-1.4188551   0.5192034   0.7722586  -2.0165107  -0.48576212  0.6706748
  2.921494    5.236725   -2.1955607   0.17957498  0.8782166   1.4520342
  1.5706425  -1.3946004   3.0239232   0.2817061   0.28467435  2.5714927
 -0.03775601 -0.66250724]
playlist_embedding 2 [-1.4188551   0.5192034   0.7722586  -2.0165107  -0.48576212  0.6706748
  2.921494    5.236725   -2.1955607   0.17957498  0.8782166   1.4520342
  1.5706425  -1.3946004   3.0239232   0.2817061   0.28467435  2.5714927
 -0.03775601 -0.66250724]
playlist_embedding 3 [[-1.4188551   0.5192034   0.7722586  -2.0165107  -0.48576212  0.6706748
   2.921494    5.236725   -2.1955607   0.17957498  0.8782166   1.4520342
   1.5706425  -1.3946004   3.0239232   0.2817061   0.28467435  2.5714927
  -0.03775601 -0.66250724  0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.     

  outlier_vec = outlier_membership_vector(


['spotify:track:629Cjw0fUyZUMkBjnjttDR', 'spotify:track:79XrkTOfV1AqySNjVlygpW', 'spotify:track:5iSEsR6NKjlC9SrIJkyL3k', 'spotify:track:2NRRrr8ylDK38KD3Ffbw4K', 'spotify:track:3qX5utZrFzl2bgQnO8PlhJ', 'spotify:track:57kozn0j4DL3toKrqKQY0U', 'spotify:track:2LQrzHAiBCk2PUgxmYzfDQ', 'spotify:track:68oUQRwGJGExtkpaSvSbgb', 'spotify:track:75kV29N3NsJIOkfuIy0113', 'spotify:track:0of4x5P2ASi3xTvPMQlVQX', 'spotify:track:1JdPcAnG1GfiPzRDNplizS', 'spotify:track:1RUTIdTnFs8lHSc0Zr4UJB', 'spotify:track:7sg9ToL94GAiksETi5GZGz', 'spotify:track:6ynvA6fa4Xk3oV1GNqIN9x', 'spotify:track:3DmNkgOHMlCvPZnuC5fFkT', 'spotify:track:1GaYqv2NMMlVbG3ewJQ4A6', 'spotify:track:29p4HuJyGOzJgvJ9hVwDhD', 'spotify:track:48bzk8bCl0uEemd6Zbc7ct', 'spotify:track:6exNsuGkhyvjyBIiu3eOOz', 'spotify:track:0xjkgYSzHjBZNvyUaC9cXX', 'spotify:track:7b8YWIjK1JEtZXNjSf2ZU1', 'spotify:track:1TtvKn8PytKPzfIh1MGS4e', 'spotify:track:2Sb8qO1M5pafAjKtBXmRpQ', 'spotify:track:3up4BXUfgvKzTRzjpdByko', 'spotify:track:4HsOnvLjiugAuhmmlJDEeQ',

since Python 3.9 and will be removed in a subsequent version.
  return sample(tracks_set, n)


In [10]:
from collections import Counter
d = dict(Counter(labels))
d = {k: v for k, v in d.items() if v > 1000}
print(d)

{-1: 5459, 1394: 46134, 948: 1010, 1109: 1270}


In [12]:
import pickle
from typing import Any
def load_pickle(input_path) -> Any:
    with open(input_path, 'rb+') as file:
        return pickle.load(file)

def save_pickle(object: Any, output_path: str):
    with open(output_path, 'wb+') as file:
        pickle.dump(object, file)

model_filepath = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/models/coldstart"
save_pickle(embeddings_dict, os.path.join(model_filepath, "embedding.pkl"))
save_pickle(pca, os.path.join(model_filepath, "pca.pkl"))
save_pickle(clusterer, os.path.join(model_filepath, "clusterer.pkl"))
save_pickle(clustered_tracks, os.path.join(model_filepath, "clustered_tracks.pkl"))
save_pickle(songs_encodings, os.path.join(model_filepath, "songs_encodings.pkl"))