In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import pandas as pd
from typing import List, Dict, Set, Optional
import string
from tqdm import tqdm
import numpy as np
import gensim
from gensim.models import Word2Vec
from numpy.typing import NDArray, ArrayLike
from collections import defaultdict
from sklearn.decomposition import PCA
import click
import hdbscan
import os
import logging
from random import sample
import dill
import time
from sklearn.cluster import Birch
from collections import Counter
import seaborn as sns
import random

# from ...features.build_features import load_pickle, save_pickle
# from ...data.make_dataset import load_songs_encodings

def missing_character(character):
    return  ""

def process_playlist_names(playlist_names: List[str]) -> List[List[str]]:
    stop_words = set(stopwords.words('english'))

    printable = set(string.printable)
    playlist_names = [''.join(filter(lambda sign: sign in printable, name)) for name in playlist_names]
    playlist_names = [word_tokenize(sentence) for sentence in tqdm(playlist_names)]
    playlist_names = [list(filter(lambda word: word.lower() not in stop_words, sentence)) for sentence in playlist_names]
    return playlist_names


def prepare_embedding_dict(model: Word2Vec) -> Dict[str, ArrayLike]:
    embeddings = {word: np.array(embedding) for word, embedding in zip(model.wv.index_to_key, model.wv)}
    return embeddings

def embed_playlists(embeddings_dict: Dict[str, ArrayLike], playlist_names: List, playlist_len=10):
    def embed_playlist(name):
        if name in embeddings_dict:
            return embeddings_dict[name]
        else:
            return ""
    embedded_playlists = [np.array(list(map(embed_playlist, name[:playlist_len]))).flatten() for name in playlist_names]
    max_name_len = max(list(map(len, embedded_playlists)))
    embedded_playlists = np.array(list(map(lambda embedding: np.pad(embedding, (0, max_name_len - len(embedding)), 'constant', constant_values=(0,0)), embedded_playlists)))
    return embedded_playlists

def recommend_coldstart_cluster(clusterer: hdbscan.HDBSCAN, pca: PCA, playlist_name: str, embeddings_dict: Dict, max_name_len=100) -> int:
    playlist_names = process_playlist_names([playlist_name,  ])[0]
    playlist_names = playlist_names[:10]
    playlist_embedding = np.array(list(map(lambda name: embeddings_dict[name], playlist_names))).flatten()
    playlist_embedding = playlist_embedding[:max_name_len]
    playlist_embedding = np.pad(playlist_embedding, (0, max_name_len - len(playlist_embedding)), 'constant', constant_values=(0,0)).reshape(1, -1)
    playlist_embedding = pca.transform(playlist_embedding)
    approx_labels, approx_probs = hdbscan.approximate_predict(clusterer, playlist_embedding)
    probs = hdbscan.membership_vector(clusterer, playlist_embedding)
    if not approx_labels:
        return np.argmax(probs)
    if approx_labels[0] == -1:
        return np.argmax(probs)
    return approx_labels[0]

def select_n_from_cluster(clustered_tracks: Dict[int, Set[int]], cluster_id: int, n: int= 50):
    tracks_set = clustered_tracks[cluster_id]
    return sample(tracks_set, n)

def cluster_tracks(labels: List[int], playlists: List[List[int]]) -> Dict[int, Set[int]]:
    clustered_playlists = {}
    for label, playlist in tqdm(zip(labels, playlists)):
        if label == -1:
            continue
        if label not in clustered_playlists:
            clustered_playlists[label] = set()
        for track in playlist:
            if track == -1:
                continue
            clustered_playlists[label].add(track)
    return clustered_playlists

def missing_embedding():
    return np.zeros(10)

def missing_track():
    return np.zeros(30)

def missing_encoding():
    return ""

def prepare_playlist(embeddings_dict, pca, user_playlist) -> NDArray:
    print(user_playlist)
    processed_playlist = process_playlist_names(user_playlist)
    print(processed_playlist)
    e_user_playlist = embed_playlists(embeddings_dict, processed_playlist)
    e_user_playlist = list(map(lambda p: p[:100], e_user_playlist))
    e_user_playlist = np.array(list(map(lambda p: np.pad(p, (0, 100-len(p)), 'constant', constant_values=(0,0)), e_user_playlist)))
    print(e_user_playlist.shape)
    rd_user_playlist = pca.transform(e_user_playlist)
    print(rd_user_playlist.shape)
    return rd_user_playlist


def empty_track():
    return -1

def load_songs_encodings(input_path: str) -> Dict[str, int]:
    songs_encodings_df = pd.read_csv(input_path, index_col=False)
    songs_encodings = defaultdict(empty_track)
    for track_uri, track_encoding in zip(songs_encodings_df.track_uris, songs_encodings_df.track_encoding):
        songs_encodings[track_uri] = track_encoding
    return songs_encodings


# @click.command()
# @click.argument('input_filepath', type=click.Path(exists=True))
# @click.argument('model_filepath', type=click.Path(exists=True))




In [3]:
input_filepath = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/data/processed"
model_filepath = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/models/coldstart_birch"

In [4]:
data_processing_start = time.time()
logging.info("Loading playlists data")
playlist_names = pd.read_csv(os.path.join(input_filepath, "playlist_names.csv"))["0"].to_numpy(dtype=str)
playlists = pd.read_csv(os.path.join(input_filepath, "playlists.csv")).to_numpy()

logging.info("Loading songs encodings")
songs_encodings = load_songs_encodings(os.path.join(input_filepath, "songs_encodings.csv"))
songs_encodings = {track_id: track_uri for track_uri, track_id in songs_encodings.items()}
logging.info("Processing playlist info")
playlist_names = process_playlist_names(playlist_names)

logging.info("Training word2vec model")
model = gensim.models.Word2Vec(playlist_names, min_count=1, vector_size=10, window=5)

logging.info("Building embeddings")
embeddings_dict = prepare_embedding_dict(model)
embedded_playlists = embed_playlists(embeddings_dict, playlist_names)
logging.info(f"Data processing execution time {time.time() - data_processing_start}")
print(type(embedded_playlists))
print(embedded_playlists.shape)


100%|██████████| 1000000/1000000 [00:42<00:00, 23383.76it/s]


<class 'numpy.ndarray'>
(1000000, 100)


In [5]:
pca = PCA(n_components=10)
reduced_pn = pca.fit_transform(embedded_playlists)

In [6]:
clusterer = hdbscan.HDBSCAN(cluster_selection_epsilon=1)
clusterer.fit(reduced_pn)
print(clusterer.labels_[:100])
n_labels = len(set(clusterer.labels_))
print(n_labels)

[2270 4075 4681 2156 4681 2639 4681 3591  961 4681 4681 3020 4681 1615
 1946 4681  585 3497 3323 2745 4681 4171 4681 4681 2377 4502 4681 4681
 4681 4681 1008 2481 1119 4681 4681 4681 4681 3647 3020 4681 2133 4681
 4681 4681 4681 2801 4681 4681 4681 4681 4179 3311 1904 4681  168 4681
 4681 4681   -1   28 4681 2885 4681 4681 4681 1788 4681 4681  452 2970
 3709 1526 2246 4681 4681 4681 2270 4681  430  290 3337 1506  799 3871
 4681 4681 4566 4681 4681 2147 4681 4681 1717 2055 4681 4681 3349 2147
 4167 2451]
4683


In [7]:
brc= Birch(n_clusters=n_labels)
labels = brc.fit_predict(reduced_pn)

In [8]:
def cluster_labels(labels: List[int], playlists: NDArray, songs_encodings: Dict[int, str]) -> Dict[int, Set[str]]:
    clustered_tracks = {}
    for playlist_index, (cluster_index, playlist) in tqdm(enumerate(zip(labels, playlists)), total=playlists.shape[0]):
        if not cluster_index in clustered_tracks:
            clustered_tracks[cluster_index] = set()
        for track in playlist:
            if track != -1:
                clustered_tracks[cluster_index].add(songs_encodings[track])
    return clustered_tracks


In [9]:
clustered_tracks = cluster_labels(labels, playlists, songs_encodings)

100%|██████████| 1000000/1000000 [01:27<00:00, 11471.46it/s]


In [10]:
print(len(clustered_tracks[0]))

37132


In [11]:
user_playlist = ["rock and roll"]
rd_user_playlist = prepare_playlist(embeddings_dict, pca, user_playlist)
print(brc.predict(rd_user_playlist))

['rock and roll']


100%|██████████| 1/1 [00:00<00:00, 4466.78it/s]


[['rock', 'roll']]
(1, 100)
(1, 10)
[3878]


In [12]:
def recommend_n_tracks(brc: Birch, clustered_tracks: Dict[int, Set], processed_playlist: List[float], n_recommendations: Optional[int]=100):
    cluster_id = brc.predict(processed_playlist)[0]
    return random.sample(tuple(clustered_tracks[cluster_id]), n_recommendations)


In [13]:
recs = recommend_n_tracks(brc, clustered_tracks, rd_user_playlist)
print(len(recs))
print(len(set(recs)))

100
100


In [19]:
import pickle
from typing import Any
def save_pickle(object: Any, output_path: str):
    with open(output_path, 'wb+') as file:
        pickle.dump(object, file)
import sys
sys.setrecursionlimit(10000)

save_pickle(brc, os.path.join(model_filepath, "brc.pkl"))
save_pickle(pca, os.path.join(model_filepath, "pca.pkl"))
save_pickle(embeddings_dict, os.path.join(model_filepath, "embeddings_dict.pkl"))
save_pickle(clustered_tracks, os.path.join(model_filepath, "clustered_tracks.pkl"))
