In [63]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
from typing import List, Dict, Set, Optional
import string
from tqdm import tqdm
import numpy as np
import gensim
from gensim.models import Word2Vec
from numpy.typing import NDArray, ArrayLike
from collections import defaultdict
from sklearn.decomposition import PCA
import click
import hdbscan
import os
import logging
from random import sample
import time
from sklearn.cluster import Birch
import random
import sys
from sklearn.model_selection import train_test_split

def empty_track():
    return -1
def load_songs_features(input_path: str) -> Dict[str, NDArray]:
    songs_features_df = pd.read_csv(input_path)
    songs_features_dict = defaultdict(empty_track)
    for _, row in tqdm(songs_features_df.iterrows(), total=songs_features_df.shape[0]):
        songs_features_dict[row[1]] = np.array(row[2:-2].to_list())
    return songs_features_dict
def missing_character(character):
    return  ""

def process_playlist_names(playlist_names: List[str]) -> List[List[str]]:
    stop_words = set(stopwords.words('english'))

    printable = set(string.printable)
    playlist_names = [''.join(filter(lambda sign: sign in printable, name)) for name in playlist_names]
    playlist_names = [word_tokenize(sentence) for sentence in tqdm(playlist_names)]
    playlist_names = [list(filter(lambda word: word.lower() not in stop_words, sentence)) for sentence in playlist_names]
    return playlist_names


def prepare_embedding_dict(model: Word2Vec) -> Dict[str, ArrayLike]:
    embeddings = {word: np.array(embedding) for word, embedding in zip(model.wv.index_to_key, model.wv)}
    return embeddings

def embed_playlists(embeddings_dict: Dict[str, ArrayLike], playlist_names: List, playlist_len=10):
    def embed_playlist(name):
        if name in embeddings_dict:
            return embeddings_dict[name]
        else:
            return ""
    embedded_playlists = [np.array(list(map(embed_playlist, name[:playlist_len]))).flatten() for name in playlist_names]
    max_name_len = max(list(map(len, embedded_playlists)))
    embedded_playlists = np.array(list(map(lambda embedding: np.pad(embedding, (0, max_name_len - len(embedding)), 'constant', constant_values=(0,0)), embedded_playlists)))
    return embedded_playlists

def cluster_tracks(labels: List[int], playlists: List[List[int]]) -> Dict[int, Set[int]]:
    clustered_playlists = {}
    for label, playlist in tqdm(zip(labels, playlists)):
        if label == -1:
            continue
        if label not in clustered_playlists:
            clustered_playlists[label] = set()
        for track in playlist:
            if track == -1:
                continue
            clustered_playlists[label].add(track)
    return clustered_playlists

def missing_embedding():
    return np.zeros(10)

def missing_track():
    return np.zeros(30)

def missing_encoding():
    return ""

def prepare_playlist(embeddings_dict, pca, user_playlist) -> NDArray:
    processed_playlist = process_playlist_names(user_playlist)
    e_user_playlist = embed_playlists(embeddings_dict, processed_playlist)
    e_user_playlist = list(map(lambda p: p[:100], e_user_playlist))
    e_user_playlist = np.array(list(map(lambda p: np.pad(p, (0, 100-len(p)), 'constant', constant_values=(0,0)), e_user_playlist)))
    rd_user_playlist = pca.transform(e_user_playlist)
    return rd_user_playlist

def cluster_labels(labels: List[int], playlists: NDArray, songs_encodings: Dict[int, str]) -> Dict[int, Set[str]]:
    clustered_tracks = {}
    for playlist_index, (cluster_index, playlist) in tqdm(enumerate(zip(labels, playlists)), total=playlists.shape[0]):
        if not cluster_index in clustered_tracks:
            clustered_tracks[cluster_index] = set()
        for track in playlist:
            if track != -1:
                clustered_tracks[cluster_index].add(songs_encodings[track])
    return clustered_tracks


def recommend_n_tracks(brc: Birch, clustered_tracks: Dict[int, Set], processed_playlist: List[float], n_recommendations: Optional[int]=100):
    cluster_id = brc.predict(processed_playlist)[0]
    return random.sample(tuple(clustered_tracks[cluster_id]), min(n_recommendations, len(clustered_tracks[cluster_id])))


def load_songs_encodings(input_path: str) -> Dict[str, int]:
    songs_encodings_df = pd.read_csv(input_path, index_col=False)
    songs_encodings = defaultdict(empty_track)
    for track_uri, track_encoding in zip(songs_encodings_df.track_uris, songs_encodings_df.track_encoding):
        songs_encodings[track_uri] = track_encoding
    return songs_encodings

In [64]:
input_filepath = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/data/processed"
reports_filepath = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/reports/eval_coldstart"

In [65]:
playlist_names = pd.read_csv(os.path.join(input_filepath, "playlist_names.csv"))["0"].to_numpy(dtype=str)
playlists = pd.read_csv(os.path.join(input_filepath, "playlists.csv")).to_numpy()
pn_train, pn_val, p_train, p_val = train_test_split(playlist_names, playlists, test_size=0.01)
songs_encodings = load_songs_encodings(os.path.join(input_filepath, "songs_encodings.csv"))
songs_encodings = {track_id: track_uri for track_uri, track_id in songs_encodings.items()}
pn_train = process_playlist_names(pn_train)
model = gensim.models.Word2Vec(pn_train, min_count=1, vector_size=10, window=5)


a
b
c
d


 77%|███████▋  | 764626/999000 [49:14<15:05, 258.80it/s]  
100%|██████████| 990000/990000 [00:45<00:00, 21920.88it/s]


e


In [66]:
embeddings_dict = prepare_embedding_dict(model)
embedded_playlists = embed_playlists(embeddings_dict, pn_train)
pca = PCA(n_components=10)
reduced_pn = pca.fit_transform(embedded_playlists)


In [67]:
clusterer = hdbscan.HDBSCAN(cluster_selection_epsilon=1)
clusterer.fit(reduced_pn)
n_labels = len(set(clusterer.labels_))
logging.info(f"Clusters: {n_labels}")


In [68]:
brc= Birch(n_clusters=n_labels)
labels = brc.fit_predict(reduced_pn)
clustered_tracks = cluster_labels(labels, p_train, songs_encodings)

In [None]:
# from sklearn.cluster import KMeans
# km = KMeans(n_clusters=n_labels)
# labels = km.fit_predict(reduced_pn)


In [None]:
def calculate_map_at_k(ground_truths: List[List[str]], predictions: List[List[str]]) -> float:
    precisions_at_k = []
    for ground_truth, prediction in zip(ground_truths, predictions):
        k = len(set(prediction))
        tp = len(set(ground_truth) & set(prediction))
        precisions_at_k.append(tp/k)
    return np.mean(np.array(precisions_at_k))


def calculate_mar_at_k(ground_truths: List[List[str]], predictions: List[List[str]]) -> float:
    recall_at_k = []
    for ground_truth, prediction in zip(ground_truths, predictions):
        k = len(set(ground_truth))
        tp = len(set(ground_truth) & set(prediction))
        recall_at_k.append(tp/k)
    return np.mean(np.array(recall_at_k))


def coverage(all_tracks: Set[str], predictions: List[List[str]]) -> float:
    preds = [track for prediction in predictions for track in prediction]
    unique_preds = set(preds)
    covered = all_tracks & unique_preds
    return len(covered) / len(all_tracks) * 100

In [None]:
N_tracks = [50, 100, 200, 300, 450, 500, 750, 1000, 1500, 2000, 2500, 3000, 4000, 5000, 6000, 7000, 10000]
total_predictions = max(N_tracks)
ground_truths = []
predictions = []
maps_at_k = []
mars_at_k = []
coverages = []
all_tracks = set(list(songs_encodings.keys()))
for playlist_name, actual_playlist in zip(pn_val, p_val):
    rd_playlist_name = prepare_playlist(embeddings_dict, pca, playlist_name)
    preds = recommend_n_tracks(brc, clustered_tracks, rd_playlist_name, total_predictions)
    ground_truths.append(actual_playlist)
    predictions.append(preds)


100%|██████████| 11/11 [00:00<00:00, 20405.72it/s]
100%|██████████| 3/3 [00:00<00:00, 12372.58it/s]
100%|██████████| 8/8 [00:00<00:00, 14873.42it/s]
100%|██████████| 11/11 [00:00<00:00, 14476.73it/s]
100%|██████████| 15/15 [00:00<00:00, 20010.99it/s]
100%|██████████| 4/4 [00:00<00:00, 12797.27it/s]
100%|██████████| 16/16 [00:00<00:00, 11403.38it/s]
100%|██████████| 12/12 [00:00<00:00, 13386.08it/s]
100%|██████████| 10/10 [00:00<00:00, 15911.62it/s]
100%|██████████| 11/11 [00:00<00:00, 18986.56it/s]
100%|██████████| 3/3 [00:00<00:00, 12396.96it/s]
100%|██████████| 14/14 [00:00<00:00, 18310.03it/s]
100%|██████████| 3/3 [00:00<00:00, 7539.19it/s]
100%|██████████| 8/8 [00:00<00:00, 15162.42it/s]
100%|██████████| 9/9 [00:00<00:00, 18799.17it/s]
100%|██████████| 6/6 [00:00<00:00, 12470.68it/s]
100%|██████████| 7/7 [00:00<00:00, 15896.12it/s]
100%|██████████| 9/9 [00:00<00:00, 19508.39it/s]
100%|██████████| 2/2 [00:00<00:00, 9939.11it/s]
100%|██████████| 13/13 [00:00<00:00, 16598.46it/s]
100%

In [None]:
print(np.array(predictions[0]).shape)
print(np.array(ground_truths[0]).shape)
print(len(predictions))
print(len(ground_truths))
print(set(list(map(lambda x: np.array(x).shape, predictions))))

(2500,)
(376,)
1000
1000
{(2500,), (661,), (1831,), (2104,), (2235,), (582,), (1167,), (1183,)}


In [None]:
# print(np.array(predictions).shape)
# print(np.array(ground_truths).shape)

(1000,)
(1000, 376)


  print(np.array(predictions).shape)


In [None]:
for n in N_tracks:
    preds = list(map(lambda pred: pred[:n], predictions))
    maps_at_k.append(calculate_map_at_k(ground_truths, preds))
    mars_at_k.append(calculate_mar_at_k(ground_truths, preds))
    coverages.append(coverage(all_tracks, preds))
logging.info(f"Maps_at_k{maps_at_k}")
logging.info(f"Mars_at_k{mars_at_k}")
logging.info(f"N_tracks {N_tracks}")
logging.info(f"Coverage {coverages}")
print(len(maps_at_k), len(mars_at_k), len(N_tracks), len(coverages))
pd.DataFrame({"Maps_at_k": maps_at_k, "Mars_at_k": mars_at_k, "N_tracks": N_tracks,
                "Coverages": coverages}).to_csv(os.path.join(reports_filepath, "stats.csv"))

11 11 11 11


In [None]:
print(maps_at_k)
print(mars_at_k)
print(coverages)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
