In [1]:
import pickle
from pathlib import Path
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import re



In [2]:
input_path = "processed/01_filtered/"

with open(Path(input_path) / "filtered_playlists.pkl", "rb") as f:
    playlists = pickle.load(f)  # list of lists of track_uris

with open(Path(input_path) / "valid_tracks.pkl", "rb") as f:
    valid_tracks_dict = pickle.load(f)  # dict from track_uri -> metadata dict

In [3]:
def filter_valid_tracks(playlists, valid_tracks):
    filtered = []
    for pl in tqdm(playlists, total=len(playlists), desc="Filtering playlists"):
        filtered_tracks = [t for t in pl['tracks'] if t in valid_tracks]
        filtered.append({'name': pl['name'], 'tracks': filtered_tracks})
    return filtered

filtered_playlists = filter_valid_tracks(playlists, valid_tracks_dict)

Filtering playlists: 100%|██████████| 996829/996829 [00:36<00:00, 27632.62it/s]


In [4]:
filtered_playlists[0]

{'name': 'Baile',
 'tracks': ['spotify:track:4Uc6BcPeBKfZUlX6jhumGv',
  'spotify:track:0Hf4aIJpsN4Os2f0y0VqWl',
  'spotify:track:2eqDUxbd0JPEhNrJdPlHLs',
  'spotify:track:4LXDHgBC1mbz3uoehYxH9b',
  'spotify:track:4Y7XAxTANhu3lmnLAzhWJW',
  'spotify:track:3ZFTkvIE7kyPt6Nu3PEa7V',
  'spotify:track:1eAj7zEFiX24oYScyIP8aO',
  'spotify:track:3XVCXTxYPptsMLwO463btY',
  'spotify:track:0UGJsRE9T0r6sIMR3mQzUW',
  'spotify:track:6JFBgJECPfDjzVLJ65jqm7',
  'spotify:track:4MK30zdOpWTvpE6UoJukIp',
  'spotify:track:4fIk8LQ1hojY8ZfFQi419y',
  'spotify:track:32lm3769IRfcnrQV11LO4E']}

In [5]:
filtered_playlists = [p for p in filtered_playlists if len(p['tracks']) >= 5]

train_playlists, test_playlists = train_test_split(filtered_playlists, test_size=0.1, random_state=42)

In [6]:
train_names = [p['name'] for p in train_playlists]
train_tracks = [p['tracks'] for p in train_playlists]

vectorizer = TfidfVectorizer(max_features=500, stop_words='english', lowercase=True, token_pattern=r'\b\w+\b')
name_vectors = vectorizer.fit_transform(train_names)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_tracks_from_name(name, k=10, n_neighbors=5):
    query_vec = vectorizer.transform([name])
    sims = cosine_similarity(query_vec, name_vectors)[0]
    top_idx = sims.argsort()[::-1][:n_neighbors]

    recommended_tracks = []
    for idx in top_idx:
        recommended_tracks.extend(train_tracks[idx])

    return list(dict.fromkeys(recommended_tracks))[:k]


In [8]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from functools import partial
import random

def precision_at_k(y_true, y_pred, k=10):
    return len(set(y_true) & set(y_pred[:k])) / k

def _evaluate_batch(batch_indices, test_playlists, test_vectors, train_vectors, train_track_lists, k, n_neighbors):
    precisions = []

    sims = cosine_similarity(test_vectors[batch_indices], train_vectors)

    for i, idx in enumerate(batch_indices):
        true_tracks = [t for t in test_playlists[idx]['tracks'] if t in valid_tracks_dict]
        if not true_tracks:
            continue

        top_idx = sims[i].argsort()[::-1][:n_neighbors]

        # Collect recommended tracks from top similar playlists
        pred_tracks = []
        for tid in top_idx:
            pred_tracks.extend(train_track_lists[tid])
        pred_tracks = list(dict.fromkeys(pred_tracks))[:k]

        precisions.append(precision_at_k(true_tracks, pred_tracks, k))

    return precisions

def parallel_sparse_evaluate(test_playlists, vectorizer, train_vectors, train_track_lists, 
                             k=10, n_neighbors=5, batch_size=512, subsample=None, num_workers=None):
    if subsample is not None:
        test_playlists = random.sample(test_playlists, min(subsample, len(test_playlists)))

    test_names = [p['name'] for p in test_playlists]
    test_vectors = vectorizer.transform(test_names)

    indices = list(range(len(test_playlists)))
    batches = [indices[i:i+batch_size] for i in range(0, len(indices), batch_size)]

    num_workers = num_workers or min(cpu_count(), 8)

    with Pool(num_workers) as pool:
        func = partial(_evaluate_batch,
                       test_playlists=test_playlists,
                       test_vectors=test_vectors,
                       train_vectors=train_vectors,
                       train_track_lists=train_track_lists,
                       k=k,
                       n_neighbors=n_neighbors)
        
        all_precisions = list(tqdm(pool.imap(func, batches), total=len(batches)))
    
    # Flatten
    precisions = [p for batch in all_precisions for p in batch]

    return np.mean(precisions) if precisions else 0.0

In [9]:
# from scipy.sparse import csr_matrix

# train_vectors = name_vectors  # already sparse
# train_track_lists = np.array(train_tracks, dtype=object)

# score = parallel_sparse_evaluate(
#     test_playlists=test_playlists,
#     vectorizer=vectorizer,
#     train_vectors=train_vectors,
#     train_track_lists=train_track_lists,
#     k=10,
#     n_neighbors=3,
#     batch_size=1024,
#     subsample=10000,           # Try 5k test playlists
#     num_workers=cpu_count()             # You can increase if needed
# )

# print(f"Parallel Precision@10: {score:.4f}")


In [14]:
print(list(map(lambda uri: valid_tracks_dict.get(uri, {}).get('track_name', 'unknown_name') ,(recommend_tracks_from_name("Fiesta Latina", k=10)))))
print(list(map(lambda uri: valid_tracks_dict.get(uri, {}).get('track_name', 'unknown_name') ,(recommend_tracks_from_name("Morning Chill", k=10)))))
print(list(map(lambda uri: valid_tracks_dict.get(uri, {}).get('track_name', 'unknown_name') ,(recommend_tracks_from_name("Rock", k=10)))))
print(list(map(lambda uri: valid_tracks_dict.get(uri, {}).get('track_name', 'unknown_name') ,(recommend_tracks_from_name("Dark", k=10)))))

['La Bala', 'Vivir Mi Vida', 'Periodico De Ayer', 'El Cantante', 'La Vida Es Un Carnaval', 'Bailando - English Version', 'Fireball', 'La Tortura', "Let's Get Loud", 'I Know You Want Me (Calle Ocho)']
['29 #Strafford APTS', 'Wake Up Your Saints', 'Cruel', 'Carrie & Lowell', 'Dark Days', 'Hey Mami', 'Caught Me Thinkin', "We're on Our Way", 'Black Sun', 'Hag']
['State Of My Head', 'Gotta Get Away', 'Can You Feel My Heart', 'Sleepwalking', 'Deathbeds', 'True Friends', 'Life Is Beautiful', 'This Is Gonna Hurt', 'Call Me', 'Better Version']
['Power Trip', 'So Good', '6 Foot 7 Foot', '0 To 100 / The Catch Up', 'HYFR (Hell Ya Fucking Right)', 'Work Out', 'Crooked Smile', 'Swimming Pools (Drank) - Extended Version', 'REVOFEV', "Coastin'"]
