In [1]:
%pip install -q qdrant-client python-dotenv aiohttp nest_asyncio

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pickle
from pathlib import Path
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()
qdrant_api_key = os.getenv("QDRANT_API_KEY")

In [3]:
input_path = "processed/01_filtered/"

with open(Path(input_path) / "filtered_playlists.pkl", "rb") as f:
    playlists = pickle.load(f)  # list of lists of track_uris

with open(Path(input_path) / "valid_tracks.pkl", "rb") as f:
    valid_tracks_dict = pickle.load(f)  # dict from track_uri -> metadata dict

In [4]:
def filter_valid_tracks(playlists, valid_tracks):
    filtered = []
    for pl in tqdm(playlists, total=len(playlists), desc="Filtering playlists"):
        filtered_tracks = [t for t in pl['tracks'] if t in valid_tracks]
        filtered.append({'name': pl['name'], 'tracks': filtered_tracks})
    return filtered

filtered_playlists = filter_valid_tracks(playlists, valid_tracks_dict)
filtered_playlists = [p for p in filtered_playlists if len(p['tracks']) >= 5]

Filtering playlists: 100%|██████████| 996829/996829 [00:20<00:00, 49833.52it/s]


In [6]:
# --- Split data ---
train_playlists, test_playlists = train_test_split(filtered_playlists, test_size=0.1, random_state=42)
train_names = [p['name'] for p in train_playlists]
train_tracks = [p['tracks'] for p in train_playlists]

# --- Fit TF-IDF Vectorizer ---
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', lowercase=True, token_pattern=r'\b\w+\b')
name_vectors = vectorizer.fit_transform(train_names)

In [7]:
import joblib

joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']

In [7]:
print(name_vectors.shape[1])

1000


In [None]:
QDRANT_URL = "https://ede2a3a4-b5e3-418b-8b33-c78597d0190d.europe-west3-0.gcp.cloud.qdrant.io"
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
COLLECTION_NAME = "playlists"
VECTOR_DIM = 768           # Adjust if your real vectors differ
NUM_PLAYLISTS = 1000       # For testing, reduce to avoid crashes
BATCH_SIZE = 300           # Batch size for uploads
MAX_CONCURRENCY = 4        # Simultaneous batches

client = QdrantClient(
    url="localhost",
    port=6333
)

collection_name = "playlists"


# Recreate collection
if collection_name in [c.name for c in client.get_collections().collections]:
    client.delete_collection(collection_name)

client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=name_vectors.shape[1], distance=Distance.COSINE)
)

  client.recreate_collection(


True

In [9]:
collection_info = client.get_collection(collection_name=collection_name)
vectors_config = collection_info
print(collection_info)


status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=0 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1000, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None,

In [10]:
import nest_asyncio
import asyncio

nest_asyncio.apply()  # allow nested event loops in notebook

In [11]:
from qdrant_client import AsyncQdrantClient
from qdrant_client.http.models import PointStruct as AsyncPointStruct
from tqdm.notebook import tqdm

BATCH_SIZE = 256

async def async_upload_qdrant(name_vectors, train_names, train_tracks):
    QDRANT_URL = "https://ede2a3a4-b5e3-418b-8b33-c78597d0190d.europe-west3-0.gcp.cloud.qdrant.io"
    QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
    print(f"Qdrant API Key: {QDRANT_API_KEY}")
    # async_client = AsyncQdrantClient(    
    #     url="localhost",
    #     port=6333)
    async_client = AsyncQdrantClient(    
        url=QDRANT_URL,
        api_key=QDRANT_API_KEY)
    total = len(train_names)

    sem = asyncio.Semaphore(8)  # limit concurrency

    async def upload_batch(start_idx):
        async with sem:
            batch_names = train_names[start_idx:start_idx + BATCH_SIZE]
            batch_vectors = name_vectors[start_idx:start_idx + BATCH_SIZE]
            batch_tracks = train_tracks[start_idx:start_idx + BATCH_SIZE]

            points = []
            for j, (name, vec, tracks) in enumerate(zip(batch_names, batch_vectors, batch_tracks)):
                idx = start_idx + j
                vector = vec.toarray().flatten().tolist()
                payload = {
                    "name": name,
                    "tracks": tracks
                }
                points.append(AsyncPointStruct(id=idx, vector=vector, payload=payload))

            await async_client.upsert(collection_name=collection_name, points=points)

    tasks = [upload_batch(i) for i in range(0, total, BATCH_SIZE)]

    for f in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Uploading to Qdrant"):
        await f

    await async_client.close()

In [24]:
client = QdrantClient(
        url=QDRANT_URL,
        api_key=QDRANT_API_KEY,
    )

query_vec = name_vectors[0].toarray().flatten().tolist()
n_neighbors = 5

search_result = client.query_points(
        collection_name=COLLECTION_NAME,
        query=query_vec,
        limit=n_neighbors,
        timeout=60,
    )

print("Search results:", search_result)    

Search results: points=[ScoredPoint(id=27422, version=0, score=0.0, payload={'name': 'Pre-game', 'tracks': ['spotify:track:1djfwy9FvafpK4KhznjjeI', 'spotify:track:2Zy7XVdxyZQB8xp5xbpJdl', 'spotify:track:75Y0HwWBQAJInFaKb10Ogh', 'spotify:track:3vR6ivBSe0SWntHyEzh4X1', 'spotify:track:4EpDZxSApKaod3PVqT5ben', 'spotify:track:25rc1PFgPvtV6GFSloyOI0', 'spotify:track:2AGUFka8kBWCM47h5uTlDb', 'spotify:track:0UAmPUdpTRq8w54B3H8TzY', 'spotify:track:7i5mufbqmLZ4Ae7JnmLqp6', 'spotify:track:6QwwsJC57SitPcWuyEpuwd', 'spotify:track:2n5gVJ9fzeX2SSWlLQuyS9', 'spotify:track:32SFvZODywPBVDYSYtqxCj', 'spotify:track:7xVLFuuYdAvcTfcP3IG3dS', 'spotify:track:7rSERmjAT38lC5QhJ8hnQc', 'spotify:track:7GZ6edN5wxF9icp7wIUoR8', 'spotify:track:68LR4RQ7JLHwRodWxPOB0k', 'spotify:track:6t2eIONH4Sax3R21QWiKNp', 'spotify:track:5mB1bWU8VueYVxuvcLPn2B', 'spotify:track:4bdJHQp90ymjEIkkYhcENB', 'spotify:track:2tT0H3A0hNTPbJFRoiZc6x', 'spotify:track:6ekV6MGnPTuCOnjB4VcWGc', 'spotify:track:0B8QzDH7YWih85V5SEMnyJ', 'spotify:tra

In [36]:
print(search_result.points[0].payload['tracks'])

['spotify:track:1djfwy9FvafpK4KhznjjeI', 'spotify:track:2Zy7XVdxyZQB8xp5xbpJdl', 'spotify:track:75Y0HwWBQAJInFaKb10Ogh', 'spotify:track:3vR6ivBSe0SWntHyEzh4X1', 'spotify:track:4EpDZxSApKaod3PVqT5ben', 'spotify:track:25rc1PFgPvtV6GFSloyOI0', 'spotify:track:2AGUFka8kBWCM47h5uTlDb', 'spotify:track:0UAmPUdpTRq8w54B3H8TzY', 'spotify:track:7i5mufbqmLZ4Ae7JnmLqp6', 'spotify:track:6QwwsJC57SitPcWuyEpuwd', 'spotify:track:2n5gVJ9fzeX2SSWlLQuyS9', 'spotify:track:32SFvZODywPBVDYSYtqxCj', 'spotify:track:7xVLFuuYdAvcTfcP3IG3dS', 'spotify:track:7rSERmjAT38lC5QhJ8hnQc', 'spotify:track:7GZ6edN5wxF9icp7wIUoR8', 'spotify:track:68LR4RQ7JLHwRodWxPOB0k', 'spotify:track:6t2eIONH4Sax3R21QWiKNp', 'spotify:track:5mB1bWU8VueYVxuvcLPn2B', 'spotify:track:4bdJHQp90ymjEIkkYhcENB', 'spotify:track:2tT0H3A0hNTPbJFRoiZc6x', 'spotify:track:6ekV6MGnPTuCOnjB4VcWGc', 'spotify:track:0B8QzDH7YWih85V5SEMnyJ', 'spotify:track:1Uh36HwT2nXXw1nF0OXa34', 'spotify:track:4afMG2PlvFfX976bZJvHmb', 'spotify:track:5oyUurs28XRagLa8lgm6V3',

In [None]:
print(search_result.points.tracks)

In [None]:
from qdrant_client import QdrantClient
import os
from sklearn.feature_extraction.text import TfidfVectorizer

QDRANT_URL = "https://ede2a3a4-b5e3-418b-8b33-c78597d0190d.europe-west3-0.gcp.cloud.qdrant.io"
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
COLLECTION_NAME = "playlists"

# Assumes you have a trained vectorizer and valid_tracks_dict
# vectorizer: TfidfVectorizer = ...  # your trained TF-IDF vectorizer

def recommend_from_qdrant(playlist_name, k=10, n_neighbors=5):
    query_vec = vectorizer.transform([playlist_name]).toarray()[0]

    client = QdrantClient(
        url=QDRANT_URL,
        api_key=QDRANT_API_KEY,
    )

    search_result = client.query_points(
        collection_name=COLLECTION_NAME,
        query=query_vec,
        limit=n_neighbors,
        timeout=60,
    )

    # Aggregate recommended tracks
    recommended_tracks = []
    for point in search_result.points:
        tracks = point.payload.get("tracks", [])
        recommended_tracks.extend(tracks)

    # Deduplicate while preserving order
    return list(dict.fromkeys(recommended_tracks))[:k]


In [43]:
import asyncio

recommended_rock = recommend_from_qdrant("latin", k=10)
print([valid_tracks_dict.get(uri, {}).get('track_name', 'unknown') for uri in recommended_rock])


['Una Vaina Loca', 'Lovumba', 'Ai Se Eu Te Pego (Nossa Nossa)', 'Danza Kuduro', 'Pégate Más', 'Pura Vida', 'Limbo', 'Moviendo Caderas', "Echa Pa'lla (Manos Pa'rriba)", 'La Botella (feat. Cali Y El Dandee)']


In [42]:
import joblib
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']