In [2]:
%pip install -q pymongo

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pickle
from pathlib import Path
from pymongo import MongoClient, UpdateOne
from tqdm import tqdm


In [4]:
input_path = "processed/01_filtered/"

with open(Path(input_path) / "filtered_playlists.pkl", "rb") as f:
    playlists = pickle.load(f)  # list of dicts {'name':..., 'tracks':[track_uris]}

with open(Path(input_path) / "valid_tracks.pkl", "rb") as f:
    valid_tracks_dict = pickle.load(f)

In [5]:
# Filter playlists to keep only valid tracks
def filter_valid_tracks(playlists, valid_tracks):
    filtered = []
    for pl in playlists:
        filtered_tracks = [t for t in pl['tracks'] if t in valid_tracks]
        filtered.append({'name': pl['name'], 'tracks': filtered_tracks})
    return filtered

filtered_playlists = filter_valid_tracks(playlists, valid_tracks_dict)


In [None]:
# random creds
mongo_uri = f"mongodb://admin:secret@localhost:27017"
client = MongoClient(mongo_uri)
db = client["spotify_recommender"]
playlists_col = db["playlists"]
track_playlists_col = db["track_playlists"]

# Clear collections if needed
playlists_col.delete_many({})
track_playlists_col.delete_many({})



DeleteResult({'n': 0, 'ok': 1.0}, acknowledged=True)

In [9]:
print("Inserting playlists...")

bulk_playlists = []
for idx, pl in enumerate(tqdm(filtered_playlists)):
    bulk_playlists.append({
        "_id": idx,
        "playlist_id": idx,
        "name": pl['name'],
        "tracks": pl['tracks'],
    })

playlists_col.insert_many(bulk_playlists)
print(f"Inserted {len(bulk_playlists)} playlists.")

Inserting playlists...


100%|██████████| 996829/996829 [00:01<00:00, 516204.29it/s] 


Inserted 996829 playlists.


In [10]:
print("Building and inserting inverted index (track_playlists)...")

track_to_playlists = {}

for pl in tqdm(filtered_playlists):
    pid = pl.get('playlist_id')  # we don't have playlist_id in data, use index as above
    # We'll use _id as playlist_id same as idx
    # But we need to keep consistent id mapping, so let's build dict first
    # We'll do it by enumerating playlists again:
    # Instead, fix above: playlist_id = idx
    
# Rebuild track_to_playlists with ids consistent with above insertion
track_to_playlists = {}

for pid, pl in enumerate(filtered_playlists):
    for track in pl['tracks']:
        track_to_playlists.setdefault(track, set()).add(pid)

# Prepare bulk writes for track_playlists collection
bulk_track_playlists = []
for track_uri, pl_set in tqdm(track_to_playlists.items()):
    bulk_track_playlists.append(UpdateOne(
        {"_id": track_uri},
        {"$set": {"playlists": list(pl_set), "popularity": len(pl_set)}},
        upsert=True
    ))

# Insert in batches (to avoid huge single write)
batch_size = 10000
for i in tqdm(range(0, len(bulk_track_playlists), batch_size)):
    batch = bulk_track_playlists[i:i+batch_size]
    track_playlists_col.bulk_write(batch)

print(f"Inserted/updated {len(bulk_track_playlists)} track_playlists documents.")

Building and inserting inverted index (track_playlists)...


100%|██████████| 996829/996829 [00:00<00:00, 6657596.46it/s]
100%|██████████| 196398/196398 [00:02<00:00, 66458.82it/s] 
100%|██████████| 20/20 [00:21<00:00,  1.09s/it]

Inserted/updated 196398 track_playlists documents.



