In [1]:
import pickle
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
input_path = "processed/01_filtered/"

with open(Path(input_path) / "filtered_playlists.pkl", "rb") as f:
    playlists = pickle.load(f)  # list of lists of track_uris

with open(Path(input_path) / "valid_tracks.pkl", "rb") as f:
    valid_tracks_dict = pickle.load(f)  # dict from track_uri -> metadata dict

## Supervised recommender

In [3]:
import pickle
from pathlib import Path
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
import pandas as pd
from tqdm import tqdm
from collections import defaultdict, Counter
from itertools import combinations
import random
import math


In [4]:
def filter_valid_tracks(playlists, valid_tracks):
    filtered = []
    for pl in tqdm(playlists, total=len(playlists), desc="Filtering playlists"):
        filtered_tracks = [t for t in pl['tracks'] if t in valid_tracks]
        filtered.append({'name': pl['name'], 'tracks': filtered_tracks})
    return filtered

filtered_playlists = filter_valid_tracks(playlists, valid_tracks_dict)

Filtering playlists:   0%|          | 0/774682 [00:00<?, ?it/s]

Filtering playlists: 100%|██████████| 774682/774682 [00:02<00:00, 332536.82it/s]


In [5]:
track_to_playlists = defaultdict(set)
for pid, pl in tqdm(enumerate(filtered_playlists), total=len(filtered_playlists), desc="Building inverted index"):
    for track in pl['tracks']:
        track_to_playlists[track].add(pid)

Building inverted index:   0%|          | 0/774682 [00:00<?, ?it/s]

Building inverted index: 100%|██████████| 774682/774682 [00:01<00:00, 400400.72it/s]


In [6]:
# Recommendation function
def find_similar_playlists(query_tracks, track_to_pl, max_neighbors=500):
    candidate_counts = Counter()
    for track in query_tracks:
        candidate_counts.update(track_to_pl.get(track, set()))
    most_common = candidate_counts.most_common(max_neighbors)
    similar_pids = [pid for pid, count in most_common]
    return similar_pids

def recommend_tracks(query_playlist, train_playlists, track_to_pl, top_k=100, max_neighbors=500):
    query_tracks = query_playlist['tracks']
    similar_pids = find_similar_playlists(query_tracks, track_to_pl, max_neighbors=max_neighbors)
    track_counter = Counter()
    for pid in similar_pids:
        pl = train_playlists[pid]
        track_counter.update(pl['tracks'])
    existing_tracks = set(query_tracks)
    recommendations = [t for t, _ in track_counter.most_common() if t not in existing_tracks]
    return recommendations[:top_k]

In [9]:
recommended_tracks = recommend_tracks(filtered_playlists[0], filtered_playlists, track_to_playlists, top_k=10, max_neighbors=500)
print("Recommended tracks:", recommended_tracks, len(recommended_tracks))

Recommended tracks: ['spotify:track:2a1o6ZejUi8U3wzzOtCOYw', 'spotify:track:7qCAVkHWZkF44OzOUKf8Cr', 'spotify:track:0S5eBnwtkuFB0TQTlVqW6C', 'spotify:track:3QHMxEOAGD51PDlbFPHLyJ', 'spotify:track:2Cd9iWfcOpGDHLz6tVA3G4', 'spotify:track:2Kj7fT9h3jRAYIpRBOjsh9', 'spotify:track:3pDhN3qB33AOPhQEkUCaWt', 'spotify:track:4aWmUDTfIPGksMNLV2rQP2', 'spotify:track:6mICuAdrwEjh6Y6lroV2Kg', 'spotify:track:5N8nNuTmIzkZOfcxXlygUw'] 10


In [10]:
def recommend_tracks_supervised(tracks, k):
    return recommend_tracks(tracks, filtered_playlists, track_to_playlists, top_k=k, max_neighbors=500)

# Recommend coldstart

In [11]:
def filter_valid_tracks(playlists, valid_tracks):
    filtered = []
    for pl in tqdm(playlists, total=len(playlists), desc="Filtering playlists"):
        filtered_tracks = [t for t in pl['tracks'] if t in valid_tracks]
        filtered.append({'name': pl['name'], 'tracks': filtered_tracks})
    return filtered

filtered_playlists_colab = filter_valid_tracks(playlists, valid_tracks_dict)

Filtering playlists: 100%|██████████| 996829/996829 [00:19<00:00, 50836.08it/s]


In [12]:
names = [p['name'] for p in filtered_playlists_colab]
tracks = [p['tracks'] for p in filtered_playlists_colab]
filtered_playlists_colab = [p for p in filtered_playlists_colab if len(p['tracks']) >= 5]
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, token_pattern=r'\b\w+\b')
name_vectors = vectorizer.fit_transform(names)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_tracks_from_name(name, k=10, n_neighbors=5):
    query_vec = vectorizer.transform([name])
    sims = cosine_similarity(query_vec, name_vectors)[0]
    top_idx = sims.argsort()[::-1][:n_neighbors]

    recommended_tracks = []
    for idx in top_idx:
        recommended_tracks.extend(tracks[idx])

    return list(dict.fromkeys(recommended_tracks))[:k]


In [14]:
def recommend_tracks_collaborative(name, k):
    return recommend_tracks_from_name(name, k=k, n_neighbors=10)[:k]

In [15]:
recommended_tracks = recommend_tracks_collaborative(name="rock", k=10)
print("Recommended tracks:", recommended_tracks, len(recommended_tracks))

Recommended tracks: ['spotify:track:5HNCy40Ni5BZJFw1TKzRsC', 'spotify:track:2TjdnqlpwOjhijHCwHCP2d', 'spotify:track:1TKTiKp3zbNgrBH2IwSwIx', 'spotify:track:0iJfN2CqrX7O8hkzgAMMAf', 'spotify:track:5a4MgIUSf9K8wXLSm6xPEx', 'spotify:track:0R797G9o4tkDfRZj6KMnHS', 'spotify:track:1GEOSS415bZVHNuXWlCT6b', 'spotify:track:0ESdtt9cjGZUkUbaubSrv2', 'spotify:track:7FCiDC7ojdm19CJePq6QAe', 'spotify:track:5OFi9CIyD6s1oENyHapK2W'] 10


## Generate submission

In [21]:
import json
import csv
from tqdm import tqdm

# Path to your challenge set and the output submission file
challenge_set_path = "challenge_set.json"
submission_path = "sample_submission.csv"

NUM_PREDICTIONS = 10000
NUM_TRACKS = 500

ALL_TRACKS = list(valid_tracks_dict.keys())

with open(challenge_set_path, "r") as f:
    challenge_set = json.load(f)
    
def dedup_fast(lst):
    seen = set()
    return [seen.add(x) or x for x in lst if x not in seen]


with open(submission_path, "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["team_info", "a-s-gorski", "adamsebastiangorski@gmail.com"])
    for playlist in tqdm(challenge_set["playlists"], desc="Processing challenge set"):
        pid = playlist["pid"]
        track_uris = [t["track_uri"] for t in playlist["tracks"]]
        name = playlist.get("name", "")
        valid_track_uris = [uri for uri in track_uris if uri in track_to_playlists]
        if valid_track_uris and len(valid_track_uris) >= 5:
            query = {
                "name": name,
                "tracks": valid_track_uris
            }
            predicted_tracks = recommend_tracks_supervised(query, NUM_PREDICTIONS)
            if len(set(predicted_tracks)) < NUM_TRACKS:
                predicted_tracks += recommend_tracks_collaborative(name, NUM_PREDICTIONS)
        else:
            predicted_tracks = recommend_tracks_collaborative(name, NUM_PREDICTIONS)

        if len(set(predicted_tracks)) < NUM_TRACKS:
            predicted_tracks += random.sample(ALL_TRACKS, NUM_PREDICTIONS)
        
        predicted_tracks = [t for t in predicted_tracks if t not in track_uris]
        predicted_tracks = dedup_fast(predicted_tracks)
        predicted_tracks = predicted_tracks[:NUM_TRACKS]
        
        if len(predicted_tracks) < NUM_TRACKS:
            print(f"Warning: Less than {NUM_TRACKS} tracks predicted for playlist {pid}. Found {len(predicted_tracks)} tracks.")
        
        writer.writerow([pid] + predicted_tracks)


Processing challenge set: 100%|██████████| 10000/10000 [07:32<00:00, 22.11it/s]
