# Sonic Index Builder - Per-Year Mode

This version saves **one .pkl file per year** (`sonic_2012.pkl`, `sonic_2013.pkl`, etc.).
Run one year at a time to avoid rate limiting.

## Usage:
1. Set `YEAR_TO_BUILD` to the year you want (2012, 2013, etc.)
2. Run all cells
3. Download `data/indices/sonic_YEAR.pkl`
4. Repeat for other years

The API will auto-merge all `sonic_*.pkl` files on startup.

In [None]:
%pip install -q -r ../requirements.txt

In [None]:
import sys
import os
import warnings

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
print(f"Project Root: {project_root}")

In [None]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import pickle
import numpy as np
import time
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

def get_session():
    session = requests.Session()
    retry = Retry(total=5, backoff_factor=2, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

api_session = get_session()

from src.audio.processor import AudioProcessor
from src.audio.semantic import SemanticEncoder
from src.ranking import nostalgia
print("Modules loaded.")

## ⚠️ SET THE YEAR HERE

In [None]:
# CHANGE THIS FOR EACH RUN
YEAR_TO_BUILD = 2013  # <-- Change to 2014, 2015, etc.

CANDIDATE_LIMIT = 300000
OUTPUT_DIR = os.path.join(project_root, 'data', 'indices')
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_FILE = os.path.join(OUTPUT_DIR, f'sonic_{YEAR_TO_BUILD}.pkl')

GENRES = [
    'pop', 'rock', 'hip hop', 'rap', 'r&b', 'soul', 'jazz', 'blues',
    'electronic', 'dance', 'edm', 'house', 'techno', 'dubstep', 'trance',
    'indie', 'alternative', 'metal', 'punk', 'country', 'folk', 'classical',
    'reggae', 'latin', 'k-pop', 'acoustic', 'ambient', 'chill', 'lo-fi', 'trap'
]

print(f"Building index for: {YEAR_TO_BUILD}")
print(f"Output: {OUTPUT_FILE}")

In [None]:
def fetch_tracks_for_year(year, limit=CANDIDATE_LIMIT):
    all_tracks = []
    seen_ids = set()
    
    queries = [f"Top {year}", f"Best of {year}", f"{year} Hits", f"{year} Music"]
    for g in GENRES:
        queries.append(f"{year} {g}")
    np.random.shuffle(queries)
    
    for query in tqdm(queries, desc=f"Crawling {year}"):
        if len(all_tracks) >= limit: break
        try:
            r = api_session.get("https://api.deezer.com/search/playlist", params={'q': query, 'limit': 30})
            playlists = r.json().get('data', [])
            for pl in playlists:
                if len(all_tracks) >= limit: break
                if pl['id'] in seen_ids: continue
                seen_ids.add(pl['id'])
                
                r_t = api_session.get(f"https://api.deezer.com/playlist/{pl['id']}/tracks", params={'limit': 100})
                for t in r_t.json().get('data', []):
                    if t['id'] not in seen_ids and len(all_tracks) < limit:
                        all_tracks.append(t)
                        seen_ids.add(t['id'])
        except:
            continue
    return all_tracks

def get_release_date(track):
    if 'release_date' in track: return track['release_date']
    try:
        return api_session.get(f"https://api.deezer.com/album/{track['album']['id']}").json().get('release_date')
    except:
        return None

In [None]:
def build_year_index():
    print(f"--- Building Index for {YEAR_TO_BUILD} ---")
    
    audio_proc = AudioProcessor()
    semantic_enc = SemanticEncoder()
    nostalgia_filter = nostalgia.NostalgiaFilter()
    
    tracks = fetch_tracks_for_year(YEAR_TO_BUILD)
    print(f"Found {len(tracks)} candidates.")
    
    sonic_data = []
    stats = {'kept': 0, 'era': 0, 'preview': 0, 'error': 0}
    
    def process(track):
        r_date = get_release_date(track)
        if not r_date or not nostalgia_filter.is_in_era(r_date):
            return 'era', None
        preview = track.get('preview')
        if not preview:
            return 'preview', None
        try:
            result = audio_proc.analyze_url(preview)
            if not result: return 'error', None
            meta = f"{track['title']} by {track['artist']['name']}"
            return 'kept', {
                'id': track['id'],
                'title': track['title'],
                'artist': track['artist']['name'],
                'year': YEAR_TO_BUILD,
                'release_date': r_date,
                'audio_vector': result['vector'],
                'semantic_vector': semantic_enc.encode(meta),
                'preview': preview
            }
        except:
            return 'error', None
    
    with ThreadPoolExecutor(max_workers=8) as ex:
        futures = {ex.submit(process, t): t for t in tracks}
        for f in tqdm(as_completed(futures), total=len(tracks), desc="Processing"):
            status, data = f.result()
            stats[status] = stats.get(status, 0) + 1
            if data: sonic_data.append(data)
    
    with open(OUTPUT_FILE, 'wb') as f:
        pickle.dump(sonic_data, f)
    
    print(f"\n--- Done ---")
    print(f"Kept: {stats['kept']} | Out of Era: {stats['era']} | No Preview: {stats['preview']} | Errors: {stats['error']}")
    print(f"Saved to: {OUTPUT_FILE}")

In [None]:
build_year_index()