# Sonic Index Builder (Vast.ai Ready)

This notebook builds the audio and semantic index for the EraEx recommendation system.
It is configured to run in a cloud environment (like Vast.ai) or locally.

## üöÄ SPEED UPDATE: Parallel Processing
This version uses `ThreadPoolExecutor` to download and process multiple tracks simultaneously.

## Nostalgia Enforcement üï∞Ô∏è
This version specifically crawls playlists from **2012-2018** and strictly enforces date checks.

## Instructions for Vast.ai Users:
1. Upload the entire `EraEx` folder (or at least `notebooks/`, `src/`, and `requirements.txt`).
2. Run the Setup & Imports cells below.
3. Set `CANDIDATE_LIMIT` to 100,000 to maximize yield after filtering.
4. When done, download `data/indices/sonic_index.pkl`.

In [None]:
%pip install -r ../requirements.txt

In [None]:
import sys
import os
import warnings

# Suppress warnings (librosa/ffmpeg/tensorflow)
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# 2. Path Setup
# Add project root to path so we can import 'src'
# Assuming this notebook is in <root>/notebooks/
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))

if project_root not in sys.path:
    sys.path.append(project_root)
    
print(f"Project Root: {project_root}")
print(f"System Path Updated.")

In [None]:
import requests
import pickle
import numpy as np
import time
from tqdm.notebook import tqdm

try:
    from src.audio.processor import AudioProcessor
    from src.audio.semantic import SemanticEncoder
    from src.ranking import nostalgia  # Import Nostalgia for date checking
    print("‚úÖ Modules imported successfully.")
except ImportError as e:
    print(f"‚ùå Import Error: {e}")
    print("Make sure you uploaded the 'src' folder along with this notebook!")

## Configuration

Set the limit for how many candidates to process. We crawl playlists from 2012-2018.

In [None]:
CANDIDATE_LIMIT = 100000  # Increased heavily as filter rejects ~98%
YEARS = [2012, 2013, 2014, 2015, 2016, 2017, 2018]
OUTPUT_DIR = os.path.join(project_root, 'data', 'indices')
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'sonic_index.pkl')

## Build Logic (Playlist Crawling)

In [None]:
def fetch_tracks_from_year(year, limit=CANDIDATE_LIMIT):
    """Fetches tracks from MULTIPLE top playlists for a specific year, covering ALL genres."""
    all_tracks = []
    seen_ids = set()
    
    # 1. Get Genres Dynamically
    try:
        from src.data.deezer import DeezerCollector
        collector = DeezerCollector()
        raw_genres = collector.get_genres()
        # The get_genres returns a list, clean usage here
        if isinstance(raw_genres, list):
            genres = raw_genres
        else:
            genres = ['pop']
        print(f"Fetched {len(genres)} genres from Deezer.")
    except Exception as e:
        print(f"Could not fetch genres: {e}. Using defaults.")
        genres = ['pop', 'rock', 'rap', 'hip hop', 'jazz', 'metal', 'alternative', 'dance', 'electronic', 'r&b', 'soul', 'reggae', 'indie', 'folk', 'country', 'latin']
        
    # 2. Build Query List
    # Core queries
    search_queries = [f"Top {year}", f"Best of {year}", f"{year} Hits"]
    # Genre-specific queries
    for g in genres:
        if len(g) > 2: # Skip tiny genre names
             search_queries.append(f"{year} {g}")
        
    # Shuffle queries to ensure variety if we hit limit early
    np.random.shuffle(search_queries)
    
    print(f"Generated {len(search_queries)} search queries for {year} (covering all genres).")
    
    for query in search_queries:
        if len(all_tracks) >= limit: break
        
        # print(f"Searching playlists for: {query}...") # Verbose
        search_url = "https://api.deezer.com/search/playlist"
        try:
            # Get top 30 playlists for this specific query (Increased to scale!)
            r = requests.get(search_url, params={'q': query, 'limit': 30})
            playlists = r.json().get('data', [])
            
            if not playlists: continue
            
            for pl in playlists:
                if len(all_tracks) >= limit: break
                if pl['id'] in seen_ids: continue # Avoid re-crawling same playlist
                
                print(f"  -> Crawling: {pl['title']} ({pl['nb_tracks']} tracks)")
                seen_ids.add(pl['id']) 
                
                # 3. Get tracks
                tracks_url = f"https://api.deezer.com/playlist/{pl['id']}/tracks"
                r_t = requests.get(tracks_url, params={'limit': 100})
                pl_tracks = r_t.json().get('data', [])
                
                for t in pl_tracks:
                    if t['id'] not in seen_ids:
                        pass
                
                # Fix: separate sets
                # Actually, let's keep it simple. Only add track if ID not in all_tracks IDs.
                current_track_ids = set(x['id'] for x in all_tracks)
                for t in pl_tracks:
                     if t['id'] not in current_track_ids and len(all_tracks) < limit:
                         all_tracks.append(t)
                
                time.sleep(0.1)
                
        except Exception as e:
            continue
            
    return all_tracks[:limit]

def get_release_date(track):
    """Helper to get release date, fetching album if needed."""
    if 'release_date' in track:
        return track['release_date']
        
    # Optimization: We fetch the album to be strictly accurate.
    try:
        alb_id = track['album']['id']
        r = requests.get(f"https://api.deezer.com/album/{alb_id}")
        return r.json().get('release_date')
    except:
        return None

def build_index(limit_per_year=CANDIDATE_LIMIT):
    print(f"--- Starting Sonic Index Build (Target: 2012-2018) ---")
    
    # 1. Initialize Engines
    try:
        audio_proc = AudioProcessor()
        semantic_enc = SemanticEncoder()
        # FIX: Instantiate the class
        nostalgia_filter = nostalgia.NostalgiaFilter()
    except Exception as e:
        print(f"Init Error: {e}")
        return
    
    sonic_data = []
    seen_ids = set()
    
    # 2. Iterate Years
    for year in YEARS:
        tracks = fetch_tracks_from_year(year, limit=limit_per_year)
        print(f"[{year}] Found {len(tracks)} candidates. Processing in PARALLEL...")
        
        kept = 0
        skipped_era = 0
        skipped_no_preview = 0
        skipped_error = 0
        
        # Define the worker function for parallel execution
        def process_one_track(track):
            # STRICT NOSTALGIA CHECK (Network Call 1)
            r_date = get_release_date(track)
            if not r_date or not nostalgia_filter.is_in_era(r_date):
                return 'skipped_era', None
            
            try:
                # A. Audio Analysis (Network Call 2 + CPU)
                preview_url = track.get('preview')
                audio_vec = None
                
                if not preview_url:
                     return 'skipped_no_preview', None
                     
                if preview_url:
                    result = audio_proc.analyze_url(preview_url)
                    if result:
                        audio_vec = result['vector']
                
                # B. Semantic Analysis
                meta_text = f"{track['title']} by {track['artist']['name']} album {track['album']['title']}"
                semantic_vec = semantic_enc.encode(meta_text)
                
                # C. Return Result
                if audio_vec is not None and semantic_vec is not None:
                    return 'kept', {
                        'id': track['id'],
                        'title': track['title'],
                        'artist': track['artist']['name'],
                        'year': year,
                        'release_date': r_date,
                        'audio_vector': audio_vec,
                        'semantic_vector': semantic_vec,
                        'preview': preview_url
                    }
                else:
                    return 'skipped_error', None
                    
            except Exception:
                return 'skipped_error', None

        # PARALLEL EXECUTION
        from concurrent.futures import ThreadPoolExecutor, as_completed
        
        # Use 16 threads to saturate connection
        with ThreadPoolExecutor(max_workers=16) as executor:
            # Filter distinct tracks first
            unique_candidates = []
            for t in tracks:
                if t['id'] not in seen_ids:
                     unique_candidates.append(t)
                     seen_ids.add(t['id'])
            
            # Submit all tasks
            future_to_track = {executor.submit(process_one_track, t): t for t in unique_candidates}
            
            for future in tqdm(as_completed(future_to_track), total=len(unique_candidates), desc=f"Processing {year}"):
                status, data = future.result()
                
                if status == 'kept':
                    sonic_data.append(data)
                    kept += 1
                elif status == 'skipped_era':
                    skipped_era += 1
                elif status == 'skipped_no_preview':
                    skipped_no_preview += 1
                else:
                    skipped_error += 1
            
        print(f"[{year} Summary] Kept: {kept} | Out of Era: {skipped_era} | No Preview: {skipped_no_preview} | Errors: {skipped_error}")

    # 3. Save Index
    with open(OUTPUT_FILE, 'wb') as f:
        pickle.dump(sonic_data, f)
        
    print(f"\n--- Build Complete ---")
    print(f"Saved {len(sonic_data)} tracks to {OUTPUT_FILE}")

In [None]:
# Run the build
build_index(limit_per_year=CANDIDATE_LIMIT)

# Optional: rclone command reminder
# !rclone copy ./data/indices/sonic_index.pkl remote:era_ex_backup/