# Sonic Index Builder

This notebook builds the audio and semantic index for the EraEx recommendation system.
It fetches tracks from Deezer, downloads previews, computes audio vectors, and encodes metadata into semantic vectors.

In [None]:
import sys
import os

# Add project root to path for imports
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

import requests
import pickle
import numpy as np
import time
from tqdm.notebook import tqdm

from src.audio.processor import AudioProcessor
from src.audio.semantic import SemanticEncoder

## Configuration

Set the limit for how many tracks to process. For production, set this to 100,000 or more.

In [None]:
LIMIT = 50  # Number of tracks to process
OUTPUT_DIR = os.path.join(project_root, 'data', 'indices')
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'sonic_index.pkl')

## Build Logic

In [None]:
def build_index(limit=LIMIT):
    print(f"--- Starting Sonic Index Build (Limit: {limit}) ---")
    
    # 1. Initialize Engines
    print("Initializing Audio Processor...")
    audio_proc = AudioProcessor()
    print("Initializing Semantic Encoder...")
    semantic_enc = SemanticEncoder()
    
    # 2. Fetch Tracks (Deezer Top Charts as seed)
    # In a real scenario, you might iterate over a list of ISRC codes or more diverse endpoints
    url = "https://api.deezer.com/chart/0/tracks"
    params = {'limit': limit}
    try:
        r = requests.get(url, params=params)
        r.raise_for_status()
        tracks = r.json().get('data', [])
    except Exception as e:
        print(f"Error fetching tracks: {e}")
        return
    
    print(f"Fetched {len(tracks)} tracks from Deezer Charts.")
    
    sonic_data = []
    
    for i, track in enumerate(tqdm(tracks, desc="Processing Tracks")):
        try:
            # A. Audio Analysis
            preview_url = track.get('preview')
            audio_vec = None
            if preview_url:
                result = audio_proc.analyze_url(preview_url)
                if result:
                    audio_vec = result['vector']
            
            # B. Semantic Analysis
            meta_text = f"{track['title']} by {track['artist']['name']} album {track['album']['title']}"
            semantic_vec = semantic_enc.encode(meta_text)
            
            # C. Store
            if audio_vec is not None and semantic_vec is not None:
                sonic_data.append({
                    'id': track['id'],
                    'title': track['title'],
                    'artist': track['artist']['name'],
                    'audio_vector': audio_vec,
                    'semantic_vector': semantic_vec,
                    'preview': preview_url
                })
        except Exception as e:
            print(f"Skipping track {track.get('title', 'Unknown')}: {e}")
            continue
            
        # Rate limit
        time.sleep(0.5)

    # 3. Save Index
    with open(OUTPUT_FILE, 'wb') as f:
        pickle.dump(sonic_data, f)
        
    print(f"\n--- Build Complete ---")
    print(f"Saved {len(sonic_data)} tracks to {OUTPUT_FILE}")

In [None]:
# Run the build
build_index(limit=LIMIT)