# EraEx: Build FAISS + BM25 Indexes

This notebook builds:
1. FAISS IVF+PQ indexes for dense retrieval
2. BM25 index for sparse retrieval

**Supports**: Google Colab, Vast.ai, Local

In [15]:
from pathlib import Path
import os

try:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_DIR = Path('/content/drive/MyDrive/EraEx')
except ImportError:
    env_dir = os.environ.get('ERAEX_DIR', '')
    if env_dir and Path(env_dir).exists():
        PROJECT_DIR = Path(env_dir)
    elif Path.cwd().name == 'notebooks':
        PROJECT_DIR = Path.cwd().parent
    else:
        PROJECT_DIR = Path.cwd()

print(f"Project: {PROJECT_DIR}")

Project: c:\Users\Yabuku\Downloads\EraEx


In [16]:
%pip install -r C:\Users\Yabuku\Downloads\EraEx\requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
import numpy as np
import polars as pl
import faiss
import pickle
import gc

EMBEDDINGS_DIR = PROJECT_DIR / 'data' / 'embeddings'
READY_DIR = PROJECT_DIR / 'data' / 'processed' / 'music_ready'
INDEXES_DIR = PROJECT_DIR / 'data' / 'indexes'

INDEXES_DIR.mkdir(parents=True, exist_ok=True)

YEAR_RANGE = range(2012, 2019)

N_LIST = 256
M_PQ = 48
N_BITS = 8
ADD_CHUNK = 500_000

print(f"Embeddings: {EMBEDDINGS_DIR}")
print(f"Indexes: {INDEXES_DIR}\n")

for y in YEAR_RANGE:
    p = EMBEDDINGS_DIR / f'embeddings_{y}.npy'
    if p.exists():
        emb = np.load(p, mmap_mode='r')
        print(f"  {y}: {emb.shape}")
    else:
        print(f"  {y}: NOT FOUND")

Embeddings: c:\Users\Yabuku\Downloads\EraEx\data\embeddings
Indexes: c:\Users\Yabuku\Downloads\EraEx\data\indexes

  2012: (12537701, 768)
  2013: (10420728, 768)
  2014: (2417280, 768)
  2015: (2293643, 768)
  2016: (1948712, 768)
  2017: (1600543, 768)
  2018: (1823593, 768)


## 1. Build FAISS Indexes

In [18]:
def build_faiss_index(year):
    emb_path = EMBEDDINGS_DIR / f'embeddings_{year}.npy'
    idx_path = INDEXES_DIR / f'faiss_{year}.index'

    if not emb_path.exists():
        print(f'{year}: No embeddings')
        return

    if idx_path.exists():
        print(f'{year}: Index exists, skipping')
        return

    print(f'\nBuilding FAISS index for {year}...')
    emb_mmap = np.load(emb_path, mmap_mode='r')
    n_vectors, dim = emb_mmap.shape
    print(f'  Vectors: {n_vectors:,} x {dim}')

    n_train = min(n_vectors, 100_000)
    train_idx = np.random.choice(n_vectors, n_train, replace=False)
    train_data = emb_mmap[train_idx].astype(np.float32)

    n_list_actual = min(N_LIST, n_vectors // 50)
    m_pq_actual = min(M_PQ, dim)

    quantizer = faiss.IndexFlatIP(dim)
    index = faiss.IndexIVFPQ(quantizer, dim, n_list_actual, m_pq_actual, N_BITS)

    print(f'  Training (n_list={n_list_actual}, m_pq={m_pq_actual})...')
    index.train(train_data)
    del train_data
    gc.collect()

    print(f'  Adding vectors in chunks of {ADD_CHUNK:,}...')
    for start in range(0, n_vectors, ADD_CHUNK):
        end = min(start + ADD_CHUNK, n_vectors)
        chunk = emb_mmap[start:end].astype(np.float32)
        index.add(chunk)
        del chunk
        gc.collect()
        print(f'    {end:,} / {n_vectors:,}')

    del emb_mmap
    gc.collect()

    faiss.write_index(index, str(idx_path))
    print(f'  Saved: {idx_path}')

In [19]:
for year in YEAR_RANGE:
    build_faiss_index(year)

print('\nFAISS indexes complete!')


Building FAISS index for 2012...
  Vectors: 12,537,701 x 768
  Training (n_list=256, m_pq=48)...
  Adding vectors in chunks of 500,000...
    500,000 / 12,537,701
    1,000,000 / 12,537,701
    1,500,000 / 12,537,701
    2,000,000 / 12,537,701
    2,500,000 / 12,537,701
    3,000,000 / 12,537,701
    3,500,000 / 12,537,701
    4,000,000 / 12,537,701
    4,500,000 / 12,537,701
    5,000,000 / 12,537,701
    5,500,000 / 12,537,701
    6,000,000 / 12,537,701
    6,500,000 / 12,537,701
    7,000,000 / 12,537,701
    7,500,000 / 12,537,701
    8,000,000 / 12,537,701
    8,500,000 / 12,537,701
    9,000,000 / 12,537,701
    9,500,000 / 12,537,701
    10,000,000 / 12,537,701
    10,500,000 / 12,537,701
    11,000,000 / 12,537,701
    11,500,000 / 12,537,701
    12,000,000 / 12,537,701
    12,500,000 / 12,537,701
    12,537,701 / 12,537,701
  Saved: c:\Users\Yabuku\Downloads\EraEx\data\indexes\faiss_2012.index

Building FAISS index for 2013...
  Vectors: 10,420,728 x 768
  Training (n_list=25

## 2. Build BM25 Index

In [20]:
import bm25s

def build_bm25_index():
    bm25_path = INDEXES_DIR / 'bm25_index.pkl'
    
    if bm25_path.exists():
        print('BM25 index exists, skipping')
        return
    
    print('Loading all documents for BM25...')
    all_docs = []
    all_ids = []
    
    for year in YEAR_RANGE:
        data_path = READY_DIR / f'year={year}' / 'data.parquet'
        if not data_path.exists():
            continue
        
        df = pl.read_parquet(data_path)
        texts = df['doc_text_music'].to_list()
        id_col = 'track_id' if 'track_id' in df.columns else 'permalink_url'
        ids = df[id_col].to_list()
        
        all_docs.extend([t if t else '' for t in texts])
        all_ids.extend([str(i) for i in ids])
        
        print(f'  {year}: {len(texts):,} docs')
    
    print(f'\nTotal documents: {len(all_docs):,}')
    
    print('Building BM25 index...')
    corpus_tokens = bm25s.tokenize(all_docs)
    
    bm25 = bm25s.BM25()
    bm25.index(corpus_tokens)
    
    index_data = {
        'bm25': bm25,
        'doc_ids': all_ids,
        'corpus_tokens': corpus_tokens,
    }
    
    with open(bm25_path, 'wb') as f:
        pickle.dump(index_data, f)
    
    print(f'Saved: {bm25_path}')

In [21]:
build_bm25_index()

print('\n' + '=' * 50)
print('ALL INDEXES COMPLETE')
print('=' * 50)

for f in sorted(INDEXES_DIR.glob('*')):
    size_mb = f.stat().st_size / 1e6
    print(f'{f.name}: {size_mb:.1f} MB')

Loading all documents for BM25...
  2012: 12,537,701 docs
  2013: 10,420,728 docs
  2014: 2,417,280 docs
  2015: 2,293,643 docs
  2016: 1,948,712 docs
  2017: 1,600,543 docs
  2018: 1,823,593 docs

Total documents: 33,042,200
Building BM25 index...


Split strings:   0%|          | 0/33042200 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/33042200 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/33042200 [00:00<?, ?it/s]

Saved: c:\Users\Yabuku\Downloads\EraEx\data\indexes\bm25_index.pkl

ALL INDEXES COMPLETE
bm25_index.pkl: 8831.4 MB
faiss_2012.index: 703.7 MB
faiss_2013.index: 585.1 MB
faiss_2014.index: 136.9 MB
faiss_2015.index: 130.0 MB
faiss_2016.index: 110.7 MB
faiss_2017.index: 91.2 MB
faiss_2018.index: 103.7 MB


## 3. Test Search

In [23]:
from transformers import AutoTokenizer, AutoModel
import torch

test_query = 'i miss my ex'
print(f'Test query: "{test_query}"')

tok = AutoTokenizer.from_pretrained('colbert-ir/colbertv2.0')
mdl = AutoModel.from_pretrained('colbert-ir/colbertv2.0')
mdl.eval()

inputs = tok([test_query], return_tensors='pt', truncation=True, max_length=32)
with torch.no_grad():
    out = mdl(**inputs)
    mask = inputs['attention_mask'].unsqueeze(-1).float()
    pooled = (out.last_hidden_state * mask).sum(1) / mask.sum(1)
    pooled = pooled / (pooled.norm(dim=1, keepdim=True) + 1e-9)
q_emb = pooled.numpy().astype(np.float32)

year = 2015
idx_path = INDEXES_DIR / f'faiss_{year}.index'
if idx_path.exists():
    index = faiss.read_index(str(idx_path))
    index.nprobe = 10
    scores, indices = index.search(q_emb, 5)

    ids_df = pl.read_parquet(EMBEDDINGS_DIR / f'ids_{year}.parquet')
    track_ids = ids_df['track_id'].to_list()

    data_path = READY_DIR / f'year={year}' / 'data.parquet'
    df = pl.read_parquet(data_path)

    print(f'\nTop 5 from {year}:')
    for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
        idx = int(idx)
        title = df['title'][idx] if idx < df.height else 'N/A'
        artist = df['artist'][idx] if idx < df.height else 'N/A'
        print(f'  {i+1}. {title} - {artist} (score: {score:.4f})')

Test query: "i miss my ex"


tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]


Top 5 from 2015:
  1. Your Gonna Miss Me When Im Gone - None (score: 0.5763)
  2. youll miss me when im gone - None (score: 0.5768)
  3. You re Gonna Miss My Love - None (score: 0.5802)
  4. You missed me so much - None (score: 0.5802)
  5. You missed me so much - None (score: 0.5802)
