In [32]:
import os

# Delete the cached metadata
METADATA_FILE = 'calibre_metadata.json'
EMBEDDING_FILE = 'calibre_book_embeddings.pkl'

if os.path.exists(METADATA_FILE):
    os.remove(METADATA_FILE)
    print(f"✓ Deleted {METADATA_FILE}")

if os.path.exists(EMBEDDING_FILE):
    os.remove(EMBEDDING_FILE)
    print(f"✓ Deleted {EMBEDDING_FILE}")

print("\n✓ Cache cleared. Now re-run the main setup cell.")


✓ Cache cleared. Now re-run the main setup cell.


In [33]:
# Cell 1: Environment Setup (MUST RUN FIRST)
import os
import warnings

# Critical: Set BEFORE any other imports
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
os.environ['HF_HUB_OFFLINE'] = '1'
os.environ['TRANSFORMERS_OFFLINE'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
warnings.filterwarnings("ignore")

print("✓ Environment configured")

✓ Environment configured


In [34]:

# Install dependencies (run this cell first)
 #pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
 #pip install sentence-transformers scikit-learn tqdm

In [35]:
# Cell 2: Imports
import pickle
import subprocess
import json
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display, HTML

print(f"PyTorch version: {torch.__version__}")
print(f"MPS available: {torch.backends.mps.is_available()}")

# Configuration
#CALIBRE_LIBRARY_PATH = os.path.expanduser("~/calibre_semantic_test")
CALIBRE_LIBRARY_PATH = os.path.expanduser("~/Calibre Library")
EMBEDDING_FILE = 'calibre_book_embeddings.pkl'
METADATA_FILE = 'calibre_metadata.json'

device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}\n")

PyTorch version: 2.8.0
MPS available: True
Using device: mps



In [36]:
# STEP 1: EXTRACT METADATA FROM CALIBRE
# ============================================================================

def get_calibre_metadata(library_path):
    """
    Extracts metadata from Calibre database using calibredb CLI.
    Returns a dictionary of book_id -> metadata
    """
    print(f"📚 Extracting metadata from: {library_path}")

    cmd = [
        'calibredb', 'list',
        '--library-path', library_path,
        '--for-machine'
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode != 0:
        raise Exception(f"Failed to read Calibre library: {result.stderr}")

    books = json.loads(result.stdout)
    print(f"✓ Found {len(books)} books in library\n")

    return books


def create_searchable_text(book):
    """
    Combines relevant metadata fields into a single searchable text.
    Fields are weighted by importance (repeated = more important).
    """
    parts = []

    # Title (most important - include twice)
    if book.get('title'):
        parts.append(f"Title: {book['title']}")
        parts.append(book['title'])

    # Authors
    if book.get('authors'):
        authors = ', '.join(book['authors']) if isinstance(book['authors'], list) else book['authors']
        parts.append(f"Authors: {authors}")

    # Series
    if book.get('series'):
        parts.append(f"Series: {book['series']}")

    # Tags/Genres (very important for semantic search)
    if book.get('tags'):
        tags = ', '.join(book['tags']) if isinstance(book['tags'], list) else book['tags']
        parts.append(f"Tags: {tags}")
        parts.append(tags)  # Include twice for emphasis

    # Publisher
    if book.get('publisher'):
        parts.append(f"Publisher: {book['publisher']}")

    # Description/Comments (most descriptive - include twice)
    if book.get('comments'):
        import re
        clean_comments = re.sub(r'<[^>]+>', '', book['comments'])
        parts.append(f"Description: {clean_comments}")
        parts.append(clean_comments)

    return " | ".join(parts)


def prepare_metadata_for_embedding(books):
    """
    Prepares book metadata for embedding.
    Returns: dict of book_id -> searchable_text and dict of book_id -> full_metadata
    """
    searchable_texts = {}
    book_metadata = {}

    print("🔄 Preparing metadata for embedding...")
    total = len(books)
    for idx, book in enumerate(books, 1):
        book_id = str(book['id'])
        searchable_texts[book_id] = create_searchable_text(book)
        book_metadata[book_id] = book

        # Simple progress indicator
        if idx % 10 == 0 or idx == total:
            print(f"   Processed {idx}/{total} books...")

    print(f"✓ Processed {len(searchable_texts)} books\n")
    return searchable_texts, book_metadata

In [37]:
# ============================================================================
# STEP 2: LOAD OR CREATE EMBEDDINGS
# ============================================================================

def load_or_create_embeddings(searchable_texts, model):
    """
    Loads existing embeddings or creates new ones using PyTorch.
    """
    if os.path.exists(EMBEDDING_FILE):
        print("📂 Loading existing embeddings from disk...")
        with open(EMBEDDING_FILE, 'rb') as f:
            embeddings_dict = pickle.load(f)
        print(f"✓ Loaded embeddings for {len(embeddings_dict)} books\n")
        return embeddings_dict

    print("🧠 Generating embeddings for book metadata...")
    print(f"   Model: {model}")
    print(f"   Device: {device}")

    book_ids = list(searchable_texts.keys())
    texts_to_embed = list(searchable_texts.values())

    # Generate embeddings with PyTorch backend
    embeddings = model.encode(
        texts_to_embed,
        batch_size=32,
        show_progress_bar=True,
        convert_to_numpy=True,
        device=device
    )

    # Map book IDs to embeddings
    embeddings_dict = {book_id: emb for book_id, emb in zip(book_ids, embeddings)}

    # Save embeddings
    with open(EMBEDDING_FILE, 'wb') as f:
        pickle.dump(embeddings_dict, f)
    print(f"✓ Embeddings saved to '{EMBEDDING_FILE}'\n")

    return embeddings_dict


In [38]:
# ============================================================================
# STEP 3: SEARCH FUNCTIONS
# ============================================================================

def semantic_search(query, embeddings_dict, book_metadata, model, top_n=10):
    """
    Performs semantic search on the book metadata using cosine similarity.
    """
    # Embed the query
    query_embedding = model.encode(query, convert_to_numpy=True, device=device).reshape(1, -1)

    # Calculate similarities
    similarities = {}
    for book_id, book_embedding in embeddings_dict.items():
        book_embedding = book_embedding.reshape(1, -1)
        similarity = cosine_similarity(query_embedding, book_embedding)[0][0]
        similarities[book_id] = similarity

    # Sort by similarity
    sorted_results = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

    # Return top N with full metadata
    results = []
    for book_id, score in sorted_results[:top_n]:
        results.append({
            'score': score,
            'metadata': book_metadata[book_id]
        })

    return results


def display_results_jupyter(results):
    """
    Displays search results in Jupyter with nice formatting.
    """
    display(HTML("<h2>🔍 Search Results</h2>"))

    for i, result in enumerate(results, 1):
        book = result['metadata']
        score = result['score']

        # Build HTML for each book
        html = f"""
        <div style="border: 1px solid #ddd; padding: 15px; margin: 10px 0; border-radius: 5px; background-color: #f9f9f9;">
            <h3 style="margin-top: 0; color: #2c3e50;">
                {i}. {book.get('title', 'Unknown Title')}
                <span style="float: right; color: #27ae60; font-size: 0.8em;">
                    Similarity: {score:.4f}
                </span>
            </h3>
        """

        # Authors
        if book.get('authors'):
            authors = ', '.join(book['authors']) if isinstance(book['authors'], list) else book['authors']
            html += f"<p><strong>Authors:</strong> {authors}</p>"

        # Series
        if book.get('series'):
            series_info = book['series']
            if book.get('series_index'):
                series_info += f" #{book['series_index']}"
            html += f"<p><strong>Series:</strong> {series_info}</p>"

        # Tags
        if book.get('tags'):
            tags = book['tags'] if isinstance(book['tags'], list) else [book['tags']]
            tag_badges = ' '.join([f'<span style="background-color: #3498db; color: white; padding: 2px 8px; border-radius: 3px; font-size: 0.85em; margin-right: 5px;">{tag}</span>' for tag in tags])
            html += f"<p><strong>Tags:</strong> {tag_badges}</p>"

        # Publisher and date
        pub_info = []
        if book.get('publisher'):
            pub_info.append(book['publisher'])
        if book.get('pubdate'):
            pub_info.append(str(book['pubdate']))
        if pub_info:
            html += f"<p><strong>Published:</strong> {', '.join(pub_info)}</p>"

        # Rating
        if book.get('rating'):
            stars = '★' * int(book['rating']) + '☆' * (5 - int(book['rating']))
            html += f"<p><strong>Rating:</strong> {stars}</p>"

        # Description preview
        if book.get('comments'):
            import re
            clean_comments = re.sub(r'<[^>]+>', '', book['comments'])
            preview = clean_comments[:300] + "..." if len(clean_comments) > 300 else clean_comments
            html += f"<p><strong>Description:</strong> <em>{preview}</em></p>"

        html += f"<p style='color: #7f8c8d; font-size: 0.9em;'><strong>Calibre ID:</strong> {book['id']}</p>"
        html += "</div>"

        display(HTML(html))


def display_results_text(results):
    """
    Displays search results in plain text (for non-Jupyter environments).
    """
    print("\n" + "="*80)
    print("🔍 SEARCH RESULTS")
    print("="*80)

    for i, result in enumerate(results, 1):
        book = result['metadata']
        score = result['score']

        print(f"\n{i}. [{score:.4f}] {book.get('title', 'Unknown Title')}")

        if book.get('authors'):
            authors = ', '.join(book['authors']) if isinstance(book['authors'], list) else book['authors']
            print(f"   Authors: {authors}")

        if book.get('series'):
            series_info = book['series']
            if book.get('series_index'):
                series_info += f" #{book['series_index']}"
            print(f"   Series: {series_info}")

        if book.get('tags'):
            tags = ', '.join(book['tags']) if isinstance(book['tags'], list) else book['tags']
            print(f"   Tags: {tags}")

        pub_info = []
        if book.get('publisher'):
            pub_info.append(book['publisher'])
        if book.get('pubdate'):
            pub_info.append(str(book['pubdate']))
        if pub_info:
            print(f"   Published: {', '.join(pub_info)}")

        if book.get('rating'):
            stars = '★' * int(book['rating']) + '☆' * (5 - int(book['rating']))
            print(f"   Rating: {stars}")

        if book.get('comments'):
            import re
            clean_comments = re.sub(r'<[^>]+>', '', book['comments'])
            preview = clean_comments[:200] + "..." if len(clean_comments) > 200 else clean_comments
            print(f"   Description: {preview}")

        print(f"   Calibre ID: {book['id']}")
        print("-" * 80)

In [39]:
# Cell 4: Main Setup
print("="*80)
print("📚 CALIBRE SEMANTIC SEARCH SYSTEM")
print("="*80)
print()

# Load metadata
if os.path.exists(METADATA_FILE):
    print("📂 Loading cached metadata...")
    with open(METADATA_FILE, 'r') as f:
        books = json.load(f)
    print(f"✓ Loaded {len(books)} books\n")
else:
    books = get_calibre_metadata(CALIBRE_LIBRARY_PATH)
    with open(METADATA_FILE, 'w') as f:
        json.dump(books, f)

# Prepare searchable texts
searchable_texts, book_metadata = prepare_metadata_for_embedding(books)

# Load model (offline mode)
print("🤖 Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
print("   ✓ Loaded on CPU")

if device == 'mps':
    model = model.to('mps')
    print("   ✓ Moved to MPS")

print("✓ Model ready\n")

# Load or create embeddings
embeddings_dict = load_or_create_embeddings(searchable_texts, model)

print("="*80)
print("✅ READY TO SEARCH!")
print("="*80)
print()

📚 CALIBRE SEMANTIC SEARCH SYSTEM

📚 Extracting metadata from: /Users/alexchilton/Calibre Library
✓ Found 14806 books in library

🔄 Preparing metadata for embedding...
   Processed 10/14806 books...
   Processed 20/14806 books...
   Processed 30/14806 books...
   Processed 40/14806 books...
   Processed 50/14806 books...
   Processed 60/14806 books...
   Processed 70/14806 books...
   Processed 80/14806 books...
   Processed 90/14806 books...
   Processed 100/14806 books...
   Processed 110/14806 books...
   Processed 120/14806 books...
   Processed 130/14806 books...
   Processed 140/14806 books...
   Processed 150/14806 books...
   Processed 160/14806 books...
   Processed 170/14806 books...
   Processed 180/14806 books...
   Processed 190/14806 books...
   Processed 200/14806 books...
   Processed 210/14806 books...
   Processed 220/14806 books...
   Processed 230/14806 books...
   Processed 240/14806 books...
   Processed 250/14806 books...
   Processed 260/14806 books...
   Process

Batches:   0%|          | 0/463 [00:00<?, ?it/s]

✓ Embeddings saved to 'calibre_book_embeddings.pkl'

✅ READY TO SEARCH!



In [40]:
# Cell 5: Search Function
def search(query, top_n=10, display_mode='jupyter'):
    """
    Main search function.

    Example:
        search("space opera with strong female lead")
        search("cozy mystery small town", top_n=5)
    """
    print(f"Searching for: '{query}'\n")
    results = semantic_search(query, embeddings_dict, book_metadata, model, top_n=top_n)

    if display_mode == 'jupyter':
        display_results_jupyter(results)
    else:
        display_results_text(results)

    return results

print("✓ Search function ready")

✓ Search function ready


In [41]:
# Cell 6: Try your first search!
search("fantasy magic adventure", top_n=5)

Searching for: 'fantasy magic adventure'



[{'score': np.float32(0.57145524),
  'metadata': {'authors': 'Steve Higgs',
   'id': 15800,
   'title': 'Untethered Magic: An Urban Fantasy Thriller'}},
 {'score': np.float32(0.5286398),
  'metadata': {'authors': 'Marian Womack',
   'id': 8326,
   'title': 'On the Nature of Magic'}},
 {'score': np.float32(0.5175878),
  'metadata': {'authors': 'Marina Montesano',
   'id': 6362,
   'title': 'Folklore, Magic, and Witchcraft: Cultural Exchanges From the Twelfth to Eighteenth Century'}},
 {'score': np.float32(0.4866191),
  'metadata': {'authors': 'Simon R. Green',
   'id': 5958,
   'title': 'For Love of Magic'}},
 {'score': np.float32(0.47863525),
  'metadata': {'authors': 'Sarah Gailey',
   'id': 6143,
   'title': 'When We Were Magic'}}]

In [42]:
search("transformers nlp", top_n=5)

Searching for: 'transformers nlp'



[{'score': np.float32(0.77226686),
  'metadata': {'authors': 'Savaş Yıldırım & Meysam Asgari- Chenaghlu',
   'id': 11292,
   'title': 'Mastering Transformers: Build State-Of-The-Art Models From Scratch With Advanced Natural Language Processing Techniques'}},
 {'score': np.float32(0.77226686),
  'metadata': {'authors': 'Savaş Yıldırım & Meysam Asgari-Chenaghlu',
   'id': 18354,
   'title': 'Mastering Transformers: Build State-Of-The-Art Models From Scratch With Advanced Natural Language Processing Techniques'}},
 {'score': np.float32(0.7668112),
  'metadata': {'authors': 'Shashank Mohan Jain',
   'id': 10150,
   'title': 'Introduction to Transformers for NLP: With the Hugging Face Library and Models to Solve Problems'}},
 {'score': np.float32(0.7668112),
  'metadata': {'authors': 'Shashank Mohan Jain',
   'id': 19499,
   'title': 'Introduction to Transformers for NLP: With the Hugging Face Library and Models to Solve Problems'}},
 {'score': np.float32(0.7632168),
  'metadata': {'authors