From e0e6dfb312045521457393a910fb001f3be45cf2 Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 19 May 2026 18:02:47 -0700 Subject: [PATCH 1/2] Cache catalog composite objects on disk to speed up startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CatalogBuilder rebuilt the full composite-object list from SQLite on every startup — ~117s of wall time for 151K objects, with ~101s of that hidden behind the deferred background loader. This adds a pickle-based cache at ~/PiFinder_data/cache/catalogs/, written after the background loader finishes and read on subsequent startups. The cache is fingerprinted on the source DB's mtime + size and a CACHE_VERSION constant; user-state `logged` is reset before pickling and re-applied from obs_db on load. On cache hit the background loader is skipped entirely. Also converts ObservationsDatabase.observed_objects_cache from list to set so the per-object check_logged() pass on cache load runs in O(n) rather than O(n*m) — drops the re-apply pass from 3.7s to 275ms. End-to-end startup time on a Pi: 117s -> 10s when the cache is warm. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/PiFinder/catalog_cache.py | 125 +++++++++++++++++++++++++ python/PiFinder/catalogs.py | 90 +++++++++++++----- python/PiFinder/db/observations_db.py | 7 +- python/tests/test_catalog_cache.py | 128 ++++++++++++++++++++++++++ 4 files changed, 325 insertions(+), 25 deletions(-) create mode 100644 python/PiFinder/catalog_cache.py create mode 100644 python/tests/test_catalog_cache.py diff --git a/python/PiFinder/catalog_cache.py b/python/PiFinder/catalog_cache.py new file mode 100644 index 000000000..22759e3a7 --- /dev/null +++ b/python/PiFinder/catalog_cache.py @@ -0,0 +1,125 @@ +"""Pickle-based cache for the output of CatalogBuilder._build_composite. + +Cache layout under ~/PiFinder_data/cache/catalogs/: + composite_objects.pkl — pickled {composite_objects, catalogs_info} + composite_objects.meta.json — fingerprint for invalidation + +The `logged` flag on each CompositeObject is user state; it is reset to False +before pickling and re-applied from the observations DB after load. +""" +from __future__ import annotations + +import json +import logging +import pickle +import sys +from typing import Dict, List, Optional, Tuple + +from PiFinder.composite_object import CompositeObject +from PiFinder.utils import data_dir, pifinder_db + +logger = logging.getLogger("Catalog.Cache") + +# Bump when CompositeObject shape, _create_full_composite_object output, or +# the pickled payload structure changes. +CACHE_VERSION = 1 + +CACHE_DIR = data_dir / "cache" / "catalogs" +PICKLE_PATH = CACHE_DIR / "composite_objects.pkl" +META_PATH = CACHE_DIR / "composite_objects.meta.json" + + +def _fingerprint() -> Dict: + st = pifinder_db.stat() + return { + "cache_version": CACHE_VERSION, + "db_path": str(pifinder_db.resolve()), + "db_mtime_ns": st.st_mtime_ns, + "db_size": st.st_size, + "python_version": f"{sys.version_info.major}.{sys.version_info.minor}", + "pickle_protocol": pickle.HIGHEST_PROTOCOL, + } + + +def load() -> Optional[Tuple[List[CompositeObject], Dict[str, Dict]]]: + """Return (composite_objects, catalogs_info) if cache is valid, else None. + + Returns None on any failure (missing files, stale fingerprint, corrupt pickle). + Resets `logged=False` on returned objects — caller must re-apply from obs_db. + """ + if not PICKLE_PATH.exists() or not META_PATH.exists(): + return None + try: + with META_PATH.open() as f: + stored_meta = json.load(f) + except Exception as e: + logger.warning("Cache meta unreadable, ignoring cache: %s", e) + return None + + current_meta = _fingerprint() + if stored_meta != current_meta: + logger.info( + "Catalog cache fingerprint mismatch; will rebuild. stored=%s current=%s", + stored_meta, + current_meta, + ) + return None + + try: + with PICKLE_PATH.open("rb") as f: + data = pickle.load(f) + composite_objects = data["composite_objects"] + catalogs_info = data["catalogs_info"] + except Exception as e: + logger.warning("Cache pickle unreadable, ignoring cache: %s", e) + return None + + for obj in composite_objects: + obj.logged = False + + return composite_objects, catalogs_info + + +def save( + composite_objects: List[CompositeObject], catalogs_info: Dict[str, Dict] +) -> None: + """Write the cache. Never raises — logs errors instead. + + Strips `logged` to False so the cache is stable across sessions. + Writes the pickle atomically via tmp + rename to avoid torn writes. + """ + try: + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + for obj in composite_objects: + obj.logged = False + + payload = { + "composite_objects": composite_objects, + "catalogs_info": catalogs_info, + } + + tmp_pkl = PICKLE_PATH.with_suffix(".pkl.tmp") + with tmp_pkl.open("wb") as f: + pickle.dump(payload, f, protocol=pickle.HIGHEST_PROTOCOL) + tmp_pkl.replace(PICKLE_PATH) + + with META_PATH.open("w") as f: + json.dump(_fingerprint(), f, indent=2) + + logger.info( + "Catalog cache written: %d composite objects -> %s", + len(composite_objects), + PICKLE_PATH, + ) + except Exception as e: + logger.error("Failed to write catalog cache: %s", e, exc_info=True) + + +def clear() -> None: + """Remove cache files. Used by tests and for manual invalidation.""" + for p in (PICKLE_PATH, META_PATH): + try: + p.unlink() + except FileNotFoundError: + pass diff --git a/python/PiFinder/catalogs.py b/python/PiFinder/catalogs.py index dc2694257..9e4db04e1 100644 --- a/python/PiFinder/catalogs.py +++ b/python/PiFinder/catalogs.py @@ -24,6 +24,7 @@ TimerMixin, VirtualIDManager, ) +from PiFinder import catalog_cache logger = logging.getLogger("Catalog") @@ -859,34 +860,73 @@ def build(self, shared_state, ui_queue=None) -> Catalogs: shared_state: Shared state object ui_queue: Optional queue to signal completion (for main loop integration) """ - db: Database = ObjectsDatabase() obs_db: Database = ObservationsDatabase() - # list of dicts, one dict for each entry in the catalog_objects table - catalog_objects: List[Dict] = [dict(row) for row in db.get_catalog_objects()] - objects = db.get_objects() - common_names = Names() - catalogs_info = db.get_catalogs_dict() - objects = {row["id"]: dict(row) for row in objects} + cached = catalog_cache.load() + if cached is not None: + composite_objects, catalogs_info = cached + obs_db.load_observed_objects_cache() + for obj in composite_objects: + obj.logged = obs_db.check_logged(obj) - composite_objects: List[CompositeObject] = self._build_composite( - catalog_objects, objects, common_names, obs_db, ui_queue - ) + self.catalog_dicts = {} + logger.info( + "Loaded %i objects from catalog cache", len(composite_objects) + ) + + all_catalogs: Catalogs = self._get_catalogs( + composite_objects, catalogs_info + ) - # This is used for caching catalog dicts - # to speed up repeated searches - self.catalog_dicts = {} - logger.debug("Loaded %i objects from database", len(composite_objects)) + # No background loader on cache hit — signal completion now. + self._background_loader = None + self._pending_catalogs_ref = all_catalogs + if ui_queue: + try: + ui_queue.put("catalogs_fully_loaded") + except Exception as e: + logger.error(f"Failed to signal catalog completion: {e}") + else: + db: Database = ObjectsDatabase() - all_catalogs: Catalogs = self._get_catalogs(composite_objects, catalogs_info) + # list of dicts, one dict for each entry in the catalog_objects table + catalog_objects: List[Dict] = [ + dict(row) for row in db.get_catalog_objects() + ] + objects = db.get_objects() + common_names = Names() + catalogs_info = db.get_catalogs_dict() + objects = {row["id"]: dict(row) for row in objects} - # Store catalogs reference for background loader completion - self._pending_catalogs_ref = all_catalogs + composite_objects = self._build_composite( + catalog_objects, objects, common_names, obs_db, ui_queue + ) - # Pass background loader reference to Catalogs instance so it can check loading status - # This is set in _build_composite() if there are deferred objects - if hasattr(self, "_background_loader") and self._background_loader is not None: - all_catalogs._background_loader = self._background_loader + # Cache write context for _on_loader_complete + self._cache_priority_objects = list(composite_objects) + self._cache_catalogs_info = catalogs_info + + # This is used for caching catalog dicts + # to speed up repeated searches + self.catalog_dicts = {} + logger.debug("Loaded %i objects from database", len(composite_objects)) + + all_catalogs = self._get_catalogs(composite_objects, catalogs_info) + + # Store catalogs reference for background loader completion + self._pending_catalogs_ref = all_catalogs + + # Pass background loader reference to Catalogs instance so it can check loading status + # This is set in _build_composite() if there are deferred objects + if ( + hasattr(self, "_background_loader") + and self._background_loader is not None + ): + all_catalogs._background_loader = self._background_loader + else: + # No deferred objects — write cache immediately since + # _on_loader_complete will never fire. + catalog_cache.save(list(composite_objects), catalogs_info) # Initialize planet catalog with whatever date we have for now # This will be re-initialized on activation of Catalog ui module # if we have GPS lock @@ -1046,6 +1086,14 @@ def _on_loader_complete( if catalog.catalog_filter: catalog.filter_objects() + # Persist the full composite list (priority + deferred) for next startup. + priority_objects = getattr(self, "_cache_priority_objects", []) or [] + catalogs_info_for_cache = getattr(self, "_cache_catalogs_info", None) + if catalogs_info_for_cache is not None: + catalog_cache.save( + priority_objects + list(loaded_objects), catalogs_info_for_cache + ) + # Signal main loop that catalogs are fully loaded if ui_queue: try: diff --git a/python/PiFinder/db/observations_db.py b/python/PiFinder/db/observations_db.py index e0fc8d6a9..7b532fd7c 100644 --- a/python/PiFinder/db/observations_db.py +++ b/python/PiFinder/db/observations_db.py @@ -122,8 +122,7 @@ def log_object(self, session_uuid, obs_time, catalog, sequence, solution, notes) self.conn.commit() # Update cache so filters reflect the new observation immediately - if (catalog, sequence) not in self.observed_objects_cache: - self.observed_objects_cache.append((catalog, sequence)) + self.observed_objects_cache.add((catalog, sequence)) observation_id = self.cursor.execute( "select last_insert_rowid() as id" @@ -146,9 +145,9 @@ def load_observed_objects_cache(self) -> None: """ (re)Loads the logged object cache """ - self.observed_objects_cache: list[tuple[str, int]] = [ + self.observed_objects_cache: set[tuple[str, int]] = { (x["catalog"], x["sequence"]) for x in self.get_observed_objects() - ] + } def check_logged(self, obj_record: CompositeObject): """ diff --git a/python/tests/test_catalog_cache.py b/python/tests/test_catalog_cache.py new file mode 100644 index 000000000..03d2a4a33 --- /dev/null +++ b/python/tests/test_catalog_cache.py @@ -0,0 +1,128 @@ +"""Tests for PiFinder.catalog_cache.""" +import os +import pickle +import pytest + +from PiFinder import catalog_cache +from PiFinder.composite_object import CompositeObject, MagnitudeObject + + +def _make_obj(seq: int, catalog_code: str = "NGC", logged: bool = False): + obj = CompositeObject( + id=seq, + object_id=seq, + ra=1.0 * seq, + dec=-1.0 * seq, + catalog_code=catalog_code, + sequence=seq, + description=f"obj {seq}", + mag=MagnitudeObject([6.5]), + logged=logged, + ) + return obj + + +@pytest.fixture +def cache_paths(tmp_path, monkeypatch): + """Redirect cache files into tmp_path and provide a fake source DB to fingerprint.""" + fake_db = tmp_path / "pifinder_objects.db" + fake_db.write_bytes(b"\x00" * 128) + + pkl = tmp_path / "composite_objects.pkl" + meta = tmp_path / "composite_objects.meta.json" + + monkeypatch.setattr(catalog_cache, "CACHE_DIR", tmp_path) + monkeypatch.setattr(catalog_cache, "PICKLE_PATH", pkl) + monkeypatch.setattr(catalog_cache, "META_PATH", meta) + monkeypatch.setattr(catalog_cache, "pifinder_db", fake_db) + + return {"db": fake_db, "pkl": pkl, "meta": meta, "dir": tmp_path} + + +@pytest.mark.unit +def test_load_returns_none_when_no_cache(cache_paths): + assert catalog_cache.load() is None + + +@pytest.mark.unit +def test_roundtrip_preserves_objects_and_info(cache_paths): + objs = [_make_obj(i) for i in range(5)] + info = {"NGC": {"desc": "ngc", "max_sequence": 100}} + + catalog_cache.save(objs, info) + loaded = catalog_cache.load() + + assert loaded is not None + out_objs, out_info = loaded + assert len(out_objs) == 5 + assert out_info == info + assert [o.sequence for o in out_objs] == [0, 1, 2, 3, 4] + assert [o.catalog_code for o in out_objs] == ["NGC"] * 5 + + +@pytest.mark.unit +def test_logged_is_not_persisted(cache_paths): + """logged=True must be reset on save so user state doesn't leak across sessions.""" + objs = [_make_obj(i, logged=True) for i in range(3)] + catalog_cache.save(objs, {}) + + with cache_paths["pkl"].open("rb") as f: + payload = pickle.load(f) + + assert all(o.logged is False for o in payload["composite_objects"]) + + # And load() also returns logged=False + loaded = catalog_cache.load() + assert loaded is not None + out_objs, _ = loaded + assert all(o.logged is False for o in out_objs) + + +@pytest.mark.unit +def test_fingerprint_mismatch_invalidates(cache_paths): + """If the source DB changes (mtime or size), the cache must be rejected.""" + objs = [_make_obj(0)] + catalog_cache.save(objs, {}) + assert catalog_cache.load() is not None # sanity + + # Bump the DB mtime and rewrite it to a new size — the cache should now be stale. + cache_paths["db"].write_bytes(b"\x01" * 256) + # Touch mtime to be safe even on fast filesystems. + new_time = cache_paths["db"].stat().st_mtime + 10 + os.utime(cache_paths["db"], (new_time, new_time)) + + assert catalog_cache.load() is None + + +@pytest.mark.unit +def test_corrupt_pickle_returns_none(cache_paths): + catalog_cache.save([_make_obj(0)], {}) + # Sanity check first. + assert catalog_cache.load() is not None + + # Corrupt the pickle without touching the meta file. + cache_paths["pkl"].write_bytes(b"not a pickle") + + assert catalog_cache.load() is None + + +@pytest.mark.unit +def test_corrupt_meta_returns_none(cache_paths): + catalog_cache.save([_make_obj(0)], {}) + cache_paths["meta"].write_text("not json{{") + + assert catalog_cache.load() is None + + +@pytest.mark.unit +def test_clear_removes_files(cache_paths): + catalog_cache.save([_make_obj(0)], {}) + assert cache_paths["pkl"].exists() and cache_paths["meta"].exists() + + catalog_cache.clear() + + assert not cache_paths["pkl"].exists() + assert not cache_paths["meta"].exists() + + # Calling clear when files are already gone must not raise. + catalog_cache.clear() From 8ceb081fdf95c2a8b76333789480ce407688c300 Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 19 May 2026 18:09:25 -0700 Subject: [PATCH 2/2] Drop catalogs_fully_loaded signal on cache hit Everything is loaded synchronously inside build() when the cache is warm, so the UI never sees a "still loading" state and the "Catalogs Fully Loaded" toast is misleading. Only fire the signal when the background loader actually completes deferred work (cache-miss path). Co-Authored-By: Claude Opus 4.7 (1M context) --- python/PiFinder/catalogs.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python/PiFinder/catalogs.py b/python/PiFinder/catalogs.py index 9e4db04e1..81cf55d44 100644 --- a/python/PiFinder/catalogs.py +++ b/python/PiFinder/catalogs.py @@ -878,14 +878,11 @@ def build(self, shared_state, ui_queue=None) -> Catalogs: composite_objects, catalogs_info ) - # No background loader on cache hit — signal completion now. + # All objects loaded synchronously from cache — no background + # loader, no completion signal (there is nothing for the UI to + # transition from). self._background_loader = None self._pending_catalogs_ref = all_catalogs - if ui_queue: - try: - ui_queue.put("catalogs_fully_loaded") - except Exception as e: - logger.error(f"Failed to signal catalog completion: {e}") else: db: Database = ObjectsDatabase()