From ad244e3399a733dda031bf7703226a94b88c3c61 Mon Sep 17 00:00:00 2001 From: Donghao Ren Date: Thu, 28 Aug 2025 21:29:45 -0700 Subject: [PATCH 1/3] refactor: [breaking] use Intl.Segmenter to generate cluster labels --- package-lock.json | 581 +++++++++++++++++- packages/component/package.json | 1 + .../src/demo/EmbeddingViewDemo.svelte | 4 +- .../embedding_view/EmbeddingViewImpl.svelte | 9 +- .../embedding_view/EmbeddingViewMosaic.svelte | 74 ++- .../lib/embedding_view/embedding_view_api.ts | 4 +- .../component/src/lib/embedding_view/theme.ts | 2 +- .../worker/clustering.worker.js | 32 +- .../src/lib/embedding_view/worker/index.ts | 24 +- .../embedding_view/worker/worker_functions.ts | 29 + .../lib/text_summarizer/text_summarizer.ts | 253 ++++---- packages/density-clustering/package.json | 7 +- packages/docs/embedding-view.md | 4 +- .../src/svelte/EmbeddingViewExample.svelte | 4 +- 14 files changed, 875 insertions(+), 153 deletions(-) diff --git a/package-lock.json b/package-lock.json index e5e990a..747aa5e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3556,6 +3556,29 @@ "dev": true, "license": "MIT" }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true + }, + "node_modules/axios": { + "version": "1.11.0", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.11.0.tgz", + "integrity": "sha512-1Lx3WLFQWm3ooKDYZD1eXmoGO9fxYQjrycfHFC8P0sCfQVXyROp0p9PFWBehewBOdCwHc+f/b8I0fMto5eSfwA==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "dependencies": { + "follow-redirects": "^1.15.6", + "form-data": "^4.0.4", + "proxy-from-env": "^1.1.0" + } + }, "node_modules/axobject-query": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-4.1.0.tgz", @@ -3573,6 +3596,117 @@ "dev": true, "license": "MIT" }, + "node_modules/binary-install": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/binary-install/-/binary-install-1.1.2.tgz", + "integrity": "sha512-ZS2cqFHPZOy4wLxvzqfQvDjCOifn+7uCPqNmYRIBM/03+yllON+4fNnsD0VJdW0p97y+E+dTRNPStWNqMBq+9g==", + "deprecated": "Package no longer supported. Contact Support at https://www.npmjs.com/support for more info.", + "dev": true, + "license": "MIT", + "dependencies": { + "axios": "^0.26.1", + "rimraf": "^3.0.2", + "tar": "^6.1.11" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/binary-install/node_modules/axios": { + "version": "0.26.1", + "resolved": "https://registry.npmjs.org/axios/-/axios-0.26.1.tgz", + "integrity": "sha512-fPwcX4EvnSHuInCMItEhAGnaSEXRBjtzh9fOtsE6E1G6p7vl7edEeZe11QHf18+6+9gR5PbKV/sGKNaD8YaMeA==", + "dev": true, + "license": "MIT", + "dependencies": { + "follow-redirects": "^1.14.8" + } + }, + "node_modules/binary-install/node_modules/chownr": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/chownr/-/chownr-2.0.0.tgz", + "integrity": "sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==", + "dev": true, + "license": "ISC", + "engines": { + "node": ">=10" + } + }, + "node_modules/binary-install/node_modules/minipass": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/minipass/-/minipass-5.0.0.tgz", + "integrity": "sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ==", + "dev": true, + "license": "ISC", + "engines": { + "node": ">=8" + } + }, + "node_modules/binary-install/node_modules/minizlib": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-2.1.2.tgz", + "integrity": "sha512-bAxsR8BVfj60DWXHE3u30oHzfl4G7khkSuPW+qvpd7jFRHm7dLxOjUk1EHACJ/hxLY8phGJ0YhYHZo7jil7Qdg==", + "dev": true, + "license": "MIT", + "dependencies": { + "minipass": "^3.0.0", + "yallist": "^4.0.0" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/binary-install/node_modules/minizlib/node_modules/minipass": { + "version": "3.3.6", + "resolved": "https://registry.npmjs.org/minipass/-/minipass-3.3.6.tgz", + "integrity": "sha512-DxiNidxSEK+tHG6zOIklvNOwm3hvCrbUrdtzY74U6HKTJxvIDfOUL5W5P2Ghd3DTkhhKPYGqeNUIh5qcM4YBfw==", + "dev": true, + "license": "ISC", + "dependencies": { + "yallist": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/binary-install/node_modules/mkdirp": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-1.0.4.tgz", + "integrity": "sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw==", + "dev": true, + "license": "MIT", + "bin": { + "mkdirp": "bin/cmd.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/binary-install/node_modules/tar": { + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/tar/-/tar-6.2.1.tgz", + "integrity": "sha512-DZ4yORTwrbTj/7MZYq2w+/ZFdI6OZ/f9SFHR+71gIVUZhOQPHzVCLpvRnPgyaMpfWxxk/4ONva3GQSyNIKRv6A==", + "dev": true, + "license": "ISC", + "dependencies": { + "chownr": "^2.0.0", + "fs-minipass": "^2.0.0", + "minipass": "^5.0.0", + "minizlib": "^2.1.1", + "mkdirp": "^1.0.3", + "yallist": "^4.0.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/binary-install/node_modules/yallist": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", + "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", + "dev": true, + "license": "ISC" + }, "node_modules/binary-search-bounds": { "version": "2.0.5", "resolved": "https://registry.npmjs.org/binary-search-bounds/-/binary-search-bounds-2.0.5.tgz", @@ -3620,6 +3754,22 @@ "node": ">=8" } }, + "node_modules/call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/ccount": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz", @@ -3763,6 +3913,21 @@ "simple-swizzle": "^0.2.2" } }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, "node_modules/comma-separated-tokens": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz", @@ -3845,6 +4010,13 @@ "dev": true, "license": "MIT" }, + "node_modules/concat-map": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", + "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", + "dev": true, + "license": "MIT" + }, "node_modules/confbox": { "version": "0.2.2", "resolved": "https://registry.npmjs.org/confbox/-/confbox-0.2.2.tgz", @@ -4380,6 +4552,18 @@ "robust-predicates": "^3.0.2" } }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "engines": { + "node": ">=0.4.0" + } + }, "node_modules/dequal": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz", @@ -4438,6 +4622,23 @@ "resolved": "packages/docs", "link": true }, + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "dependencies": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/eastasianwidth": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", @@ -4515,6 +4716,39 @@ "node": ">= 0.4" } }, + "node_modules/es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "dependencies": { + "es-errors": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-set-tostringtag": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "dependencies": { + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/es6-error": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/es6-error/-/es6-error-4.1.1.tgz", @@ -4786,6 +5020,27 @@ "tabbable": "^6.2.0" } }, + "node_modules/follow-redirects": { + "version": "1.15.11", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz", + "integrity": "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==", + "dev": true, + "funding": [ + { + "type": "individual", + "url": "https://github.com/sponsors/RubenVerborgh" + } + ], + "license": "MIT", + "engines": { + "node": ">=4.0" + }, + "peerDependenciesMeta": { + "debug": { + "optional": true + } + } + }, "node_modules/foreground-child": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.1.tgz", @@ -4802,6 +5057,25 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/form-data": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.4.tgz", + "integrity": "sha512-KrGhL9Q4zjj0kiUt5OO4Mr/A/jlI2jDYs5eHBpYHPcBEVSiipAvn2Ko2HnPe20rmcuuvMHNdZFp+4IlGTMF0Ow==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "es-set-tostringtag": "^2.1.0", + "hasown": "^2.0.2", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, "node_modules/fs-extra": { "version": "11.3.1", "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-11.3.1.tgz", @@ -4817,6 +5091,46 @@ "node": ">=14.14" } }, + "node_modules/fs-minipass": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/fs-minipass/-/fs-minipass-2.1.0.tgz", + "integrity": "sha512-V/JgOLFCS+R6Vcq0slCuaeWEdNC3ouDlJMNIsacH2VtALiu9mV4LPrHc5cDl8k5aw6J8jwgWWpiTo5RYhmIzvg==", + "dev": true, + "license": "ISC", + "dependencies": { + "minipass": "^3.0.0" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/fs-minipass/node_modules/minipass": { + "version": "3.3.6", + "resolved": "https://registry.npmjs.org/minipass/-/minipass-3.3.6.tgz", + "integrity": "sha512-DxiNidxSEK+tHG6zOIklvNOwm3hvCrbUrdtzY74U6HKTJxvIDfOUL5W5P2Ghd3DTkhhKPYGqeNUIh5qcM4YBfw==", + "dev": true, + "license": "ISC", + "dependencies": { + "yallist": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/fs-minipass/node_modules/yallist": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", + "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", + "dev": true, + "license": "ISC" + }, + "node_modules/fs.realpath": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", + "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", + "dev": true, + "license": "ISC" + }, "node_modules/fsevents": { "version": "2.3.3", "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", @@ -4842,6 +5156,49 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/get-intrinsic": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "function-bind": "^1.1.2", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "dependencies": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/gh-pages": { "version": "6.3.0", "resolved": "https://registry.npmjs.org/gh-pages/-/gh-pages-6.3.0.tgz", @@ -5042,6 +5399,39 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/has-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "dependencies": { + "has-symbols": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/hasown": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", @@ -5163,6 +5553,25 @@ "node": ">=8" } }, + "node_modules/inflight": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", + "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==", + "deprecated": "This module is not supported, and leaks memory. Do not use it. Check out lru-cache if you want a good and tested way to coalesce async requests by a key value, which is much more comprehensive and powerful.", + "dev": true, + "license": "ISC", + "dependencies": { + "once": "^1.3.0", + "wrappy": "1" + } + }, + "node_modules/inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", + "dev": true, + "license": "ISC" + }, "node_modules/internmap": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/internmap/-/internmap-2.0.3.tgz", @@ -5772,6 +6181,18 @@ "node": ">=10" } }, + "node_modules/math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/mdast-util-to-hast": { "version": "13.2.0", "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.0.tgz", @@ -5912,6 +6333,33 @@ "node": ">=8.6" } }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true, + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, "node_modules/mini-svg-data-uri": { "version": "1.4.4", "resolved": "https://registry.npmjs.org/mini-svg-data-uri/-/mini-svg-data-uri-1.4.4.tgz", @@ -6093,6 +6541,16 @@ "node": ">= 0.4" } }, + "node_modules/once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", + "dev": true, + "license": "ISC", + "dependencies": { + "wrappy": "1" + } + }, "node_modules/oniguruma-to-es": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/oniguruma-to-es/-/oniguruma-to-es-3.1.1.tgz", @@ -6241,6 +6699,16 @@ "node": ">=8" } }, + "node_modules/path-is-absolute": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/path-key": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz", @@ -6490,6 +6958,15 @@ "node": ">=12.0.0" } }, + "node_modules/proxy-from-env": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==", + "dev": true, + "license": "MIT", + "optional": true, + "peer": true + }, "node_modules/publint": { "version": "0.3.12", "resolved": "https://registry.npmjs.org/publint/-/publint-0.3.12.tgz", @@ -6717,6 +7194,69 @@ "dev": true, "license": "MIT" }, + "node_modules/rimraf": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz", + "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==", + "deprecated": "Rimraf versions prior to v4 are no longer supported", + "dev": true, + "license": "ISC", + "dependencies": { + "glob": "^7.1.3" + }, + "bin": { + "rimraf": "bin.js" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/rimraf/node_modules/brace-expansion": { + "version": "1.1.12", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", + "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", + "dev": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "node_modules/rimraf/node_modules/glob": { + "version": "7.2.3", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz", + "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==", + "deprecated": "Glob versions prior to v9 are no longer supported", + "dev": true, + "license": "ISC", + "dependencies": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.1.1", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + }, + "engines": { + "node": "*" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/rimraf/node_modules/minimatch": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", + "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", + "dev": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^1.1.7" + }, + "engines": { + "node": "*" + } + }, "node_modules/roarr": { "version": "2.15.4", "resolved": "https://registry.npmjs.org/roarr/-/roarr-2.15.4.tgz", @@ -7103,6 +7643,20 @@ "dev": true, "license": "BSD-3-Clause" }, + "node_modules/stemmer": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/stemmer/-/stemmer-2.0.1.tgz", + "integrity": "sha512-bkWvSX2JR4nSZFfs113kd4C6X13bBBrg4fBKv2pVdzpdQI2LA5pZcWzTFNdkYsiUNl13E4EzymSRjZ0D55jBYg==", + "dev": true, + "license": "MIT", + "bin": { + "stemmer": "cli.js" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/streamlit-component-lib": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/streamlit-component-lib/-/streamlit-component-lib-2.0.0.tgz", @@ -8820,6 +9374,20 @@ } } }, + "node_modules/wasm-pack": { + "version": "0.13.1", + "resolved": "https://registry.npmjs.org/wasm-pack/-/wasm-pack-0.13.1.tgz", + "integrity": "sha512-P9exD4YkjpDbw68xUhF3MDm/CC/3eTmmthyG5bHJ56kalxOTewOunxTke4SyF8MTXV6jUtNjXggPgrGmMtczGg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT OR Apache-2.0", + "dependencies": { + "binary-install": "^1.0.1" + }, + "bin": { + "wasm-pack": "run.js" + } + }, "node_modules/webpack-virtual-modules": { "version": "0.6.2", "resolved": "https://registry.npmjs.org/webpack-virtual-modules/-/webpack-virtual-modules-0.6.2.tgz", @@ -8939,6 +9507,13 @@ "url": "https://github.com/chalk/ansi-styles?sponsor=1" } }, + "node_modules/wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", + "dev": true, + "license": "ISC" + }, "node_modules/yallist": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz", @@ -8993,6 +9568,7 @@ "publint": "^0.3.12", "quickselect": "^3.0.0", "simplify-js": "^1.2.4", + "stemmer": "^2.0.1", "svelte": "^5.37.3", "typescript": "^5.9.2", "vite": "^7.0.6", @@ -9012,7 +9588,10 @@ }, "packages/density-clustering": { "name": "@embedding-atlas/density-clustering", - "version": "0.0.0" + "version": "0.0.0", + "devDependencies": { + "wasm-pack": "^0.13.1" + } }, "packages/docs": { "devDependencies": { diff --git a/packages/component/package.json b/packages/component/package.json index d0cb413..db33e40 100644 --- a/packages/component/package.json +++ b/packages/component/package.json @@ -47,6 +47,7 @@ "publint": "^0.3.12", "quickselect": "^3.0.0", "simplify-js": "^1.2.4", + "stemmer": "^2.0.1", "svelte": "^5.37.3", "typescript": "^5.9.2", "vite": "^7.0.6", diff --git a/packages/component/src/demo/EmbeddingViewDemo.svelte b/packages/component/src/demo/EmbeddingViewDemo.svelte index 4ca9409..ce5fb26 100644 --- a/packages/component/src/demo/EmbeddingViewDemo.svelte +++ b/packages/component/src/demo/EmbeddingViewDemo.svelte @@ -37,8 +37,8 @@ return { x: data.x[minIndex], y: data.y[minIndex], text: dataset[minIndex].text, fields: {} }; } - async function queryClusterLabels(rects: Rectangle[]): Promise { - return "label"; + async function queryClusterLabels(clusters: Rectangle[][]): Promise<(string | null)[]> { + return clusters.map(() => "label"); } diff --git a/packages/component/src/lib/embedding_view/EmbeddingViewImpl.svelte b/packages/component/src/lib/embedding_view/EmbeddingViewImpl.svelte index 6ce64f4..2447f11 100644 --- a/packages/component/src/lib/embedding_view/EmbeddingViewImpl.svelte +++ b/packages/component/src/lib/embedding_view/EmbeddingViewImpl.svelte @@ -18,7 +18,7 @@ totalCount: number | null; maxDensity: number | null; automaticLabels: AutomaticLabelsConfig | boolean; - queryClusterLabels: ((rects: Rectangle[]) => Promise) | null; + queryClusterLabels: ((clusters: Rectangle[][]) => Promise<(string | null)[]>) | null; tooltip: Selection | null; selection: Selection[] | null; querySelection: ((x: number, y: number, unitDistance: number) => Promise) | null; @@ -595,12 +595,11 @@ let newClusters = await generateClusters(renderer, 10, viewport); newClusters = newClusters.concat(await generateClusters(renderer, 5, viewport)); - statusMessage = "Generating labels (initializing)..."; + statusMessage = "Generating labels..."; if (queryClusterLabels) { + let labels = await queryClusterLabels(newClusters.map((x) => x.rects)); for (let i = 0; i < newClusters.length; i++) { - let label = await queryClusterLabels(newClusters[i].rects); - newClusters[i].label = label; - statusMessage = `Generating labels (${(((i + 1) / newClusters.length) * 100).toFixed(0)}%)...`; + newClusters[i].label = labels[i]; } } diff --git a/packages/component/src/lib/embedding_view/EmbeddingViewMosaic.svelte b/packages/component/src/lib/embedding_view/EmbeddingViewMosaic.svelte index 4ce76a3..0411bbd 100644 --- a/packages/component/src/lib/embedding_view/EmbeddingViewMosaic.svelte +++ b/packages/component/src/lib/embedding_view/EmbeddingViewMosaic.svelte @@ -6,7 +6,6 @@ import EmbeddingViewImpl from "./EmbeddingViewImpl.svelte"; - import { TextSummarizer } from "../text_summarizer/text_summarizer.js"; import { deepEquals, type Point, type Rectangle, type ViewportState } from "../utils.js"; import type { EmbeddingViewMosaicProps } from "./embedding_view_mosaic_api.js"; import { @@ -17,6 +16,12 @@ } from "./mosaic_client.js"; import { makeClient } from "./mosaic_helper.js"; import type { DataPoint, DataPointID } from "./types.js"; + import { + textSummarizerAdd, + textSummarizerCreate, + textSummarizerDestroy, + textSummarizerSummarize, + } from "./worker/index.js"; let { coordinator = defaultCoordinator(), @@ -328,20 +333,63 @@ } // Cluster Labels - let textSummarizer = $derived( - text != null ? new TextSummarizer({ coordinator: coordinator, table: table, x: x, y: y, text: text }) : null, - ); - - async function queryClusterLabels(rects: Rectangle[]): Promise { - if (textSummarizer == null) { - return null; + async function queryClusterLabels(clusters: Rectangle[][]): Promise<(string | null)[]> { + if (text == null) { + return clusters.map(() => null); } - let list = await textSummarizer.summarize(rects, 4); - if (list.length > 0) { - return list.slice(0, 2).join("-") + "-\n" + list.slice(2).join("-"); - } else { - return null; + // Infer binning parameters + let result: any = await coordinator.query( + SQL.Query.from(table).select({ + xMin: SQL.min(SQL.column(x)), + yMin: SQL.min(SQL.column(y)), + xDiff: SQL.sub(SQL.quantile(SQL.column(x), 0.99), SQL.quantile(SQL.column(x), 0.01)), + yDiff: SQL.sub(SQL.quantile(SQL.column(y), 0.99), SQL.quantile(SQL.column(y), 0.01)), + }), + ); + let { xMin, yMin, xDiff, yDiff } = result.get(0); + let binning = { xMin: xMin, yMin: yMin, xStep: xDiff / 200, yStep: yDiff / 200 }; + // Create text summarizer (in the worker) + let summarizer = await textSummarizerCreate({ binning: binning, regions: clusters }); + // Add text data to the summarizer + let start = 0; + let chunkSize = 10000; + let lastAdd: Promise | null = null; + while (true) { + let r: any = await coordinator.query( + SQL.Query.from(table) + .select({ x: SQL.column(x), y: SQL.column(y), text: SQL.column(text) }) + .offset(start) + .limit(chunkSize), + ); + let data = { + x: r.getChild("x").toArray(), + y: r.getChild("y").toArray(), + text: r.getChild("text").toArray(), + }; + if (lastAdd != null) { + await lastAdd; + } + lastAdd = textSummarizerAdd(summarizer, data); + if (r.getChild("text").length < chunkSize) { + break; + } + start += chunkSize; + } + if (lastAdd != null) { + await lastAdd; } + let summarizeResult = await textSummarizerSummarize(summarizer); + await textSummarizerDestroy(summarizer); + + return summarizeResult.map((words) => { + if (words.length == 0) { + return null; + } else if (words.length > 2) { + return words.slice(0, 2).join("-") + "-\n" + words.slice(2).join("-"); + } else { + return words.join("-"); + } + }); } diff --git a/packages/component/src/lib/embedding_view/embedding_view_api.ts b/packages/component/src/lib/embedding_view/embedding_view_api.ts index 2282aa1..d35a15b 100644 --- a/packages/component/src/lib/embedding_view/embedding_view_api.ts +++ b/packages/component/src/lib/embedding_view/embedding_view_api.ts @@ -66,8 +66,8 @@ export interface EmbeddingViewProps { /** A function to query selected point given (x, y) location, and a unit distance (distance of 1pt in data units). */ querySelection?: ((x: number, y: number, unitDistance: number) => Promise) | null; - /** A function that returns a summary label for points covered by the union of the given rectangles. */ - queryClusterLabels?: ((rects: Rectangle[]) => Promise) | null; + /** A function that returns summary labels for clusters. Each cluster is given by a list of rectangles that approximate its shape. */ + queryClusterLabels?: ((clusters: Rectangle[][]) => Promise<(string | null)[]>) | null; /** A callback for when viewportState changes. */ onViewportState?: ((value: ViewportState) => void) | null; diff --git a/packages/component/src/lib/embedding_view/theme.ts b/packages/component/src/lib/embedding_view/theme.ts index 08d5eee..aae2c09 100644 --- a/packages/component/src/lib/embedding_view/theme.ts +++ b/packages/component/src/lib/embedding_view/theme.ts @@ -31,7 +31,7 @@ const defaultThemeConfig: { light: ThemeConfig; dark: ThemeConfig } = { }, dark: { fontFamily: "system-ui,sans-serif", - clusterLabelColor: "#fff", + clusterLabelColor: "#ccc", clusterLabelOutlineColor: "rgba(0,0,0,0.8)", clusterLabelOpacity: 0.8, statusBar: true, diff --git a/packages/component/src/lib/embedding_view/worker/clustering.worker.js b/packages/component/src/lib/embedding_view/worker/clustering.worker.js index 4690e26..1b2c67b 100644 --- a/packages/component/src/lib/embedding_view/worker/clustering.worker.js +++ b/packages/component/src/lib/embedding_view/worker/clustering.worker.js @@ -1,16 +1,32 @@ // Copyright (c) 2025 Apple Inc. Licensed under MIT License. -import { dynamicLabelPlacement, findClusters } from "./worker_functions.js"; +import { + dynamicLabelPlacement, + findClusters, + textSummarizerAdd, + textSummarizerCreate, + textSummarizerDestroy, + textSummarizerSummarize, +} from "./worker_functions.js"; + +/** @type Record any> */ +let functions = { + dynamicLabelPlacement, + findClusters, + textSummarizerCreate, + textSummarizerAdd, + textSummarizerDestroy, + textSummarizerSummarize, +}; onmessage = async (msg) => { - if (msg.data.name == "findClusters") { - let args = msg.data.payload; - let clusters = await findClusters(args.density_map, args.width, args.height, args.options); - postMessage({ id: msg.data.id, payload: clusters }); - } - if (msg.data.name == "dynamicLabelPlacement") { + if (functions[msg.data.name]) { + let func = functions[msg.data.name]; let args = msg.data.payload; - let result = dynamicLabelPlacement(args.labels, args.options); + let result = func(...args); + if (result instanceof Promise) { + result = await result; + } postMessage({ id: msg.data.id, payload: result }); } }; diff --git a/packages/component/src/lib/embedding_view/worker/index.ts b/packages/component/src/lib/embedding_view/worker/index.ts index 7770777..aef12a4 100644 --- a/packages/component/src/lib/embedding_view/worker/index.ts +++ b/packages/component/src/lib/embedding_view/worker/index.ts @@ -40,10 +40,26 @@ function invokeWorker(name: string, payload: any, transfer: Transferable[] = []) type PromiseReturn any> = (...args: Parameters) => Promise>; -export let findClusters: PromiseReturn = (density_map, width, height, options) => { - return invokeWorker("findClusters", { density_map, width, height, options: options }, [density_map.buffer]); +export let findClusters: PromiseReturn = (densityMap, width, height, options) => { + return invokeWorker("findClusters", [densityMap, width, height, options], [densityMap.buffer]); }; -export let dynamicLabelPlacement: PromiseReturn = (labels, options) => { - return invokeWorker("dynamicLabelPlacement", { labels, options }); +export let dynamicLabelPlacement: PromiseReturn = (...args) => { + return invokeWorker("dynamicLabelPlacement", args); +}; + +export let textSummarizerCreate: PromiseReturn = (...args) => { + return invokeWorker("textSummarizerCreate", args); +}; + +export let textSummarizerDestroy: PromiseReturn = (...args) => { + return invokeWorker("textSummarizerDestroy", args); +}; + +export let textSummarizerAdd: PromiseReturn = (...args) => { + return invokeWorker("textSummarizerAdd", args); +}; + +export let textSummarizerSummarize: PromiseReturn = (...args) => { + return invokeWorker("textSummarizerSummarize", args); }; diff --git a/packages/component/src/lib/embedding_view/worker/worker_functions.ts b/packages/component/src/lib/embedding_view/worker/worker_functions.ts index b4484f3..b6fe570 100644 --- a/packages/component/src/lib/embedding_view/worker/worker_functions.ts +++ b/packages/component/src/lib/embedding_view/worker/worker_functions.ts @@ -2,5 +2,34 @@ import { findClusters } from "@embedding-atlas/density-clustering"; import { dynamicLabelPlacement } from "../../dynamic_label_placement/dynamic_label_placement.js"; +import { TFIDFSummarizer } from "../../text_summarizer/text_summarizer.js"; +import type { Rectangle } from "../../utils.js"; export { dynamicLabelPlacement, findClusters }; + +let textSummarizers = new Map(); + +export function textSummarizerCreate(options: { + binning: { xMin: number; xStep: number; yMin: number; yStep: number }; + regions: Rectangle[][]; + stopWords?: string[]; +}) { + let key = new Date().getTime() + "-" + Math.random(); + textSummarizers.set(key, new TFIDFSummarizer(options)); + return key; +} + +export function textSummarizerDestroy(key: string) { + return textSummarizers.delete(key); +} + +export function textSummarizerAdd( + key: string, + data: { x: ArrayLike; y: ArrayLike; text: ArrayLike }, +) { + textSummarizers.get(key)?.add(data); +} + +export function textSummarizerSummarize(key: string) { + return textSummarizers.get(key)?.summarize() ?? []; +} diff --git a/packages/component/src/lib/text_summarizer/text_summarizer.ts b/packages/component/src/lib/text_summarizer/text_summarizer.ts index 166944d..61f129e 100644 --- a/packages/component/src/lib/text_summarizer/text_summarizer.ts +++ b/packages/component/src/lib/text_summarizer/text_summarizer.ts @@ -1,98 +1,135 @@ // Copyright (c) 2025 Apple Inc. Licensed under MIT License. -import { column, literal, sql } from "@uwdata/mosaic-sql"; +import { stemmer } from "stemmer"; import type { Rectangle } from "../utils.js"; -import { stopWords } from "./stop_words.js"; - -/** A text summarizer based on c-TF-IDF, all implemented as SQL queries. */ -export class TextSummarizer { - private coordinator: any; - private tableName: string; - private xColumn: string; - private yColumn: string; - private textColumn: string; - private derivedTableDF: string; - private derivedTableBins: string; - private initialized: boolean; - private xBinSize: number; - private yBinSize: number; - private x0: number; - private y0: number; - - constructor(options: { coordinator: any; table: string; text: string; x: string; y: string }) { - this.coordinator = options.coordinator; - this.tableName = options.table; - this.xColumn = options.x; - this.yColumn = options.y; - this.textColumn = options.text; - - this.derivedTableDF = this.tableName + "_df"; - this.derivedTableBins = this.tableName + "_bt"; - this.initialized = false; - this.xBinSize = 1; - this.yBinSize = 1; - this.x0 = 0; - this.y0 = 0; +import { stopWords as defaultStopWords } from "./stop_words.js"; + +/** A text summarizer based on c-TF-IDF (https://arxiv.org/pdf/2203.05794) */ +export class TFIDFSummarizer { + private segmenter: Intl.Segmenter; + private binning: XYBinning; + private stopWords: Set; + private key2RegionIndices: Map; + private frequencyPerClass: Map[]; + private frequencyAll: Map; + + /** Create a new TFIDFSummarizer */ + constructor(options: { + binning: { xMin: number; xStep: number; yMin: number; yStep: number }; + regions: Rectangle[][]; + stopWords?: string[]; + }) { + this.binning = new XYBinning( + options.binning.xMin, + options.binning.yMin, + options.binning.xStep, + options.binning.yStep, + ); + this.segmenter = new Intl.Segmenter(undefined, { granularity: "word" }); + this.stopWords = new Set(options.stopWords ?? defaultStopWords); + + this.frequencyPerClass = options.regions.map(() => new Map()); + this.frequencyAll = new Map(); + + // Generate key2RegionIndices, a map from xy key to region index + this.key2RegionIndices = new Map(); + for (let i = 0; i < options.regions.length; i++) { + let keys = this.binning.keys(options.regions[i]); + for (let k of keys) { + let v = this.key2RegionIndices.get(k); + if (v != null) { + v.push(i); + } else { + this.key2RegionIndices.set(k, [i]); + } + } + } } - private async initialize(): Promise { - if (this.initialized) { - return; + /** Add data to the summarizer */ + add(data: { x: ArrayLike; y: ArrayLike; text: ArrayLike }) { + for (let i = 0; i < data.text.length; i++) { + let key = this.binning.key(data.x[i], data.y[i]); + let indices = this.key2RegionIndices.get(key); + if (indices == null) { + continue; + } + for (let s of this.segmenter.segment(data.text[i])) { + let word = s.segment.toLowerCase().trim(); + if (word.length > 1) { + for (let idx of indices) { + incrementMap(this.frequencyPerClass[idx], word); + } + incrementMap(this.frequencyAll, word); + } + } } - let xColumn = column(this.xColumn); - let yColumn = column(this.yColumn); - let textColumn = column(this.textColumn); - - let r = await this.coordinator.query(sql` - SELECT - MIN(${xColumn}) AS xMin, QUANTILE_CONT(${xColumn}, 0.99) - QUANTILE_CONT(${xColumn}, 0.01) AS xDiff, - MIN(${yColumn}) AS yMin, QUANTILE_CONT(${yColumn}, 0.99) - QUANTILE_CONT(${yColumn}, 0.01) AS yDiff, - COUNT(*) AS count - FROM ${this.tableName} - `); - let { xMin, yMin, xDiff, yDiff, count } = r.get(0); - - this.x0 = xMin; - this.y0 = yMin; - this.xBinSize = xDiff / 200; - this.yBinSize = yDiff / 200; - let minCount = count < 10000 ? 1 : 5; - await this.coordinator.exec(sql` - - `); - await this.coordinator.exec(sql` - CREATE OR REPLACE TEMP MACRO embedding_view_tokenize(s) AS - unnest(string_split_regex(regexp_replace(lower(s), '[^a-z0-9'']', ' ', 'g'), '\\s+')); - - CREATE OR REPLACE TABLE ${this.derivedTableBins} AS ( - WITH tokens_all AS ( - SELECT - floor((${xColumn} - ${this.x0}) / ${this.xBinSize})::INT + 32768 * (floor((${yColumn} - ${this.y0}) / ${this.yBinSize})::INT) as xykey, - embedding_view_tokenize(${textColumn}) AS token - FROM ${this.tableName} - ) - SELECT xykey, token, COUNT(*) AS count - FROM tokens_all - WHERE token NOT IN ('',${stopWords.map((x) => literal(x)).join(",")}) AND LENGTH(token) >= 3 - GROUP BY xykey, token - HAVING count >= ${minCount} - ); - CREATE OR REPLACE TABLE ${this.derivedTableDF} AS ( - SELECT sum(count) AS count, stem(token, 'english') AS stem_token - FROM ${this.derivedTableBins} GROUP BY stem_token + } + + isStopWord(word: string) { + // Consider words in the stop words list or pure numbers as stop words. + return this.stopWords.has(word) || /^[0-9]+$/.test(word); + } + + summarize(limit: number = 4): string[][] { + // Aggregate the frequencies by stemmed words + let frequencyAllStem = aggregateByStem(this.frequencyAll); + let frequencyPerClassStem = this.frequencyPerClass.map(aggregateByStem); + + // Average number of words per class + let averageWords = + frequencyPerClassStem.map((x) => x.values().reduce((a, b) => a + b[1], 0)).reduce((a, b) => a + b, 0) / + frequencyPerClassStem.length; + + return frequencyPerClassStem.map((wordMap) => { + // Compute TF-IDF + let entries = Array.from( + wordMap.entries().map(([key, [word, tf]]) => { + let df = frequencyAllStem.get(key)?.[1] ?? 1; + let idf = Math.log(1 + averageWords / df); + return { + word: word, + tf: tf, + df: df, + idf: idf, + tfIDF: tf * idf, + }; + }), ); - `); - this.initialized = true; + entries = entries.filter((x) => !this.isStopWord(x.word) && x.df >= 2); + entries = entries.sort((a, b) => b.tfIDF - a.tfIDF); + return entries.slice(0, limit).map((x) => x.word); + }); + } +} + +class XYBinning { + private xMin: number; + private yMin: number; + private xStep: number; + private yStep: number; + + constructor(xMin: number, yMin: number, xStep: number, yStep: number) { + this.xMin = xMin; + this.yMin = yMin; + this.xStep = xStep; + this.yStep = yStep; } - private indices(rects: Rectangle[]): number[] { + key(x: number, y: number) { + let ix = Math.floor((x - this.xMin) / this.xStep); + let iy = Math.floor((y - this.yMin) / this.yStep); + return ix + iy * 32768; + } + + keys(rects: Rectangle[]): number[] { let keys = new Set(); for (let { xMin, yMin, xMax, yMax } of rects) { - let xiLowerBound = Math.floor((xMin - this.x0) / this.xBinSize); - let xiUpperBound = Math.floor((xMax - this.x0) / this.xBinSize); - let yiLowerBound = Math.floor((yMin - this.y0) / this.yBinSize); - let yiUpperBound = Math.floor((yMax - this.y0) / this.yBinSize); + let xiLowerBound = Math.floor((xMin - this.xMin) / this.xStep); + let xiUpperBound = Math.floor((xMax - this.xMin) / this.xStep); + let yiLowerBound = Math.floor((yMin - this.yMin) / this.yStep); + let yiUpperBound = Math.floor((yMax - this.yMin) / this.yStep); for (let xi = xiLowerBound; xi <= xiUpperBound; xi++) { for (let yi = yiLowerBound; yi <= yiUpperBound; yi++) { let p = yi * 32768 + xi; @@ -102,34 +139,28 @@ export class TextSummarizer { } return Array.from(keys); } +} - async summarize(rects: Rectangle[], limit: number = 4): Promise { - await this.initialize(); - let indices = this.indices(rects); - let q = sql` - WITH tokens_tf AS ( - SELECT token, sum(count) AS count - FROM ${this.derivedTableBins} - WHERE xykey IN (${indices.join(",")}) - GROUP BY token - ), - tokens_tf_stem AS ( - SELECT sum(count) AS count, stem(token, 'english') AS stem_token, ARG_MAX(token, count) AS token - FROM tokens_tf - GROUP BY stem_token - ) - SELECT - tokens_tf_stem.count AS tf, - ${this.derivedTableDF}.count AS df, - tf * log(1 + (SELECT sum(count) FROM tokens_tf_stem) / df) AS tfidf, - tokens_tf_stem.token AS token - FROM ${this.derivedTableDF}, tokens_tf_stem - WHERE ${this.derivedTableDF}.stem_token == tokens_tf_stem.stem_token - ORDER BY tfidf DESC limit ${limit} - `; - // TODO: try to directly call the DuckDB instance to see if perf is better. - let result = await this.coordinator.query(q); - let list = result.getChild("token").toArray(); - return list; +function incrementMap(map: Map, key: K) { + let c = map.get(key) ?? 0; + map.set(key, c + 1); +} + +/** Aggregate words by their stems and track the most frequent version. + * Returns a map with stemmed words as keys, and the most frequent version and total count as values. */ +function aggregateByStem(inputMap: Map): Map { + const result = new Map(); + for (const [word, count] of inputMap.entries()) { + const s = stemmer(word); + if (result.has(s)) { + const value = result.get(s); + value[1] += count; + if ((inputMap.get(value[0]) ?? 0) < count) { + value[0] = word; + } + } else { + result.set(s, [word, count]); + } } + return result; } diff --git a/packages/density-clustering/package.json b/packages/density-clustering/package.json index fe1db41..d431977 100644 --- a/packages/density-clustering/package.json +++ b/packages/density-clustering/package.json @@ -7,11 +7,14 @@ "module": "density_clustering_wasm/js/index.js", "type": "module", "scripts": { - "build": "npx -y wasm-pack build --release --target web density_clustering_wasm && rm density_clustering_wasm/pkg/.gitignore density_clustering_wasm/pkg/package.json" + "build": "wasm-pack build --release --target web density_clustering_wasm && rm density_clustering_wasm/pkg/.gitignore density_clustering_wasm/pkg/package.json" }, "files": [ "density_clustering_wasm/js/index.js", "density_clustering_wasm/pkg/density_clustering_wasm.js", "density_clustering_wasm/pkg/density_clustering_wasm_bg.wasm" - ] + ], + "devDependencies": { + "wasm-pack": "^0.13.1" + } } diff --git a/packages/docs/embedding-view.md b/packages/docs/embedding-view.md index ca99714..df01957 100644 --- a/packages/docs/embedding-view.md +++ b/packages/docs/embedding-view.md @@ -191,8 +191,8 @@ of a single pixel in data domain. You can use this to determine the distance thr ### queryClusterLabels `Function | null` -An async function of type `(rects: Rectangle[]) => Promise`, that returns a cluster label -for the points covered by the given set of rectangles. +An async function of type `(clusters: Rectangle[][]) => Promise<(string | null)[]>`, +that returns labels for a list of clusters. Each cluster is given as a list of rectangles that approximately cover the region. ## Custom Tooltip diff --git a/packages/examples/src/svelte/EmbeddingViewExample.svelte b/packages/examples/src/svelte/EmbeddingViewExample.svelte index 1341865..195c9b9 100644 --- a/packages/examples/src/svelte/EmbeddingViewExample.svelte +++ b/packages/examples/src/svelte/EmbeddingViewExample.svelte @@ -37,8 +37,8 @@ return { x: data.x[minIndex], y: data.y[minIndex], text: dataset[minIndex].text, fields: {} }; } - async function queryClusterLabels(rects: Rectangle[]): Promise { - return "label"; + async function queryClusterLabels(clusters: Rectangle[][]): Promise<(string | null)[]> { + return clusters.map(() => "label"); } From 165a828200b78c1631cdfc912849294c308266ce Mon Sep 17 00:00:00 2001 From: Donghao Ren Date: Fri, 29 Aug 2025 15:44:45 -0700 Subject: [PATCH 2/3] Update package-lock.json --- package-lock.json | 18 +++++++++++++++++- package.json | 1 + 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/package-lock.json b/package-lock.json index edbecc2..a08b511 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,9 +1,10 @@ { - "name": "embedding-atlas", + "name": "@embedding-atlas/workspace", "lockfileVersion": 3, "requires": true, "packages": { "": { + "name": "@embedding-atlas/workspace", "workspaces": [ "packages/component", "packages/viewer", @@ -7897,6 +7898,20 @@ "dev": true, "license": "MIT" }, + "node_modules/stemmer": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/stemmer/-/stemmer-2.0.1.tgz", + "integrity": "sha512-bkWvSX2JR4nSZFfs113kd4C6X13bBBrg4fBKv2pVdzpdQI2LA5pZcWzTFNdkYsiUNl13E4EzymSRjZ0D55jBYg==", + "dev": true, + "license": "MIT", + "bin": { + "stemmer": "cli.js" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/streamlit-component-lib": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/streamlit-component-lib/-/streamlit-component-lib-2.0.0.tgz", @@ -9998,6 +10013,7 @@ "publint": "^0.3.12", "quickselect": "^3.0.0", "simplify-js": "^1.2.4", + "stemmer": "^2.0.1", "svelte": "^5.37.3", "typescript": "^5.9.2", "vite": "^7.0.6", diff --git a/package.json b/package.json index d016661..afcfdd1 100644 --- a/package.json +++ b/package.json @@ -1,4 +1,5 @@ { + "name": "@embedding-atlas/workspace", "private": true, "workspaces": [ "packages/component", From 2ba31d3696c6e83af0d41d1425c4f39b899f0210 Mon Sep 17 00:00:00 2001 From: Donghao Ren Date: Sat, 30 Aug 2025 22:54:51 -0700 Subject: [PATCH 3/3] Updates --- .../embedding_view/EmbeddingViewMosaic.svelte | 13 +-- .../embedding_view/worker/worker_functions.ts | 14 +-- .../lib/text_summarizer/text_summarizer.ts | 91 ++++++++++++------- 3 files changed, 62 insertions(+), 56 deletions(-) diff --git a/packages/component/src/lib/embedding_view/EmbeddingViewMosaic.svelte b/packages/component/src/lib/embedding_view/EmbeddingViewMosaic.svelte index 0411bbd..3919584 100644 --- a/packages/component/src/lib/embedding_view/EmbeddingViewMosaic.svelte +++ b/packages/component/src/lib/embedding_view/EmbeddingViewMosaic.svelte @@ -337,19 +337,8 @@ if (text == null) { return clusters.map(() => null); } - // Infer binning parameters - let result: any = await coordinator.query( - SQL.Query.from(table).select({ - xMin: SQL.min(SQL.column(x)), - yMin: SQL.min(SQL.column(y)), - xDiff: SQL.sub(SQL.quantile(SQL.column(x), 0.99), SQL.quantile(SQL.column(x), 0.01)), - yDiff: SQL.sub(SQL.quantile(SQL.column(y), 0.99), SQL.quantile(SQL.column(y), 0.01)), - }), - ); - let { xMin, yMin, xDiff, yDiff } = result.get(0); - let binning = { xMin: xMin, yMin: yMin, xStep: xDiff / 200, yStep: yDiff / 200 }; // Create text summarizer (in the worker) - let summarizer = await textSummarizerCreate({ binning: binning, regions: clusters }); + let summarizer = await textSummarizerCreate({ regions: clusters }); // Add text data to the summarizer let start = 0; let chunkSize = 10000; diff --git a/packages/component/src/lib/embedding_view/worker/worker_functions.ts b/packages/component/src/lib/embedding_view/worker/worker_functions.ts index b6fe570..bfa2f20 100644 --- a/packages/component/src/lib/embedding_view/worker/worker_functions.ts +++ b/packages/component/src/lib/embedding_view/worker/worker_functions.ts @@ -2,20 +2,16 @@ import { findClusters } from "@embedding-atlas/density-clustering"; import { dynamicLabelPlacement } from "../../dynamic_label_placement/dynamic_label_placement.js"; -import { TFIDFSummarizer } from "../../text_summarizer/text_summarizer.js"; +import { TextSummarizer } from "../../text_summarizer/text_summarizer.js"; import type { Rectangle } from "../../utils.js"; export { dynamicLabelPlacement, findClusters }; -let textSummarizers = new Map(); +let textSummarizers = new Map(); -export function textSummarizerCreate(options: { - binning: { xMin: number; xStep: number; yMin: number; yStep: number }; - regions: Rectangle[][]; - stopWords?: string[]; -}) { +export function textSummarizerCreate(options: { regions: Rectangle[][]; stopWords?: string[] }) { let key = new Date().getTime() + "-" + Math.random(); - textSummarizers.set(key, new TFIDFSummarizer(options)); + textSummarizers.set(key, new TextSummarizer(options)); return key; } @@ -25,7 +21,7 @@ export function textSummarizerDestroy(key: string) { export function textSummarizerAdd( key: string, - data: { x: ArrayLike; y: ArrayLike; text: ArrayLike }, + data: { x: ArrayLike; y: ArrayLike; text: ArrayLike } ) { textSummarizers.get(key)?.add(data); } diff --git a/packages/component/src/lib/text_summarizer/text_summarizer.ts b/packages/component/src/lib/text_summarizer/text_summarizer.ts index 61f129e..1320c4f 100644 --- a/packages/component/src/lib/text_summarizer/text_summarizer.ts +++ b/packages/component/src/lib/text_summarizer/text_summarizer.ts @@ -6,7 +6,7 @@ import type { Rectangle } from "../utils.js"; import { stopWords as defaultStopWords } from "./stop_words.js"; /** A text summarizer based on c-TF-IDF (https://arxiv.org/pdf/2203.05794) */ -export class TFIDFSummarizer { +export class TextSummarizer { private segmenter: Intl.Segmenter; private binning: XYBinning; private stopWords: Set; @@ -14,18 +14,9 @@ export class TFIDFSummarizer { private frequencyPerClass: Map[]; private frequencyAll: Map; - /** Create a new TFIDFSummarizer */ - constructor(options: { - binning: { xMin: number; xStep: number; yMin: number; yStep: number }; - regions: Rectangle[][]; - stopWords?: string[]; - }) { - this.binning = new XYBinning( - options.binning.xMin, - options.binning.yMin, - options.binning.xStep, - options.binning.yStep, - ); + /** Create a new TextSummarizer */ + constructor(options: { regions: Rectangle[][]; stopWords?: string[] }) { + this.binning = XYBinning.inferFromRegions(options.regions); this.segmenter = new Intl.Segmenter(undefined, { granularity: "word" }); this.stopWords = new Set(options.stopWords ?? defaultStopWords); @@ -55,27 +46,27 @@ export class TFIDFSummarizer { if (indices == null) { continue; } + let words = []; for (let s of this.segmenter.segment(data.text[i])) { - let word = s.segment.toLowerCase().trim(); + let word = s.segment.trim(); if (word.length > 1) { - for (let idx of indices) { - incrementMap(this.frequencyPerClass[idx], word); - } - incrementMap(this.frequencyAll, word); + words.push(word); } } + let inc = 1 / words.length; + for (let word of words) { + for (let idx of indices) { + incrementMap(this.frequencyPerClass[idx], word, inc); + } + incrementMap(this.frequencyAll, word, inc); + } } } - isStopWord(word: string) { - // Consider words in the stop words list or pure numbers as stop words. - return this.stopWords.has(word) || /^[0-9]+$/.test(word); - } - summarize(limit: number = 4): string[][] { // Aggregate the frequencies by stemmed words - let frequencyAllStem = aggregateByStem(this.frequencyAll); - let frequencyPerClassStem = this.frequencyPerClass.map(aggregateByStem); + let frequencyAllStem = aggregateByStem(this.frequencyAll, this.stopWords); + let frequencyPerClassStem = this.frequencyPerClass.map((m) => aggregateByStem(m, this.stopWords)); // Average number of words per class let averageWords = @@ -97,7 +88,7 @@ export class TFIDFSummarizer { }; }), ); - entries = entries.filter((x) => !this.isStopWord(x.word) && x.df >= 2); + entries = entries.filter((x) => x.df >= 2); entries = entries.sort((a, b) => b.tfIDF - a.tfIDF); return entries.slice(0, limit).map((x) => x.word); }); @@ -117,13 +108,39 @@ class XYBinning { this.yStep = yStep; } + static inferFromRegions(regions: Rectangle[][]): XYBinning { + let xMin = Number.POSITIVE_INFINITY; + let yMin = Number.POSITIVE_INFINITY; + let xMax = Number.NEGATIVE_INFINITY; + let yMax = Number.NEGATIVE_INFINITY; + for (let region of regions) { + for (let rect of region) { + if (rect.xMin < xMin) { + xMin = rect.xMin; + } else if (rect.xMax > xMax) { + xMax = rect.xMax; + } + if (rect.yMin < yMin) { + yMin = rect.yMin; + } else if (rect.yMax > yMax) { + yMax = rect.yMax; + } + } + } + if (xMin < xMax && yMin < yMax) { + return new XYBinning(xMin, yMin, (xMax - xMin) / 200, (yMax - yMin) / 200); + } else { + return new XYBinning(0, 0, 1, 1); + } + } + key(x: number, y: number) { let ix = Math.floor((x - this.xMin) / this.xStep); let iy = Math.floor((y - this.yMin) / this.yStep); return ix + iy * 32768; } - keys(rects: Rectangle[]): number[] { + keys(rects: Rectangle[]): Set { let keys = new Set(); for (let { xMin, yMin, xMax, yMax } of rects) { let xiLowerBound = Math.floor((xMin - this.xMin) / this.xStep); @@ -137,23 +154,27 @@ class XYBinning { } } } - return Array.from(keys); + return keys; } } -function incrementMap(map: Map, key: K) { +function incrementMap(map: Map, key: K, value: number) { let c = map.get(key) ?? 0; - map.set(key, c + 1); + map.set(key, c + value); } /** Aggregate words by their stems and track the most frequent version. * Returns a map with stemmed words as keys, and the most frequent version and total count as values. */ -function aggregateByStem(inputMap: Map): Map { - const result = new Map(); - for (const [word, count] of inputMap.entries()) { - const s = stemmer(word); +function aggregateByStem(inputMap: Map, stopWords: Set): Map { + let result = new Map(); + for (let [word, count] of inputMap.entries()) { + let lower = word.toLowerCase(); + if (stopWords.has(lower) || /^[0-9]+$/.test(lower)) { + continue; + } + let s = stemmer(lower); if (result.has(s)) { - const value = result.get(s); + let value = result.get(s); value[1] += count; if ((inputMap.get(value[0]) ?? 0) < count) { value[0] = word;