alexklibisz · joancf · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/client-python/elastiknn/api.py b/client-python/elastiknn/api.py
@@ -17,6 +17,7 @@ class Similarity(Enum):
     L1 = 3
     L2 = 4
     Cosine = 5
+    Dot = 6
 
 
 class Vec:
@@ -144,7 +145,24 @@ def to_dict(self):
                     "k": self.k
                 }
             }
+
+    @dataclass(frozen=True)
+    class DotLsh(Base):
+        dims: int
+        L: int
+        k: int
 
+        def to_dict(self):
+            return {
+                "type": "elastiknn_dense_float_vector",
+                "elastiknn": {
+                    "model": "lsh",
+                    "similarity": "dot",
+                    "dims": self.dims,
+                    "L": self.L,
+                    "k": self.k
+                }
+            }
     @dataclass(frozen=True)
     class L2Lsh(Base):
         dims: int
@@ -271,6 +289,27 @@ def with_vec(self, vec: Vec.Base):
             return NearestNeighborsQuery.CosineLsh(field=self.field, vec=vec, similarity=self.similarity,
                                                    candidates=self.candidates)
 
+    @dataclass(frozen=True)
+    class DotLsh(Base):
+        field: str
+        vec: Vec.Base
+        similarity: Similarity = Similarity.Dot
+        candidates: int = 1000
+
+        def to_dict(self):
+            return {
+                "field": self.field,
+                "model": "lsh",
+                "similarity": self.similarity.name.lower(),
+                "candidates": self.candidates,
+                "vec": self.vec.to_dict()
+            }
+
+        def with_vec(self, vec: Vec.Base):
+            return NearestNeighborsQuery.DotLsh(field=self.field, vec=vec, similarity=self.similarity,
+                                                   candidates=self.candidates)
+
+
     @dataclass(frozen=True)
     class L2Lsh(Base):
         field: str

diff --git a/client-python/elastiknn/models.py b/client-python/elastiknn/models.py
@@ -91,6 +91,8 @@ def _mk_mapping_query(self, query_params: dict()) -> (Mapping.Base, NearestNeigh
                 return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.L2)
             elif self._metric == 'cosine':
                 return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Cosine)
+            elif self._metric == 'dot':
+                return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Dot)
             elif self._metric == 'jaccard':
                 return Mapping.SparseBool(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Jaccard)
             elif self._metric == 'hamming':
@@ -103,6 +105,9 @@ def _mk_mapping_query(self, query_params: dict()) -> (Mapping.Base, NearestNeigh
             elif self._metric == 'cosine':
                 return Mapping.CosineLsh(self._dims, **self._mapping_params), \
                        NearestNeighborsQuery.CosineLsh(field, dummy, **query_params)
+            elif self._metric == 'dot':
+                return Mapping.DotLsh(self._dims, **self._mapping_params), \
+                       NearestNeighborsQuery.DotLsh(field, dummy, **query_params)
             elif self._metric == 'hamming':
                 return Mapping.CosineLsh(self._dims, **self._mapping_params), \
                        NearestNeighborsQuery.HammingLsh(field, dummy, **query_params)

diff --git a/client-python/elastiknn/utils.py b/client-python/elastiknn/utils.py
@@ -13,10 +13,12 @@
     ('exact', 'l1'),
     ('exact', 'l2'),
     ('exact', 'cosine'),
+    ('exact', 'dot'),
     ('exact', 'hamming'),
     ('exact', 'jaccard'),
     ('lsh', 'l2'),
     ('lsh', 'cosine'),
+    ('lsh', 'dot'),
     ('lsh', 'jaccard'),
     ('lsh', 'hamming'),
     ('permutation_lsh', 'cosine'),

diff --git a/docs/_posts/2021-07-30-how-does-elastiknn-work.md b/docs/_posts/2021-07-30-how-does-elastiknn-work.md
@@ -43,8 +43,8 @@ The name is a combination of _Elastic_ and _KNN_ (K-Nearest Neighbors).
 The full list of features (copied from the home page) is as follows:
 
 - Datatypes to efficiently store dense and sparse numerical vectors in Elasticsearch documents, including multiple vectors per document.
-- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance).
-- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Jaccard, and Hamming similarity.
+- Exact nearest neighbor queries for six similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance).
+- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Dot, Jaccard, and Hamming similarity.
 - Integration of nearest neighbor queries with standard Elasticsearch queries.
 - Incremental index updates: start with any number of vectors and incrementally create/update/delete more without ever re-building the entire index.
 - Implementation based on standard Elasticsearch and Lucene primitives, entirely in the JVM. Indexing and querying scale horizontally with Elasticsearch.
@@ -88,13 +88,14 @@ So Java is used for all the CPU-bound LSH models and Lucene abstractions, and Sc
 
 Elasticsearch requires non-negative scores, with higher scores indicating higher relevance.
 
-Elastiknn supports five vector similarity functions (L1, L2, Cosine, Jaccard, and Hamming).
+Elastiknn supports six vector similarity functions (L1, L2, Cosine,Dot, Jaccard, and Hamming).
 Three of these are problematic with respect to this scoring requirement.
 
 Specifically, L1 and L2 are generally defined as _distance_ functions, rather than similarity functions,
 which means that higher relevance (i.e., lower distance) yields _lower_ scores.
 Cosine similarity is defined over $$[-1, 1]$$, and we can't have negative scores.
-
+Dot similarity is defined over $$[-1, 1]$$, If vectors have a magnitude of 1, then it's equivalent to cosine similarity.
+Elasticsearch does not allow negative scores.
 To work around this, Elastiknn applies simple transformations to produce L1, L2, and Cosine _similarity_ in accordance with the Elasticsearch requirements.
 The exact transformations are documented [on the API page](/api/#similarity-scoring).
 

diff --git a/docs/pages/api.md b/docs/pages/api.md
@@ -292,6 +292,30 @@ PUT /my-index/_mapping
     }
 }
 ```
+### Dot LSH Mapping
+
+Uses the [Random Projection algorithm](https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Random_projection)
+to hash and store dense float vectors such that they support approximate Dot similarity queries. Equivalent to Cosine similarity if the vectors are normalized
+
+The implementation is influenced by Chapter 3 of [Mining Massive Datasets.](http://www.mmds.org/)
+
+```json
+PUT /my-index/_mapping
+{
+    "properties": {
+        "my_vec": {
+            "type": "elastiknn_dense_float_vector", # 1
+            "elastiknn": {
+                "dims": 100,                        # 2
+                "model": "lsh",                     # 3
+                "similarity": "dot",                # 4
+                "L": 99,                            # 5
+                "k": 1                              # 6
+            }
+        }
+    }
+}
+```
 
 |#|Description|
 |:--|:--|
@@ -425,7 +449,7 @@ GET /my-index/_search
 ### Compatibility of Vector Types and Similarities
 
 Jaccard and Hamming similarity only work with sparse bool vectors. 
-Cosine,[^note-angular-cosine] L1, and L2 similarity only work with dense float vectors. 
+Cosine,[^note-angular-cosine],Dot[^note-dot-product], L1, and L2 similarity only work with dense float vectors. 
 The following documentation assume this restriction is known.
 
 These restrictions aren't inherent to the types and algorithms, i.e., you could in theory run cosine similarity on sparse vectors.
@@ -446,9 +470,12 @@ The exact transformations are described below.
 |Jaccard|N/A|0|1.0|
 |Hamming|N/A|0|1.0|
 |Cosine[^note-angular-cosine]|`cosine similarity + 1`|0|2|
+|Dot[^note-dot-product]|`Dot similarity + 1`|0|2| 
 |L1|`1 / (1 + l1 distance)`|0|1|
 |L2|`1 / (1 + l2 distance)`|0|1|
 
+Dot similirarity will produce negative scores if the vectors are not normalized
+
 If you're using the `elastiknn_nearest_neighbors` query with other queries, and the score values are inconvenient (e.g. huge values like 1e6), consider wrapping the query in a [Script Score Query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html), where you can access and transform the `_score` value.
 
 ### Query Vector
@@ -621,6 +648,36 @@ GET /my-index/_search
 |5|Number of candidates per segment. See the section on LSH Search Strategy.|
 |6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|
 
+### Dot LSH Query
+
+Retrieve dense float vectors based on approximate Dot similarity.[^note-dot-cosine]
+
+```json
+GET /my-index/_search
+{
+    "query": {
+        "elastiknn_nearest_neighbors": {
+            "field": "my_vec",                     # 1
+            "vec": {                               # 2
+                "values": [0.1, 0.2, 0.3, ...]
+            },
+            "model": "lsh",                        # 3
+            "similarity": "dot",                # 4
+            "candidates": 50                       # 5
+        }
+    }
+}
+```
+
+|#|Description|
+|:--|:--|
+|1|Indexed field. Must use `lsh` mapping model with `dot`[^note-dot-product] similarity.|
+|2|Query vector. Must be literal dense float or a pointer to an indexed dense float vector.|
+|3|Model name.|
+|4|Similarity function.|
+|5|Number of candidates per segment. See the section on LSH Search Strategy.|
+|6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|
+
 ### L1 LSH Query
 
 Not yet implemented.
@@ -707,12 +764,13 @@ The similarity functions are abbreviated (J: Jaccard, H: Hamming, C: Cosine,[^no
 
 #### elastiknn_dense_float_vector
 
-|Model / Query                   |Exact         |Cosine LSH |L2 LSH |Permutation LSH|
-|:--                             |:--           |:--         |:--    |:--            |
-|Exact (i.e. no model specified) |✔ (C, L1, L2) |x           |x      |x              | 
-|Cosine LSH                      |✔ (C, L1, L2) |✔           |x      |x              |
-|L2 LSH                          |✔ (C, L1, L2) |x           |✔      |x              |
-|Permutation LSH                 |✔ (C, L1, L2) |x           |x      |✔              |
+|Model / Query                   |Exact            |Cosine LSH  |Dot LSH|L2 LSH |Permutation LSH|
+|:--                             |:--              |:--         |:--    |:--    |:--            |
+|Exact (i.e. no model specified) |✔ (C, D, L1, L2) |x           |x      |x      |x              | 
+|Cosine LSH                      |✔ (C, D, L1, L2) |✔           |✔      |x      |x              |
+|Dot LSH                         |✔ (C, D, L1, L2) |✔           |✔      |x      |x              |
+|L2 LSH                          |✔ (C, D, L1, L2) |x           |x      |✔      |x              |
+|Permutation LSH                 |✔ (C, D, L1, L2) |x           |x      |x      |✔              |
 
 ### Running Nearest Neighbors Query on a Filtered Subset of Documents
 
@@ -860,4 +918,5 @@ PUT /my-index
 
 See the [create index documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-create-index.html) for more details.
 
-[^note-angular-cosine]: Cosine similarity used to be (incorrectly) called "angular" similarity. All references to "angular" were renamed to "Cosine" in 7.13.3.2. You can still use "angular" in the JSON/HTTP API; it will convert to "cosine" internally. 
+[^note-angular-cosine]: Cosine similarity used to be (incorrectly) called "angular" similarity. All references to "angular" were renamed to "Cosine" in 7.13.3.2. You can still use "angular" in the JSON/HTTP API; it will convert to "cosine" internally. 
+[^note-dot-product]: Dot product is intended to be used with normalized vectors V, meaning that ||v||==1. 
diff --git a/docs/pages/index.md b/docs/pages/index.md
@@ -15,8 +15,8 @@ This enables users to combine traditional queries (e.g., "some product") with ve
 ## Features
 
 - Datatypes to efficiently store dense and sparse numerical vectors in Elasticsearch documents, including multiple vectors per document.
-- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance).
-- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Jaccard, and Hamming similarity.
+- Exact nearest neighbor queries for six similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product) (for normalized vectors), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance).
+- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Dot, Jaccard, and Hamming similarity.
 - Integration of nearest neighbor queries with standard Elasticsearch queries.
 - Incremental index updates. Start with 1 vector or 1 million vectors and then create/update/delete documents and vectors without ever re-building the entire index.
 - Implementation based on standard Elasticsearch and Lucene primitives, entirely in the JVM. Indexing and querying scale horizontally with Elasticsearch.

diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala
@@ -15,6 +15,8 @@ object Mapping {
 
   final case class CosineLsh(dims: Int, L: Int, k: Int) extends Mapping
 
+  final case class DotLsh(dims: Int, L: Int, k: Int) extends Mapping
+
   final case class L2Lsh(dims: Int, L: Int, k: Int, w: Int) extends Mapping
 
   final case class PermutationLsh(dims: Int, k: Int, repeating: Boolean) extends Mapping

diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala
@@ -29,6 +29,14 @@ object NearestNeighborsQuery {
     override def similarity: Similarity = Similarity.Cosine
   }
 
+  final case class DotLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery {
+    override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
+
+    override def withCandidates(candidates: Int): ApproximateQuery = copy(candidates = candidates)
+
+    override def similarity: Similarity = Similarity.Dot
+  }
+
   final case class HammingLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery {
     override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
 

diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Similarity.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Similarity.scala
@@ -5,6 +5,8 @@ sealed trait Similarity
 object Similarity {
   case object Cosine extends Similarity
 
+  case object Dot extends Similarity
+
   case object Hamming extends Similarity
 
   case object Jaccard extends Similarity
@@ -13,5 +15,5 @@ object Similarity {
 
   case object L2 extends Similarity
 
-  val values: Seq[Similarity] = Vector(Cosine, Jaccard, Hamming, L1, L2)
+  val values: Seq[Similarity] = Vector(Cosine, Dot, Jaccard, Hamming, L1, L2)
 }