Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added dot product everywhere were cosine similarity was used #676

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
39 changes: 39 additions & 0 deletions client-python/elastiknn/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class Similarity(Enum):
L1 = 3
L2 = 4
Cosine = 5
Dot = 6


class Vec:
Expand Down Expand Up @@ -144,7 +145,24 @@ def to_dict(self):
"k": self.k
}
}

@dataclass(frozen=True)
class DotLsh(Base):
dims: int
L: int
k: int

def to_dict(self):
return {
"type": "elastiknn_dense_float_vector",
"elastiknn": {
"model": "lsh",
"similarity": "dot",
"dims": self.dims,
"L": self.L,
"k": self.k
}
}
@dataclass(frozen=True)
class L2Lsh(Base):
dims: int
Expand Down Expand Up @@ -271,6 +289,27 @@ def with_vec(self, vec: Vec.Base):
return NearestNeighborsQuery.CosineLsh(field=self.field, vec=vec, similarity=self.similarity,
candidates=self.candidates)

@dataclass(frozen=True)
class DotLsh(Base):
field: str
vec: Vec.Base
similarity: Similarity = Similarity.Dot
candidates: int = 1000

def to_dict(self):
return {
"field": self.field,
"model": "lsh",
"similarity": self.similarity.name.lower(),
"candidates": self.candidates,
"vec": self.vec.to_dict()
}

def with_vec(self, vec: Vec.Base):
return NearestNeighborsQuery.DotLsh(field=self.field, vec=vec, similarity=self.similarity,
candidates=self.candidates)


@dataclass(frozen=True)
class L2Lsh(Base):
field: str
Expand Down
5 changes: 5 additions & 0 deletions client-python/elastiknn/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ def _mk_mapping_query(self, query_params: dict()) -> (Mapping.Base, NearestNeigh
return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.L2)
elif self._metric == 'cosine':
return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Cosine)
elif self._metric == 'dot':
return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Dot)
elif self._metric == 'jaccard':
return Mapping.SparseBool(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Jaccard)
elif self._metric == 'hamming':
Expand All @@ -103,6 +105,9 @@ def _mk_mapping_query(self, query_params: dict()) -> (Mapping.Base, NearestNeigh
elif self._metric == 'cosine':
return Mapping.CosineLsh(self._dims, **self._mapping_params), \
NearestNeighborsQuery.CosineLsh(field, dummy, **query_params)
elif self._metric == 'dot':
return Mapping.DotLsh(self._dims, **self._mapping_params), \
NearestNeighborsQuery.DotLsh(field, dummy, **query_params)
elif self._metric == 'hamming':
return Mapping.CosineLsh(self._dims, **self._mapping_params), \
NearestNeighborsQuery.HammingLsh(field, dummy, **query_params)
Expand Down
2 changes: 2 additions & 0 deletions client-python/elastiknn/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
('exact', 'l1'),
('exact', 'l2'),
('exact', 'cosine'),
('exact', 'dot'),
('exact', 'hamming'),
('exact', 'jaccard'),
('lsh', 'l2'),
('lsh', 'cosine'),
('lsh', 'dot'),
('lsh', 'jaccard'),
('lsh', 'hamming'),
('permutation_lsh', 'cosine'),
Expand Down
9 changes: 5 additions & 4 deletions docs/_posts/2021-07-30-how-does-elastiknn-work.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ The name is a combination of _Elastic_ and _KNN_ (K-Nearest Neighbors).
The full list of features (copied from the home page) is as follows:

- Datatypes to efficiently store dense and sparse numerical vectors in Elasticsearch documents, including multiple vectors per document.
- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance).
- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Jaccard, and Hamming similarity.
- Exact nearest neighbor queries for six similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance).
- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Dot, Jaccard, and Hamming similarity.
- Integration of nearest neighbor queries with standard Elasticsearch queries.
- Incremental index updates: start with any number of vectors and incrementally create/update/delete more without ever re-building the entire index.
- Implementation based on standard Elasticsearch and Lucene primitives, entirely in the JVM. Indexing and querying scale horizontally with Elasticsearch.
Expand Down Expand Up @@ -88,13 +88,14 @@ So Java is used for all the CPU-bound LSH models and Lucene abstractions, and Sc

Elasticsearch requires non-negative scores, with higher scores indicating higher relevance.

Elastiknn supports five vector similarity functions (L1, L2, Cosine, Jaccard, and Hamming).
Elastiknn supports six vector similarity functions (L1, L2, Cosine,Dot, Jaccard, and Hamming).
Three of these are problematic with respect to this scoring requirement.

Specifically, L1 and L2 are generally defined as _distance_ functions, rather than similarity functions,
which means that higher relevance (i.e., lower distance) yields _lower_ scores.
Cosine similarity is defined over $$[-1, 1]$$, and we can't have negative scores.

Dot similarity is defined over $$[-1, 1]$$, If vectors have a magnitude of 1, then it's equivalent to cosine similarity.
Elasticsearch does not allow negative scores.
To work around this, Elastiknn applies simple transformations to produce L1, L2, and Cosine _similarity_ in accordance with the Elasticsearch requirements.
The exact transformations are documented [on the API page](/api/#similarity-scoring).

Expand Down
75 changes: 67 additions & 8 deletions docs/pages/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,30 @@ PUT /my-index/_mapping
}
}
```
### Dot LSH Mapping

Uses the [Random Projection algorithm](https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Random_projection)
to hash and store dense float vectors such that they support approximate Dot similarity queries. Equivalent to Cosine similarity if the vectors are normalized

The implementation is influenced by Chapter 3 of [Mining Massive Datasets.](http://www.mmds.org/)

```json
PUT /my-index/_mapping
{
"properties": {
"my_vec": {
"type": "elastiknn_dense_float_vector", # 1
"elastiknn": {
"dims": 100, # 2
"model": "lsh", # 3
"similarity": "dot", # 4
"L": 99, # 5
"k": 1 # 6
}
}
}
}
```

|#|Description|
|:--|:--|
Expand Down Expand Up @@ -425,7 +449,7 @@ GET /my-index/_search
### Compatibility of Vector Types and Similarities

Jaccard and Hamming similarity only work with sparse bool vectors.
Cosine,[^note-angular-cosine] L1, and L2 similarity only work with dense float vectors.
Cosine,[^note-angular-cosine],Dot[^note-dot-product], L1, and L2 similarity only work with dense float vectors.
The following documentation assume this restriction is known.

These restrictions aren't inherent to the types and algorithms, i.e., you could in theory run cosine similarity on sparse vectors.
Expand All @@ -446,9 +470,12 @@ The exact transformations are described below.
|Jaccard|N/A|0|1.0|
|Hamming|N/A|0|1.0|
|Cosine[^note-angular-cosine]|`cosine similarity + 1`|0|2|
|Dot[^note-dot-product]|`Dot similarity + 1`|0|2|
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there might be something wrong with the table formatting:
image

Also, if we end up using it, we should describe the updated transformation here: max(0, 1 + dot product)

|L1|`1 / (1 + l1 distance)`|0|1|
|L2|`1 / (1 + l2 distance)`|0|1|

Dot similirarity will produce negative scores if the vectors are not normalized
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should make sure to catch this and return an error in the plugin.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i added max(0,distance) so, it will always be positive.


If you're using the `elastiknn_nearest_neighbors` query with other queries, and the score values are inconvenient (e.g. huge values like 1e6), consider wrapping the query in a [Script Score Query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html), where you can access and transform the `_score` value.

### Query Vector
Expand Down Expand Up @@ -621,6 +648,36 @@ GET /my-index/_search
|5|Number of candidates per segment. See the section on LSH Search Strategy.|
|6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|

### Dot LSH Query

Retrieve dense float vectors based on approximate Dot similarity.[^note-dot-cosine]

```json
GET /my-index/_search
{
"query": {
"elastiknn_nearest_neighbors": {
"field": "my_vec", # 1
"vec": { # 2
"values": [0.1, 0.2, 0.3, ...]
},
"model": "lsh", # 3
"similarity": "dot", # 4
"candidates": 50 # 5
}
}
}
```

|#|Description|
|:--|:--|
|1|Indexed field. Must use `lsh` mapping model with `dot`[^note-dot-product] similarity.|
|2|Query vector. Must be literal dense float or a pointer to an indexed dense float vector.|
|3|Model name.|
|4|Similarity function.|
|5|Number of candidates per segment. See the section on LSH Search Strategy.|
|6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|

### L1 LSH Query

Not yet implemented.
Expand Down Expand Up @@ -707,12 +764,13 @@ The similarity functions are abbreviated (J: Jaccard, H: Hamming, C: Cosine,[^no

#### elastiknn_dense_float_vector

|Model / Query |Exact |Cosine LSH |L2 LSH |Permutation LSH|
|:-- |:-- |:-- |:-- |:-- |
|Exact (i.e. no model specified) |✔ (C, L1, L2) |x |x |x |
|Cosine LSH |✔ (C, L1, L2) |✔ |x |x |
|L2 LSH |✔ (C, L1, L2) |x |✔ |x |
|Permutation LSH |✔ (C, L1, L2) |x |x |✔ |
|Model / Query |Exact |Cosine LSH |Dot LSH|L2 LSH |Permutation LSH|
|:-- |:-- |:-- |:-- |:-- |:-- |
|Exact (i.e. no model specified) |✔ (C, D, L1, L2) |x |x |x |x |
|Cosine LSH |✔ (C, D, L1, L2) |✔ |✔ |x |x |
|Dot LSH |✔ (C, D, L1, L2) |✔ |✔ |x |x |
|L2 LSH |✔ (C, D, L1, L2) |x |x |✔ |x |
|Permutation LSH |✔ (C, D, L1, L2) |x |x |x |✔ |

### Running Nearest Neighbors Query on a Filtered Subset of Documents

Expand Down Expand Up @@ -860,4 +918,5 @@ PUT /my-index

See the [create index documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-create-index.html) for more details.

[^note-angular-cosine]: Cosine similarity used to be (incorrectly) called "angular" similarity. All references to "angular" were renamed to "Cosine" in 7.13.3.2. You can still use "angular" in the JSON/HTTP API; it will convert to "cosine" internally.
[^note-angular-cosine]: Cosine similarity used to be (incorrectly) called "angular" similarity. All references to "angular" were renamed to "Cosine" in 7.13.3.2. You can still use "angular" in the JSON/HTTP API; it will convert to "cosine" internally.
[^note-dot-product]: Dot product is intended to be used with normalized vectors V, meaning that ||v||==1.
4 changes: 2 additions & 2 deletions docs/pages/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ This enables users to combine traditional queries (e.g., "some product") with ve
## Features

- Datatypes to efficiently store dense and sparse numerical vectors in Elasticsearch documents, including multiple vectors per document.
- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance).
- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Jaccard, and Hamming similarity.
- Exact nearest neighbor queries for six similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product) (for normalized vectors), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance).
- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Dot, Jaccard, and Hamming similarity.
- Integration of nearest neighbor queries with standard Elasticsearch queries.
- Incremental index updates. Start with 1 vector or 1 million vectors and then create/update/delete documents and vectors without ever re-building the entire index.
- Implementation based on standard Elasticsearch and Lucene primitives, entirely in the JVM. Indexing and querying scale horizontally with Elasticsearch.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ object Mapping {

final case class CosineLsh(dims: Int, L: Int, k: Int) extends Mapping

final case class DotLsh(dims: Int, L: Int, k: Int) extends Mapping

final case class L2Lsh(dims: Int, L: Int, k: Int, w: Int) extends Mapping

final case class PermutationLsh(dims: Int, k: Int, repeating: Boolean) extends Mapping
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ object NearestNeighborsQuery {
override def similarity: Similarity = Similarity.Cosine
}

final case class DotLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery {
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)

override def withCandidates(candidates: Int): ApproximateQuery = copy(candidates = candidates)

override def similarity: Similarity = Similarity.Dot
}

final case class HammingLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery {
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ sealed trait Similarity
object Similarity {
case object Cosine extends Similarity

case object Dot extends Similarity

case object Hamming extends Similarity

case object Jaccard extends Similarity
Expand All @@ -13,5 +15,5 @@ object Similarity {

case object L2 extends Similarity

val values: Seq[Similarity] = Vector(Cosine, Jaccard, Hamming, L1, L2)
val values: Seq[Similarity] = Vector(Cosine, Dot, Jaccard, Hamming, L1, L2)
}