Plugin, clients: Remove Sparse Indexed Mapping and Query (#302)

See Issue #301 for details
alexklibisz · Aug 18, 2021 · 94afe6b · 94afe6b
1 parent 9201b57
commit 94afe6b
Show file tree

Hide file tree

Showing 17 changed files with 60 additions and 369 deletions.
diff --git a/client-python/elastiknn/api.py b/client-python/elastiknn/api.py
@@ -79,19 +79,6 @@ def to_dict(self):
                 }
             }
 
-    @dataclass(frozen=True)
-    class SparseIndexed(Base):
-        dims: int
-
-        def to_dict(self):
-            return {
-                "type": "elastiknn_sparse_bool_vector",
-                "elastiknn": {
-                    "dims": self.dims,
-                    "model": "sparse_indexed"
-                }
-            }
-
     @dataclass(frozen=True)
     class JaccardLsh(Base):
         dims: int
@@ -224,23 +211,6 @@ def to_dict(self):
         def with_vec(self, vec: Vec.Base):
             return NearestNeighborsQuery.Exact(field=self.field, vec=vec, similarity=self.similarity)
 
-    @dataclass(frozen=True)
-    class SparseIndexed(Base):
-        field: str
-        vec: Vec.Base
-        similarity: Similarity
-
-        def to_dict(self):
-            return {
-                "field": self.field,
-                "model": "sparse_indexed",
-                "similarity": self.similarity.name.lower(),
-                "vec": self.vec.to_dict()
-            }
-
-        def with_vec(self, vec: Vec.Base):
-            return NearestNeighborsQuery.SparseIndexed(field=self.field, vec=vec, similarity=self.similarity)
-
     @dataclass(frozen=True)
     class JaccardLsh(Base):
         field: str

diff --git a/client-python/elastiknn/models.py b/client-python/elastiknn/models.py
@@ -96,13 +96,6 @@ def _mk_mapping_query(self, query_params: dict()) -> (Mapping.Base, NearestNeigh
                 return Mapping.SparseBool(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Jaccard)
             elif self._metric == 'hamming':
                 return Mapping.SparseBool(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Hamming)
-        elif self._algorithm == 'sparse_indexed':
-            if self._metric == 'jaccard':
-                return Mapping.SparseIndexed(self._dims), NearestNeighborsQuery.SparseIndexed(field, dummy,
-                                                                                        Similarity.Jaccard)
-            elif self._metric == 'hamming':
-                return Mapping.SparseIndexed(self._dims), NearestNeighborsQuery.SparseIndexed(field, dummy,
-                                                                                        Similarity.Hamming)
         elif self._algorithm == 'lsh':
             if self._metric == 'l2':
                 m, q = Mapping.L2Lsh(self._dims, **self._mapping_params), \

diff --git a/client-python/elastiknn/utils.py b/client-python/elastiknn/utils.py
@@ -15,8 +15,6 @@
     ('exact', 'cosine'),
     ('exact', 'hamming'),
     ('exact', 'jaccard'),
-    ('sparse_indexed', 'jaccard'),
-    ('sparse_indexed', 'hamming'),
     ('lsh', 'l2'),
     ('lsh', 'cosine'),
     ('lsh', 'jaccard'),

diff --git a/docs/pages/api.md b/docs/pages/api.md
@@ -113,7 +113,7 @@ PUT /my-index/_mapping
       "type": "elastiknn_sparse_bool_vector",   # 3
       "elastiknn": {                            # 4
         "dims": 100,                            # 5
-        "model": "sparse_indexed",              # 6
+        "model": "exact",                       # 6
         ...                                     # 7
       }
     }
@@ -211,33 +211,6 @@ PUT /my-index/_mapping
 |1|Vector datatype. Both dense float and sparse bool are supported|
 |2|Vector dimensionality.|
 
-### Sparse Indexed Mapping
-
-The sparse indexed model introduces an obvious optimization for exact queries on sparse bool vectors. 
-It indexes each of the true indices as a Lucene term, basically treating them like [Elasticsearch keywords](https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html). Jaccard and Hamming similarity both require computing the intersection of the query vector against all indexed vectors, and indexing the true indices makes this operation much more efficient. However, you must consider that there is an upper bound on the number of possible terms in a term query, [see the `index.max_terms_count` setting.](https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules.html#index-max-terms-count) 
-If the number of true indices in your vectors exceeds this limit, you'll have to adjust it or you'll encounter failed queries.
-
-```json
-PUT /my-index/_mapping
-{
-    "properties": {
-        "my_vec": {
-            "type": "elastiknn_sparse_bool_vector",  # 1
-            "elastiknn": {
-                "dims": 25000,                       # 2
-                "model": "sparse_indexed",           # 3
-            }
-        }
-    }
-}
-```
-
-|#|Description|
-|:--|:--|
-|1|Vector datatype. Must be sparse bool vector.|
-|2|Vector dimensionality.|
-|3|Model type. This model has no additional parameters.|
-
 ### Jaccard LSH Mapping
 
 Uses the [Minhash algorithm](https://en.wikipedia.org/wiki/MinHash) to hash and store sparse bool vectors such that they
@@ -557,34 +530,6 @@ GET /my-index/_search
 |2|Model name.|
 |3|Similarity function. Must be compatible with the vector type.|
 
-### Sparse Indexed Query
-
-Computes the exact similarity of sparse bool vectors using a Lucene Boolean Query to compute the size of the intersection of true indices in the query vector against true indices in the indexed vectors.
-
-```json
-GET /my-index/_search
-{
-    "query": {
-        "elastiknn_nearest_neighbors": {        
-            "field": "my_vec",                      # 1
-            "vec": {                                # 2
-                "true_indices": [1, 3, 5, ...],
-                "total_indices": 100
-            },
-            "model": "sparse_indexed",              # 3
-            "similarity": "(jaccard | hamming)",    # 4
-        }
-    }
-}
-```
-
-|#|Description|
-|:--|:--|
-|1|Indexed field. Must use `sparse_indexed` mapping model.|
-|2|Query vector. Must be literal sparse bool or a pointer to an indexed sparse bool vector.|
-|3|Model name.|
-|4|Similarity function. Must be jaccard or hamming.|
-
 ### LSH Search Strategy
 
 All LSH search models follow roughly the same strategy. 
@@ -940,4 +885,4 @@ Obviously this has an upper limit, but the general performance implications of s
 
 ---
 
-[^note-angular-cosine]: Cosine similarity used to be (incorrectly) called "angular" similarity. All references to "angular" were renamed to "Cosine" in 7.13.3.1. You can still use "angular" in the JSON/HTTP API; it will convert to "cosine" internally. 
+[^note-angular-cosine]: Cosine similarity used to be (incorrectly) called "angular" similarity. All references to "angular" were renamed to "Cosine" in 7.13.3.2. You can still use "angular" in the JSON/HTTP API; it will convert to "cosine" internally. 
diff --git a/docs/pages/installation.md b/docs/pages/installation.md
@@ -31,7 +31,7 @@ Make a Dockerfile like below. The image version (`elasticsearch:A.B.C`) must mat
 
 ```docker
 FROM docker.elastic.co/elasticsearch/elasticsearch:7.13.3
-RUN elasticsearch-plugin install --batch https://github.com/alexklibisz/elastiknn/releases/download/7.13.3.1/elastiknn-7.13.3.1.zip
+RUN elasticsearch-plugin install --batch https://github.com/alexklibisz/elastiknn/releases/download/7.13.3.2/elastiknn-7.13.3.2.zip
 ```
 
 Build and run the Dockerfile. If you have any issues please refer to the [official docs.](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html)

diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/ElasticsearchCodec.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/ElasticsearchCodec.scala
@@ -35,7 +35,6 @@ private object Keys {
   val MODEL = "model"
   val QUERY_OPTIONS = "query_options"
   val SIMILARITY = "similarity"
-  val SPARSE_INDEXED = "sparse_indexed"
   val TYPE = "type"
   val VEC = "vec"
 }
@@ -158,7 +157,6 @@ object ElasticsearchCodec { esc =>
 
   implicit val mappingSparseBool: ESC[Mapping.SparseBool] = ElasticsearchCodec(deriveCodec)
   implicit val mappingDenseFloat: ESC[Mapping.DenseFloat] = ElasticsearchCodec(deriveCodec)
-  implicit val mappingSparseIndexed: ESC[Mapping.SparseIndexed] = ElasticsearchCodec(deriveCodec)
   implicit val mappingJaccardLsh: ESC[Mapping.JaccardLsh] = ElasticsearchCodec(deriveCodec)
   implicit val mappingHammingLsh: ESC[Mapping.HammingLsh] = ElasticsearchCodec(deriveCodec)
   implicit val mappingCosineLsh: ESC[Mapping.CosineLsh] = ElasticsearchCodec(deriveCodec)
@@ -170,8 +168,6 @@ object ElasticsearchCodec { esc =>
       t match {
         case m: Mapping.SparseBool => JsonObject(TYPE -> EKNN_SPARSE_BOOL_VECTOR, ELASTIKNN_NAME -> esc.encode(m))
         case m: Mapping.DenseFloat => JsonObject(TYPE -> EKNN_DENSE_FLOAT_VECTOR, ELASTIKNN_NAME -> esc.encode(m))
-        case m: Mapping.SparseIndexed =>
-          JsonObject(TYPE -> EKNN_SPARSE_BOOL_VECTOR, ELASTIKNN_NAME -> (esc.encode(m) ++ JsonObject(MODEL -> SPARSE_INDEXED)))
         case m: Mapping.JaccardLsh =>
           JsonObject(TYPE -> EKNN_SPARSE_BOOL_VECTOR, ELASTIKNN_NAME -> (esc.encode(m) ++ JsonObject(MODEL -> LSH, SIMILARITY -> JACCARD)))
         case m: Mapping.HammingLsh =>
@@ -195,8 +191,6 @@ object ElasticsearchCodec { esc =>
             esc.decode[Mapping.SparseBool](c)
           case (EKNN_DENSE_FLOAT_VECTOR, None, None) =>
             esc.decode[Mapping.DenseFloat](c)
-          case (EKNN_SPARSE_BOOL_VECTOR, Some(SPARSE_INDEXED), None) =>
-            esc.decode[Mapping.SparseIndexed](c)
           case (EKNN_SPARSE_BOOL_VECTOR, Some(LSH), Some(Similarity.Jaccard)) =>
             esc.decode[Mapping.JaccardLsh](c)
           case (EKNN_SPARSE_BOOL_VECTOR, Some(LSH), Some(Similarity.Hamming)) =>
@@ -214,7 +208,6 @@ object ElasticsearchCodec { esc =>
   }
 
   implicit val queryExact: ESC[NearestNeighborsQuery.Exact] = ElasticsearchCodec(deriveCodec)
-  implicit val querySparseIndexed: ESC[NearestNeighborsQuery.SparseIndexed] = ElasticsearchCodec(deriveCodec)
   implicit val queryJaccardLsh: ESC[NearestNeighborsQuery.JaccardLsh] = {
     implicit val cfg: Configuration = Configuration.default.withDefaults
     ElasticsearchCodec(deriveConfiguredCodec)
@@ -241,7 +234,6 @@ object ElasticsearchCodec { esc =>
       val default = JsonObject(FIELD -> a.field, VEC -> esc.encode(a.vec), SIMILARITY -> esc.encode(a.similarity))
       a match {
         case q: NearestNeighborsQuery.Exact          => JsonObject(MODEL -> EXACT) ++ (default ++ esc.encode(q))
-        case q: NearestNeighborsQuery.SparseIndexed  => JsonObject(MODEL -> SPARSE_INDEXED) ++ (default ++ esc.encode(q))
         case q: NearestNeighborsQuery.JaccardLsh     => JsonObject(MODEL -> LSH) ++ (default ++ esc.encode(q))
         case q: NearestNeighborsQuery.HammingLsh     => JsonObject(MODEL -> LSH) ++ (default ++ esc.encode(q))
         case q: NearestNeighborsQuery.CosineLsh      => JsonObject(MODEL -> LSH) ++ (default ++ esc.encode(q))
@@ -255,7 +247,6 @@ object ElasticsearchCodec { esc =>
         sim <- c.downField(SIMILARITY).as[Json].flatMap(esc.decodeJson[Similarity])
         nnq <- model match {
           case EXACT           => esc.decode[NearestNeighborsQuery.Exact](c)
-          case SPARSE_INDEXED  => esc.decode[NearestNeighborsQuery.SparseIndexed](c)
           case PERMUTATION_LSH => esc.decode[NearestNeighborsQuery.PermutationLsh](c)
           case LSH =>
             sim match {
@@ -265,7 +256,7 @@ object ElasticsearchCodec { esc =>
               case Similarity.L2      => esc.decode[NearestNeighborsQuery.L2Lsh](c)
               case other              => fail(s"$SIMILARITY [$other] is not compatible with $MODEL [$LSH]")
             }
-          case other => failTypes(MODEL, Seq(EXACT, SPARSE_INDEXED, LSH), other)
+          case other => failTypes(MODEL, Seq(EXACT, LSH), other)
         }
       } yield nnq
   }

diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/package.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/package.scala
@@ -107,7 +107,6 @@ package object api {
   }
   object Mapping {
     final case class SparseBool(dims: Int) extends Mapping
-    final case class SparseIndexed(dims: Int) extends Mapping
     final case class JaccardLsh(dims: Int, L: Int, k: Int) extends Mapping
     final case class HammingLsh(dims: Int, L: Int, k: Int) extends Mapping
     final case class DenseFloat(dims: Int) extends Mapping
@@ -127,10 +126,6 @@ package object api {
       override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
     }
 
-    final case class SparseIndexed(field: String, similarity: Similarity, vec: Vec = Vec.Empty()) extends NearestNeighborsQuery {
-      override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
-    }
-
     sealed trait ApproximateQuery extends NearestNeighborsQuery {
       def candidates: Int
       def withCandidates(candidates: Int): ApproximateQuery

diff --git a/elastiknn-benchmarks/src/main/scala/com/klibisz/elastiknn/benchmarks/package.scala b/elastiknn-benchmarks/src/main/scala/com/klibisz/elastiknn/benchmarks/package.scala
@@ -33,7 +33,6 @@ package object benchmarks {
       import NearestNeighborsQuery._
       nnq match {
         case _: Exact                                                => "Exact"
-        case _: SparseIndexed                                        => "Sparse Indexed"
         case _: HammingLsh | _: JaccardLsh | _: CosineLsh | _: L2Lsh => "LSH"
         case _: PermutationLsh                                       => "Permutation LSH"
       }

diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala
@@ -5,7 +5,7 @@ import com.klibisz.elastiknn._
 import com.klibisz.elastiknn.api.ElasticsearchCodec._
 import com.klibisz.elastiknn.api.{ElasticsearchCodec, JavaJsonMap, Mapping, Vec}
 import com.klibisz.elastiknn.models.Cache
-import com.klibisz.elastiknn.query.{ExactQuery, HashingQuery, SparseIndexedQuery}
+import com.klibisz.elastiknn.query.{ExactQuery, HashingQuery}
 import io.circe.syntax._
 import io.circe.{Json, JsonObject}
 import org.apache.lucene.document.{FieldType => LuceneFieldType}
@@ -34,8 +34,7 @@ object VectorMapper {
         else {
           val sorted = vec.sorted() // Sort for faster intersections on the query side.
           mapping match {
-            case Mapping.SparseBool(_)    => Try(ExactQuery.index(field, sorted))
-            case Mapping.SparseIndexed(_) => Try(SparseIndexedQuery.index(field, luceneFieldType, sorted))
+            case Mapping.SparseBool(_) => Try(ExactQuery.index(field, sorted))
             case m: Mapping.JaccardLsh =>
               Try(HashingQuery.index(field, luceneFieldType, sorted, Cache(m).hash(vec.trueIndices, vec.totalIndices)))
             case m: Mapping.HammingLsh =>

diff --git a/...-plugin/src/main/scala/com/klibisz/elastiknn/models/SparseIndexedSimilarityFunction.scala b/...-plugin/src/main/scala/com/klibisz/elastiknn/models/SparseIndexedSimilarityFunction.scala
diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQuery.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQuery.scala
@@ -4,7 +4,7 @@ import com.klibisz.elastiknn.ElastiknnException.ElastiknnRuntimeException
 import com.klibisz.elastiknn.api.NearestNeighborsQuery._
 import com.klibisz.elastiknn.api._
 import com.klibisz.elastiknn.mapper.VectorMapper
-import com.klibisz.elastiknn.models.{Cache, SparseIndexedSimilarityFunction}
+import com.klibisz.elastiknn.models.Cache
 import com.klibisz.elastiknn.models.{ExactSimilarityFunction => ESF}
 import org.apache.lucene.index.IndexReader
 import org.apache.lucene.search.Query
@@ -54,13 +54,13 @@ object ElastiknnQuery {
 
       case (
           Exact(f, Similarity.Jaccard, v: Vec.SparseBool),
-          _: Mapping.SparseBool | _: Mapping.SparseIndexed | _: Mapping.JaccardLsh | _: Mapping.HammingLsh
+          _: Mapping.SparseBool | _: Mapping.JaccardLsh | _: Mapping.HammingLsh
           ) =>
         new ExactQuery(f, v, ESF.Jaccard)
 
       case (
           Exact(f, Similarity.Hamming, v: Vec.SparseBool),
-          _: Mapping.SparseBool | _: Mapping.SparseIndexed | _: Mapping.JaccardLsh | _: Mapping.HammingLsh
+          _: Mapping.SparseBool | _: Mapping.JaccardLsh | _: Mapping.HammingLsh
           ) =>
         new ExactQuery(f, v, ESF.Hamming)
 
@@ -82,12 +82,6 @@ object ElastiknnQuery {
           ) =>
         new ExactQuery(f, v, ESF.Cosine)
 
-      case (SparseIndexed(f, Similarity.Jaccard, sbv: Vec.SparseBool), _: Mapping.SparseIndexed) =>
-        new SparseIndexedQuery(f, sbv, SparseIndexedSimilarityFunction.Jaccard)
-
-      case (SparseIndexed(f, Similarity.Hamming, sbv: Vec.SparseBool), _: Mapping.SparseIndexed) =>
-        new SparseIndexedQuery(f, sbv, SparseIndexedSimilarityFunction.Hamming)
-
       case (JaccardLsh(f, candidates, v: Vec.SparseBool), m: Mapping.JaccardLsh) =>
         new HashingQuery(f, v, candidates, Cache(m).hash(v.trueIndices, v.totalIndices), ESF.Jaccard)