Skip to content

Commit

Permalink
#348: Dependencies: Upgrade Elasticsearch to 8.0.0 (#347)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexklibisz committed Sep 16, 2022
1 parent 3e1fbd2 commit a760195
Show file tree
Hide file tree
Showing 24 changed files with 110 additions and 195 deletions.
11 changes: 6 additions & 5 deletions build.gradle
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
buildscript {
ext {
esVersion = '7.17.6'
luceneVersion = '8.11.1'
elastic4sVersion = '7.17.3'
esVersion = '8.0.0'
luceneVersion = '9.0.0'
elastic4sVersion = '8.0.0'
circeVersion= '0.14.1'
circeGenericExtrasVersion= '0.14.1'
scalaShortVersion = '2.13'
Expand Down Expand Up @@ -137,6 +137,7 @@ configure(plugin, List.of(scalaProjectConfig, {
implementation models
implementation "com.google.guava:guava:28.1-jre"
implementation "org.scala-lang:scala-library:${scalaFullVersion}"
implementation "org.apache.lucene:lucene-backward-codecs:${luceneVersion}"
runtimeOnly "com.google.guava:failureaccess:1.0.1"
runtimeOnly "org.scala-lang:scala-library:${scalaFullVersion}"
}
Expand Down Expand Up @@ -170,8 +171,8 @@ configure(testing, List.of(scalaProjectConfig, {
implementation "io.circe:circe-generic-extras_${scalaShortVersion}:${circeGenericExtrasVersion}"
implementation "io.circe:circe-parser_${scalaShortVersion}:${circeVersion}"
implementation 'org.apache.commons:commons-math3:3.6.1'
implementation "org.apache.lucene:lucene-analyzers-common:${luceneVersion}"
implementation "org.apache.lucene:lucene-codecs:${luceneVersion}"
implementation "org.apache.lucene:lucene-analysis-common:${luceneVersion}"
implementation "org.apache.lucene:lucene-backward-codecs:${luceneVersion}"
implementation "org.elasticsearch:elasticsearch:${esVersion}"
implementation 'org.pegdown:pegdown:1.4.2'
implementation "org.scala-lang:scala-library:${scalaFullVersion}"
Expand Down
30 changes: 15 additions & 15 deletions client-python/elastiknn/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, es: Elasticsearch = None):
Defaults to a client pointing at http://localhost:9200.
"""
if es is None:
self.es = Elasticsearch(["http://localhost:9200"], timeout=99)
self.es = Elasticsearch(["http://localhost:9200"], request_timeout=99)
else:
self.es = es

Expand All @@ -48,16 +48,14 @@ def put_mapping(self, index: str, vec_field: str, mapping: Mapping.Base, stored_
Dict
Json response as a dict. Successful request returns `{"acknowledged": true}`.
"""
body = {
"properties": {
vec_field: mapping.to_dict(),
stored_id_field: {
"type": "keyword",
"store": True
}
properties = {
vec_field: mapping.to_dict(),
stored_id_field: {
"type": "keyword",
"store": True
}
}
return self.es.indices.put_mapping(body, index=index)
return self.es.indices.put_mapping(properties=properties, index=index)

def index(self, index: str, vec_field: str, vecs: Iterable[Vec.Base], stored_id_field: str, ids: Iterable[str], refresh: bool = False) -> Tuple[int, List[Dict]]:
"""Index (i.e. store) the given vectors at the given index and field with the optional ids.
Expand Down Expand Up @@ -117,14 +115,16 @@ def nearest_neighbors(self, index: str, query: NearestNeighborsQuery.Base, store
Dict
Standard Elasticsearch search response parsed as a dict.
"""
body = {
"query": {
"elastiknn_nearest_neighbors": query.to_dict()
}
query = {
"elastiknn_nearest_neighbors": query.to_dict()
}
if fetch_source:
return self.es.search(index=index, body=body, size=k)
return self.es.search(index=index, query=query, size=k)
else:
return self.es.search(index=index, body=body, size=k, _source=fetch_source, docvalue_fields=stored_id_field,
return self.es.search(index=index,
query=query,
size=k,
_source=fetch_source,
docvalue_fields=[stored_id_field],
stored_fields="_none_",
filter_path=[f'hits.hits.fields.{stored_id_field}', 'hits.hits._score'])
8 changes: 4 additions & 4 deletions client-python/elastiknn/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ def fit(self, X: Union[np.ndarray, csr_matrix, List[Vec.SparseBool], List[Vec.De
self._index = f"{ELASTIKNN_NAME}-{int(time())}"
self._logger.warning(f"index was not given, using {self._index} instead")

self._eknn.es.indices.delete(self._index, ignore=[400, 404])
body = dict(settings=dict(number_of_shards=shards, elastiknn=True, number_of_replicas=0))
self._eknn.es.indices.create(self._index, body=json.dumps(body))
self._eknn.es.indices.delete(index=self._index, ignore_unavailable=True)
# body = dict(settings=dict(number_of_shards=shards, elastiknn=True, number_of_replicas=0))
self._eknn.es.indices.create(index=self._index, settings=dict(number_of_shards=shards, elastiknn=True, number_of_replicas=0))
self._eknn.put_mapping(self._index, self._vec_field, mapping, self._stored_id_field)

self._logger.info(f"indexing {len(X)} vectors into index {self._index}")
ids = map(lambda i: str(i + 1), range(len(X))) # Add one because 0 is an invalid id in ES.
self._eknn.index(self._index, self._vec_field, vecs, self._stored_id_field, ids, refresh=True)
self._eknn.es.indices.forcemerge(self._index, params=dict(max_num_segments=1))
self._eknn.es.indices.forcemerge(index=self._index, max_num_segments=1)
self._eknn.index(self._index, self._vec_field, [], self._stored_id_field, [], refresh=True)


Expand Down
2 changes: 1 addition & 1 deletion client-python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
elasticsearch==7.17.4
elasticsearch==8.0.0
dataclasses-json==0.3.7
tqdm==4.61.1
scipy==1.7.0
4 changes: 2 additions & 2 deletions client-python/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ def test_exact_jaccard(self):
id_field = "id"
mapping = Mapping.SparseBool(dims=dim)

eknn.es.indices.delete(index, ignore=[400, 404])
eknn.es.indices.delete(index=index, ignore_unavailable=True)
eknn.es.indices.refresh()
eknn.es.indices.create(index)
eknn.es.indices.create(index=index)
eknn.es.indices.refresh()
m = eknn.put_mapping(index, vec_field, mapping, "id")

Expand Down
7 changes: 2 additions & 5 deletions docs/pages/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,15 @@ PUT /my-index
{
"settings": {
"index": {
"number_of_shards": 1, # 1
"elastiknn": true # 2

"number_of_shards": 1 # 1
}
}
}
```

|#|Description|
|:--|:--|
|1|The number of shards in your index. Like all Elasticsearch queries, Elastiknn queries execute once per shard in parallel. This means you can generally speed up your queries by adding more shards to the index.|
|2|Setting this to `true` (default is `false`) yields a significant performance improvement for Elastiknn on Elasticsearch versions 7.7.x and beyond. The reason is a bit involved: Elastiknn stores vectors as binary doc values. Setting this to `true` tells Elastiknn to use a non-default Lucene setting to store doc values. Specifically, it uses the `Lucene87Codec` with `BEST_SPEED` instead of `BEST_COMPRESSION`. The default `BEST_COMPRESSION` setting saves space on disk but makes reading vectors significantly slower. If you really need to save space on disk or need to [freeze](https://www.elastic.co/guide/en/elasticsearch/reference/current/freeze-index-api.html) the index, then you should set this to `false`.|
|1|The number of shards in your index. Like all Elasticsearch queries, Elastiknn queries execute in parallel across shards. This means you can generally speed up your queries by adding more shards to the index.|

## Vectors

Expand Down
4 changes: 2 additions & 2 deletions docs/pages/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ Make a Dockerfile like below. The image version (`elasticsearch:A.B.C`) must mat
`A.B.C` is the Elasticsearch version. `.x` just refers to an incremental version of Elastiknn on top of `A.B.C`.

```docker
FROM docker.elastic.co/elasticsearch/elasticsearch:7.17.6
RUN elasticsearch-plugin install --batch https://github.com/alexklibisz/elastiknn/releases/download/7.17.6.0/elastiknn-7.17.6.0.zip
FROM docker.elastic.co/elasticsearch/elasticsearch:8.0.0
RUN elasticsearch-plugin install --batch https://github.com/alexklibisz/elastiknn/releases/download/8.0.0.0/elastiknn-8.0.0.0.zip
```

Build and run the Dockerfile. If you have any issues please refer to the [official docs.](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ trait ElastiknnClient[F[_]] extends AutoCloseable {
* @param elastiknn Value for `index.elastiknn` setting, true by default.
* @return CreateIndexResponse
*/
def createIndex(index: String, shards: Int = 1, replicas: Int = 0, elastiknn: Boolean = true): F[Response[CreateIndexResponse]] =
execute(ElasticDsl.createIndex(index).shards(shards).replicas(replicas).indexSetting("elastiknn", elastiknn))
def createIndex(index: String, shards: Int = 1, replicas: Int = 0): F[Response[CreateIndexResponse]] =
execute(ElasticDsl.createIndex(index).shards(shards).replicas(replicas))

/**
* Index a batch of vectors as new Elasticsearch docs, one doc per vector.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,6 @@ public long cost() {
}
}

@Override
public void extractTerms(Set<Term> terms) { }

@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
HitCounter counter = countHits(context.reader());
Expand Down Expand Up @@ -203,6 +200,11 @@ public boolean isCacheable(LeafReaderContext ctx) {
};
}

@Override
public void visit(QueryVisitor visitor) {

}

@Override
public String toString(String field) {
return String.format(
Expand Down
2 changes: 1 addition & 1 deletion elastiknn-plugin/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
FROM docker.elastic.co/elasticsearch/elasticsearch:7.17.6-amd64
FROM docker.elastic.co/elasticsearch/elasticsearch:8.0.0-amd64
COPY build/distributions/*.zip .
RUN elasticsearch-plugin install -b file:$(ls elastiknn*zip | sort | tail -n1)
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
package com.klibisz.elastiknn

import java.util
import java.util.{Collections, Optional}

import com.klibisz.elastiknn.codec.ElastiknnCodecService
import com.klibisz.elastiknn.mapper.VectorMapper
import com.klibisz.elastiknn.query._
import org.elasticsearch.common.settings.{Setting, Settings}
import org.elasticsearch.index.IndexSettings
import org.elasticsearch.index.engine.{Engine, EngineConfig, EngineFactory, InternalEngine}
import org.elasticsearch.index.engine.EngineFactory
import org.elasticsearch.index.mapper.Mapper
import org.elasticsearch.plugins.SearchPlugin.{QuerySpec, ScoreFunctionSpec}
import org.elasticsearch.plugins._

import java.util
import java.util.{Collections, Optional}

class ElastiknnPlugin(settings: Settings) extends Plugin with SearchPlugin with MapperPlugin with EnginePlugin {

override def getQueries: util.List[SearchPlugin.QuerySpec[_]] =
Expand All @@ -36,40 +35,7 @@ class ElastiknnPlugin(settings: Settings) extends Plugin with SearchPlugin with
)

override def getEngineFactory(indexSettings: IndexSettings): Optional[EngineFactory] = {
if (indexSettings.getValue(ElastiknnPlugin.Settings.elastiknn)) Optional.of {
new EngineFactory {
val codecService = new ElastiknnCodecService
override def newReadWriteEngine(config: EngineConfig): Engine = {
new InternalEngine(
new EngineConfig(
config.getShardId,
config.getThreadPool,
config.getIndexSettings,
config.getWarmer,
config.getStore,
config.getMergePolicy,
config.getAnalyzer,
config.getSimilarity,
codecService,
config.getEventListener,
config.getQueryCache,
config.getQueryCachingPolicy,
config.getTranslogConfig,
config.getFlushMergesAfter,
config.getExternalRefreshListener,
config.getInternalRefreshListener,
config.getIndexSort,
config.getCircuitBreakerService,
config.getGlobalCheckpointSupplier,
config.retentionLeasesSupplier,
config.getPrimaryTermSupplier,
config.getSnapshotCommitSupplier,
config.getLeafSorter
)
)
}
}
}
if (indexSettings.getValue(ElastiknnPlugin.Settings.elastiknn)) Optional.empty()
else Optional.empty()
}
}
Expand All @@ -79,10 +45,11 @@ object ElastiknnPlugin {
object Settings {

// Setting: index.elastiknn
// Determines whether elastiknn can control the codec used for the index.
// Highly recommended to set to true. Elastiknn will still work without it, but will be much slower.
// Previously used to determine whether elastiknn can control the codec used for the index to improve performance.
// Now it's a no-op.
// It was deprecated as part of https://github.com/alexklibisz/elastiknn/issues/254 and
// https://github.com/alexklibisz/elastiknn/issues/348.
val elastiknn: Setting[java.lang.Boolean] =
Setting.boolSetting("index.elastiknn", false, Setting.Property.IndexScope)
}

}
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package com.klibisz.elastiknn.codec

import org.apache.lucene.codecs._
import org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat
import org.apache.lucene.codecs.lucene84.Lucene84Codec
import org.apache.lucene.backward_codecs.lucene70.Lucene70DocValuesFormat
import org.apache.lucene.backward_codecs.lucene84.Lucene84Codec

class Elastiknn84Codec extends Codec(ElastiknnCodecService.ELASTIKNN_84) {
/**
* No longer used as of Elasticsearch 8.0.0. Kept for backwards-compatibility.
*/
class Elastiknn84Codec extends Codec("Elastiknn84Codec") {
private val luceneCodec: Codec = new Lucene84Codec()
override def docValuesFormat(): DocValuesFormat = new Lucene70DocValuesFormat()
override def postingsFormat(): PostingsFormat = luceneCodec.postingsFormat()
Expand All @@ -16,4 +19,5 @@ class Elastiknn84Codec extends Codec(ElastiknnCodecService.ELASTIKNN_84) {
override def liveDocsFormat(): LiveDocsFormat = luceneCodec.liveDocsFormat()
override def compoundFormat(): CompoundFormat = luceneCodec.compoundFormat()
override def pointsFormat(): PointsFormat = luceneCodec.pointsFormat()
override def knnVectorsFormat(): KnnVectorsFormat = luceneCodec.knnVectorsFormat()
}
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package com.klibisz.elastiknn.codec

import org.apache.lucene.backward_codecs.lucene70.Lucene70DocValuesFormat
import org.apache.lucene.backward_codecs.lucene86.Lucene86Codec
import org.apache.lucene.codecs._
import org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat
import org.apache.lucene.codecs.lucene86.Lucene86Codec

class Elastiknn86Codec extends Codec(ElastiknnCodecService.ELASTIKNN_86) {
/**
* No longer used as of Elasticsearch 8.0.0. Kept for backwards-compatibility.
*/
class Elastiknn86Codec extends Codec("Elastiknn86Codec") {
private val luceneCodec: Codec = new Lucene86Codec()
override def docValuesFormat(): DocValuesFormat = new Lucene70DocValuesFormat()
override def postingsFormat(): PostingsFormat = luceneCodec.postingsFormat()
Expand All @@ -16,4 +19,5 @@ class Elastiknn86Codec extends Codec(ElastiknnCodecService.ELASTIKNN_86) {
override def liveDocsFormat(): LiveDocsFormat = luceneCodec.liveDocsFormat()
override def compoundFormat(): CompoundFormat = luceneCodec.compoundFormat()
override def pointsFormat(): PointsFormat = luceneCodec.pointsFormat()
override def knnVectorsFormat(): KnnVectorsFormat = luceneCodec.knnVectorsFormat()
}
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package com.klibisz.elastiknn.codec

import org.apache.lucene.backward_codecs.lucene70.Lucene70DocValuesFormat
import org.apache.lucene.backward_codecs.lucene87.Lucene87Codec
import org.apache.lucene.codecs._
import org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat
import org.apache.lucene.codecs.lucene87.Lucene87Codec

class Elastiknn87Codec extends Codec(ElastiknnCodecService.ELASTIKNN_87) {
/**
* No longer used as of Elasticsearch 8.0.0. Kept for backwards-compatibility.
*/
class Elastiknn87Codec extends Codec("Elastiknn87Codec") {
private val luceneCodec: Codec = new Lucene87Codec()
override def docValuesFormat(): DocValuesFormat = new Lucene70DocValuesFormat()
override def postingsFormat(): PostingsFormat = luceneCodec.postingsFormat()
Expand All @@ -16,4 +19,5 @@ class Elastiknn87Codec extends Codec(ElastiknnCodecService.ELASTIKNN_87) {
override def liveDocsFormat(): LiveDocsFormat = luceneCodec.liveDocsFormat()
override def compoundFormat(): CompoundFormat = luceneCodec.compoundFormat()
override def pointsFormat(): PointsFormat = luceneCodec.pointsFormat()
override def knnVectorsFormat(): KnnVectorsFormat = luceneCodec.knnVectorsFormat()
}
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
package com.klibisz.elastiknn.codec

import org.apache.lucene.backward_codecs.lucene87.Lucene87Codec
import org.apache.lucene.codecs._
import org.apache.lucene.codecs.lucene87.Lucene87Codec

class Elastiknn88Codec extends Codec(ElastiknnCodecService.ELASTIKNN_88) {
/**
* No longer used as of Elasticsearch 8.0.0. Kept for backwards-compatibility.
*/
class Elastiknn88Codec extends Codec("Elastiknn88Codec") {
private val luceneCodec: Codec = new Lucene87Codec(Lucene87Codec.Mode.BEST_SPEED)
override def docValuesFormat(): DocValuesFormat = luceneCodec.docValuesFormat()
override def postingsFormat(): PostingsFormat = luceneCodec.postingsFormat()
Expand All @@ -15,4 +18,5 @@ class Elastiknn88Codec extends Codec(ElastiknnCodecService.ELASTIKNN_88) {
override def liveDocsFormat(): LiveDocsFormat = luceneCodec.liveDocsFormat()
override def compoundFormat(): CompoundFormat = luceneCodec.compoundFormat()
override def pointsFormat(): PointsFormat = luceneCodec.pointsFormat()
override def knnVectorsFormat(): KnnVectorsFormat = luceneCodec.knnVectorsFormat()
}

This file was deleted.

Loading

0 comments on commit a760195

Please sign in to comment.