From 4480762bf85b111d58fb1737bd32117ceb1cab15 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Mon, 5 Dec 2011 13:31:59 +0200 Subject: [PATCH] first commit --- .gitignore | 7 + README.md | 15 ++ pom.xml | 128 ++++++++++++++++++ src/main/assemblies/plugin.xml | 26 ++++ .../analysis/IcuAnalysisBinderProcessor.java | 38 ++++++ .../IcuCollationTokenFilterFactory.java | 106 +++++++++++++++ .../IcuFoldingTokenFilterFactory.java | 45 ++++++ .../IcuNormalizerTokenFilterFactory.java | 52 +++++++ .../analysis/icu/AnalysisICUPlugin.java | 49 +++++++ src/main/resources/es-plugin.properties | 1 + .../analysis/SimpleIcuAnalysisTests.java | 59 ++++++++ src/test/resources/log4j.properties | 5 + 12 files changed, 531 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 pom.xml create mode 100644 src/main/assemblies/plugin.xml create mode 100644 src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java create mode 100644 src/main/resources/es-plugin.properties create mode 100644 src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java create mode 100644 src/test/resources/log4j.properties diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..06a1e6f --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +/data +/work +/logs +/.idea +/target +.DS_Store +*.iml diff --git a/README.md b/README.md new file mode 100644 index 0000000..50bb848 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +ICU Analysis for ElasticSearch +================================== + +The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. + +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.0.0`. + + --------------------------------------- + | memcached Plugin | ElasticSearch | + --------------------------------------- + | master | 0.18 -> master | + --------------------------------------- + | 1.0.0 | 0.18 -> master | + --------------------------------------- + diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..da5bdfd --- /dev/null +++ b/pom.xml @@ -0,0 +1,128 @@ + + + elasticsearch-analysis-icu + 4.0.0 + org.elasticsearch + elasticsearch-analysis-icu + 1.0.0 + jar + ICU Analysis for ElasticSearch + 2009 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + scm:git:git@github.com:elasticsearch/elasticsearch-analysis-icu.git + scm:git:git@github.com:elasticsearch/elasticsearch-analysis-icu.git + + http://github.com/elasticsearch/elasticsearch-analysis-icu + + + + org.sonatype.oss + oss-parent + 7 + + + + 0.18.5 + + + + + + + + org.elasticsearch + elasticsearch + ${elasticsearch.version} + compile + + + + org.apache.lucene + lucene-icu + 3.5.0 + compile + + + + log4j + log4j + 1.2.16 + runtime + + + + org.testng + testng + 6.3.1 + test + + + + org.hamcrest + hamcrest-core + 1.3.RC2 + test + + + + org.hamcrest + hamcrest-library + 1.3.RC2 + test + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + 1.6 + 1.6 + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.11 + + + **/*Tests.java + + + + + org.apache.maven.plugins + maven-source-plugin + 2.1.2 + + + attach-sources + + jar + + + + + + maven-assembly-plugin + + + ${basedir}/src/main/assemblies/plugin.xml + + + + + + \ No newline at end of file diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml new file mode 100644 index 0000000..c5c3a71 --- /dev/null +++ b/src/main/assemblies/plugin.xml @@ -0,0 +1,26 @@ + + + + + zip + + false + + + / + true + true + + org.elasticsearch:elasticsearch + + + + / + true + true + + org.apache.lucene:lucene-icu + + + + \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java new file mode 100644 index 0000000..111607c --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java @@ -0,0 +1,38 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +/** + * + */ +public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { + + @Override + public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { + tokenFiltersBindings.processTokenFilter("icuNormalizer", IcuNormalizerTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("icu_normalizer", IcuNormalizerTokenFilterFactory.class); + + tokenFiltersBindings.processTokenFilter("icuFolding", IcuFoldingTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("icu_folding", IcuFoldingTokenFilterFactory.class); + + tokenFiltersBindings.processTokenFilter("icuCollation", IcuCollationTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("icu_collation", IcuCollationTokenFilterFactory.class); + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java new file mode 100644 index 0000000..3517c01 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java @@ -0,0 +1,106 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RuleBasedCollator; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.collation.ICUCollationKeyFilter; +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.FailedToResolveConfigException; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.io.IOException; +import java.util.Locale; + +/** + * An ICU based collation token filter. There are two ways to configure collation: + *

+ *

The first is simply specifying the locale (defaults to the default locale). The language + * parameter is the lowercase two-letter ISO-639 code. An additional country and variant + * can be provided. + *

+ *

The second option is to specify collation rules as defined in the + * Collation customization chapter in icu docs. The rules parameter can either embed the rules definition + * in the settings or refer to an external location (preferable located under the config location, relative to it). + * + * + */ +public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { + + private final Collator collator; + + @Inject + public IcuCollationTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment environment, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + + Collator collator; + String rules = settings.get("rules"); + if (rules != null) { + FailedToResolveConfigException failureToResolve = null; + try { + rules = environment.resolveConfigAndLoadToString(rules); + } catch (FailedToResolveConfigException e) { + failureToResolve = e; + } catch (IOException e) { + throw new ElasticSearchIllegalArgumentException("Failed to load collation rules", e); + } + try { + collator = new RuleBasedCollator(rules); + } catch (Exception e) { + if (failureToResolve != null) { + throw new ElasticSearchIllegalArgumentException("Failed to resolve collation rules location", failureToResolve); + } else { + throw new ElasticSearchIllegalArgumentException("Failed to parse collation rules", e); + } + } + } else { + String language = settings.get("language"); + if (language != null) { + Locale locale; + String country = settings.get("country"); + if (country != null) { + String variant = settings.get("variant"); + if (variant != null) { + locale = new Locale(language, country, variant); + } else { + locale = new Locale(language, country); + } + } else { + locale = new Locale(language); + } + collator = Collator.getInstance(locale); + } else { + collator = Collator.getInstance(); + } + } + this.collator = collator; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ICUCollationKeyFilter(tokenStream, collator); + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java new file mode 100644 index 0000000..ed11a22 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.icu.ICUFoldingFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + + +/** + * + */ +public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory { + + @Inject + public IcuFoldingTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ICUFoldingFilter(tokenStream); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java new file mode 100644 index 0000000..b28e7c9 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java @@ -0,0 +1,52 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import com.ibm.icu.text.Normalizer2; +import org.apache.lucene.analysis.TokenStream; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + + +/** + * Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to normalize tokens. + *

+ *

The name can be used to provide the type of normalization to perform. + * + * + */ +public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory { + + private final String name; + + @Inject + public IcuNormalizerTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + this.name = settings.get("name", "nfkc_cf"); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE)); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java new file mode 100644 index 0000000..f24852d --- /dev/null +++ b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java @@ -0,0 +1,49 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.plugin.analysis.icu; + +import org.elasticsearch.common.inject.Module; +import org.elasticsearch.index.analysis.AnalysisModule; +import org.elasticsearch.index.analysis.IcuAnalysisBinderProcessor; +import org.elasticsearch.plugins.AbstractPlugin; + +/** + * + */ +public class AnalysisICUPlugin extends AbstractPlugin { + + @Override + public String name() { + return "analysis-icu"; + } + + @Override + public String description() { + return "UTF related ICU analysis support"; + } + + @Override + public void processModule(Module module) { + if (module instanceof AnalysisModule) { + AnalysisModule analysisModule = (AnalysisModule) module; + analysisModule.addProcessor(new IcuAnalysisBinderProcessor()); + } + } +} diff --git a/src/main/resources/es-plugin.properties b/src/main/resources/es-plugin.properties new file mode 100644 index 0000000..b694c79 --- /dev/null +++ b/src/main/resources/es-plugin.properties @@ -0,0 +1 @@ +plugin=org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java new file mode 100644 index 0000000..adcd03a --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -0,0 +1,59 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.hamcrest.MatcherAssert; +import org.testng.annotations.Test; + +import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS; +import static org.hamcrest.Matchers.instanceOf; + +/** + * + */ +public class SimpleIcuAnalysisTests { + + @Test + public void testDefaultsIcuAnalysis() { + Index index = new Index("test"); + + Injector parentInjector = new ModulesBuilder().add(new SettingsModule(EMPTY_SETTINGS), new EnvironmentModule(new Environment(EMPTY_SETTINGS)), new IndicesAnalysisModule()).createInjector(); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(index, EMPTY_SETTINGS), + new IndexNameModule(index), + new AnalysisModule(EMPTY_SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor())) + .createChildInjector(parentInjector); + + AnalysisService analysisService = injector.getInstance(AnalysisService.class); + + TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer"); + MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class)); + } +} diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties new file mode 100644 index 0000000..497c97f --- /dev/null +++ b/src/test/resources/log4j.properties @@ -0,0 +1,5 @@ +log4j.rootLogger=INFO, out + +log4j.appender.out=org.apache.log4j.ConsoleAppender +log4j.appender.out.layout=org.apache.log4j.PatternLayout +log4j.appender.out.layout.conversionPattern=[%d{ISO8601}][%-5p][%-25c] %m%n