diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..06a1e6f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+/data
+/work
+/logs
+/.idea
+/target
+.DS_Store
+*.iml
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..50bb848
--- /dev/null
+++ b/README.md
@@ -0,0 +1,15 @@
+ICU Analysis for ElasticSearch
+==================================
+
+The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components.
+
+In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-icu/1.0.0`.
+
+ ---------------------------------------
+ | memcached Plugin | ElasticSearch |
+ ---------------------------------------
+ | master | 0.18 -> master |
+ ---------------------------------------
+ | 1.0.0 | 0.18 -> master |
+ ---------------------------------------
+
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..da5bdfd
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,128 @@
+
+
The first is simply specifying the locale (defaults to the default locale). The language + * parameter is the lowercase two-letter ISO-639 code. An additional country and variant + * can be provided. + *
+ *The second option is to specify collation rules as defined in the + * Collation customization chapter in icu docs. The rules parameter can either embed the rules definition + * in the settings or refer to an external location (preferable located under the config location, relative to it). + * + * + */ +public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory { + + private final Collator collator; + + @Inject + public IcuCollationTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment environment, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + + Collator collator; + String rules = settings.get("rules"); + if (rules != null) { + FailedToResolveConfigException failureToResolve = null; + try { + rules = environment.resolveConfigAndLoadToString(rules); + } catch (FailedToResolveConfigException e) { + failureToResolve = e; + } catch (IOException e) { + throw new ElasticSearchIllegalArgumentException("Failed to load collation rules", e); + } + try { + collator = new RuleBasedCollator(rules); + } catch (Exception e) { + if (failureToResolve != null) { + throw new ElasticSearchIllegalArgumentException("Failed to resolve collation rules location", failureToResolve); + } else { + throw new ElasticSearchIllegalArgumentException("Failed to parse collation rules", e); + } + } + } else { + String language = settings.get("language"); + if (language != null) { + Locale locale; + String country = settings.get("country"); + if (country != null) { + String variant = settings.get("variant"); + if (variant != null) { + locale = new Locale(language, country, variant); + } else { + locale = new Locale(language, country); + } + } else { + locale = new Locale(language); + } + collator = Collator.getInstance(locale); + } else { + collator = Collator.getInstance(); + } + } + this.collator = collator; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ICUCollationKeyFilter(tokenStream, collator); + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java new file mode 100644 index 0000000..ed11a22 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.icu.ICUFoldingFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + + +/** + * + */ +public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory { + + @Inject + public IcuFoldingTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ICUFoldingFilter(tokenStream); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java new file mode 100644 index 0000000..b28e7c9 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java @@ -0,0 +1,52 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import com.ibm.icu.text.Normalizer2; +import org.apache.lucene.analysis.TokenStream; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + + +/** + * Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to normalize tokens. + *
+ *The name can be used to provide the type of normalization to perform. + * + * + */ +public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory { + + private final String name; + + @Inject + public IcuNormalizerTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + this.name = settings.get("name", "nfkc_cf"); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE)); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java new file mode 100644 index 0000000..f24852d --- /dev/null +++ b/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java @@ -0,0 +1,49 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.plugin.analysis.icu; + +import org.elasticsearch.common.inject.Module; +import org.elasticsearch.index.analysis.AnalysisModule; +import org.elasticsearch.index.analysis.IcuAnalysisBinderProcessor; +import org.elasticsearch.plugins.AbstractPlugin; + +/** + * + */ +public class AnalysisICUPlugin extends AbstractPlugin { + + @Override + public String name() { + return "analysis-icu"; + } + + @Override + public String description() { + return "UTF related ICU analysis support"; + } + + @Override + public void processModule(Module module) { + if (module instanceof AnalysisModule) { + AnalysisModule analysisModule = (AnalysisModule) module; + analysisModule.addProcessor(new IcuAnalysisBinderProcessor()); + } + } +} diff --git a/src/main/resources/es-plugin.properties b/src/main/resources/es-plugin.properties new file mode 100644 index 0000000..b694c79 --- /dev/null +++ b/src/main/resources/es-plugin.properties @@ -0,0 +1 @@ +plugin=org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java new file mode 100644 index 0000000..adcd03a --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -0,0 +1,59 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.hamcrest.MatcherAssert; +import org.testng.annotations.Test; + +import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS; +import static org.hamcrest.Matchers.instanceOf; + +/** + * + */ +public class SimpleIcuAnalysisTests { + + @Test + public void testDefaultsIcuAnalysis() { + Index index = new Index("test"); + + Injector parentInjector = new ModulesBuilder().add(new SettingsModule(EMPTY_SETTINGS), new EnvironmentModule(new Environment(EMPTY_SETTINGS)), new IndicesAnalysisModule()).createInjector(); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(index, EMPTY_SETTINGS), + new IndexNameModule(index), + new AnalysisModule(EMPTY_SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor())) + .createChildInjector(parentInjector); + + AnalysisService analysisService = injector.getInstance(AnalysisService.class); + + TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer"); + MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class)); + } +} diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties new file mode 100644 index 0000000..497c97f --- /dev/null +++ b/src/test/resources/log4j.properties @@ -0,0 +1,5 @@ +log4j.rootLogger=INFO, out + +log4j.appender.out=org.apache.log4j.ConsoleAppender +log4j.appender.out.layout=org.apache.log4j.PatternLayout +log4j.appender.out.layout.conversionPattern=[%d{ISO8601}][%-5p][%-25c] %m%n