Fixed spanish accents normalization (#1957)

Normalization of accented letters only happens if the input is larger than 5 characters, something that, for example, neither `guía` nor `fría` comply. The solution would be to always execute the accented characters normalization, by moving it to a separate file just like it is done in the german analyzer. Fixes: #1956
blevesearch · Jan 10, 2024 · 5f1f45a · 5f1f45a
1 parent e26eace
commit 5f1f45a
Show file tree

Hide file tree

Showing 4 changed files with 184 additions and 15 deletions.
diff --git a/analysis/lang/es/analyzer_es.go b/analysis/lang/es/analyzer_es.go
@@ -34,6 +34,10 @@ func AnalyzerConstructor(config map[string]interface{},
 	if err != nil {
 		return nil, err
 	}
+	normalizeEsFilter, err := cache.TokenFilterNamed(NormalizeName)
+	if err != nil {
+		return nil, err
+	}
 	stopEsFilter, err := cache.TokenFilterNamed(StopName)
 	if err != nil {
 		return nil, err
@@ -47,6 +51,7 @@ func AnalyzerConstructor(config map[string]interface{},
 		TokenFilters: []analysis.TokenFilter{
 			toLowerFilter,
 			stopEsFilter,
+			normalizeEsFilter,
 			lightStemmerEsFilter,
 		},
 	}

diff --git a/analysis/lang/es/light_stemmer_es.go b/analysis/lang/es/light_stemmer_es.go
@@ -46,21 +46,6 @@ func stem(input []rune) []rune {
 		return input
 	}
 
-	for i, r := range input {
-		switch r {
-		case 'à', 'á', 'â', 'ä':
-			input[i] = 'a'
-		case 'ò', 'ó', 'ô', 'ö':
-			input[i] = 'o'
-		case 'è', 'é', 'ê', 'ë':
-			input[i] = 'e'
-		case 'ù', 'ú', 'û', 'ü':
-			input[i] = 'u'
-		case 'ì', 'í', 'î', 'ï':
-			input[i] = 'i'
-		}
-	}
-
 	switch input[l-1] {
 	case 'o', 'a', 'e':
 		return input[:l-1]

diff --git a/analysis/lang/es/spanish_normalize.go b/analysis/lang/es/spanish_normalize.go
@@ -0,0 +1,67 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package es
+
+import (
+	"bytes"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const NormalizeName = "normalize_es"
+
+type SpanishNormalizeFilter struct {
+}
+
+func NewSpanishNormalizeFilter() *SpanishNormalizeFilter {
+	return &SpanishNormalizeFilter{}
+}
+
+func (s *SpanishNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		term := normalize(token.Term)
+		token.Term = term
+	}
+	return input
+}
+
+func normalize(input []byte) []byte {
+	runes := bytes.Runes(input)
+	for i := 0; i < len(runes); i++ {
+		switch runes[i] {
+		case 'à', 'á', 'â', 'ä':
+			runes[i] = 'a'
+		case 'ò', 'ó', 'ô', 'ö':
+			runes[i] = 'o'
+		case 'è', 'é', 'ê', 'ë':
+			runes[i] = 'e'
+		case 'ù', 'ú', 'û', 'ü':
+			runes[i] = 'u'
+		case 'ì', 'í', 'î', 'ï':
+			runes[i] = 'i'
+		}
+	}
+
+	return analysis.BuildTermFromRunes(runes)
+}
+
+func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewSpanishNormalizeFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
+}
diff --git a/analysis/lang/es/spanish_normalize_test.go b/analysis/lang/es/spanish_normalize_test.go
@@ -0,0 +1,112 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package es
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestSpanishNormalizeFilter(t *testing.T) {
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Guía"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Guia"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Belcebú"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Belcebu"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Limón"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Limon"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("agüero"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("aguero"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("laúd"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("laud"),
+				},
+			},
+		},
+		// empty
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+	}
+
+	spanishNormalizeFilter := NewSpanishNormalizeFilter()
+	for _, test := range tests {
+		actual := spanishNormalizeFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %#v, got %#v", test.output, actual)
+			t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
+		}
+	}
+}