Skip to content

Commit

Permalink
Fixed spanish accents normalization (#1957)
Browse files Browse the repository at this point in the history
Normalization of accented letters only happens if the input is larger
than 5 characters, something that, for example, neither `guía` nor
`fría` comply.
The solution would be to always execute the accented characters
normalization, by moving it to a separate file just like it is done in
the german analyzer.

Fixes: #1956
  • Loading branch information
svera committed Jan 10, 2024
1 parent e26eace commit 5f1f45a
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 15 deletions.
5 changes: 5 additions & 0 deletions analysis/lang/es/analyzer_es.go
Expand Up @@ -34,6 +34,10 @@ func AnalyzerConstructor(config map[string]interface{},
if err != nil {
return nil, err
}
normalizeEsFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stopEsFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
Expand All @@ -47,6 +51,7 @@ func AnalyzerConstructor(config map[string]interface{},
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEsFilter,
normalizeEsFilter,
lightStemmerEsFilter,
},
}
Expand Down
15 changes: 0 additions & 15 deletions analysis/lang/es/light_stemmer_es.go
Expand Up @@ -46,21 +46,6 @@ func stem(input []rune) []rune {
return input
}

for i, r := range input {
switch r {
case 'à', 'á', 'â', 'ä':
input[i] = 'a'
case 'ò', 'ó', 'ô', 'ö':
input[i] = 'o'
case 'è', 'é', 'ê', 'ë':
input[i] = 'e'
case 'ù', 'ú', 'û', 'ü':
input[i] = 'u'
case 'ì', 'í', 'î', 'ï':
input[i] = 'i'
}
}

switch input[l-1] {
case 'o', 'a', 'e':
return input[:l-1]
Expand Down
67 changes: 67 additions & 0 deletions analysis/lang/es/spanish_normalize.go
@@ -0,0 +1,67 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package es

import (
"bytes"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const NormalizeName = "normalize_es"

type SpanishNormalizeFilter struct {
}

func NewSpanishNormalizeFilter() *SpanishNormalizeFilter {
return &SpanishNormalizeFilter{}
}

func (s *SpanishNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}

func normalize(input []byte) []byte {
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case 'à', 'á', 'â', 'ä':
runes[i] = 'a'
case 'ò', 'ó', 'ô', 'ö':
runes[i] = 'o'
case 'è', 'é', 'ê', 'ë':
runes[i] = 'e'
case 'ù', 'ú', 'û', 'ü':
runes[i] = 'u'
case 'ì', 'í', 'î', 'ï':
runes[i] = 'i'
}
}

return analysis.BuildTermFromRunes(runes)
}

func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSpanishNormalizeFilter(), nil
}

func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}
112 changes: 112 additions & 0 deletions analysis/lang/es/spanish_normalize_test.go
@@ -0,0 +1,112 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package es

import (
"reflect"
"testing"

"github.com/blevesearch/bleve/v2/analysis"
)

func TestSpanishNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Guía"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Guia"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Belcebú"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Belcebu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Limón"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Limon"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("agüero"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("aguero"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("laúd"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("laud"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}

spanishNormalizeFilter := NewSpanishNormalizeFilter()
for _, test := range tests {
actual := spanishNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
}
}
}

0 comments on commit 5f1f45a

Please sign in to comment.