/
BeiderMorseEncoder.cs
163 lines (155 loc) · 8.56 KB
/
BeiderMorseEncoder.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
// commons-codec version compatibility level: 1.9
namespace Lucene.Net.Analysis.Phonetic.Language.Bm
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Encodes strings into their Beider-Morse phonetic encoding.
/// </summary>
/// <remarks>
/// Beider-Morse phonetic encodings are optimised for family names. However, they may be useful for a wide range
/// of words.
/// <para/>
/// This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it
/// is mutable, and may not be thread-safe. If you require a guaranteed thread-safe encoding then use
/// <see cref="PhoneticEngine"/> directly.
/// <para/>
/// <b>Encoding overview</b>
/// <para/>
/// Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
/// language the word comes from. For example, if it ends in "<c>ault</c>" then it infers that the word is French.
/// Next, the word is translated into a phonetic representation using a language-specific phonetics table. Some
/// runs of letters can be pronounced in multiple ways, and a single run of letters may be potentially broken up
/// into phonemes at different places, so this stage results in a set of possible language-specific phonetic
/// representations. Lastly, this language-specific phonetic representation is processed by a table of rules that
/// re-writes it phonetically taking into account systematic pronunciation differences between languages, to move
/// it towards a pan-indo-european phonetic representation. Again, sometimes there are multiple ways this could be
/// done and sometimes things that can be pronounced in several ways in the source language have only one way to
/// represent them in this average phonetic language, so the result is again a set of phonetic spellings.
/// <para/>
/// Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated.
/// In this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final
/// encoding. Secondly, some names have standard prefixes, for example, "<c>Mac/Mc</c>" in Scottish (English)
/// names. As sometimes it is ambiguous whether the prefix is intended or is an accident of the spelling, the word
/// is encoded once with the prefix and once without it. The resulting encoding contains one and then the other result.
/// <para/>
/// <b>Encoding format</b>
/// <para/>
/// Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where
/// there are multiple possible phonetic representations, these are joined with a pipe (<c>|</c>) character.
/// If multiple hyphenated words where found, or if the word may contain a name prefix, each encoded word is placed
/// in elipses and these blocks are then joined with hyphens. For example, "<c>d'ortley</c>" has a possible
/// prefix. The form without prefix encodes to <c>ortlaj|ortlej</c>, while the form with prefix encodes to
/// <c>dortlaj|dortlej</c>. Thus, the full, combined encoding is <c>(ortlaj|ortlej)-(dortlaj|dortlej)</c>.
/// <para/>
/// The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
/// potential phonetic interpretations. For example, <c>Renault</c> encodes to
/// <c>rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult</c>. The <see cref="RuleType.APPROX"/> rules will tend to produce larger
/// encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
/// Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
/// splitting on pipe (<c>|</c>) and indexing under each of these alternatives.
/// <para/>
/// since 1.6
/// </remarks>
public class BeiderMorseEncoder : IStringEncoder
{
// Implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration
// of an immutable PhoneticEngine instance that will be delegated to for the actual encoding.
// a cached object
private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);
// LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
//public object Encode(object source)
//{
// if (!(source is string))
// {
// throw new EncoderException("BeiderMorseEncoder encode parameter is not of type String");
// }
// return encode((string)source);
//}
public virtual string Encode(string source)
{
if (source is null)
{
return null;
}
return this.engine.Encode(source);
}
/// <summary>
/// Gets or Sets the name type currently in operation. Use <see cref="NameType.GENERIC"/> unless you specifically want phonetic encodings
/// optimized for Ashkenazi or Sephardic Jewish family names.
/// </summary>
// LUCENENET NOTE: Made setter into property because
// its behavior is similar to what would happen when
// setting a property, even though it is actually
// replacing a related instance.
public virtual NameType NameType
{
get => this.engine.NameType;
set
{
this.engine = new PhoneticEngine(value,
this.engine.RuleType,
this.engine.IsConcat,
this.engine.MaxPhonemes);
}
}
/// <summary>
/// Gets or Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered.
/// <see cref="RuleType.APPROX"/> or <see cref="RuleType.EXACT"/> for approximate or exact phonetic matches.
/// </summary>
public virtual RuleType RuleType
{
get => this.engine.RuleType;
set
{
this.engine = new PhoneticEngine(this.engine.NameType,
value,
this.engine.IsConcat,
this.engine.MaxPhonemes);
}
}
/// <summary>
/// Gets or Sets how multiple possible phonetic encodings are combined.
/// <c>true</c> if multiple encodings are to be combined with a '|', <c>false</c> if just the first one is
/// to be considered.
/// </summary>
public virtual bool IsConcat
{
get => this.engine.IsConcat;
set
{
this.engine = new PhoneticEngine(this.engine.NameType,
this.engine.RuleType,
value,
this.engine.MaxPhonemes);
}
}
/// <summary>
/// Sets the number of maximum of phonemes that shall be considered by the engine.
/// <para/>
/// since 1.7
/// </summary>
/// <param name="maxPhonemes">the maximum number of phonemes returned by the engine</param>
public virtual void SetMaxPhonemes(int maxPhonemes)
{
this.engine = new PhoneticEngine(this.engine.NameType,
this.engine.RuleType,
this.engine.IsConcat,
maxPhonemes);
}
}
}