src/Lucene.Net.Analysis.Phonetic/Language/Bm/BeiderMorseEncoder.cs

﻿// commons-codec version compatibility level: 1.9
namespace Lucene.Net.Analysis.Phonetic.Language.Bm
{
    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */

    /// <summary>
    /// Encodes strings into their Beider-Morse phonetic encoding.
    /// </summary>
    /// <remarks>
    /// Beider-Morse phonetic encodings are optimised for family names. However, they may be useful for a wide range
    /// of words.
    /// <para/>
    /// This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it
    /// is mutable, and may not be thread-safe. If you require a guaranteed thread-safe encoding then use
    /// <see cref="PhoneticEngine"/> directly.
    /// <para/>
    /// <b>Encoding overview</b>
    /// <para/>
    /// Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
    /// language the word comes from. For example, if it ends in "<c>ault</c>" then it infers that the word is French.
    /// Next, the word is translated into a phonetic representation using a language-specific phonetics table. Some
    /// runs of letters can be pronounced in multiple ways, and a single run of letters may be potentially broken up
    /// into phonemes at different places, so this stage results in a set of possible language-specific phonetic
    /// representations. Lastly, this language-specific phonetic representation is processed by a table of rules that
    /// re-writes it phonetically taking into account systematic pronunciation differences between languages, to move
    /// it towards a pan-indo-european phonetic representation. Again, sometimes there are multiple ways this could be
    /// done and sometimes things that can be pronounced in several ways in the source language have only one way to
    /// represent them in this average phonetic language, so the result is again a set of phonetic spellings.
    /// <para/>
    /// Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated.
    /// In this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final
    /// encoding. Secondly, some names have standard prefixes, for example, "<c>Mac/Mc</c>" in Scottish (English)
    /// names. As sometimes it is ambiguous whether the prefix is intended or is an accident of the spelling, the word
    /// is encoded once with the prefix and once without it. The resulting encoding contains one and then the other result.
    /// <para/>
    /// <b>Encoding format</b>
    /// <para/>
    /// Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where
    /// there are multiple possible phonetic representations, these are joined with a pipe (<c>|</c>) character.
    /// If multiple hyphenated words where found, or if the word may contain a name prefix, each encoded word is placed
    /// in elipses and these blocks are then joined with hyphens. For example, "<c>d'ortley</c>" has a possible
    /// prefix. The form without prefix encodes to <c>ortlaj|ortlej</c>, while the form with prefix encodes to
    /// <c>dortlaj|dortlej</c>. Thus, the full, combined encoding is <c>(ortlaj|ortlej)-(dortlaj|dortlej)</c>.
    /// <para/>
    /// The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
    /// potential phonetic interpretations. For example, <c>Renault</c> encodes to
    /// <c>rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult</c>. The <see cref="RuleType.APPROX"/> rules will tend to produce larger
    /// encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
    /// Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
    /// splitting on pipe (<c>|</c>) and indexing under each of these alternatives.
    /// <para/>
    /// since 1.6
    /// </remarks>
    public class BeiderMorseEncoder : IStringEncoder
    {
        // Implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration
        // of an immutable PhoneticEngine instance that will be delegated to for the actual encoding.

        // a cached object
        private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);

        // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
        //public object Encode(object source)
        //{
        //    if (!(source is string))
        //    {
        //        throw new EncoderException("BeiderMorseEncoder encode parameter is not of type String");
        //    }
        //    return encode((string)source);
        //}

        public virtual string Encode(string source)
        {
            if (source is null)
            {
                return null;
            }
            return this.engine.Encode(source);
        }

        /// <summary>
        /// Gets or Sets the name type currently in operation. Use <see cref="NameType.GENERIC"/> unless you specifically want phonetic encodings
        /// optimized for Ashkenazi or Sephardic Jewish family names.
        /// </summary>
        // LUCENENET NOTE: Made setter into property because
        // its behavior is similar to what would happen when
        // setting a property, even though it is actually
        // replacing a related instance.
        public virtual NameType NameType
        {
            get => this.engine.NameType;
            set
            {
                this.engine = new PhoneticEngine(value,
                                                this.engine.RuleType,
                                                this.engine.IsConcat,
                                                this.engine.MaxPhonemes);
            }
        }

        /// <summary>
        /// Gets or Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered.
        /// <see cref="RuleType.APPROX"/> or <see cref="RuleType.EXACT"/> for approximate or exact phonetic matches.
        /// </summary>
        public virtual RuleType RuleType
        {
            get => this.engine.RuleType;
            set
            {
                this.engine = new PhoneticEngine(this.engine.NameType,
                                                value,
                                                this.engine.IsConcat,
                                                this.engine.MaxPhonemes);
            }
        }

        /// <summary>
        /// Gets or Sets how multiple possible phonetic encodings are combined.
        /// <c>true</c> if multiple encodings are to be combined with a '|', <c>false</c> if just the first one is 
        /// to be considered.
        /// </summary>
        public virtual bool IsConcat
        {
            get => this.engine.IsConcat;
            set
            {
                this.engine = new PhoneticEngine(this.engine.NameType,
                                                this.engine.RuleType,
                                                value,
                                                this.engine.MaxPhonemes);
            }
        }

        /// <summary>
        /// Sets the number of maximum of phonemes that shall be considered by the engine.
        /// <para/>
        /// since 1.7
        /// </summary>
        /// <param name="maxPhonemes">the maximum number of phonemes returned by the engine</param>
        public virtual void SetMaxPhonemes(int maxPhonemes)
        {
            this.engine = new PhoneticEngine(this.engine.NameType,
                                             this.engine.RuleType,
                                             this.engine.IsConcat,
                                             maxPhonemes);
        }
    }
}