Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for analysis culture sensitivity and surrogate pair support #321

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ namespace Lucene.Net.Analysis.Core
/// <summary>
/// A <see cref="LetterTokenizer"/> is a tokenizer that divides text at non-letters. That's to
/// say, it defines tokens as maximal strings of adjacent letters, as defined by
/// <see cref="char.IsLetter(char)"/> predicate.
/// <see cref="Character.IsLetter(int)"/> predicate.
/// <para>
/// Note: this does a decent job for most European languages, but does a terrible
/// job for some Asian languages, where words are not separated by spaces.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using J2N;
using Lucene.Net.Util;
using System.Globalization;
using System.IO;

namespace Lucene.Net.Analysis.Core
Expand Down Expand Up @@ -73,11 +74,11 @@ public LowerCaseTokenizer(LuceneVersion matchVersion, AttributeSource.AttributeF

/// <summary>
/// Converts char to lower case
/// <see cref="Character.ToLower(int)"/>.
/// <see cref="Character.ToLower(int, CultureInfo)"/> in the invariant culture.
/// </summary>
protected override int Normalize(int c)
{
return Character.ToLower(c);
return Character.ToLower(c, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
}
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using Lucene.Net.Analysis.Util;
using J2N;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
using System.IO;

Expand Down Expand Up @@ -65,7 +66,7 @@ public WhitespaceTokenizer(LuceneVersion matchVersion, AttributeFactory factory,
/// </summary>
protected override bool IsTokenChar(int c)
{
return !char.IsWhiteSpace((char)c);
return !Character.IsWhiteSpace(c);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
using System.Globalization;

namespace Lucene.Net.Analysis.El
{
Expand Down Expand Up @@ -37,6 +38,8 @@ public sealed class GreekLowerCaseFilter : TokenFilter
private readonly ICharTermAttribute termAtt;
private readonly CharacterUtils charUtils;

private static readonly CultureInfo culture = new CultureInfo("el"); // LUCENENET specific - use Greek culture when lowercasing.

/// <summary>
/// Create a <see cref="GreekLowerCaseFilter"/> that normalizes Greek token text.
/// </summary>
Expand Down Expand Up @@ -127,7 +130,7 @@ private int LowerCase(int codepoint)
return '\u03C2'; // small final sigma

default:
return Character.ToLower(codepoint);
return Character.ToLower(codepoint, culture); // LUCENENET specific - need to use specific culture to override current thread
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -841,9 +841,9 @@ public virtual bool Stem(int i0)
// ch = buffer[offset++];
// }

// if (char.IsLetter((char)ch))
// if (Character.IsLetter(ch))
// {
// s.Add(char.ToLowerInvariant((char)ch));
// s.Add(Character.ToLower(ch, CultureInfo.InvariantCulture));
// }
// else
// {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using J2N;
using Lucene.Net.Analysis.TokenAttributes;
using System.Globalization;

namespace Lucene.Net.Analysis.Ga
{
Expand Down Expand Up @@ -28,6 +29,8 @@ public sealed class IrishLowerCaseFilter : TokenFilter
{
private readonly ICharTermAttribute termAtt;

private static readonly CultureInfo culture = new CultureInfo("ga"); // LUCENENET specific - use Irish culture when lowercasing.

/// <summary>
/// Create an <see cref="IrishLowerCaseFilter"/> that normalises Irish token text.
/// </summary>
Expand Down Expand Up @@ -60,7 +63,7 @@ public override bool IncrementToken()

for (int i = idx; i < chLen;)
{
i += Character.ToChars(Character.ToLower(chArray[i]), chArray, i);
i += Character.ToChars(Character.ToLower(chArray[i], culture), chArray, i); // LUCENENET specific - use Irish culture when lowercasing
}
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using System.Collections.Generic;
using System.Globalization;
using System.IO;

namespace Lucene.Net.Analysis.Miscellaneous
Expand Down Expand Up @@ -134,7 +135,7 @@ public BytesRef Get(char[] buffer, int bufferLen, FST.Arc<BytesRef> scratchArc,
while (bufUpto < bufferLen)
{
int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null)
{
return null;
}
Expand Down Expand Up @@ -192,7 +193,7 @@ public virtual bool Add(string input, string output)
char[] buffer = charsSpare.Chars;
for (int i = 0; i < length;)
{
i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i)), buffer, i);
i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i), CultureInfo.InvariantCulture), buffer, i);
}
UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Globalization;
using J2N;
using System.Globalization;

namespace Lucene.Net.Analysis.Miscellaneous
{
Expand Down Expand Up @@ -84,17 +85,17 @@ private static byte[] LoadDefaultWordDelimTable() // LUCENENET: Avoid static con
for (int i = 0; i < 256; i++)
{
byte code = 0;
if (char.IsLower((char)i))
if (Character.IsLower(i))
{
code |= (byte)WordDelimiterFilter.LOWER;
code |= WordDelimiterFilter.LOWER;
}
else if (char.IsUpper((char)i))
else if (Character.IsUpper(i))
{
code |= (byte)WordDelimiterFilter.UPPER;
code |= WordDelimiterFilter.UPPER;
}
else if (char.IsDigit((char)i))
else if (Character.IsDigit(i))
{
code |= (byte)WordDelimiterFilter.DIGIT;
code |= WordDelimiterFilter.DIGIT;
}
if (code == 0)
{
Expand Down Expand Up @@ -318,7 +319,7 @@ private int CharType(int ch)
/// <returns> Type of the character </returns>
public static byte GetType(int ch)
{
switch (CharUnicodeInfo.GetUnicodeCategory((char)ch))
switch (Character.GetType(ch))
{
case UnicodeCategory.UppercaseLetter:
return WordDelimiterFilter.UPPER;
Expand Down
10 changes: 6 additions & 4 deletions src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using Lucene.Net.Util.Fst;
using System;
using System.Diagnostics;
using System.Globalization;

namespace Lucene.Net.Analysis.Synonym
{
Expand Down Expand Up @@ -252,9 +253,10 @@ public virtual void Add(char[] output, int offset, int len, int endOffset, int p

/// <param name="input"> input tokenstream </param>
/// <param name="synonyms"> synonym map </param>
/// <param name="ignoreCase"> case-folds input for matching with <see cref="Character.ToLower(int)"/>.
/// Note, if you set this to true, its your responsibility to lowercase
/// the input entries when you create the <see cref="SynonymMap"/> </param>
/// <param name="ignoreCase"> case-folds input for matching with <see cref="Character.ToLower(int, CultureInfo)"/>
/// in using <see cref="CultureInfo.InvariantCulture"/>.
/// Note, if you set this to <c>true</c>, its your responsibility to lowercase
/// the input entries when you create the <see cref="SynonymMap"/>.</param>
public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase)
: base(input)
{
Expand Down Expand Up @@ -411,7 +413,7 @@ private void Parse()
while (bufUpto < bufferLen)
{
int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null)
{
//System.out.println(" stop");
goto byTokenBreak;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ protected override bool IncrementWord()

// find the next set of boundaries, skipping over non-tokens
int end = wordBreaker.Next();
while (end != BreakIterator.Done && !char.IsLetterOrDigit((char)Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
{
start = end;
end = wordBreaker.Next();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using J2N;
using J2N.Globalization;
using Lucene.Net.Analysis.TokenAttributes;
using System;
using System.Globalization;
Expand Down Expand Up @@ -35,12 +34,13 @@ namespace Lucene.Net.Analysis.Tr
public sealed class TurkishLowerCaseFilter : TokenFilter
{
private const int LATIN_CAPITAL_LETTER_I = '\u0049';
private const int LATIN_CAPITAL_LETTER_DOTTED_I = '\u0130';
private const int LATIN_SMALL_LETTER_I = '\u0069';
private const int LATIN_SMALL_LETTER_DOTLESS_I = '\u0131';
private const int COMBINING_DOT_ABOVE = '\u0307';
private readonly ICharTermAttribute termAtt;

private static readonly CultureInfo culture = new CultureInfo("tr"); // LUCENENET specific - we need to do a culture-sensitive lowercase operation in Turkish

/// <summary>
/// Create a new <see cref="TurkishLowerCaseFilter"/>, that normalizes Turkish token text
/// to lower case.
Expand All @@ -64,7 +64,7 @@ public override sealed bool IncrementToken()
{
int ch = Character.CodePointAt(buffer, i, length);

iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && CharUnicodeInfo.GetUnicodeCategory((char)ch) == UnicodeCategory.NonSpacingMark));
iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && Character.GetType(ch) == UnicodeCategory.NonSpacingMark));

if (iOrAfter) // all the special I turkish handling happens here.
{
Expand Down Expand Up @@ -93,32 +93,8 @@ public override sealed bool IncrementToken()
}
}

using (var culture = new CultureContext("tr"))
{
switch (ch)
{
// LUCENENET: The .NET char.ToLower() function works correctly in
// Turkish as long as the current thread is set to tr-TR (well, technically the
// culture change is only required for the LATIN_CAPITAL_LETTER_I case). .NET does
// not split these characters into separate letter/non-spacing mark characters,
// but the user might still input them that way so we still need the above
// block to handle that case.
//
// LUCENENET TODO: Oddly, the Character.ToLowerCase() function below does not work right
// for Turkish. Which begs the question, should this special case be there so Turkish works
// everywhere? Or should we leave it a special case here because that is the way it works in Java?
//
// References:
// http://haacked.com/archive/2012/07/05/turkish-i-problem-and-why-you-should-care.aspx/
// http://www.i18nguy.com/unicode/turkish-i18n.html
case LATIN_CAPITAL_LETTER_I:
case LATIN_CAPITAL_LETTER_DOTTED_I:
i += Character.ToChars(char.ToLower((char)ch), buffer, i);
continue;
}
}

i += Character.ToChars(Character.ToLower(ch), buffer, i);
// LUCENENET specific - need to pass Turkish culture to get the correct lowercase results
i += Character.ToChars(Character.ToLower(ch, culture), buffer, i);
}

termAtt.Length = length;
Expand All @@ -139,8 +115,7 @@ private bool IsBeforeDot(char[] s, int pos, int len)
for (int i = pos; i < len;)
{
int ch = Character.CodePointAt(s, i, len);
//if (char.getType(ch) != char.NON_SPACING_MARK)
if (CharUnicodeInfo.GetUnicodeCategory((char)ch) != UnicodeCategory.NonSpacingMark)
if (Character.GetType(ch) != UnicodeCategory.NonSpacingMark)
{
return false;
}
Expand All @@ -161,9 +136,7 @@ private bool IsBeforeDot(char[] s, int pos, int len)
private int Delete(char[] s, int pos, int len)
{
if (pos < len)
{
Array.Copy(s, pos + 1, s, pos, len - pos - 1);
}

return len - 1;
}
Expand Down
12 changes: 6 additions & 6 deletions src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayMap.cs
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ private bool Equals(char[] text1, int offset, int length, char[] text2)
for (int i = 0; i < length;)
{
var codePointAt = charUtils.CodePointAt(text1, offset + i, limit);
if (Character.ToLower(codePointAt) != charUtils.CodePointAt(text2, i, text2.Length))
if (Character.ToLower(codePointAt, CultureInfo.InvariantCulture) != charUtils.CodePointAt(text2, i, text2.Length)) // LUCENENET specific - need to use invariant culture to match Java
{
return false;
}
Expand Down Expand Up @@ -696,7 +696,7 @@ private bool Equals(ICharSequence text1, char[] text2)
for (int i = 0; i < length;)
{
int codePointAt = charUtils.CodePointAt(text1, i);
if (Character.ToLower(codePointAt) != charUtils.CodePointAt(text2, i, text2.Length))
if (Character.ToLower(codePointAt, CultureInfo.InvariantCulture) != charUtils.CodePointAt(text2, i, text2.Length)) // LUCENENET specific - need to use invariant culture to match Java
{
return false;
}
Expand Down Expand Up @@ -728,7 +728,7 @@ private bool Equals(string text1, char[] text2)
for (int i = 0; i < length;)
{
int codePointAt = charUtils.CodePointAt(text1, i);
if (Character.ToLower(codePointAt) != charUtils.CodePointAt(text2, i, text2.Length))
if (Character.ToLower(codePointAt, CultureInfo.InvariantCulture) != charUtils.CodePointAt(text2, i, text2.Length)) // LUCENENET specific - need to use invariant culture to match Java
{
return false;
}
Expand Down Expand Up @@ -811,7 +811,7 @@ private int GetHashCode(char[] text, int offset, int length)
for (int i = offset; i < stop;)
{
int codePointAt = charUtils.CodePointAt(text, i, stop);
code = code * 31 + Character.ToLower(codePointAt);
code = code * 31 + Character.ToLower(codePointAt, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
i += Character.CharCount(codePointAt);
}
}
Expand Down Expand Up @@ -839,7 +839,7 @@ private int GetHashCode(ICharSequence text)
for (int i = 0; i < length;)
{
int codePointAt = charUtils.CodePointAt(text, i);
code = code * 31 + Character.ToLower(codePointAt);
code = code * 31 + Character.ToLower(codePointAt, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
i += Character.CharCount(codePointAt);
}
}
Expand Down Expand Up @@ -867,7 +867,7 @@ private int GetHashCode(string text)
for (int i = 0; i < length;)
{
int codePointAt = charUtils.CodePointAt(text, i);
code = code * 31 + Character.ToLower(codePointAt);
code = code * 31 + Character.ToLower(codePointAt, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
i += Character.CharCount(codePointAt);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ public static CharacterBuffer NewCharacterBuffer(int bufferSize)


/// <summary>
/// Converts each unicode codepoint to lowerCase via <see cref="Character.ToLower(int)"/> starting
/// Converts each unicode codepoint to lowerCase via <see cref="TextInfo.ToLower(string)"/> in the invariant culture starting
/// at the given offset. </summary>
/// <param name="buffer"> the char buffer to lowercase </param>
/// <param name="offset"> the offset to start at </param>
Expand Down Expand Up @@ -199,7 +199,7 @@ public static CharacterBuffer NewCharacterBuffer(int bufferSize)
}

/// <summary>
/// Converts each unicode codepoint to UpperCase via <see cref="Character.ToUpper(int)"/> starting
/// Converts each unicode codepoint to UpperCase via <see cref="TextInfo.ToUpper(string)"/> in the invariant culture starting
/// at the given offset. </summary>
/// <param name="buffer"> the char buffer to UPPERCASE </param>
/// <param name="offset"> the offset to start at </param>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using Morfologik.Stemming.Polish;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Text;
using System.Text.RegularExpressions;

Expand Down Expand Up @@ -55,6 +56,8 @@ public class MorfologikFilter : TokenFilter

private int lemmaListIndex;

private static readonly CultureInfo culture = new CultureInfo("pl"); // LUCENENET specific - do lowercasing in Polish culture

/// <summary>
/// Creates a filter with the default (Polish) dictionary.
/// </summary>
Expand Down Expand Up @@ -166,7 +169,7 @@ private string ToLowercase(string chs)
for (int i = 0; i < length;)
{
i += Character.ToChars(
Character.ToLower(Character.CodePointAt(chs, i)), buffer, i);
Character.ToLower(Character.CodePointAt(chs, i), culture), buffer, i); // LUCENENET specific - need to use explicit culture to override current thread
}

return scratch.ToString();
Expand Down