Skip to content

Commit

Permalink
Lucene.Net.Analysis.Common: Removed cast from NGramTokenizerAnonymous…
Browse files Browse the repository at this point in the history
…InnerClassHelper.IsTokenChar(int) that was causing surrogate pairs to fail in the TestUTF8FullRange() tests of NGramTokenizerTest and EdgeNGramTokenizerTest (see #269)
  • Loading branch information
NightOwl888 committed Jul 24, 2020
1 parent e3afc98 commit 9821215
Showing 1 changed file with 16 additions and 5 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using J2N;
using J2N.Text;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Support;
using Lucene.Net.Util;
Expand Down Expand Up @@ -165,11 +166,22 @@ internal static int[] toCodePoints(string s)
return codePoints;
}

internal static int[] toCodePoints(ICharSequence s)
{
int[] codePoints = new int[Character.CodePointCount(s, 0, s.Length)];
for (int i = 0, j = 0; i < s.Length; ++j)
{
codePoints[j] = Character.CodePointAt(s, i);
i += Character.CharCount(codePoints[j]);
}
return codePoints;
}

internal static bool isTokenChar(string nonTokenChars, int codePoint)
{
for (int i = 0; i < nonTokenChars.Length;)
{
int cp = char.ConvertToUtf32(nonTokenChars, i);
int cp = nonTokenChars.CodePointAt(i);
if (cp == codePoint)
{
return false;
Expand Down Expand Up @@ -211,8 +223,7 @@ internal static void TestNGrams(int minGram, int maxGram, string s, string nonTo
}
}
assertTrue(grams.IncrementToken());

assertArrayEquals(Arrays.CopyOfRange(codePoints, start, end), toCodePoints(termAtt.ToString()));
assertArrayEquals(Arrays.CopyOfRange(codePoints, start, end), toCodePoints(termAtt));
assertEquals(1, posIncAtt.PositionIncrement);
assertEquals(1, posLenAtt.PositionLength);
assertEquals(offsets[start], offsetAtt.StartOffset);
Expand All @@ -229,7 +240,7 @@ internal static void TestNGrams(int minGram, int maxGram, string s, string nonTo

private class NGramTokenizerAnonymousInnerClassHelper : NGramTokenizer
{
private string nonTokenChars;
private readonly string nonTokenChars;

public NGramTokenizerAnonymousInnerClassHelper(LuceneVersion TEST_VERSION_CURRENT, StringReader java, int minGram, int maxGram, bool edgesOnly, string nonTokenChars)
: base(TEST_VERSION_CURRENT, java, minGram, maxGram, edgesOnly)
Expand All @@ -239,7 +250,7 @@ public NGramTokenizerAnonymousInnerClassHelper(LuceneVersion TEST_VERSION_CURREN

protected override bool IsTokenChar(int chr)
{
return nonTokenChars.IndexOf((char)chr) < 0;
return nonTokenChars.IndexOf(chr) < 0;
}
}

Expand Down

0 comments on commit 9821215

Please sign in to comment.