Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fuzzy String matching logic to StringUtils #20

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
79 changes: 79 additions & 0 deletions src/main/java/org/apache/commons/lang3/StringUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -7072,6 +7072,85 @@ private static double score(final CharSequence first, final CharSequence second)
return dist;
}

/**
* <p>Determine the fuzzy score which indicates the similarity between two Strings.</p>
*
* <p>This string matching algorithm is similar to the algorithms of editors such as Sublime Text,
* TextMate, Atom and others. One point is given for every matched character. Subsequent
* matches yield two bonus points. A higher score indicates a higher similarity.</p>
*
* <pre>
* StringUtils.getFuzzyDistance(null, null, null) = IllegalArgumentException
* StringUtils.getFuzzyDistance("", "", Locale.ENGLISH) = 0
* StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH) = 0
* StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH) = 1
* StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH) = 1
* StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH) = 2
* StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH) = 4
* StringUtils.getFuzzyDistance("Apache Software Foundation", "asf", Locale.ENGLISH) = 3
* </pre>
*
* @param term a full term that should be matched against, must not be null
* @param query the query that will be matched against a term, must not be null
* @param locale This string matching logic is case insensitive. A locale is necessary to normalize
* both Strings to lower case.
* @return result score
* @throws IllegalArgumentException if either String input {@code null} or Locale input {@code null}
* @since 3.4
*/
public static int getFuzzyDistance(final CharSequence term, final CharSequence query, final Locale locale) {
if (term == null || query == null) {
throw new IllegalArgumentException("Strings must not be null");
} else if (locale == null) {
throw new IllegalArgumentException("Locale must not be null");
}

// fuzzy logic is case insensitive. We normalize the Strings to lower
// case right from the start. Turning characters to lower case
// via Character.toLowerCase(char) is unfortunately insufficient
// as it does not accept a locale.
final String termLowerCase = term.toString().toLowerCase(locale);
final String queryLowerCase = query.toString().toLowerCase(locale);

// the resulting score
int score = 0;

// the position in the term which will be scanned next for potential
// query character matches
int termIndex = 0;

// index of the previously matched character in the term
int previousMatchingCharacterIndex = Integer.MIN_VALUE;

for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) {
char queryChar = queryLowerCase.charAt(queryIndex);

boolean termCharacterMatchFound = false;
for (; termIndex < termLowerCase.length() && !termCharacterMatchFound; termIndex++) {
char termChar = termLowerCase.charAt(termIndex);

if (queryChar == termChar) {
// simple character matches result in one point
score++;

// subsequent character matches further improve
// the score.
if (previousMatchingCharacterIndex + 1 == termIndex) {
score += 2;
}

previousMatchingCharacterIndex = termIndex;

// we can leave the nested loop. Every character in the
// query can match at most one character in the term.
termCharacterMatchFound = true;
}
}
}

return score;
}

/**
* Gets a set of matching characters between two strings.
*
Expand Down
31 changes: 31 additions & 0 deletions src/test/java/org/apache/commons/lang3/StringUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -2018,6 +2018,37 @@ public void testGetJaroWinklerDistance_NullString() throws Exception {
StringUtils.getJaroWinklerDistance(null, "clear");
}

@Test
public void testGetFuzzyDistance() throws Exception {
assertEquals(0, StringUtils.getFuzzyDistance("", "", Locale.ENGLISH));
assertEquals(0, StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH));
assertEquals(1, StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH));
assertEquals(1, StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH));
assertEquals(2, StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH));
assertEquals(4, StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH));
assertEquals(3, StringUtils.getFuzzyDistance("Apache Software Foundation", "asf", Locale.ENGLISH));
}

@Test(expected = IllegalArgumentException.class)
public void testGetFuzzyDistance_NullNullNull() throws Exception {
StringUtils.getFuzzyDistance(null, null, null);
}

@Test(expected = IllegalArgumentException.class)
public void testGetFuzzyDistance_StringNullLoclae() throws Exception {
StringUtils.getFuzzyDistance(" ", null, Locale.ENGLISH);
}

@Test(expected = IllegalArgumentException.class)
public void testGetFuzzyDistance_NullStringLocale() throws Exception {
StringUtils.getFuzzyDistance(null, "clear", Locale.ENGLISH);
}

@Test(expected = IllegalArgumentException.class)
public void testGetFuzzyDistance_StringStringNull() throws Exception {
StringUtils.getFuzzyDistance(" ", "clear", null);
}

/**
* A sanity check for {@link StringUtils#EMPTY}.
*/
Expand Down