Skip to content
This repository has been archived by the owner on May 1, 2023. It is now read-only.

Commit

Permalink
disregard word boundaries; add bestMatchIndex
Browse files Browse the repository at this point in the history
  • Loading branch information
aceakash committed Nov 29, 2018
1 parent 16c348e commit ccdb537
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 67 deletions.
26 changes: 17 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Finds degree of similarity between two strings, based on [Dice's Coefficient](ht
* [Examples](#examples-1)
* [Release Notes](#release-notes)
* [2.0.0](#200)
* [3.0.0](#300)


## Usage
Expand All @@ -41,7 +42,7 @@ Requiring the module gives an object with two methods:

### compareTwoStrings(string1, string2)

Returns a fraction between 0 and 1, which indicates the degree of similarity between the two strings. 0 indicates completely different strings, 1 indicates identical strings. The comparison is case-insensitive.
Returns a fraction between 0 and 1, which indicates the degree of similarity between the two strings. 0 indicates completely different strings, 1 indicates identical strings. The comparison is case-sensitive.

##### Arguments

Expand All @@ -62,15 +63,15 @@ stringSimilarity.compareTwoStrings('healed', 'sealed');

stringSimilarity.compareTwoStrings('Olive-green table for sale, in extremely good condition.',
'For sale: table in very good condition, olive green in colour.');
// → 0.7073170731707317
// → 0.6060606060606061

stringSimilarity.compareTwoStrings('Olive-green table for sale, in extremely good condition.',
'For sale: green Subaru Impreza, 210,000 miles');
// → 0.3013698630136986
// → 0.2558139534883721

stringSimilarity.compareTwoStrings('Olive-green table for sale, in extremely good condition.',
'Wanted: mountain bike with at least 21 gears.');
// → 0.11267605633802817
// → 0.1411764705882353
```

### findBestMatch(mainString, targetStrings)
Expand All @@ -83,7 +84,7 @@ Compares `mainString` against each string in `targetStrings`.
2. targetStrings (Array): Each string in this array will be matched against the main string.

##### Returns
(Object): An object with a `ratings` property, which gives a similarity rating for each target string, and a `bestMatch` property, which specifies which target string was most similar to the main string.
(Object): An object with a `ratings` property, which gives a similarity rating for each target string, a `bestMatch` property, which specifies which target string was most similar to the main string, and a `bestMatchIndex` property, which specifies the index of the bestMatch in the targetStrings array.

##### Examples
```javascript
Expand All @@ -95,14 +96,16 @@ stringSimilarity.findBestMatch('Olive-green table for sale, in extremely good co
//
{ ratings:
[ { target: 'For sale: green Subaru Impreza, 210,000 miles',
rating: 0.3013698630136986 },
rating: 0.2558139534883721 },
{ target: 'For sale: table in very good condition, olive green in colour.',
rating: 0.7073170731707317 },
rating: 0.6060606060606061 },
{ target: 'Wanted: mountain bike with at least 21 gears.',
rating: 0.11267605633802817 } ],
rating: 0.1411764705882353 } ],
bestMatch:
{ target: 'For sale: table in very good condition, olive green in colour.',
rating: 0.7073170731707317 } }
rating: 0.6060606060606061 },
bestMatchIndex: 1
}
```

## Release Notes
Expand All @@ -111,6 +114,11 @@ stringSimilarity.findBestMatch('Olive-green table for sale, in extremely good co
* Removed production dependencies
* Updated to ES6 (this breaks backward-compatibility for pre-ES6 apps)

### 3.0.0
* Performance improvement for `compareTwoStrings(..)`: now O(n) instead of O(n^2)
* The algorithm has been tweaked slightly to disregard spaces and word boundaries. This will change the rating values slightly but not enough to make a significant difference
* Adding a `bestMatchIndex` to the results for `findBestMatch(..)` to point to the best match in the supplied `targetStrings` array


![Build status](https://codeship.com/projects/2aa453d0-0959-0134-8a76-4abcb29fe9b4/status?branch=master)
[![Known Vulnerabilities](https://snyk.io/test/github/aceakash/string-similarity/badge.svg)](https://snyk.io/test/github/aceakash/string-similarity)
83 changes: 56 additions & 27 deletions compare-strings.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,53 +3,82 @@ module.exports = {
findBestMatch
};

function compareTwoStrings (str1, str2) {
if (!str1.length && !str2.length) return 1; // if both are empty strings
if (!str1.length || !str2.length) return 0; // if only one is empty string
if (str1.toUpperCase() === str2.toUpperCase()) return 1; // identical
if (str1.length === 1 && str2.length === 1) return 0; // both are 1-letter strings

const pairs1 = wordLetterPairs(str1);
const pairs2 = wordLetterPairs(str2);
const union = pairs1.length + pairs2.length;
let intersection = 0;
pairs1.forEach(pair1 => {
for (let i = 0, pair2; pair2 = pairs2[i]; i++) {
if (pair1 !== pair2) continue;
intersection++;
pairs2.splice(i, 1);
break;
function compareTwoStrings(first, second) {
first = first.replace(/\s+/g, '')
second = second.replace(/\s+/g, '')

if (!first.length && !second.length) return 1; // if both are empty strings
if (!first.length || !second.length) return 0; // if only one is empty string
if (first === second) return 1; // identical
if (first.length === 1 && second.length === 1) return 0; // both are 1-letter strings
if (first.length < 2 || second.length < 2) return 0; // if either is a 1-letter string

let firstBigrams = new Map();
for (let i = 0; i < first.length - 1; i++) {
const bigram = first.substr(i, 2);
const count = firstBigrams.has(bigram)
? firstBigrams.get(bigram) + 1
: 1;

firstBigrams.set(bigram, count);
};

let intersectionSize = 0;
for (let i = 0; i < second.length - 1; i++) {
const bigram = second.substr(i, 2);
const count = firstBigrams.has(bigram)
? firstBigrams.get(bigram)
: 0;

if (count > 0) {
firstBigrams.set(bigram, count - 1);
intersectionSize++;
}
});
return intersection * 2 / union;
}

return (2.0 * intersectionSize) / (first.length + second.length - 2);
}

function findBestMatch (mainString, targetStrings) {
function findBestMatch(mainString, targetStrings) {
if (!areArgsValid(mainString, targetStrings)) throw new Error('Bad arguments: First argument should be a string, second should be an array of strings');
const ratings = targetStrings.map(target => ({ target, rating: compareTwoStrings(mainString, target) }));
const bestMatch = Array.from(ratings).sort((a, b) => b.rating - a.rating)[0];
return { ratings, bestMatch };

const ratings = [];
let bestMatchIndex = 0;

for (let i = 0; i < targetStrings.length; i++) {
const currentTargetString = targetStrings[i];
const currentRating = compareTwoStrings(mainString, currentTargetString)
ratings.push({target: currentTargetString, rating: currentRating})
if (currentRating > ratings[bestMatchIndex].rating) {
bestMatchIndex = i
}
}


const bestMatch = ratings[bestMatchIndex]

return { ratings, bestMatch, bestMatchIndex };
}

function flattenDeep (arr) {
return Array.isArray(arr) ? arr.reduce((a, b) => a.concat(flattenDeep(b)) , []) : [arr];
function flattenDeep(arr) {
return Array.isArray(arr) ? arr.reduce((a, b) => a.concat(flattenDeep(b)), []) : [arr];
}

function areArgsValid (mainString, targetStrings) {
function areArgsValid(mainString, targetStrings) {
if (typeof mainString !== 'string') return false;
if (!Array.isArray(targetStrings)) return false;
if (!targetStrings.length) return false;
if (targetStrings.find(s => typeof s !== 'string')) return false;
return true;
}

function letterPairs (str) {
function letterPairs(str) {
const pairs = [];
for (let i = 0, max = str.length - 1; i < max; i++) pairs[i] = str.substring(i, i + 2);
return pairs;
}

function wordLetterPairs (str) {
function wordLetterPairs(str) {
const pairs = str.toUpperCase().split(' ').map(letterPairs);
return flattenDeep(pairs);
}
16 changes: 8 additions & 8 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "string-similarity",
"version": "2.0.0",
"version": "3.0.0",
"description": "Finds degree of similarity between strings, based on Dice's Coefficient, which is mostly better than Levenshtein distance.",
"main": "compare-strings.js",
"scripts": {
Expand Down Expand Up @@ -29,6 +29,6 @@
"author": "Akash Kurdekar <npm@kurdekar.com> (http://untilfalse.com/)",
"license": "ISC",
"devDependencies": {
"jasmine": "^3.2.0"
"jasmine": "^3.3.0"
}
}
53 changes: 32 additions & 21 deletions spec/compare-strings.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,32 @@ describe('compareTwoStrings', function () {
var compareTwoStrings = stringSimilarity.compareTwoStrings;

it('is a function', function () {
expect(typeof compareTwoStrings).toBe('function');
expect(typeof compareTwoStrings).toEqual('function');
});

it('returns the correct value for different inputs:', function () {
const testData = [
{first: 'french', second: 'quebec', expected: 0},
{first: 'france', second: 'france', expected: 1},
{first: 'fRaNce', second: 'france', expected: 1},
{first: 'healed', second: 'sealed', expected: 0.8},
{first: 'web applications', second: 'applications of the web', expected: 0.896551724137931},
{first: 'this will have a typo somewhere', second: 'this will huve a typo somewhere', expected: 0.9},
{first: 'this has one extra word', second: 'this has one word', expected: 0.8333333333333334},
{first: 'a', second: 'a', expected: 1},
{first: 'a', second: 'b', expected: 0},
{first: '', second: '', expected: 1},
{first: 'a', second: '', expected: 0},
{first: '', second: 'a', expected: 0}
];
{ first: 'french', second: 'quebec', expected: 0 },
{ first: 'france', second: 'france', expected: 1 },
{ first: 'fRaNce', second: 'france', expected: 0.2 },
{ first: 'healed', second: 'sealed', expected: 0.8 },
{ first: 'web applications', second: 'applications of the web', expected: 0.7878787878787878 },
{ first: 'this will have a typo somewhere', second: 'this will huve a typo somewhere', expected: 0.92 },
{ first: 'Olive-green table for sale, in extremely good condition.', second: 'For sale: table in very good condition, olive green in colour.', expected: 0.6060606060606061 },
{ first: 'Olive-green table for sale, in extremely good condition.', second: 'For sale: green Subaru Impreza, 210,000 miles', expected: 0.2558139534883721 },
{ first: 'Olive-green table for sale, in extremely good condition.', second: 'Wanted: mountain bike with at least 21 gears.', expected: 0.1411764705882353 },
{ first: 'this has one extra word', second: 'this has one word', expected: 0.7741935483870968 },
{ first: 'a', second: 'a', expected: 1 },
{ first: 'a', second: 'b', expected: 0 },
{ first: '', second: '', expected: 1 },
{ first: 'a', second: '', expected: 0 },
{ first: '', second: 'a', expected: 0 },
{ first: 'apple event', second: 'apple event', expected: 1 },
{ first: 'iphone', second: 'iphone x', expected: 0.9090909090909091 }
];

testData.forEach(td => {
expect(compareTwoStrings(td.first, td.second)).toEqual(td.expected);
expect(compareTwoStrings(td.first, td.second)).toBe(td.expected, td);
});
});
});
Expand Down Expand Up @@ -78,16 +83,22 @@ describe('findBestMatch', function () {
var matches = findBestMatch('healed', ['mailed', 'edward', 'sealed', 'theatre']);

expect(matches.ratings).toEqual([
{target: 'mailed', rating: 0.4},
{target: 'edward', rating: 0.2},
{target: 'sealed', rating: 0.8},
{target: 'theatre', rating: 0.36363636363636365}
{ target: 'mailed', rating: 0.4 },
{ target: 'edward', rating: 0.2 },
{ target: 'sealed', rating: 0.8 },
{ target: 'theatre', rating: 0.36363636363636365 }
]);
});

it("returns the best match and it's similarity rating", function () {
it("returns the best match and its similarity rating", function () {
var matches = findBestMatch('healed', ['mailed', 'edward', 'sealed', 'theatre']);

expect(matches.bestMatch).toEqual({target: 'sealed', rating: 0.8});
expect(matches.bestMatch).toEqual({ target: 'sealed', rating: 0.8 });
});

it("returns the index of best match from the target strings", function () {
var matches = findBestMatch('healed', ['mailed', 'edward', 'sealed', 'theatre']);

expect(matches.bestMatchIndex).toBe(2);
});
});

0 comments on commit ccdb537

Please sign in to comment.