Skip to content

Commit

Permalink
added maxLength to double metaphone
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisumbel committed Feb 20, 2012
2 parents 456d80f + d6ec04b commit 95ff89f
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 16 deletions.
3 changes: 2 additions & 1 deletion README.md
Expand Up @@ -170,7 +170,8 @@ the raw phonetics are obtained with phonetics()

console.log('phonetics'.phonetics());

DoubleMetaphone deals with two encodings returned in an array
DoubleMetaphone deals with two encodings returned in an array. This
feature is experimental and subject to change.

var natural = require('natural'),
var dm = natural.DoubleMetaphone;
Expand Down
26 changes: 16 additions & 10 deletions lib/natural/phonetics/double_metaphone.js
Expand Up @@ -29,11 +29,18 @@ function isVowel(c) {
return c.match(/[aeiouy]/i);
}

function process(token, spyCallback) {
function truncate(string, length) {
if(string.length >= length)
string = string.substring(0, length);

return string;
}

function process(token, maxLength) {
token = token.toUpperCase();
var primary = '', secondary = '';
var pos = 0;
var spy = {};
maxLength == maxLength || 32;

function subMatch(startOffset, stopOffset, terms) {
return subMatchAbsolute(pos + startOffset, pos + stopOffset, terms);
Expand Down Expand Up @@ -325,7 +332,7 @@ function process(token, spyCallback) {

pos += 2;
} else if(pos == token.length - 1
&& subMatch(-2, pos, ['AI', 'OI'])) {
&& subMatch(-2, 0, ['AI', 'OI'])) {
addSecondary('', 'S');
} else if(token[pos + 1] != 'L' && (
token[pos - 1] != 'A' && token[pos - 1] != 'I')) {
Expand Down Expand Up @@ -402,11 +409,9 @@ function process(token, spyCallback) {
var san = (token.substring(0, 3) == 'SAN');
var startsWithVowel = isVowel(token[0]);
var slavoGermanic = token.match(/(W|K|CZ|WITZ)/);
spy.slavoGermanic = slavoGermanic;

if(subMatch(0, 2, ['GN', 'KN', 'PN', 'WR', 'PS'])) {
pos++;
spy.initialSilentConsonantSkipped = true;
}

while(pos < token.length) {
Expand Down Expand Up @@ -478,13 +483,14 @@ function process(token, spyCallback) {
break;
}

pos++;
}
if(primary.length >= maxLength && secondary.length >= maxLength) {
break;
}

if(spyCallback)
spyCallback(spy);
pos++;
}

return [primary, secondary];
return [truncate(primary, maxLength), truncate(secondary, maxLength)];
}

function compare(stringA, stringB) {
Expand Down
6 changes: 5 additions & 1 deletion package.json
@@ -1,7 +1,7 @@
{
"name": "natural",
"description": "General natural language (tokenizing, stemming, classification, inflection, phonetics, tfidf, WordNet) facilities for node.",
"version": "0.0.67",
"version": "0.0.68",
"homepage": "https://github.com/NaturalNode/natural",
"engines": {
"node": ">=0.4.9"
Expand All @@ -17,7 +17,11 @@
"author": "Chris Umbel <chris@chrisumbel.com>",
"keywords": ["natural", "language", "porter", "lancaster", "stemmer", "bayes",
"classifier", "phonetic", "metaphone", "inflector", "wordnet", "tf-idf",
<<<<<<< HEAD
"logistic", "regression", "doublemetaphone"],
=======
"logistic", "regression", "doublemetaphone", "double"],
>>>>>>> d6ec04b5db5a52da9d3b9fd47b7d3e851cb9f769
"main": "./lib/natural/index.js",
"maintainers": [{
"name": "Chris Umbel",
Expand Down
29 changes: 25 additions & 4 deletions spec/double_metaphone_spec.js
Expand Up @@ -700,12 +700,33 @@ describe('double metaphone', function() {
expect(encodings[1]).toMatch(/APRPRT/);

encodings = doubleMetaphone.process('intervention');
expect(encodings[0]).toMatch(/ANTRFNXN/);
expect(encodings[1]).toMatch(/ANTRFNXN/);

encodings = doubleMetaphone.process('français');
console.log(encodings);
expect(encodings[0]).toBe('ANTRFNXN');
expect(encodings[1]).toBe('ANTRFNXN');

encodings = doubleMetaphone.process('Français');
expect(encodings[0]).toBe('FRNS');
expect(encodings[1]).toBe('FRNSS');
});

it('should truncate codes if specified', function() {
var encodings = doubleMetaphone.process('Matrix', 4);
expect(encodings[0]).toBe('MTRK');
expect(encodings[1]).toBe('MTRK');

encodings = doubleMetaphone.process('Français', 4);
expect(encodings[0]).toBe('FRNS');
expect(encodings[1]).toBe('FRNS');
});

it('should not truncate code is shorter than specification', function() {
var encodings = doubleMetaphone.process('Matrix', 32);
expect(encodings[0]).toBe('MTRKS');
expect(encodings[1]).toBe('MTRKS');

encodings = doubleMetaphone.process('Français', 5);
expect(encodings[0]).toBe('FRNS');
expect(encodings[1]).toBe('FRNSS');
});

it('should compare', function() {
Expand Down

0 comments on commit 95ff89f

Please sign in to comment.