Skip to content

Commit

Permalink
feat(lyra): adds stemming capabilities during indexing and search
Browse files Browse the repository at this point in the history
  • Loading branch information
micheleriva committed Jun 8, 2022
1 parent cf8f2d4 commit 246bdc9
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 13 deletions.
4 changes: 3 additions & 1 deletion packages/lyra/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@
"types": "./dist/esm/lyra.d.ts",
"dependencies": {
"fastq": "^1.13.0",
"nanoid": "3.3.4"
"nanoid": "3.3.4",
"natural": "^5.2.2"
},
"devDependencies": {
"@types/jest": "^27.5.0",
"@types/natural": "^5.1.1",
"jest": "^28.1.0",
"ts-jest": "^28.0.2",
"typescript": "^4.6.4"
Expand Down
12 changes: 9 additions & 3 deletions packages/lyra/src/lyra.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { Trie } from "./prefix-tree/trie";
import * as ERRORS from "./errors";
import { tokenize } from "./tokenizer";
import { formatNanoseconds, getNanosecondsTime } from "./utils";
import { Language } from "./stemmer";

export type PropertyType = "string" | "number" | "boolean";

Expand All @@ -30,6 +31,7 @@ type LyraIndex = Map<string, Trie>;
type QueueDocParams = {
id: string;
doc: object;
language: Language;
};

type SearchResult = Promise<{
Expand Down Expand Up @@ -183,7 +185,10 @@ export class Lyra {
return ids;
}

public async insert(doc: object): Promise<{ id: string }> {
public async insert(
doc: object,
language: Language = "english"
): Promise<{ id: string }> {
const id = nanoid();

if (!(await this.checkInsertDocSchema(doc))) {
Expand All @@ -193,12 +198,13 @@ export class Lyra {
await this.queue.push({
id,
doc,
language,
});

return { id };
}

private async _insert({ doc, id }: QueueDocParams): Promise<void> {
private async _insert({ doc, id, language }: QueueDocParams): Promise<void> {
const index = this.index;
this.docs.set(id, doc);

Expand All @@ -210,7 +216,7 @@ export class Lyra {
// recursiveTrieInsertion((doc as any)[key]);
} else if (typeof (doc as any)[key] === "string") {
const requestedTrie = index.get(key);
const tokens = tokenize((doc as any)[key]);
const tokens = tokenize((doc as any)[key], language);

for (const token of tokens) {
requestedTrie?.insert(token, id);
Expand Down
57 changes: 57 additions & 0 deletions packages/lyra/src/stemmer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import natural from "natural";

export type Language = typeof SUPPORTED_LANGUAGES[number];

export const SUPPORTED_LANGUAGES = [
"dutch",
"english",
"french",
"italian",
"norwegian",
"portugese",
"russian",
"spanish",
"swedish",
] as const;

export function stemArray(input: string[], language: Language): string[] {
let stemmer;

switch (language) {
case "dutch":
stemmer = natural.PorterStemmerNl;
break;
case "english":
stemmer = natural.PorterStemmer;
break;
case "french":
stemmer = natural.PorterStemmerFr;
break;
case "italian":
stemmer = natural.PorterStemmerIt;
break;
case "norwegian":
stemmer = natural.PorterStemmerNo;
break;
case "portugese":
stemmer = natural.PorterStemmerPt;
break;
case "russian":
stemmer = natural.PorterStemmerRu;
break;
case "spanish":
stemmer = natural.PorterStemmerEs;
break;
case "swedish":
stemmer = natural.PorterStemmerSv;
break;
default:
break;
}

if (stemmer) {
return input.map(stemmer.stem);
}

return input;
}
19 changes: 11 additions & 8 deletions packages/lyra/src/tokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
export function tokenize(input: string): Set<string> {
return new Set(
input
.toLowerCase()
.replace(/[^a-z0-9 -]/g, "")
.split(" ")
.filter(Boolean)
);
import { WordTokenizer } from "natural";
import { Language, stemArray } from "./stemmer";

const tokenizer = new WordTokenizer();

export function tokenize(
input: string,
language: Language = "english"
): Set<string> {
const tokens = tokenizer.tokenize(input);
return new Set(stemArray(tokens, language));
}
54 changes: 53 additions & 1 deletion pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 246bdc9

Please sign in to comment.