Merge pull request #2142 from botpress/fl_merge_tokens_slots

fix(nlu): merge tokens if both are made of chosen special characters in slot extraction
botpress · Jul 22, 2019 · 48e3171 · 48e3171
2 parents 41cbebe + 56275d6
commit 48e3171
Show file tree

Hide file tree

Showing 7 changed files with 103 additions and 32 deletions.
diff --git a/modules/nlu/src/backend/engine.ts b/modules/nlu/src/backend/engine.ts
@@ -18,7 +18,7 @@ import { sanitize } from './pipelines/language/sanitizer'
 import CRFExtractor from './pipelines/slots/crf_extractor'
 import { generateTrainingSequence } from './pipelines/slots/pre-processor'
 import Storage from './storage'
-import { makeTokens } from './tools/make-tokens'
+import { makeTokens } from './tools/token-utils'
 import { allInRange } from './tools/math'
 import { LanguageProvider, NluMlRecommendations } from './typings'
 import {

diff --git a/modules/nlu/src/backend/pipelines/entities/pattern_extractor.test.ts b/modules/nlu/src/backend/pipelines/entities/pattern_extractor.test.ts
@@ -3,7 +3,7 @@ import * as sdk from 'botpress/sdk'
 import _ from 'lodash'
 
 import { initNLUStruct } from '../../pipeline-manager'
-import { makeTokens } from '../../tools/make-tokens'
+import { makeTokens } from '../../tools/token-utils'
 import { LanguageProvider, NLUHealth } from '../../typings'
 
 import PatternExtractor from './pattern_extractor'

diff --git a/modules/nlu/src/backend/pipelines/slots/pre-processor.test.ts b/modules/nlu/src/backend/pipelines/slots/pre-processor.test.ts
@@ -1,7 +1,7 @@
 import * as sdk from 'botpress/sdk'
 import _ from 'lodash'
 
-import { makeTokens } from '../../tools/make-tokens'
+import { makeTokens } from '../../tools/token-utils'
 import { BIO, LanguageProvider, NLUHealth } from '../../typings'
 
 import { generatePredictionSequence, generateTrainingSequence } from './pre-processor'

diff --git a/modules/nlu/src/backend/pipelines/slots/pre-processor.ts b/modules/nlu/src/backend/pipelines/slots/pre-processor.ts
@@ -1,7 +1,7 @@
 import * as sdk from 'botpress/sdk'
 import _ from 'lodash'
 
-import { makeTokens } from '../../tools/make-tokens'
+import { makeTokens, mergeSpecialCharactersTokens } from '../../tools/token-utils'
 import { allInRange } from '../../tools/math'
 import { LanguageProvider } from '../../typings'
 import { BIO, Sequence, Token } from '../../typings'
@@ -48,13 +48,15 @@ const _generateTrainingTokens = languageProvider => async (
   })
 }
 
+const charactersToMerge: string[] = '"+è-_!@#$%?&*()1234567890~`/\\[]{}:;<>='.split('')
+
 export const generatePredictionSequence = async (
   input: string,
   intentName: string,
   entities: sdk.NLU.Entity[],
   toks: Token[]
 ): Promise<Sequence> => {
-  const tokens = toks.map(tok => {
+  const tokens = mergeSpecialCharactersTokens(toks, charactersToMerge).map(tok => {
     const matchedEntities = entities
       .filter(e => allInRange([tok.start, tok.end], e.meta.start, e.meta.end + 1))
       .map(e => e.name)

diff --git a/modules/nlu/src/backend/tools/make-tokens.ts b/modules/nlu/src/backend/tools/make-tokens.ts
diff --git a/...nlu/src/backend/tools/make-tokens.test.ts → ...nlu/src/backend/tools/token-utils.test.ts b/...nlu/src/backend/tools/make-tokens.test.ts → ...nlu/src/backend/tools/token-utils.test.ts
@@ -1,4 +1,4 @@
-import { makeTokens } from './make-tokens'
+import { makeTokens, mergeSpecialCharactersTokens } from './token-utils'
 
 const SPACE = '\u2581'
 
@@ -45,3 +45,36 @@ describe('Tokens generation', () => {
     expect(actualEnds).toEqual(expectedEnds)
   })
 })
+
+describe('Token Merging', () => {
+  test('Merge special Characters with numbers should merge all consecutive numbers', async () => {
+    // arrange
+    const text = '1234vanillaIce4321'
+    const stringTokens = [SPACE + '1', '23', '4', 'vanilla', 'ice', '43', '2', '1']
+    const tokens = makeTokens(stringTokens, text)
+
+    const numbers = '0123456789'.split('')
+
+    // act
+    const actualTokens = mergeSpecialCharactersTokens(tokens, numbers)
+
+    // assert
+    const expectedTokens = [SPACE + '1234', 'vanilla', 'ice', '4321']
+    expect(actualTokens.map(t => t.value)).toEqual(expectedTokens)
+  })
+
+  test('Merge special Characters with pipes should merge all consecutive pipes', async () => {
+    // arrange
+    const text = '|||yes|||yes|||yes|||'
+    const base = ['yes', '|', '|', '|']
+    const stringTokens = [SPACE + '|', '|', '|', ...base, ...base, ...base]
+    const tokens = makeTokens(stringTokens, text)
+
+    // act
+    const actualTokens = mergeSpecialCharactersTokens(tokens, ['|'])
+
+    // assert
+    const expectedTokens = [SPACE + '|||', 'yes', '|||', 'yes', '|||', 'yes', '|||']
+    expect(actualTokens.map(t => t.value)).toEqual(expectedTokens)
+  })
+})
diff --git a/modules/nlu/src/backend/tools/token-utils.ts b/modules/nlu/src/backend/tools/token-utils.ts
@@ -0,0 +1,62 @@
+import _ from 'lodash'
+import { Token } from '../typings'
+
+const SPACE = '\u2581'
+
+export const makeTokens = (stringTokens: string[], text: string) => {
+  return stringTokens.reduce(reduceTokens(text), [] as Token[])
+}
+
+const reduceTokens = (text: string) => (currentTokens: Token[], token: string) => {
+  const trimedToken = token.replace(SPACE, '')
+
+  const previousToken = currentTokens[currentTokens.length - 1]
+  const cursor = previousToken ? previousToken.end : 0
+
+  const cutText = text.substring(cursor).toLowerCase()
+  const start = cutText.indexOf(trimedToken) + cursor
+  const sanitized = text.substr(start, trimedToken.length)
+
+  const newToken = {
+    value: token,
+    cannonical: sanitized,
+    start,
+    end: start + trimedToken.length,
+    matchedEntities: []
+  } as Token
+
+  return currentTokens.concat(newToken)
+}
+
+function tokenIsAllMadeOf(tok: string, chars: string[]) {
+  const tokenCharsLeft = _.without(tok.split(''), ...chars)
+  return _.isEmpty(tokenCharsLeft)
+}
+
+export const mergeSpecialCharactersTokens = (tokens: Token[], specialChars: string[]) => {
+  let current: Token | undefined
+  const final: Token[] = []
+
+  for (const head of tokens) {
+    if (!current) {
+      current = head
+      continue
+    }
+
+    const currentIsAllSpecialChars = tokenIsAllMadeOf(current!.value.replace(SPACE, ''), specialChars)
+
+    const headHasNoSpace = !head.value.includes(SPACE)
+    const headIsAllSpecialChars = tokenIsAllMadeOf(head.value, specialChars)
+
+    if (currentIsAllSpecialChars && headIsAllSpecialChars && headHasNoSpace) {
+      current.value += head.value
+      current.cannonical += head.cannonical
+      current.end = head.end
+      current.matchedEntities = current.matchedEntities.concat(head.matchedEntities)
+    } else {
+      final.push(current)
+      current = head
+    }
+  }
+  return current ? [...final, current] : final
+}