Skip to content

Commit

Permalink
Merge pull request #2142 from botpress/fl_merge_tokens_slots
Browse files Browse the repository at this point in the history
fix(nlu): merge tokens if both are made of chosen special characters in slot extraction
  • Loading branch information
franklevasseur committed Jul 22, 2019
2 parents 41cbebe + 56275d6 commit 48e3171
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 32 deletions.
2 changes: 1 addition & 1 deletion modules/nlu/src/backend/engine.ts
Expand Up @@ -18,7 +18,7 @@ import { sanitize } from './pipelines/language/sanitizer'
import CRFExtractor from './pipelines/slots/crf_extractor'
import { generateTrainingSequence } from './pipelines/slots/pre-processor'
import Storage from './storage'
import { makeTokens } from './tools/make-tokens'
import { makeTokens } from './tools/token-utils'
import { allInRange } from './tools/math'
import { LanguageProvider, NluMlRecommendations } from './typings'
import {
Expand Down
Expand Up @@ -3,7 +3,7 @@ import * as sdk from 'botpress/sdk'
import _ from 'lodash'

import { initNLUStruct } from '../../pipeline-manager'
import { makeTokens } from '../../tools/make-tokens'
import { makeTokens } from '../../tools/token-utils'
import { LanguageProvider, NLUHealth } from '../../typings'

import PatternExtractor from './pattern_extractor'
Expand Down
@@ -1,7 +1,7 @@
import * as sdk from 'botpress/sdk'
import _ from 'lodash'

import { makeTokens } from '../../tools/make-tokens'
import { makeTokens } from '../../tools/token-utils'
import { BIO, LanguageProvider, NLUHealth } from '../../typings'

import { generatePredictionSequence, generateTrainingSequence } from './pre-processor'
Expand Down
6 changes: 4 additions & 2 deletions modules/nlu/src/backend/pipelines/slots/pre-processor.ts
@@ -1,7 +1,7 @@
import * as sdk from 'botpress/sdk'
import _ from 'lodash'

import { makeTokens } from '../../tools/make-tokens'
import { makeTokens, mergeSpecialCharactersTokens } from '../../tools/token-utils'
import { allInRange } from '../../tools/math'
import { LanguageProvider } from '../../typings'
import { BIO, Sequence, Token } from '../../typings'
Expand Down Expand Up @@ -48,13 +48,15 @@ const _generateTrainingTokens = languageProvider => async (
})
}

const charactersToMerge: string[] = '"+è-_!@#$%?&*()1234567890~`/\\[]{}:;<>='.split('')

export const generatePredictionSequence = async (
input: string,
intentName: string,
entities: sdk.NLU.Entity[],
toks: Token[]
): Promise<Sequence> => {
const tokens = toks.map(tok => {
const tokens = mergeSpecialCharactersTokens(toks, charactersToMerge).map(tok => {
const matchedEntities = entities
.filter(e => allInRange([tok.start, tok.end], e.meta.start, e.meta.end + 1))
.map(e => e.name)
Expand Down
26 changes: 0 additions & 26 deletions modules/nlu/src/backend/tools/make-tokens.ts

This file was deleted.

@@ -1,4 +1,4 @@
import { makeTokens } from './make-tokens'
import { makeTokens, mergeSpecialCharactersTokens } from './token-utils'

const SPACE = '\u2581'

Expand Down Expand Up @@ -45,3 +45,36 @@ describe('Tokens generation', () => {
expect(actualEnds).toEqual(expectedEnds)
})
})

describe('Token Merging', () => {
test('Merge special Characters with numbers should merge all consecutive numbers', async () => {
// arrange
const text = '1234vanillaIce4321'
const stringTokens = [SPACE + '1', '23', '4', 'vanilla', 'ice', '43', '2', '1']
const tokens = makeTokens(stringTokens, text)

const numbers = '0123456789'.split('')

// act
const actualTokens = mergeSpecialCharactersTokens(tokens, numbers)

// assert
const expectedTokens = [SPACE + '1234', 'vanilla', 'ice', '4321']
expect(actualTokens.map(t => t.value)).toEqual(expectedTokens)
})

test('Merge special Characters with pipes should merge all consecutive pipes', async () => {
// arrange
const text = '|||yes|||yes|||yes|||'
const base = ['yes', '|', '|', '|']
const stringTokens = [SPACE + '|', '|', '|', ...base, ...base, ...base]
const tokens = makeTokens(stringTokens, text)

// act
const actualTokens = mergeSpecialCharactersTokens(tokens, ['|'])

// assert
const expectedTokens = [SPACE + '|||', 'yes', '|||', 'yes', '|||', 'yes', '|||']
expect(actualTokens.map(t => t.value)).toEqual(expectedTokens)
})
})
62 changes: 62 additions & 0 deletions modules/nlu/src/backend/tools/token-utils.ts
@@ -0,0 +1,62 @@
import _ from 'lodash'
import { Token } from '../typings'

const SPACE = '\u2581'

export const makeTokens = (stringTokens: string[], text: string) => {
return stringTokens.reduce(reduceTokens(text), [] as Token[])
}

const reduceTokens = (text: string) => (currentTokens: Token[], token: string) => {
const trimedToken = token.replace(SPACE, '')

const previousToken = currentTokens[currentTokens.length - 1]
const cursor = previousToken ? previousToken.end : 0

const cutText = text.substring(cursor).toLowerCase()
const start = cutText.indexOf(trimedToken) + cursor
const sanitized = text.substr(start, trimedToken.length)

const newToken = {
value: token,
cannonical: sanitized,
start,
end: start + trimedToken.length,
matchedEntities: []
} as Token

return currentTokens.concat(newToken)
}

function tokenIsAllMadeOf(tok: string, chars: string[]) {
const tokenCharsLeft = _.without(tok.split(''), ...chars)
return _.isEmpty(tokenCharsLeft)
}

export const mergeSpecialCharactersTokens = (tokens: Token[], specialChars: string[]) => {
let current: Token | undefined
const final: Token[] = []

for (const head of tokens) {
if (!current) {
current = head
continue
}

const currentIsAllSpecialChars = tokenIsAllMadeOf(current!.value.replace(SPACE, ''), specialChars)

const headHasNoSpace = !head.value.includes(SPACE)
const headIsAllSpecialChars = tokenIsAllMadeOf(head.value, specialChars)

if (currentIsAllSpecialChars && headIsAllSpecialChars && headHasNoSpace) {
current.value += head.value
current.cannonical += head.cannonical
current.end = head.end
current.matchedEntities = current.matchedEntities.concat(head.matchedEntities)
} else {
final.push(current)
current = head
}
}
return current ? [...final, current] : final
}

0 comments on commit 48e3171

Please sign in to comment.