Skip to content

Commit

Permalink
fix(nlu): intent classification w/o slots
Browse files Browse the repository at this point in the history
  • Loading branch information
slvnperron committed Apr 29, 2019
1 parent ebe8462 commit 00ee463
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 10 deletions.
1 change: 1 addition & 0 deletions internal-modules
Submodule internal-modules added at 023138
5 changes: 4 additions & 1 deletion modules/nlu/src/backend/confusion-engine.ts
Expand Up @@ -2,6 +2,7 @@ import * as sdk from 'botpress/sdk'
import { flatten, groupBy } from 'lodash'

import ScopedEngine from './engine'
import { keepEntityValues } from './pipelines/slots/pre-processor'
import { FiveFolder, RecordCallback, Result } from './tools/five-fold'

type TrainingEntry = {
Expand Down Expand Up @@ -89,7 +90,9 @@ export default class ConfusionEngine extends ScopedEngine {
const defs = this._entriesToDefinition(trainSet)

await this.loadModels(defs, this.modelName)
const actual = await Promise.mapSeries(testSet, (__, idx) => this.extract(testSet[idx].utterance, []))
const actual = await Promise.mapSeries(testSet, (__, idx) =>
this.extract(keepEntityValues(testSet[idx].utterance), [])
)

testSet.forEach((__, idx) => record(testSet[idx].definition.name, actual[idx].intent.name))
}
Expand Down
29 changes: 25 additions & 4 deletions modules/nlu/src/backend/engine.ts
Expand Up @@ -17,7 +17,7 @@ import { FastTextLanguageId } from './pipelines/language/ft_lid'
import { sanitize } from './pipelines/language/sanitizer'
import { tokenize } from './pipelines/language/tokenizers'
import CRFExtractor from './pipelines/slots/crf_extractor'
import { generateTrainingSequence } from './pipelines/slots/pre-processor'
import { generateTrainingSequence, keepEntityTypes } from './pipelines/slots/pre-processor'
import Storage from './storage'
import { Engine, EntityExtractor, LanguageIdentifier, Model, MODEL_TYPES, SlotExtractor } from './typings'

Expand Down Expand Up @@ -332,10 +332,31 @@ export default class ScopedEngine implements Engine {
let ret: any = { errored: true }
const t1 = Date.now()
try {
const entities = await this._extractEntities(text, ret.language)

const entitiesToReplace = _.chain(entities)
.filter(x => x.type === 'pattern' || x.type === 'list')
.orderBy(['entity.meta.start', 'entity.meta.confidence'], ['asc', 'desc'])
.value()

let noEntitiesText = ''
let cursor = 0

for (const entity of entitiesToReplace) {
if (entity.meta.start < cursor) {
continue
}

noEntitiesText += text.substr(cursor, entity.meta.start - cursor) + entity.name
cursor = entity.meta.end
}

noEntitiesText += text.substr(cursor, text.length - cursor)

ret.language = await this.langDetector.identify(text)
ret = { ...ret, ...(await this._extractIntents(text, ret.language, includedContexts)) }
ret.entities = await this._extractEntities(text, ret.language)
ret.slots = await this._extractSlots(text, ret.intent, ret.entities)
ret = { ...ret, ...(await this._extractIntents(noEntitiesText, ret.language, includedContexts)) }
ret.entities = entities
ret.slots = await this._extractSlots(text, ret.intent, entities)
debugEntities('slots', { text, slots: ret.slots })
ret.errored = false
} catch (error) {
Expand Down
3 changes: 2 additions & 1 deletion modules/nlu/src/backend/pipelines/intents/svm_classifier.ts
Expand Up @@ -9,6 +9,7 @@ import { Model } from '../../typings'
import FTWordVecFeaturizer from '../language/ft_featurizer'
import { sanitize } from '../language/sanitizer'
import { tokenize } from '../language/tokenizers'
import { keepEntityTypes } from '../slots/pre-processor'

import tfidf, { TfidfInput, TfidfOutput } from './tfidf'

Expand Down Expand Up @@ -73,7 +74,7 @@ export default class SVMClassifier {
.value()

const intentsWTokens = await Promise.map(intentDefs, async intent => {
const lowerUtterances = intent.utterances.map(x => sanitize(x.toLowerCase()))
const lowerUtterances = intent.utterances.map(x => keepEntityTypes(sanitize(x.toLowerCase())))

return {
...intent,
Expand Down
17 changes: 13 additions & 4 deletions modules/nlu/src/backend/pipelines/slots/pre-processor.ts
@@ -1,14 +1,23 @@
import * as sdk from 'botpress/sdk'
import _ from 'lodash'

import { BIO, Sequence, Tag, Token } from '../../typings'

const SLOTS_REGEX = /\[(.+?)\]\(([\w_\.-]+)\)/gi
export const SLOTS_REGEX = /\[(.+?)\]\(([\w_\.-]+)\)/gi

// TODO replace this for appropriate tokenizer
const _tokenize = (input: string): string[] => {
return input.split(' ').filter(w => w.length)
}

export function keepEntityTypes(text: string): string {
return text.replace(SLOTS_REGEX, '$2')
}

export function keepEntityValues(text: string): string {
return text.replace(SLOTS_REGEX, '$1')
}

const _makeToken = (value: string, matchedEntities: string[], start: number, tag = '', slot = ''): Token => {
const token = {
value,
Expand All @@ -33,9 +42,9 @@ const _generateTrainingTokens = (
slot: string = '',
slotDefinitions: sdk.NLU.SlotDefinition[] = []
): Token[] => {
const matchedEntities = slotDefinitions
.filter(slotDef => slot && slotDef.name === slot)
.map(slotDef => slotDef.entity)
const matchedEntities = _.flatten(
slotDefinitions.filter(slotDef => slot && slotDef.name === slot).map(slotDef => slotDef.entities)
)

return _tokenize(input).map((t, idx) => {
let tag = BIO.OUT
Expand Down

0 comments on commit 00ee463

Please sign in to comment.