fix(nlu): intent classification w/o slots

botpress · Apr 29, 2019 · 00ee463 · 00ee463
1 parent ebe8462
commit 00ee463
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 10 deletions.
diff --git a/internal-modules b/internal-modules
diff --git a/modules/nlu/src/backend/confusion-engine.ts b/modules/nlu/src/backend/confusion-engine.ts
@@ -2,6 +2,7 @@ import * as sdk from 'botpress/sdk'
 import { flatten, groupBy } from 'lodash'
 
 import ScopedEngine from './engine'
+import { keepEntityValues } from './pipelines/slots/pre-processor'
 import { FiveFolder, RecordCallback, Result } from './tools/five-fold'
 
 type TrainingEntry = {
@@ -89,7 +90,9 @@ export default class ConfusionEngine extends ScopedEngine {
     const defs = this._entriesToDefinition(trainSet)
 
     await this.loadModels(defs, this.modelName)
-    const actual = await Promise.mapSeries(testSet, (__, idx) => this.extract(testSet[idx].utterance, []))
+    const actual = await Promise.mapSeries(testSet, (__, idx) =>
+      this.extract(keepEntityValues(testSet[idx].utterance), [])
+    )
 
     testSet.forEach((__, idx) => record(testSet[idx].definition.name, actual[idx].intent.name))
   }

diff --git a/modules/nlu/src/backend/engine.ts b/modules/nlu/src/backend/engine.ts
@@ -17,7 +17,7 @@ import { FastTextLanguageId } from './pipelines/language/ft_lid'
 import { sanitize } from './pipelines/language/sanitizer'
 import { tokenize } from './pipelines/language/tokenizers'
 import CRFExtractor from './pipelines/slots/crf_extractor'
-import { generateTrainingSequence } from './pipelines/slots/pre-processor'
+import { generateTrainingSequence, keepEntityTypes } from './pipelines/slots/pre-processor'
 import Storage from './storage'
 import { Engine, EntityExtractor, LanguageIdentifier, Model, MODEL_TYPES, SlotExtractor } from './typings'
 
@@ -332,10 +332,31 @@ export default class ScopedEngine implements Engine {
     let ret: any = { errored: true }
     const t1 = Date.now()
     try {
+      const entities = await this._extractEntities(text, ret.language)
+
+      const entitiesToReplace = _.chain(entities)
+        .filter(x => x.type === 'pattern' || x.type === 'list')
+        .orderBy(['entity.meta.start', 'entity.meta.confidence'], ['asc', 'desc'])
+        .value()
+
+      let noEntitiesText = ''
+      let cursor = 0
+
+      for (const entity of entitiesToReplace) {
+        if (entity.meta.start < cursor) {
+          continue
+        }
+
+        noEntitiesText += text.substr(cursor, entity.meta.start - cursor) + entity.name
+        cursor = entity.meta.end
+      }
+
+      noEntitiesText += text.substr(cursor, text.length - cursor)
+
       ret.language = await this.langDetector.identify(text)
-      ret = { ...ret, ...(await this._extractIntents(text, ret.language, includedContexts)) }
-      ret.entities = await this._extractEntities(text, ret.language)
-      ret.slots = await this._extractSlots(text, ret.intent, ret.entities)
+      ret = { ...ret, ...(await this._extractIntents(noEntitiesText, ret.language, includedContexts)) }
+      ret.entities = entities
+      ret.slots = await this._extractSlots(text, ret.intent, entities)
       debugEntities('slots', { text, slots: ret.slots })
       ret.errored = false
     } catch (error) {

diff --git a/modules/nlu/src/backend/pipelines/intents/svm_classifier.ts b/modules/nlu/src/backend/pipelines/intents/svm_classifier.ts
@@ -9,6 +9,7 @@ import { Model } from '../../typings'
 import FTWordVecFeaturizer from '../language/ft_featurizer'
 import { sanitize } from '../language/sanitizer'
 import { tokenize } from '../language/tokenizers'
+import { keepEntityTypes } from '../slots/pre-processor'
 
 import tfidf, { TfidfInput, TfidfOutput } from './tfidf'
 
@@ -73,7 +74,7 @@ export default class SVMClassifier {
       .value()
 
     const intentsWTokens = await Promise.map(intentDefs, async intent => {
-      const lowerUtterances = intent.utterances.map(x => sanitize(x.toLowerCase()))
+      const lowerUtterances = intent.utterances.map(x => keepEntityTypes(sanitize(x.toLowerCase())))
 
       return {
         ...intent,

diff --git a/modules/nlu/src/backend/pipelines/slots/pre-processor.ts b/modules/nlu/src/backend/pipelines/slots/pre-processor.ts
@@ -1,14 +1,23 @@
 import * as sdk from 'botpress/sdk'
+import _ from 'lodash'
 
 import { BIO, Sequence, Tag, Token } from '../../typings'
 
-const SLOTS_REGEX = /\[(.+?)\]\(([\w_\.-]+)\)/gi
+export const SLOTS_REGEX = /\[(.+?)\]\(([\w_\.-]+)\)/gi
 
 // TODO replace this for appropriate tokenizer
 const _tokenize = (input: string): string[] => {
   return input.split(' ').filter(w => w.length)
 }
 
+export function keepEntityTypes(text: string): string {
+  return text.replace(SLOTS_REGEX, '$2')
+}
+
+export function keepEntityValues(text: string): string {
+  return text.replace(SLOTS_REGEX, '$1')
+}
+
 const _makeToken = (value: string, matchedEntities: string[], start: number, tag = '', slot = ''): Token => {
   const token = {
     value,
@@ -33,9 +42,9 @@ const _generateTrainingTokens = (
   slot: string = '',
   slotDefinitions: sdk.NLU.SlotDefinition[] = []
 ): Token[] => {
-  const matchedEntities = slotDefinitions
-    .filter(slotDef => slot && slotDef.name === slot)
-    .map(slotDef => slotDef.entity)
+  const matchedEntities = _.flatten(
+    slotDefinitions.filter(slotDef => slot && slotDef.name === slot).map(slotDef => slotDef.entities)
+  )
 
   return _tokenize(input).map((t, idx) => {
     let tag = BIO.OUT