fix(nlu): discrete number of tokens in none utt

botpress · Oct 25, 2019 · 29b9284 · 29b9284
1 parent 25cf530
commit 29b9284
Showing 1 changed file with 24 additions and 15 deletions.
diff --git a/modules/nlu/src/backend/engine2/engine2.ts b/modules/nlu/src/backend/engine2/engine2.ts
@@ -922,7 +922,7 @@ export const AppendNoneIntents = async (input: TrainOutput, tools: Tools): Promi
   const joinChar = vocabulary.filter(x => x.startsWith(SPACE)).length >= vocabulary.length * 0.5 ? SPACE : ''
 
   const noneUtterances = _.range(0, nbOfNoneUtterances).map(() => {
-    const nbWords = _.random(avgTokens / 2, avgTokens * 2, false)
+    const nbWords = Math.round(_.random(avgTokens / 2, avgTokens * 2, false))
     return _.sampleSize(junkWords, nbWords).join(joinChar)
   })
 
@@ -960,21 +960,30 @@ const Utterances = async (raw_utterances: string[], languageCode: string, tools:
   const vectors = await tools.vectorize_tokens(uniqTokens, languageCode)
   const vectorMap = _.zipObject(uniqTokens, vectors)
 
-  return _.zip(tokens, parsed).map(([tokUtt, { utterance: utt, parsedSlots }]) => {
-    const vectors = tokUtt.map(t => vectorMap[t])
-    const utterance = new Utterance(tokUtt, vectors)
-
-    // TODO: temporary work-around
-    // covers a corner case where tokenization returns tokens that are not identical to `parsed` utterance
-    // the corner case is when there's a trailing space inside a slot at the end of the utterance, e.g. `my name is [Sylvain ](any)`
-    if (utterance.toString().length === utt.length) {
-      parsedSlots.forEach(s => {
-        utterance.tagSlot({ name: s.name, source: s.value, confidence: 1 }, s.cleanPosition.start, s.cleanPosition.end)
-      })
-    } // else we skip the slot
+  return _.zip(tokens, parsed)
+    .map(([tokUtt, { utterance: utt, parsedSlots }]) => {
+      if (tokUtt.length === 0) {
+        return
+      }
+      const vectors = tokUtt.map(t => vectorMap[t])
+      const utterance = new Utterance(tokUtt, vectors)
+
+      // TODO: temporary work-around
+      // covers a corner case where tokenization returns tokens that are not identical to `parsed` utterance
+      // the corner case is when there's a trailing space inside a slot at the end of the utterance, e.g. `my name is [Sylvain ](any)`
+      if (utterance.toString().length === utt.length) {
+        parsedSlots.forEach(s => {
+          utterance.tagSlot(
+            { name: s.name, source: s.value, confidence: 1 },
+            s.cleanPosition.start,
+            s.cleanPosition.end
+          )
+        })
+      } // else we skip the slot
 
-    return utterance
-  })
+      return utterance
+    })
+    .filter(Boolean)
 }
 
 const trainSlotTagger = async (input: TrainOutput, tools: Tools): Promise<Buffer> => {