Skip to content

Commit

Permalink
fix(nlu): discrete number of tokens in none utt
Browse files Browse the repository at this point in the history
  • Loading branch information
EFF committed Oct 25, 2019
1 parent 25cf530 commit 29b9284
Showing 1 changed file with 24 additions and 15 deletions.
39 changes: 24 additions & 15 deletions modules/nlu/src/backend/engine2/engine2.ts
Expand Up @@ -922,7 +922,7 @@ export const AppendNoneIntents = async (input: TrainOutput, tools: Tools): Promi
const joinChar = vocabulary.filter(x => x.startsWith(SPACE)).length >= vocabulary.length * 0.5 ? SPACE : ''

const noneUtterances = _.range(0, nbOfNoneUtterances).map(() => {
const nbWords = _.random(avgTokens / 2, avgTokens * 2, false)
const nbWords = Math.round(_.random(avgTokens / 2, avgTokens * 2, false))
return _.sampleSize(junkWords, nbWords).join(joinChar)
})

Expand Down Expand Up @@ -960,21 +960,30 @@ const Utterances = async (raw_utterances: string[], languageCode: string, tools:
const vectors = await tools.vectorize_tokens(uniqTokens, languageCode)
const vectorMap = _.zipObject(uniqTokens, vectors)

return _.zip(tokens, parsed).map(([tokUtt, { utterance: utt, parsedSlots }]) => {
const vectors = tokUtt.map(t => vectorMap[t])
const utterance = new Utterance(tokUtt, vectors)

// TODO: temporary work-around
// covers a corner case where tokenization returns tokens that are not identical to `parsed` utterance
// the corner case is when there's a trailing space inside a slot at the end of the utterance, e.g. `my name is [Sylvain ](any)`
if (utterance.toString().length === utt.length) {
parsedSlots.forEach(s => {
utterance.tagSlot({ name: s.name, source: s.value, confidence: 1 }, s.cleanPosition.start, s.cleanPosition.end)
})
} // else we skip the slot
return _.zip(tokens, parsed)
.map(([tokUtt, { utterance: utt, parsedSlots }]) => {
if (tokUtt.length === 0) {
return
}
const vectors = tokUtt.map(t => vectorMap[t])
const utterance = new Utterance(tokUtt, vectors)

// TODO: temporary work-around
// covers a corner case where tokenization returns tokens that are not identical to `parsed` utterance
// the corner case is when there's a trailing space inside a slot at the end of the utterance, e.g. `my name is [Sylvain ](any)`
if (utterance.toString().length === utt.length) {
parsedSlots.forEach(s => {
utterance.tagSlot(
{ name: s.name, source: s.value, confidence: 1 },
s.cleanPosition.start,
s.cleanPosition.end
)
})
} // else we skip the slot

return utterance
})
return utterance
})
.filter(Boolean)
}

const trainSlotTagger = async (input: TrainOutput, tools: Tools): Promise<Buffer> => {
Expand Down

0 comments on commit 29b9284

Please sign in to comment.