/
pre-processor.ts
118 lines (99 loc) · 3.23 KB
/
pre-processor.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import * as sdk from 'botpress/sdk'
import _ from 'lodash'
import { BIO, Sequence, Tag, Token } from '../../typings'
export const SLOTS_REGEX = /\[(.+?)\]\(([\w_\.-]+)\)/gi
// TODO replace this for appropriate tokenizer
const _tokenize = (input: string): string[] => {
return input.split(' ').filter(w => w.length)
}
export function keepEntityTypes(text: string): string {
return text.replace(SLOTS_REGEX, '$2')
}
export function keepEntityValues(text: string): string {
return text.replace(SLOTS_REGEX, '$1')
}
const _makeToken = (value: string, matchedEntities: string[], start: number, tag = '', slot = ''): Token => {
const token = {
value,
matchedEntities,
start,
end: start + value.length
} as Token
if (tag) {
token.tag = <Tag>tag
}
if (slot) {
token.slot = slot
}
return token
}
// TODO use the same algorithm as in the prediction sequence
const _generateTrainingTokens = (
input: string,
start: number,
slot: string = '',
slotDefinitions: sdk.NLU.SlotDefinition[] = []
): Token[] => {
const matchedEntities = _.flatten(
slotDefinitions.filter(slotDef => slot && slotDef.name === slot).map(slotDef => slotDef.entities)
)
return _tokenize(input).map((t, idx) => {
let tag = BIO.OUT
if (slot) {
tag = idx === 0 ? BIO.BEGINNING : BIO.INSIDE
}
const token = _makeToken(t, matchedEntities, start, tag, slot)
start += t.length + 1 // 1 is the space char, replace this by what was done in the prediction sequence
return token
})
}
export const generateTrainingSequence = (
input: string,
slotDefinitions: sdk.NLU.SlotDefinition[],
intentName: string = ''
): Sequence => {
let matches: RegExpExecArray | null
let start = 0
let tokens: Token[] = []
do {
matches = SLOTS_REGEX.exec(input)
if (matches) {
const sub = input.substr(start, matches.index - start - 1)
tokens = [
...tokens,
..._generateTrainingTokens(sub, start),
..._generateTrainingTokens(matches[1], start + matches.index, matches[2], slotDefinitions)
]
start = matches.index + matches[0].length
}
} while (matches)
if (start !== input.length) {
const lastingPart = input.substr(start, input.length - start)
tokens = [...tokens, ..._generateTrainingTokens(lastingPart, start)]
}
return {
intent: intentName,
cannonical: tokens.map(t => t.value).join(' '),
tokens
}
}
export const generatePredictionSequence = (input: string, intentName: string, entities: sdk.NLU.Entity[]): Sequence => {
const cannonical = input // we generate a copy here since input is mutating
let currentIdx = 0
const tokens = _tokenize(input).map(value => {
const inputIdx = input.indexOf(value)
currentIdx += inputIdx // in case of tokenization uses more than one char i.e words separated with multiple spaces
input = input.slice(inputIdx + value.length)
const matchedEntities = entities
.filter(e => e.meta.start <= currentIdx && e.meta.end >= currentIdx + value.length)
.map(e => e.name)
const token = _makeToken(value, matchedEntities, currentIdx)
currentIdx = token.end // move cursor to end of token in original input
return token
})
return {
intent: intentName,
cannonical,
tokens
}
}