Skip to content

Commit

Permalink
feat(nlu): load stopwords for given language
Browse files Browse the repository at this point in the history
  • Loading branch information
EFF committed Feb 13, 2020
1 parent de805e4 commit 51e98fc
Show file tree
Hide file tree
Showing 5 changed files with 322 additions and 181 deletions.
2 changes: 1 addition & 1 deletion modules/nlu/build.extras.js
@@ -1,3 +1,3 @@
module.exports = {
copyFiles: ['src/backend/tools/pretrained/*', 'src/backend/pretrained/*']
copyFiles: ['src/backend/tools/pretrained/*', 'src/backend/pretrained/*', 'src/backend/tools/stopwords/*']
}
2 changes: 1 addition & 1 deletion modules/nlu/src/backend/engine2/training-pipeline.ts
Expand Up @@ -322,7 +322,7 @@ export const AppendNoneIntent = async (input: TrainOutput, tools: Tools): Promis
const junkWords = await tools.generateSimilarJunkWords(_.uniq(vocabWithDupes), input.languageCode)
const avgTokens = _.meanBy(allUtterances, x => x.tokens.length)
const nbOfNoneUtterances = Math.max((allUtterances.length * 2) / 3, 20)
const stopWords = getStopWordsForLang(input.languageCode)
const stopWords = await getStopWordsForLang(input.languageCode)
const vocabWords = _.chain(input.tfIdf)
.toPairs()
.filter(([word, tfidf]) => tfidf <= 0.3)
Expand Down
214 changes: 35 additions & 179 deletions modules/nlu/src/backend/tools/stopWords.ts
@@ -1,183 +1,39 @@
const en = [
'a',
'about',
'above',
'after',
'again',
'against',
'all',
'am',
'an',
'and',
'any',
'are',
"aren't",
'as',
'at',
'be',
'because',
'been',
'before',
'being',
'below',
'between',
'both',
'but',
'by',
"can't",
'cannot',
'could',
"couldn't",
'did',
"didn't",
'do',
'does',
"doesn't",
'doing',
"don't",
'down',
'during',
'each',
'few',
'for',
'from',
'further',
'had',
"hadn't",
'has',
"hasn't",
'have',
"haven't",
'having',
'he',
"he'd",
"he'll",
"he's",
'her',
'here',
"here's",
'hers',
'herself',
'him',
'himself',
'his',
'how',
"how's",
'i',
"i'd",
"i'll",
"i'm",
"i've",
'if',
'in',
'into',
'is',
"isn't",
'it',
"it's",
'its',
'itself',
"let's",
'me',
'more',
'most',
"mustn't",
'my',
'myself',
'no',
'nor',
'not',
'of',
'off',
'on',
'once',
'only',
'or',
'other',
'ought',
'our',
'ours ourselves',
'out',
'over',
'own',
'same',
"shan't",
'she',
"she'd",
"she'll",
"she's",
'should',
"shouldn't",
'so',
'some',
'such',
'than',
'that',
"that's",
'the',
'their',
'theirs',
'them',
'themselves',
'then',
'there',
"there's",
'these',
'they',
"they'd",
"they'll",
"they're",
"they've",
'this',
'those',
'through',
'to',
'too',
'under',
'until',
'up',
'very',
'was',
"wasn't",
'we',
"we'd",
"we'll",
"we're",
"we've",
'were',
"weren't",
'what',
"what's",
'when',
"when's",
'where',
"where's",
'which',
'while',
'who',
"who's",
'whom',
'why',
"why's",
'with',
"won't",
'would',
"wouldn't",
'you',
"you'd",
"you'll",
"you're",
"you've",
'your',
'yours',
'yourself',
'yourselves'
]
import { createReadStream } from 'fs'
import fs from 'fs-extra'
import path from 'path'
import readline from 'readline'

export function getStopWordsForLang(language: string): string[] {
// return []
if (language !== 'en') {
fs.exists

const StopWordsByLang: _.Dictionary<string[]> = {}

async function loadStopWords(language: string): Promise<string[]> {
const fn = path.join(__dirname, `stopwords/${language}.txt`)

const langSupported: boolean = await Promise.fromCallback(callback => {
fs.exists(fn, callback.bind(this, undefined))
})
if (!langSupported) {
return []
}
return en

return new Promise((resolve, reject) => {
const stopWords = []
const stream = createReadStream(fn)
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity })

rl.on('line', l => {
stopWords.push(l)
})

rl.on('close', () => resolve(stopWords))
})
}

export async function getStopWordsForLang(language: string): Promise<string[]> {
if (!StopWordsByLang[language]) {
StopWordsByLang[language] = await loadStopWords(language)
}

return StopWordsByLang[language]
}

0 comments on commit 51e98fc

Please sign in to comment.