From 7ecb2f04099785591d00b2438d4b65c19acb08e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:00:40 +0100 Subject: [PATCH 01/19] Update Babel config --- .babelrc | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/.babelrc b/.babelrc index 7a0c5a7..738dfbe 100644 --- a/.babelrc +++ b/.babelrc @@ -1,21 +1,15 @@ { - "env": { - "development": { - "presets": [ - ["@babel/env"] - ], - "plugins": [ - "add-module-exports" - ] - }, - "production": { - "presets": [ - ["@babel/env"], - "minify" - ], - "plugins": [ - "add-module-exports" - ] - } - } + "env": { + "test": { + "plugins": ["@babel/plugin-transform-modules-commonjs"] + }, + "development": { + "presets": [["@babel/env"]], + "plugins": ["add-module-exports"] + }, + "production": { + "presets": [["@babel/env"], "minify"], + "plugins": ["add-module-exports"] + } + } } From 441ee4df8524f8954fae8f4b4f1482ab8219e409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:01:01 +0100 Subject: [PATCH 02/19] Update ESLint config --- .eslintrc | 19 ------------------- .eslintrc.js | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 19 deletions(-) delete mode 100644 .eslintrc create mode 100755 .eslintrc.js diff --git a/.eslintrc b/.eslintrc deleted file mode 100644 index d199a23..0000000 --- a/.eslintrc +++ /dev/null @@ -1,19 +0,0 @@ -{ - "parserOptions": { - "ecmaVersion": 9, - "sourceType": "module" - }, - "rules": { - "semi": ["warn", "never"], - "no-mixed-spaces-and-tabs": "warn", - "indent": [ - "warn" - ], - "max-statements-per-line": [ - "warn", - { - "max": 2 - } - ] - } -} diff --git a/.eslintrc.js b/.eslintrc.js new file mode 100755 index 0000000..5a000bb --- /dev/null +++ b/.eslintrc.js @@ -0,0 +1,21 @@ +module.exports = { + env: { + browser: true, + node: true, + es2021: true, + jest: true + }, + extends: ['eslint:recommended'], + parserOptions: { + ecmaVersion: 12, + sourceType: 'module' + }, + rules: { + quotes: ['error', 'single', { avoidEscape: true }], + semi: ['error', 'never'], + indent: 'off', + 'no-mixed-spaces-and-tabs': ['warn', 'smart-tabs'], + 'linebreak-style': ['error', 'unix'], + 'no-unused-vars': 'warn' + } +} From 59ddb6bed88c378ff250565628975fe709dbdb81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:01:18 +0100 Subject: [PATCH 03/19] Update gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 670be12..d5505e5 100644 --- a/.gitignore +++ b/.gitignore @@ -14,5 +14,5 @@ jspm_packages .idea lib package-lock.json -yarn.lock .DS_Store +Thumbs.db From 5ea10314c389da26ddd9cb1eb185cbc357ffa38a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:12:11 +0100 Subject: [PATCH 04/19] Fix conflicting EditorConfig settings --- .editorconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.editorconfig b/.editorconfig index 9b9a53d..c1a910e 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,8 +8,7 @@ end_of_line = lf charset = utf-8 trim_trailing_whitespace = true insert_final_newline = true -indent_style = space -indent_size = 4 +indent_style = tab [*.md] trim_trailing_whitespace = false From fe6fd92ab94184626924dcf7eac2abb7b923e0d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:12:20 +0100 Subject: [PATCH 05/19] Update npmignore --- .npmignore | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.npmignore b/.npmignore index 4058c52..da0fa57 100644 --- a/.npmignore +++ b/.npmignore @@ -1,15 +1,20 @@ *.log npm-debug.log* coverage +docs +.vscode .nyc_output node_modules package-lock.json yarn.lock src test -CHANGELOG.md -.travis.yml +CONTRIBUTING.md .editorconfig -.eslintrc +.eslintrc.js +.vscode .babelrc +webpack.config.js .gitignore +.DS_Store +Thumb.db From b043c27706f32b1263516f9dc8629cc2fb4d7ce5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:12:41 +0100 Subject: [PATCH 06/19] Add Prettier config --- .prettierrc | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100755 .prettierrc diff --git a/.prettierrc b/.prettierrc new file mode 100755 index 0000000..67d0eb5 --- /dev/null +++ b/.prettierrc @@ -0,0 +1,7 @@ +{ + "trailingComma": "none", + "tabWidth": 4, + "useTabs": true, + "semi": false, + "singleQuote": true +} From 1391769af0d7d65b32e1a0d61624e8673654b764 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:12:54 +0100 Subject: [PATCH 07/19] Add VSCode config --- .vscode/extensions.json | 3 +++ .vscode/settings.json | 6 ++++++ 2 files changed, 9 insertions(+) create mode 100644 .vscode/extensions.json create mode 100755 .vscode/settings.json diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..64dbfec --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,3 @@ +{ + "recommendations": ["esbenp.prettier-vscode"] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100755 index 0000000..0c4a68e --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "editor.formatOnSave": true, + "files.insertFinalNewline": true, + "editor.defaultFormatter": "esbenp.prettier-vscode", + "prettier.useTabs": true +} From 679778980d193ae4a9ad4cad7fff8c9668772ef2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:13:27 +0100 Subject: [PATCH 08/19] Remove unused Travis config --- .travis.yml | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 24b8730..0000000 --- a/.travis.yml +++ /dev/null @@ -1,10 +0,0 @@ -language: node_js -node_js: - - '8' - - '6' -script: - - npm run test - - npm run build -branches: - only: - - master From 706c8af0c28da4c89a811fb2a2d071fc8164c71d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:14:30 +0100 Subject: [PATCH 09/19] Update package dependencies and scripts --- package.json | 171 +++++++++++++++++++++++++-------------------------- 1 file changed, 83 insertions(+), 88 deletions(-) diff --git a/package.json b/package.json index 7b2c717..dff5988 100644 --- a/package.json +++ b/package.json @@ -1,90 +1,85 @@ { - "name": "ml-classify-text", - "version": "2.0.0", - "description": "Text classification using n-grams and cosine similarity", - "module": "./lib", - "main": "./lib", - "scripts": { - "clean": "rimraf lib", - "test": "npm run lint && npm run cover", - "test:prod": "cross-env BABEL_ENV=production npm run test", - "test:only": "mocha --require @babel/register --require @babel/core --recursive", - "test:watch": "npm test -- --watch", - "cover": "nyc --check-coverage npm run test:only", - "lint": "eslint src test", - "build": "webpack --mode=production --no-progress --hide-modules --config=webpack.config.js", - "prepublish": "npm run clean && npm run lint && npm run test && npm run build" - }, - "files": [ - "lib", - "src" - ], - "repository": { - "type": "git", - "url": "git+https://github.com/andreekeberg/ml-classify-text-js.git" - }, - "keywords": [ - "text classification", - "classification", - "classify", - "classifier", - "machine learning", - "machine", - "learning", - "ai", - "artificial intelligence", - "artificial", - "intelligence", - "n-gram", - "n-grams", - "cosine similarity", - "cosine", - "similarity", - "confidence", - "predict", - "prediction", - "model", - "train" - ], - "author": "André Ekeberg (https://andreekeberg.se/en/)", - "license": "MIT", - "bugs": { - "url": "https://github.com/andreekeberg/ml-classify-text-js/issues" - }, - "homepage": "https://github.com/andreekeberg/ml-classify-text-js", - "devDependencies": { - "@babel/core": "^7.10.4", - "@babel/plugin-transform-modules-amd": "^7.10.5", - "@babel/plugin-transform-modules-commonjs": "^7.6.0", - "@babel/plugin-transform-runtime": "^7.6.2", - "@babel/polyfill": "^7.10.4", - "@babel/preset-env": "^7.8.3", - "@babel/register": "^7.10.4", - "@babel/runtime": "^7.6.2", - "@babel/runtime-corejs3": "^7.6.2", - "babel-cli": "^6.26.0", - "babel-eslint": "^10.0.1", - "babel-loader": "^8.0.6", - "babel-plugin-add-module-exports": "^1.0.2", - "babel-polyfill": "^6.26.0", - "babel-preset-env": "^1.6.1", - "babel-preset-minify": "^0.5.0", - "babel-runtime": "^6.26.0", - "chai": "^4.1.2", - "core-js": "^3.2.1", - "cross-env": "^5.2.1", - "eslint": "^5.16.0", - "eslint-config-standard": "^14.1.1", - "eslint-plugin-node": "^11.1.0", - "jsdoc": "^3.6.5", - "jsdoc-to-markdown": "^6.0.1", - "mocha": "^6.1.3", - "nyc": "^13.3.0", - "rimraf": "^2.6.2", - "webpack": "^4.40.2", - "webpack-cli": "^3.3.9" - }, - "dependencies": { - "xregexp": "^4.3.0" - } + "name": "ml-classify-text", + "version": "2.0.0", + "description": "Text classification using n-grams and cosine similarity", + "module": "./lib", + "main": "./lib", + "scripts": { + "clean": "rimraf lib", + "test": "jest --coverage", + "test:watch": "jest --watchAll", + "test:prod": "cross-env BABEL_ENV=production npm run test", + "lint": "eslint src test", + "build": "webpack --mode=production --config=webpack.config.js", + "prepublish": "npm run clean && npm run lint && npm run test && npm run build" + }, + "files": [ + "lib" + ], + "repository": { + "type": "git", + "url": "git+https://github.com/andreekeberg/ml-classify-text-js.git" + }, + "keywords": [ + "text classification", + "classification", + "classify", + "classifier", + "machine learning", + "machine", + "learning", + "ai", + "artificial intelligence", + "artificial", + "intelligence", + "n-gram", + "n-grams", + "cosine similarity", + "cosine", + "similarity", + "confidence", + "predict", + "prediction", + "model", + "train" + ], + "author": "André Ekeberg (https://andreekeberg.se/en/)", + "license": "MIT", + "bugs": { + "url": "https://github.com/andreekeberg/ml-classify-text-js/issues" + }, + "homepage": "https://github.com/andreekeberg/ml-classify-text-js", + "devDependencies": { + "@babel/core": "^7.20.12", + "@babel/plugin-transform-modules-amd": "^7.20.11", + "@babel/plugin-transform-modules-commonjs": "^7.20.11", + "@babel/plugin-transform-runtime": "^7.19.6", + "@babel/polyfill": "^7.12.1", + "@babel/preset-env": "^7.20.2", + "@babel/register": "^7.18.9", + "@babel/runtime": "^7.20.13", + "@babel/runtime-corejs3": "^7.20.13", + "babel-cli": "^6.26.0", + "babel-eslint": "^10.1.0", + "babel-loader": "^9.1.2", + "babel-plugin-add-module-exports": "^1.0.4", + "babel-polyfill": "^6.26.0", + "babel-preset-env": "^1.7.0", + "babel-preset-minify": "^0.5.2", + "babel-runtime": "^6.26.0", + "core-js": "^3.27.2", + "cross-env": "^7.0.3", + "eslint": "^8.33.0", + "eslint-config-standard": "^17.0.0", + "eslint-plugin-node": "^11.1.0", + "jest": "^29.4.1", + "jsdoc": "^4.0.0", + "jsdoc-to-markdown": "^8.0.0", + "rimraf": "^4.1.2", + "webpack": "^5.75.0", + "webpack-cli": "^5.0.1" + }, + "dependencies": { + "xregexp": "^5.1.1" + } } From 758e8ef1275c6585a2c2db553fa1e0f110461d01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:16:06 +0100 Subject: [PATCH 10/19] Fix several bugs and migrate tests to Jest --- src/classifier.js | 612 +++++++++++++++++++++------------------- src/index.js | 6 +- src/model.js | 265 ++++++++--------- src/prediction.js | 96 +++---- src/vocabulary.js | 218 +++++++------- test/Classifier.test.js | 427 ++++++++++++++++++++++++++++ test/Model.test.js | 206 ++++++++++++++ test/Prediction.test.js | 73 +++++ test/Vocabulary.test.js | 177 ++++++++++++ test/classifier.js | 363 ------------------------ test/model.js | 186 ------------ test/prediction.js | 74 ----- test/vocabulary.js | 166 ----------- 13 files changed, 1507 insertions(+), 1362 deletions(-) create mode 100644 test/Classifier.test.js create mode 100644 test/Model.test.js create mode 100644 test/Prediction.test.js create mode 100644 test/Vocabulary.test.js delete mode 100644 test/classifier.js delete mode 100644 test/model.js delete mode 100644 test/prediction.js delete mode 100644 test/vocabulary.js diff --git a/src/classifier.js b/src/classifier.js index 073f74a..6c50012 100644 --- a/src/classifier.js +++ b/src/classifier.js @@ -1,6 +1,7 @@ import XRegExp from 'xregexp' -import Model from './model' -import Prediction from './prediction' +import Model from './Model' +import Prediction from './Prediction' +import Vocabulary from './Vocabulary' /** * @param {(Model|Object)} [model] @@ -11,290 +12,329 @@ import Prediction from './prediction' * @constructor */ class Classifier { - constructor(model = {}) { - if (!(model instanceof Model)) { - model = new Model(model) - } - - this._model = model - } - - /** - * Model instance - * - * @type {Model} - */ - get model() { - return this._model - } - - set model(model) { - if (!(model instanceof Model)) { - model = new Model(model) - } - - this._model = model - } - - /** - * Train the current model using an input string (or array of strings) and a corresponding label - * - * @param {(string|string[])} input - String, or an array of strings - * @param {string} label - Corresponding label - * @return {this} - */ - train(input, label) { - if (typeof input !== 'string' && !(input instanceof Array)) { - throw new Error('input must be either a string or Array') - } - - if (typeof label !== 'string') { - throw new Error('label must be a string') - } - - // If input isn't an array, convert to a single item array - if (!(input instanceof Array)) { - input = [input] - } - - input.forEach(string => { - // Convert the string to a tokenized object - let tokens = this.tokenize(string) - - // If we're using a vocabulary, convert the tokens to a vector where all - // indexes reference vocabulary terms (all terms not already in the - // vocabulary are automatically added) - if (this._model.vocabulary !== false) { - tokens = this.vectorize(tokens) - } - - // Set up an empty entry for the label if it does not exist - if (typeof this._model.data[label] === 'undefined') { - this._model.data[label] = {} - } - - // Add all occurrences to our model entry - Object.keys(tokens).forEach(index => { - let occurrences = tokens[index] - - if (typeof this._model.data[label][index] === 'undefined') { - this._model.data[label][index] = 0 - } - - this._model.data[label][index] += occurrences - }) - }) - - return this - } - - /** - * Return an array of one or more Prediction instances - * - * @param {string} input - Input string to make a prediction from - * @param {int} [maxMatches=1] Maximum number of predictions to return - * @param {float} [minimumConfidence=0.2] Minimum confidence required to include a prediction - * @return {Array} - */ - predict(input, maxMatches = 1, minimumConfidence = 0.2) { - if (typeof input !== 'string') { - throw new Error('input must be a string') - } - - if (typeof minimumConfidence !== 'number') { - throw new Error('minimumConfidence must be a number') - } - - if (minimumConfidence < 0) { - throw new Error('minimumConfidence can not be lower than 0') - } - - if (minimumConfidence > 1) { - throw new Error('minimumConfidence can not be higher than 1') - } - - let tokens = this.tokenize(input) - - if (this.vocabulary !== false) { - tokens = this.vectorize(tokens) - } - - let predictions = [] - - Object.keys(this._model.data).forEach(label => { - let entry = this._model.data[label] - - let confidence = this.cosineSimilarity(tokens, entry) - - if (confidence >= minimumConfidence) { - predictions.push(new Prediction({ - label, - confidence - })) - } - }) - - /* istanbul ignore next */ - predictions.sort((a, b) => { - if (a.confidence === b.confidence) { - return 0 - } - - return a.confidence > b.confidence ? -1 : 1 - }) - - return predictions.slice(0, Math.min(predictions.length, maxMatches)) - } - - /** - * Split a string into an array of lowercase words, with all non-letter characters removed - * - * @param {string} input - * @return {Array} - */ - splitWords(input) { - if (typeof input !== 'string') { - throw new Error('input must be a string') - } - - // Remove all apostrophes and dashes to keep words intact - input = input.replace(/'|´|’|-/g, '') - - // Lowercase all letters and replace all non-letter characters with a space - input = XRegExp.replace(input.toLocaleLowerCase(), XRegExp('\\P{L}+', 'g'), ' ').trim() - - return input.split(' ') - } - - /** - * Create an object literal of unique tokens (n-grams) as keys, and their - * respective occurrences as values based on an input string, or array of words - * - * @param {(string|string[])} input - * @return {Object} - */ - tokenize(input) { - let words = typeof input === 'string' ? this.splitWords(input) : input - - if (!(words instanceof Array)) { - throw new Error('input must be either a string or Array') - } - - if (this._model.nGramMax < this._model.nGramMin) { - throw new Error('Invalid nGramMin/nGramMax combination in model config') - } - - let tokens = {} - - // Generate a list of n-grams along with their respective occurrences - // based on the models configured min/max values - words.forEach((word, index) => { - let sequence = '' - - words.slice(index).forEach(nextWord => { - sequence += sequence ? (' ' + nextWord) : nextWord - let tokenCount = sequence.split(' ').length - - if (tokenCount < this._model.nGramMin || tokenCount > this._model.nGramMax) { - return - } - - if (typeof tokens[sequence] === 'undefined') { - tokens[sequence] = 0 - } - - ++tokens[sequence] - }) - }) - - return tokens - } - - /** - * Convert a tokenized object into a new object with all keys (terms) - * translated to their index in the vocabulary (adding all terms to - * the vocabulary that do not already exist) - * - * @param {Object} tokens - * @return {Object} - */ - vectorize(tokens) { - if (!(tokens instanceof Object) || tokens.constructor !== Object) { - throw new Error('tokens must be an object literal') - } - - /* istanbul ignore next */ - if (this._model.vocabulary === false) { - throw new Error('Cannot vectorize tokens when vocabulary is false') - } - - let vector = {} - - Object.keys(tokens).forEach(token => { - let vocabularyIndex = this._model.vocabulary.indexOf(token) - - if (vocabularyIndex === -1) { - this._model.vocabulary.add(token) - - vocabularyIndex = this._model.vocabulary.size - 1 - } - - vector[vocabularyIndex] = tokens[token] - }) - - return vector - } - - /** - * Return the cosine similarity between two vectors - * - * @param {Object} v1 - * @param {Object} v2 - * @return {float} - */ - cosineSimilarity(v1, v2) { - if (!(v1 instanceof Object) || v1.constructor !== Object) { - throw new Error('v1 must be an object literal') - } - if (!(v2 instanceof Object) || v2.constructor !== Object) { - throw new Error('v2 must be an object literal') - } - - let prod = 0.0 - let v1Norm = 0.0 - - Object.keys(v1).forEach(i => { - let xi = v1[i] - - if (typeof v2[i] !== 'undefined') { - prod += xi * v2[i] - } - - v1Norm += xi * xi - }) - - v1Norm = Math.sqrt(v1Norm) - - if (v1Norm === 0) { - return 0 - } - - let v2Norm = 0.0 - - Object.keys(v2).forEach(i => { - let xi = v2[i] - - v2Norm += xi * xi - }) - - v2Norm = Math.sqrt(v2Norm) - - if (v2Norm === 0) { - return 0 - } - - return prod / (v1Norm * v2Norm) - } + constructor(model = {}) { + if (!(model instanceof Model)) { + model = new Model(model) + } + + this._model = model + } + + /** + * Model instance + * + * @type {Model} + */ + get model() { + return this._model + } + + set model(model) { + if (!(model instanceof Model)) { + model = new Model(model) + } + + this._model = model + } + + /** + * Train the current model using an input string (or array of strings) and a corresponding label + * + * @param {(string|string[])} input - String, or an array of strings + * @param {string} label - Corresponding label + * @return {this} + */ + train(input, label) { + if (typeof input !== 'string' && !(input instanceof Array)) { + throw new Error('input must be either a string or Array') + } + + if (typeof label !== 'string') { + throw new Error('label must be a string') + } + + // If input isn't an array, convert to a single item array + if (!(input instanceof Array)) { + input = [input] + } + + input.forEach((string) => { + // Convert the string to a tokenized object + let tokens = this.tokenize(string) + + if (this._model.vocabulary !== false) { + // If we're using a vocabulary, convert the tokens to a vector where all + // indexes reference vocabulary terms + const { vector, vocabulary } = this.vectorize(tokens) + + // Overwrite the tokens object with our new vectorized object + tokens = vector + + // Update the model vocabulary + this._model.vocabulary = vocabulary + } + + // Set up an empty entry for the label if it does not exist + if ( + !Object.prototype.hasOwnProperty.call(this._model.data, label) + ) { + this._model.data[label] = {} + } + + // Add all occurrences to our model entry + Object.keys(tokens).forEach((index) => { + let occurrences = tokens[index] + + if ( + !Object.prototype.hasOwnProperty.call( + this._model.data[label], + index + ) + ) { + this._model.data[label][index] = 0 + } + + this._model.data[label][index] += occurrences + }) + }) + + return this + } + + /** + * Return an array of one or more Prediction instances + * + * @param {string} input - Input string to make a prediction from + * @param {int} [maxMatches=1] Maximum number of predictions to return + * @param {float} [minimumConfidence=0.2] Minimum confidence required to include a prediction + * @return {Array} + */ + predict(input, maxMatches = 1, minimumConfidence = 0.2) { + if (typeof input !== 'string') { + throw new Error('input must be a string') + } + + if (!['number', 'undefined'].includes(typeof maxMatches)) { + throw new Error('maxMatches must be either a number or undefined') + } + + if (!['number', 'undefined'].includes(typeof minimumConfidence)) { + throw new Error( + 'minimumConfidence must be either a number or undefined' + ) + } + + if (minimumConfidence < 0) { + throw new Error('minimumConfidence can not be lower than 0') + } + + if (minimumConfidence > 1) { + throw new Error('minimumConfidence can not be higher than 1') + } + + // Convert the string to a tokenized object + let tokens = this.tokenize(input) + + if (this.vocabulary !== false) { + // If we're using a vocabulary, convert the tokens to a vector where all + // indexes reference vocabulary terms + const { vector } = this.vectorize(tokens) + + // Overwrite the tokens object with our new vectorized object + tokens = vector + } + + const predictions = [] + + Object.keys(this._model.data).forEach((label) => { + let entry = this._model.data[label] + + let confidence = this.cosineSimilarity(tokens, entry) + + if (confidence >= minimumConfidence) { + predictions.push( + new Prediction({ + label, + confidence + }) + ) + } + }) + + /* istanbul ignore next */ + predictions.sort((a, b) => { + if (a.confidence === b.confidence) { + return 0 + } + + return a.confidence > b.confidence ? -1 : 1 + }) + + return predictions.slice(0, Math.min(predictions.length, maxMatches)) + } + + /** + * Split a string into an array of lowercase words, with all non-letter characters removed + * + * @param {string} input + * @return {Array} + */ + splitWords(input) { + if (typeof input !== 'string') { + throw new Error('input must be a string') + } + + // Remove all apostrophes and dashes to keep words intact + input = input.replace(/'|´|’|-/g, '') + + // Lowercase all letters and replace all non-letter characters with a space + input = XRegExp.replace( + input.toLocaleLowerCase(), + XRegExp('\\P{L}+', 'g'), + ' ' + ).trim() + + return input.split(' ') + } + + /** + * Create an object literal of unique tokens (n-grams) as keys, and their + * respective occurrences as values based on an input string, or array of words + * + * @param {(string|string[])} input + * @return {Object} + */ + tokenize(input) { + let words = typeof input === 'string' ? this.splitWords(input) : input + + if (!(words instanceof Array)) { + throw new Error('input must be either a string or Array') + } + + if (this._model.nGramMax < this._model.nGramMin) { + throw new Error( + 'Invalid nGramMin/nGramMax combination in model config' + ) + } + + let tokens = {} + + // Generate a list of n-grams along with their respective occurrences + // based on the models configured min/max values + words.forEach((word, index) => { + let sequence = '' + + words.slice(index).forEach((nextWord) => { + sequence += sequence ? ' ' + nextWord : nextWord + let tokenCount = sequence.split(' ').length + + if ( + tokenCount < this._model.nGramMin || + tokenCount > this._model.nGramMax + ) { + return + } + + if (!Object.prototype.hasOwnProperty.call(tokens, sequence)) { + tokens[sequence] = 0 + } + + ++tokens[sequence] + }) + }) + + return tokens + } + + /** + * Convert a tokenized object into a new object with all keys (terms) + * translated to their index in the returned vocabulary (which is also + * returned along with the object, with any new terms added to the end) + * + * @param {Object} tokens + * @return {Object} + */ + vectorize(tokens) { + if (Object.getPrototypeOf(tokens) !== Object.prototype) { + throw new Error('tokens must be an object literal') + } + + /* istanbul ignore next */ + if (this._model.vocabulary === false) { + throw new Error('Cannot vectorize tokens when vocabulary is false') + } + + const vector = {} + const vocabulary = new Vocabulary(this._model.vocabulary.terms) + + Object.keys(tokens).forEach((token) => { + let vocabularyIndex = vocabulary.indexOf(token) + + if (vocabularyIndex === -1) { + vocabulary.add(token) + + vocabularyIndex = vocabulary.size - 1 + } + + vector[vocabularyIndex] = tokens[token] + }) + + return { + vector, + vocabulary + } + } + + /** + * Return the cosine similarity between two vectors + * + * @param {Object} v1 + * @param {Object} v2 + * @return {float} + */ + cosineSimilarity(v1, v2) { + if (Object.getPrototypeOf(v1) !== Object.prototype) { + throw new Error('v1 must be an object literal') + } + if (Object.getPrototypeOf(v2) !== Object.prototype) { + throw new Error('v2 must be an object literal') + } + + let prod = 0.0 + let v1Norm = 0.0 + + Object.keys(v1).forEach((i) => { + let xi = v1[i] + + if (Object.prototype.hasOwnProperty.call(v2, i)) { + prod += xi * v2[i] + } + + v1Norm += xi * xi + }) + + v1Norm = Math.sqrt(v1Norm) + + if (v1Norm === 0) { + return 0 + } + + let v2Norm = 0.0 + + Object.keys(v2).forEach((i) => { + let xi = v2[i] + + v2Norm += xi * xi + }) + + v2Norm = Math.sqrt(v2Norm) + + if (v2Norm === 0) { + return 0 + } + + return prod / (v1Norm * v2Norm) + } } export default Classifier diff --git a/src/index.js b/src/index.js index 8fdb968..4332d7a 100644 --- a/src/index.js +++ b/src/index.js @@ -1,7 +1,7 @@ -import Classifier from './classifier' +import Classifier from './Classifier' -export { default as Model } from './model' -export { default as Cocabulary } from './vocabulary' +export { default as Model } from './Model' +export { default as Vocabulary } from './Vocabulary' export { Classifier as Classifier } export default Classifier diff --git a/src/model.js b/src/model.js index d9badbc..a27dbb6 100644 --- a/src/model.js +++ b/src/model.js @@ -1,4 +1,4 @@ -import Vocabulary from './vocabulary' +import Vocabulary from './Vocabulary' /** * @param {Object} [config] @@ -9,136 +9,139 @@ import Vocabulary from './vocabulary' * @constructor */ class Model { - constructor(config = {}) { - if (!(config instanceof Object) || config.constructor !== Object) { - throw new Error('config must be an object literal') - } - - config = { - nGramMin: 1, - nGramMax: 1, - vocabulary: [], - data: {}, - ...config - } - - if (config.nGramMin !== parseInt(config.nGramMin, 10)) { - throw new Error('Config value nGramMin must be an integer') - } - - if (config.nGramMax !== parseInt(config.nGramMax, 10)) { - throw new Error('Config value nGramMax must be an integer') - } - - if (config.nGramMin < 1) { - throw new Error('Config value nGramMin must be at least 1') - } - - if (config.nGramMax < 1) { - throw new Error('Config value nGramMax must be at least 1') - } - - if (config.nGramMax < config.nGramMin) { - throw new Error('Invalid nGramMin/nGramMax combination in config') - } - - if (config.vocabulary !== false && !(config.vocabulary instanceof Vocabulary)) { - config.vocabulary = new Vocabulary(config.vocabulary) - } - - if (!(config.data instanceof Object) || config.data.constructor !== Object) { - throw new Error('Config value data must be an object literal') - } - - this._nGramMin = config.nGramMin - this._nGramMax = config.nGramMax - this._vocabulary = config.vocabulary - this._data = {...config.data} - } - - /** - * Minimum n-gram size - * - * @type {int} - */ - get nGramMin() { - return this._nGramMin - } - - set nGramMin(size) { - if (size !== parseInt(size, 10)) { - throw new Error('nGramMin must be an integer') - } - - this._nGramMin = size - } - - /** - * Maximum n-gram size - * - * @type {int} - */ - get nGramMax() { - return this._nGramMax - } - - set nGramMax(size) { - if (size !== parseInt(size, 10)) { - throw new Error('nGramMax must be an integer') - } - - this._nGramMax = size - } - - /** - * Vocabulary instance - * - * @type {(Vocabulary|false)} - */ - get vocabulary() { - return this._vocabulary - } - - set vocabulary(vocabulary) { - if (vocabulary !== false && !(vocabulary instanceof Vocabulary)) { - vocabulary = new Vocabulary(vocabulary) - } - - this._vocabulary = vocabulary - } - - /** - * Model data - * - * @type {Object} - */ - get data() { - return this._data - } - - set data(data) { - if (!(data instanceof Object) || data.constructor !== Object) { - throw new Error('data must be an object literal') - } - - this._data = {...data} - } - - /** - * Return the model in its current state an an object literal, including the - * configured n-gram min/max values, the vocabulary as an array (if any, - * otherwise false), and an object literal with all the training data - * - * @return {Object} - */ - serialize() { - return { - nGramMin: this._nGramMin, - nGramMax: this._nGramMax, - vocabulary: Array.from(this._vocabulary.terms), - data: this._data - } - } + constructor(config = {}) { + if (Object.getPrototypeOf(config) !== Object.prototype) { + throw new Error('config must be an object literal') + } + + config = { + nGramMin: 1, + nGramMax: 1, + vocabulary: [], + data: {}, + ...config + } + + if (config.nGramMin !== parseInt(config.nGramMin, 10)) { + throw new Error('Config value nGramMin must be an integer') + } + + if (config.nGramMax !== parseInt(config.nGramMax, 10)) { + throw new Error('Config value nGramMax must be an integer') + } + + if (config.nGramMin < 1) { + throw new Error('Config value nGramMin must be at least 1') + } + + if (config.nGramMax < 1) { + throw new Error('Config value nGramMax must be at least 1') + } + + if (config.nGramMax < config.nGramMin) { + throw new Error('Invalid nGramMin/nGramMax combination in config') + } + + if ( + config.vocabulary !== false && + !(config.vocabulary instanceof Vocabulary) + ) { + config.vocabulary = new Vocabulary(config.vocabulary) + } + + if (Object.getPrototypeOf(config.data) !== Object.prototype) { + throw new Error('Config value data must be an object literal') + } + + this._nGramMin = config.nGramMin + this._nGramMax = config.nGramMax + this._vocabulary = config.vocabulary + this._data = { ...config.data } + } + + /** + * Minimum n-gram size + * + * @type {int} + */ + get nGramMin() { + return this._nGramMin + } + + set nGramMin(size) { + if (size !== parseInt(size, 10)) { + throw new Error('nGramMin must be an integer') + } + + this._nGramMin = size + } + + /** + * Maximum n-gram size + * + * @type {int} + */ + get nGramMax() { + return this._nGramMax + } + + set nGramMax(size) { + if (size !== parseInt(size, 10)) { + throw new Error('nGramMax must be an integer') + } + + this._nGramMax = size + } + + /** + * Vocabulary instance + * + * @type {(Vocabulary|false)} + */ + get vocabulary() { + return this._vocabulary + } + + set vocabulary(vocabulary) { + if (vocabulary !== false && !(vocabulary instanceof Vocabulary)) { + vocabulary = new Vocabulary(vocabulary) + } + + this._vocabulary = vocabulary + } + + /** + * Model data + * + * @type {Object} + */ + get data() { + return this._data + } + + set data(data) { + if (!(data instanceof Object) || data.constructor !== Object) { + throw new Error('data must be an object literal') + } + + this._data = { ...data } + } + + /** + * Return the model in its current state an an object literal, including the + * configured n-gram min/max values, the vocabulary as an array (if any, + * otherwise false), and an object literal with all the training data + * + * @return {Object} + */ + serialize() { + return { + nGramMin: this._nGramMin, + nGramMax: this._nGramMax, + vocabulary: Array.from(this._vocabulary.terms), + data: this._data + } + } } export default Model diff --git a/src/prediction.js b/src/prediction.js index f3045dc..6bd4db5 100644 --- a/src/prediction.js +++ b/src/prediction.js @@ -4,54 +4,54 @@ * @hideconstructor */ class Prediction { - constructor(prediction = {}) { - if (!(prediction instanceof Object) || prediction.constructor !== Object) { - throw new Error('prediction must be an object literal') - } - - prediction = { - label: '', - confidence: 0, - ...prediction - } - - this._label = prediction.label - this._confidence = prediction.confidence - } - - /** - * Label of the prediction - * - * @type {string} - */ - get label() { - return this._label - } - - set label(label) { - if (typeof label !== 'string') { - throw new Error('label must be a string') - } - - this._label = label - } - - /** - * Confidence of the prediction - * - * @type {number} - */ - get confidence() { - return this._confidence - } - - set confidence(confidence) { - if (typeof confidence !== 'number') { - throw new Error('confidence must be a number') - } - - this._confidence = confidence - } + constructor(prediction = {}) { + if (Object.getPrototypeOf(prediction) !== Object.prototype) { + throw new Error('prediction must be an object literal') + } + + prediction = { + label: '', + confidence: 0, + ...prediction + } + + this._label = prediction.label + this._confidence = prediction.confidence + } + + /** + * Label of the prediction + * + * @type {string} + */ + get label() { + return this._label + } + + set label(label) { + if (typeof label !== 'string') { + throw new Error('label must be a string') + } + + this._label = label + } + + /** + * Confidence of the prediction + * + * @type {number} + */ + get confidence() { + return this._confidence + } + + set confidence(confidence) { + if (typeof confidence !== 'number') { + throw new Error('confidence must be a number') + } + + this._confidence = confidence + } } export default Prediction diff --git a/src/vocabulary.js b/src/vocabulary.js index 10bfdfe..d322f7f 100644 --- a/src/vocabulary.js +++ b/src/vocabulary.js @@ -3,111 +3,119 @@ * @constructor */ class Vocabulary { - constructor(terms = []) { - if (!(terms instanceof Array) && !(terms instanceof Set)) { - throw new Error('terms must be either an Array or a Set') - } - - this._terms = new Set(terms) - } - - /** - * Vocabulary size - * - * @type {number} - */ - get size() { - return this._terms.size - } - - /** - * Vocabulary terms - * - * @type {(Array|Set)} - */ - get terms() { - return this._terms - } - - set terms(terms) { - if (!(terms instanceof Array) && !(terms instanceof Set)) { - throw new Error('terms must be either an Array or a Set') - } - - this._terms = new Set(terms) - } - - /** - * Add one or more terms to the vocabulary - * - * @param {(string|Array|Set)} terms - * @return {this} - */ - add(terms) { - if (typeof terms !== 'string' && !(terms instanceof Array) && !(terms instanceof Set)) { - throw new Error('terms must be either a string, Array or Set') - } - - if (typeof terms === 'string') { - terms = [terms] - } else if (terms instanceof Set) { - terms = Array.from(terms) - } - - terms.forEach(term => { - this._terms.add(term) - }) - - return this - } - - /** - * Remove one or more terms from the vocabulary - * - * @param {(string|Array|Set)} terms - * @return {this} - */ - remove(terms) { - if (typeof terms !== 'string' && !(terms instanceof Array) && !(terms instanceof Set)) { - throw new Error('terms must be either a string, Array or Set') - } - - if (typeof terms === 'string') { - terms = [terms] - } else if (terms instanceof Set) { - terms = Array.from(terms) - } - - terms.forEach(term => { - this._terms.delete(term) - }) - - return this - } - - /** - * Return whether the vocabulary contains a certain term - * - * @param {string} term - * @return {bool} - */ - has(term) { - return this._terms.has(term) - } - - /** - * Return the index of a term in the vocabulary (returns -1 if not found) - * - * @param {string} term - * @return {number} - */ - indexOf(term) { - if (!this._terms.has(term)) { - return -1 - } - - return Array.from(this._terms).indexOf(term) - } + constructor(terms = []) { + if (!(terms instanceof Array) && !(terms instanceof Set)) { + throw new Error('terms must be either an Array or a Set') + } + + this._terms = new Set(terms) + } + + /** + * Vocabulary size + * + * @type {number} + */ + get size() { + return this._terms.size + } + + /** + * Vocabulary terms + * + * @type {(Array|Set)} + */ + get terms() { + return this._terms + } + + set terms(terms) { + if (!(terms instanceof Array) && !(terms instanceof Set)) { + throw new Error('terms must be either an Array or a Set') + } + + this._terms = new Set(terms) + } + + /** + * Add one or more terms to the vocabulary + * + * @param {(string|Array|Set)} terms + * @return {this} + */ + add(terms) { + if ( + typeof terms !== 'string' && + !(terms instanceof Array) && + !(terms instanceof Set) + ) { + throw new Error('terms must be either a string, Array or Set') + } + + if (typeof terms === 'string') { + terms = [terms] + } else if (terms instanceof Set) { + terms = Array.from(terms) + } + + terms.forEach((term) => { + this._terms.add(term) + }) + + return this + } + + /** + * Remove one or more terms from the vocabulary + * + * @param {(string|Array|Set)} terms + * @return {this} + */ + remove(terms) { + if ( + typeof terms !== 'string' && + !(terms instanceof Array) && + !(terms instanceof Set) + ) { + throw new Error('terms must be either a string, Array or Set') + } + + if (typeof terms === 'string') { + terms = [terms] + } else if (terms instanceof Set) { + terms = Array.from(terms) + } + + terms.forEach((term) => { + this._terms.delete(term) + }) + + return this + } + + /** + * Return whether the vocabulary contains a certain term + * + * @param {string} term + * @return {bool} + */ + has(term) { + return this._terms.has(term) + } + + /** + * Return the index of a term in the vocabulary (returns -1 if not found) + * + * @param {string} term + * @return {number} + */ + indexOf(term) { + if (!this._terms.has(term)) { + return -1 + } + + return Array.from(this._terms).indexOf(term) + } } export default Vocabulary diff --git a/test/Classifier.test.js b/test/Classifier.test.js new file mode 100644 index 0000000..bb274b7 --- /dev/null +++ b/test/Classifier.test.js @@ -0,0 +1,427 @@ +import Classifier from '../src/Classifier' +import Model from '../src/Model' + +describe('Classifier', () => { + describe('constructor', () => { + test('should set the model when passed a model instance', () => { + const classifier = new Classifier( + new Model({ + nGramMax: 4 + }) + ) + + expect(classifier.model.nGramMax).toStrictEqual(4) + }) + + test('should set the model when passed an object literal', () => { + const classifier = new Classifier({ + nGramMax: 5 + }) + + expect(classifier.model.nGramMax).toStrictEqual(5) + }) + }) + + describe('model', () => { + test('should return a model instance', () => { + let classifier = new Classifier() + + expect(classifier.model).toBeInstanceOf(Model) + }) + + test('should set the current model when passed a model instance', () => { + let classifier = new Classifier() + + classifier.model = new Model({ + nGramMax: 3 + }) + + expect(classifier.model.nGramMax).toStrictEqual(3) + }) + + test('should set the current model to a new model instance when passed an object literal', () => { + let classifier = new Classifier() + + classifier.model = {} + + expect(classifier.model).toBeInstanceOf(Model) + }) + }) + + describe('splitWords', () => { + test('should throw an error if input is not a string', () => { + const classifier = new Classifier() + + expect(() => classifier.splitWords(1)).toThrow(Error) + }) + + test('should split a string into an array of words', () => { + const classifier = new Classifier() + + expect(classifier.splitWords('Hello world!')).toStrictEqual([ + 'hello', + 'world' + ]) + }) + }) + + describe('tokenize', () => { + test('should throw an error if input is neither a string or array', () => { + const classifier = new Classifier() + + expect(() => classifier.tokenize({})).toThrow(Error) + }) + + test('should throw an error if nGramMax is less than nGramMin in model config', () => { + const classifier = new Classifier() + + classifier.model.nGramMin = 2 + + expect(() => classifier.tokenize('Hello world!')).toThrow(Error) + }) + + test('should return an object literal of tokens and their occurrences from a string', () => { + const classifier = new Classifier() + + expect(classifier.tokenize('Hello world!')).toStrictEqual({ + hello: 1, + world: 1 + }) + }) + + test('should return an object literal of tokens and their occurrences from a string', () => { + const classifier = new Classifier() + + expect(classifier.tokenize('Hello world!')).toStrictEqual({ + hello: 1, + world: 1 + }) + }) + + test('should return an object literal of tokens and their occurrences from a array', () => { + const classifier = new Classifier() + + expect(classifier.tokenize(['hello', 'world'])).toStrictEqual({ + hello: 1, + world: 1 + }) + }) + + test('should return an object literal of bigrams when nGramMin/nGramMax is 2', () => { + const classifier = new Classifier({ + nGramMin: 2, + nGramMax: 2 + }) + + expect(classifier.tokenize('Hello world!')).toStrictEqual({ + 'hello world': 1 + }) + }) + + test('should return an object literal of unigrams and bigrams when nGramMin/nGramMax is 1/2', () => { + const classifier = new Classifier({ + nGramMin: 1, + nGramMax: 2 + }) + + expect(classifier.tokenize('Hello world!')).toStrictEqual({ + hello: 1, + 'hello world': 1, + world: 1 + }) + }) + + test('should increment the occurrence of the duplicate tokens', () => { + const classifier = new Classifier() + + expect(classifier.tokenize('Hello hello!')).toStrictEqual({ + hello: 2 + }) + }) + }) + + describe('vectorize', () => { + test('should throw an error if input is not an object literal', () => { + const classifier = new Classifier() + + expect(() => classifier.vectorize([])).toThrow(Error) + }) + + test('should throw an error if vocabulary config option is set to false', () => { + const classifier = new Classifier({ + vocabulary: false + }) + + expect(() => classifier.vectorize({ hello: 1 })).toThrow(Error) + }) + + test('should convert key to its corresponding vocabulary term index', () => { + const classifier = new Classifier() + const tokens = classifier.tokenize('Hello') + + const { vector } = classifier.vectorize(tokens) + + expect(vector).toStrictEqual({ 0: 1 }) + }) + + test('should use existing term index when token is already in vocabulary', () => { + const classifier = new Classifier({ + vocabulary: ['hello', 'world'] + }) + + const tokens = classifier.tokenize('world') + + const { vector } = classifier.vectorize(tokens) + + expect(vector).toStrictEqual({ 1: 1 }) + }) + + test('should return an updated copy of the vocabulary', () => { + const classifier = new Classifier() + + const tokens = classifier.tokenize('Hello world') + + const { vocabulary } = classifier.vectorize(tokens) + + const terms = vocabulary.terms + + expect(Array.from(terms)).toStrictEqual(['hello', 'world']) + }) + }) + + describe('train', () => { + test('should throw an error if input is not a string or array', () => { + const classifier = new Classifier() + + expect(() => classifier.train({}, 'test')).toThrow(Error) + }) + + test('should throw an error if label is not a string', () => { + const classifier = new Classifier() + + expect(() => classifier.train('test', [])).toThrow(Error) + }) + + test('should add tokens to the vocabulary (if not configured to false)', () => { + const classifier = new Classifier() + + classifier.train('hello world', 'test') + + const vocabulary = classifier.model.vocabulary + + expect(vocabulary.size).toStrictEqual(2) + }) + + test('should add tokens (and their occurrences) to the model from a string', () => { + const classifier = new Classifier() + + classifier.train('hello world', 'test') + + const model = classifier.model + + expect(model.data).toStrictEqual({ + test: { 0: 1, 1: 1 } + }) + }) + + test('should add tokens (and their occurrences) to the model from an array of strings', () => { + const classifier = new Classifier() + + classifier.train(['hello world', 'foo', 'bar'], 'test') + + const model = classifier.model + + expect(model.data).toStrictEqual({ + test: { 0: 1, 1: 1, 2: 1, 3: 1 } + }) + }) + + test('should increment the occurrence of an existing vocabulary term', () => { + const classifier = new Classifier() + + classifier.train(['hello world', 'foo', 'hello'], 'test') + + const model = classifier.model + + expect(model.data).toStrictEqual({ + test: { 0: 2, 1: 1, 2: 1 } + }) + }) + + test('should return classifier instance', () => { + const classifier = new Classifier() + + expect(classifier.train('hello world', 'test')).toStrictEqual( + classifier + ) + }) + }) + + describe('cosineSimilarity', () => { + test('should throw an error if v1 is not an object literal', () => { + const classifier = new Classifier() + + expect(() => classifier.cosineSimilarity(false, {})).toThrow(Error) + }) + + test('should throw an error if v2 is not an object literal', () => { + const classifier = new Classifier() + + expect(() => classifier.cosineSimilarity({}, false)).toThrow(Error) + }) + + test('should return 1 on identical object literals', () => { + const classifier = new Classifier() + + expect( + classifier.cosineSimilarity( + { + 0: 1 + }, + { + 0: 1 + } + ) + ).toStrictEqual(1) + }) + + test('should return 0 on object literals with no similarity', () => { + const classifier = new Classifier() + + expect( + classifier.cosineSimilarity( + { + 0: 1 + }, + { + 1: 1 + } + ) + ).toStrictEqual(0) + }) + + test('should return > 0 on similar object literals', () => { + const classifier = new Classifier() + + expect( + classifier.cosineSimilarity( + { + 0: 1, + 1: 1 + }, + { + 0: 1, + 2: 1 + } + ) + ).toBeGreaterThan(0) + }) + + test('should return 0 when sum of v1 is 0', () => { + const classifier = new Classifier() + + expect( + classifier.cosineSimilarity( + { + 0: 0 + }, + { + 0: 1 + } + ) + ).toStrictEqual(0) + }) + + test('should return 0 when sum of v2 is 0', () => { + const classifier = new Classifier() + + expect( + classifier.cosineSimilarity( + { + 0: 1 + }, + { + 0: 0 + } + ) + ).toStrictEqual(0) + }) + }) + + describe('predict', () => { + test('should throw an error if input is not a string', () => { + const classifier = new Classifier() + + expect(() => classifier.predict([])).toThrow(Error) + }) + + test('should throw an error if maxMatches is not a number', () => { + const classifier = new Classifier() + + expect(() => classifier.predict('', 'test')).toThrow(Error) + }) + + test('should throw an error if minimumConfidence is not a number', () => { + const classifier = new Classifier() + + expect(() => classifier.predict('', undefined, 'test')).toThrow( + Error + ) + }) + + test('should throw an error if minimumConfidence is lower than 0', () => { + const classifier = new Classifier() + + expect(() => classifier.predict('', undefined, -1)).toThrow(Error) + }) + + test('should throw an error if minimumConfidence is higher than 1', () => { + const classifier = new Classifier() + + expect(() => classifier.predict('', undefined, 2)).toThrow(Error) + }) + + test('should return an array', () => { + const classifier = new Classifier() + + expect(classifier.predict('test')).toBeInstanceOf(Array) + }) + + test('should return one prediction when trained with a sample', () => { + const classifier = new Classifier() + + classifier.train('hello world', 'test') + + expect(classifier.predict('hello world').length).toStrictEqual(1) + }) + + test('should not include predictions with a confidence below the configured minimumConfidence', () => { + const classifier = new Classifier() + + classifier.train('hello world', 'test') + + const minimumConfidence = 0.8 + + const predictions = classifier.predict( + 'hello', + undefined, + minimumConfidence + ) + + expect( + predictions.filter((prediction) => { + return prediction.confidence < minimumConfidence + }).length + ).toStrictEqual(0) + }) + + test('should not update the model vocabulary', () => { + const classifier = new Classifier() + + classifier.train('hello world', 'test') + classifier.predict('hello foo world') + + expect(classifier.model.vocabulary.has('foo')).toStrictEqual(false) + }) + }) +}) diff --git a/test/Model.test.js b/test/Model.test.js new file mode 100644 index 0000000..33fff16 --- /dev/null +++ b/test/Model.test.js @@ -0,0 +1,206 @@ +import Model from '../src/Model' +import Vocabulary from '../src/Vocabulary' + +describe('Model', () => { + describe('constructor', () => { + test('should throw an error if config is not an object literal', () => { + expect(() => new Model([])).toThrow(Error) + }) + + test('should throw an error if config option nGramMin is not a number', () => { + expect( + () => + new Model({ + nGramMin: '' + }) + ).toThrow(Error) + }) + + test('should throw an error if config option nGramMax is not a number', () => { + expect( + () => + new Model({ + nGramMax: '' + }) + ).toThrow(Error) + }) + + test('should throw an error if config option nGramMin is less than 1', () => { + expect( + () => + new Model({ + nGramMin: 0 + }) + ).toThrow(Error) + }) + + test('should throw an error if config option nGramMax is less than 1', () => { + expect( + () => + new Model({ + nGramMax: 0 + }) + ).toThrow(Error) + }) + + test('should throw an error if config option nGramMax is less than nGramMin', () => { + expect( + () => + new Model({ + nGramMin: 2, + nGramMax: 1 + }) + ).toThrow(Error) + }) + + test('should throw an error if data is not an object literal', () => { + expect( + () => + new Model({ + data: [] + }) + ).toThrow(Error) + }) + }) + + describe('nGramMin', () => { + test('should return a number', () => { + const model = new Model() + + expect(typeof model.nGramMin).toStrictEqual('number') + }) + + test('should return the current nGramMin value', () => { + const model = new Model({ + nGramMin: 3, + nGramMax: 4 + }) + + expect(model.nGramMin).toStrictEqual(3) + }) + + test('should set the nGramMin value', () => { + const model = new Model() + + model.nGramMin = 2 + + expect(model.nGramMin).toStrictEqual(2) + }) + + test('should throw an error if size is not an integer', () => { + const model = new Model() + + expect(() => { + model.nGramMin = 1.1 + }).toThrow(Error) + }) + }) + + describe('nGramMax', () => { + test('should return a number', () => { + const model = new Model() + + expect(typeof model.nGramMax).toStrictEqual('number') + }) + + test('should return the current nGramMax value', () => { + const model = new Model({ + nGramMax: 2 + }) + + expect(model.nGramMax).toStrictEqual(2) + }) + + test('should set the nGramMax value', () => { + const model = new Model() + + model.nGramMax = 3 + + expect(model.nGramMax).toStrictEqual(3) + }) + + test('should throw an error if size is not an integer', () => { + const model = new Model() + + expect(() => { + model.nGramMax = 1.1 + }).toThrow(Error) + }) + }) + + describe('vocabulary', () => { + test('should return a vocabulary instance', () => { + const model = new Model() + + expect(model.vocabulary).toBeInstanceOf(Vocabulary) + }) + + test('should return false when vocabulary is configured to false', () => { + const model = new Model({ + vocabulary: false + }) + + expect(model.vocabulary).toStrictEqual(false) + }) + + test('should set the vocabulary value when passing an array', () => { + const model = new Model() + + model.vocabulary = ['hello', 'world'] + + expect(Array.from(model.vocabulary.terms)).toStrictEqual([ + 'hello', + 'world' + ]) + }) + + test('should set the vocabulary value when passing false', () => { + const model = new Model() + + model.vocabulary = false + + expect(model.vocabulary).toStrictEqual(false) + }) + }) + + describe('data', () => { + test('should return an object literal', () => { + const model = new Model() + + expect(model.data).toStrictEqual({}) + }) + + test('should set the model data', () => { + const model = new Model() + + model.data = { + test: { 0: 1 } + } + + expect(model.data).toStrictEqual({ + test: { 0: 1 } + }) + }) + + test('should throw an error if data is not an object literal', () => { + const model = new Model() + + expect(() => { + model.data = [] + }).toThrow(Error) + }) + }) + + describe('serialize', () => { + test('should return an object literal created from the current model', () => { + const model = new Model() + + expect(model.serialize()).toStrictEqual({ + nGramMin: 1, + nGramMax: 1, + vocabulary: [], + data: {} + }) + }) + }) +}) diff --git a/test/Prediction.test.js b/test/Prediction.test.js new file mode 100644 index 0000000..d2b3f84 --- /dev/null +++ b/test/Prediction.test.js @@ -0,0 +1,73 @@ +import Prediction from '../src/Prediction' + +describe('Prediction', () => { + describe('constructor', () => { + test('should throw an error if prediction is not an object literal', () => { + expect(() => new Prediction([])).toThrow(Error) + }) + }) + + describe('label', () => { + test('should throw an error if label is not a string', () => { + const prediction = new Prediction() + + expect(() => { + prediction.label = [] + }).toThrow(Error) + }) + + test('should return a string', () => { + const prediction = new Prediction() + + expect(typeof prediction.label).toStrictEqual('string') + }) + + test('should return the defined prediction label', () => { + const prediction = new Prediction({ + label: 'test' + }) + + expect(prediction.label).toStrictEqual('test') + }) + + test('should set the prediction label', () => { + const prediction = new Prediction() + + prediction.label = 'test' + + expect(prediction.label).toStrictEqual('test') + }) + }) + + describe('confidence', () => { + test('should throw an error if confidence is not a number', () => { + const prediction = new Prediction() + + expect(() => { + prediction.confidence = 'test' + }).toThrow(Error) + }) + + test('should return a number', () => { + const prediction = new Prediction() + + expect(typeof prediction.confidence).toStrictEqual('number') + }) + + test('should return the defined prediction confidence', () => { + const prediction = new Prediction({ + confidence: 0.5 + }) + + expect(prediction.confidence).toBeCloseTo(0.5) + }) + + test('should set the prediction confidence', () => { + const prediction = new Prediction() + + prediction.confidence = 1 + + expect(prediction.confidence).toStrictEqual(1) + }) + }) +}) diff --git a/test/Vocabulary.test.js b/test/Vocabulary.test.js new file mode 100644 index 0000000..95ed3f7 --- /dev/null +++ b/test/Vocabulary.test.js @@ -0,0 +1,177 @@ +import Vocabulary from '../src/Vocabulary' + +describe('Vocabulary', () => { + describe('constructor', () => { + test('should throw an error if terms is not an array or set', () => { + expect(() => new Vocabulary({})).toThrow(Error) + }) + }) + + describe('size', () => { + test('should return a number', () => { + const vocabulary = new Vocabulary() + + expect(typeof vocabulary.size).toStrictEqual('number') + }) + + test('should return the vocabulary size', () => { + const vocabulary = new Vocabulary(['hello']) + + expect(vocabulary.size).toStrictEqual(1) + }) + }) + + describe('terms', () => { + test('should return a set instance', () => { + const vocabulary = new Vocabulary() + + expect(vocabulary.terms).toBeInstanceOf(Set) + }) + + test('should return the vocabulary terms', () => { + const vocabulary = new Vocabulary(['hello']) + + expect(Array.from(vocabulary.terms)).toStrictEqual(['hello']) + }) + + test('should set the vocabulary terms from an array', () => { + const vocabulary = new Vocabulary() + + vocabulary.terms = ['hello', 'world'] + + expect(Array.from(vocabulary.terms)).toStrictEqual([ + 'hello', + 'world' + ]) + }) + + test('should set the vocabulary terms from a set', () => { + const vocabulary = new Vocabulary() + + vocabulary.terms = new Set(['hello', 'world']) + + expect(Array.from(vocabulary.terms)).toStrictEqual([ + 'hello', + 'world' + ]) + }) + + test('should throw an error if terms is not an array or set', () => { + const vocabulary = new Vocabulary() + + expect(() => { + vocabulary.terms = {} + }).toThrow(Error) + }) + }) + + describe('add', () => { + test('should throw an error if terms is not a string, array or set', () => { + const vocabulary = new Vocabulary() + + expect(() => vocabulary.add({})).toThrow(Error) + }) + + test('should add a term to the vocabulary from a string', () => { + const vocabulary = new Vocabulary() + + vocabulary.add('test') + + expect(Array.from(vocabulary.terms)).toStrictEqual(['test']) + }) + + test('should add terms to the vocabulary from an array', () => { + const vocabulary = new Vocabulary() + + vocabulary.add(['hello', 'world']) + + expect(Array.from(vocabulary.terms)).toStrictEqual([ + 'hello', + 'world' + ]) + }) + + test('should add terms to the vocabulary from a set', () => { + const vocabulary = new Vocabulary() + + vocabulary.add(new Set(['hello', 'world'])) + + expect(Array.from(vocabulary.terms)).toStrictEqual([ + 'hello', + 'world' + ]) + }) + + test('should return vocabulary instance', () => { + const vocabulary = new Vocabulary() + + expect(vocabulary.add('test')).toBeInstanceOf(Vocabulary) + }) + }) + + describe('remove', () => { + test('should throw an error if terms is not a string, array or set', () => { + const vocabulary = new Vocabulary() + + expect(() => vocabulary.remove({})).toThrow(Error) + }) + + test('should remove a term to the vocabulary when called with a string', () => { + const vocabulary = new Vocabulary(['test']) + + vocabulary.remove('test') + + expect(Array.from(vocabulary.terms)).toStrictEqual([]) + }) + + test('should remove terms from the vocabulary when called with an array', () => { + const vocabulary = new Vocabulary(['hello', 'world']) + + vocabulary.remove(['world']) + + expect(Array.from(vocabulary.terms)).toStrictEqual(['hello']) + }) + + test('should remove terms from the vocabulary when called with a set', () => { + const vocabulary = new Vocabulary(['hello', 'world']) + + vocabulary.remove(new Set(['world'])) + + expect(Array.from(vocabulary.terms)).toStrictEqual(['hello']) + }) + + test('should return a vocabulary instance', () => { + const vocabulary = new Vocabulary(['test']) + + expect(vocabulary.remove('test')).toBeInstanceOf(Vocabulary) + }) + }) + + describe('has', () => { + test('should return a boolean', () => { + const vocabulary = new Vocabulary() + + expect(typeof vocabulary.has('test')).toStrictEqual('boolean') + }) + + test('should return whether a term exists in the vocabulary', () => { + const vocabulary = new Vocabulary(['test']) + + expect(vocabulary.has('test')).toStrictEqual(true) + }) + }) + + describe('indexOf', () => { + test('should return the index of an existing vocabulary term', () => { + const vocabulary = new Vocabulary(['test']) + + expect(vocabulary.indexOf('test')).toStrictEqual(0) + }) + + test('should return -1 for non-existing vocabulary terms', () => { + const vocabulary = new Vocabulary() + + expect(vocabulary.indexOf('test')).toStrictEqual(-1) + }) + }) +}) diff --git a/test/classifier.js b/test/classifier.js deleted file mode 100644 index a8b3791..0000000 --- a/test/classifier.js +++ /dev/null @@ -1,363 +0,0 @@ -import { assert, expect } from 'chai' -import Classifier from '../src/classifier' -import Model from '../src/model' - -describe('Classifier', () => { - describe('constructor', () => { - it('should set the model when passed a model instance', () => { - const classifier = new Classifier(new Model({ - nGramMax: 4 - })) - - expect(classifier.model.nGramMax).to.equal(4) - }) - - it('should set the model when passed an object literal', () => { - const classifier = new Classifier({ - nGramMax: 5 - }) - - expect(classifier.model.nGramMax).to.equal(5) - }) - }) - - describe('model', () => { - it('should return a model instance', () => { - let classifier = new Classifier() - - assert.instanceOf(classifier.model, Model) - }) - - it('should set the current model when passed a model instance', () => { - let classifier = new Classifier() - - classifier.model = new Model({ - nGramMax: 3 - }) - - expect(classifier.model.nGramMax).to.equal(3) - }) - - it('should set the current model to a new model instance when passed an object literal', () => { - let classifier = new Classifier() - - classifier.model = {} - - assert.instanceOf(classifier.model, Model) - }) - }) - - describe('splitWords', () => { - it('should throw an error if input is not a string', () => { - const classifier = new Classifier() - - expect(() => classifier.splitWords(1)).to.throw(Error) - }) - - it('should split a string into an array of words', () => { - const classifier = new Classifier() - - expect(classifier.splitWords('Hello world!')).to.eql( - ['hello', 'world'] - ) - }) - }) - - describe('tokenize', () => { - it('should throw an error if input is neither a string or array', () => { - const classifier = new Classifier() - - expect(() => classifier.tokenize({})).to.throw(Error) - }) - - it('should throw an error if nGramMax is less than nGramMin in model config', () => { - const classifier = new Classifier() - - classifier.model.nGramMin = 2 - - expect(() => classifier.tokenize('Hello world!')).to.throw(Error) - }) - - it('should return an object literal of tokens and their occurrences from a string', () => { - const classifier = new Classifier() - - expect(classifier.tokenize('Hello world!')).to.eql({ - hello: 1, - world: 1 - }) - }) - - it('should return an object literal of tokens and their occurrences from a string', () => { - const classifier = new Classifier() - - expect(classifier.tokenize('Hello world!')).to.eql({ - hello: 1, - world: 1 - }) - }) - - it('should return an object literal of tokens and their occurrences from a array', () => { - const classifier = new Classifier() - - expect(classifier.tokenize(['hello', 'world'])).to.eql({ - hello: 1, - world: 1 - }) - }) - - it('should return an object literal of bigrams when nGramMin/nGramMax is 2', () => { - const classifier = new Classifier({ - nGramMin: 2, - nGramMax: 2 - }) - - expect(classifier.tokenize('Hello world!')).to.eql({ - 'hello world': 1 - }) - }) - - it('should return an object literal of unigrams and bigrams when nGramMin/nGramMax is 1/2', () => { - const classifier = new Classifier({ - nGramMin: 1, - nGramMax: 2 - }) - - expect(classifier.tokenize('Hello world!')).to.eql({ - 'hello': 1, - 'hello world': 1, - 'world': 1 - }) - }) - - it('should increment the occurrence of the duplicate tokens', () => { - const classifier = new Classifier() - - expect(classifier.tokenize('Hello hello!')).to.eql({ - 'hello': 2 - }) - }) - }) - - describe('vectorize', () => { - it('should throw an error if input is not an object literal', () => { - const classifier = new Classifier() - - expect(() => classifier.vectorize([])).to.throw(Error) - }) - - it('should throw an error if vocabulary config option is set to false', () => { - const classifier = new Classifier({ - vocabulary: false - }) - - expect(() => classifier.vectorize('hello')).to.throw(Error) - }) - - it('should convert key to its corresponding vocabulary term index', () => { - const classifier = new Classifier() - const tokens = classifier.tokenize('Hello') - - expect(classifier.vectorize(tokens)).to.eql({ 0: 1 }) - }) - - it('should use existing term index when token is already in vocabulary', () => { - const classifier = new Classifier({ - vocabulary: ['hello', 'world'] - }) - - const tokens = classifier.tokenize('world') - - expect(classifier.vectorize(tokens)).to.eql({ 1: 1 }) - }) - - it('should add new tokens to the vocabulary', () => { - const classifier = new Classifier() - - const tokens = classifier.tokenize('Hello world') - - classifier.vectorize(tokens) - - const terms = classifier.model.vocabulary.terms - - expect(Array.from(terms)).to.eql(['hello', 'world']) - }) - }) - - describe('train', () => { - it('should throw an error if input is not a string or array', () => { - const classifier = new Classifier() - - expect(() => classifier.train({}, 'test')).to.throw(Error) - }) - - it('should throw an error if label is not a string', () => { - const classifier = new Classifier() - - expect(() => classifier.train('test', [])).to.throw(Error) - }) - - it('should add tokens to the vocabulary (if not configured to false)', () => { - const classifier = new Classifier() - - classifier.train('hello world', 'test') - - const vocabulary = classifier.model.vocabulary - - expect(vocabulary.size).to.equal(2) - }) - - it('should add tokens (and their occurrences) to the model from a string', () => { - const classifier = new Classifier() - - classifier.train('hello world', 'test') - - const model = classifier.model - - expect(model.data).to.eql({ - test: { 0: 1, 1: 1 } - }) - }) - - it('should add tokens (and their occurrences) to the model from an array of strings', () => { - const classifier = new Classifier() - - classifier.train([ - 'hello world', - 'foo', 'bar' - ], 'test') - - const model = classifier.model - - expect(model.data).to.eql({ - test: { 0: 1, 1: 1, 2: 1, 3: 1 } - }) - }) - - it('should increment the occurrence of an existing vocabulary term', () => { - const classifier = new Classifier() - - classifier.train([ - 'hello world', - 'foo', 'hello' - ], 'test') - - const model = classifier.model - - expect(model.data).to.eql({ - test: { 0: 2, 1: 1, 2: 1 } - }) - }) - - it('should return classifier instance', () => { - const classifier = new Classifier() - - expect(classifier.train('hello world', 'test')).to.equal(classifier) - }) - }) - - describe('cosineSimilarity', () => { - it('should throw an error if v1 is not an object literal', () => { - const classifier = new Classifier() - - expect(() => classifier.cosineSimilarity(false, {})).to.throw(Error) - }) - - it('should throw an error if v2 is not an object literal', () => { - const classifier = new Classifier() - - expect(() => classifier.cosineSimilarity({}, false)).to.throw(Error) - }) - - it('should return 1 on identical object literals', () => { - const classifier = new Classifier() - - expect(classifier.cosineSimilarity({ - 0: 1 - }, { - 0: 1 - })).to.equal(1) - }) - - it('should return 0 on object literals with no similarity', () => { - const classifier = new Classifier() - - expect(classifier.cosineSimilarity({ - 0: 1 - }, { - 1: 1 - })).to.equal(0) - }) - - it('should return > 0 on similar object literals', () => { - const classifier = new Classifier() - - assert.isAbove(classifier.cosineSimilarity({ - 0: 1, - 1: 1 - }, { - 0: 1, - 2: 1 - }), 0) - }) - - it('should return 0 when sum of v1 is 0', () => { - const classifier = new Classifier() - - expect(classifier.cosineSimilarity({ - 0: 0 - }, { - 0: 1 - })).to.equal(0) - }) - - it('should return 0 when sum of v2 is 0', () => { - const classifier = new Classifier() - - expect(classifier.cosineSimilarity({ - 0: 1 - }, { - 0: 0 - })).to.equal(0) - }) - }) - - describe('predict', () => { - it('should throw an error if input is not a string', () => { - const classifier = new Classifier() - - expect(() => classifier.predict([])).to.throw(Error) - }) - - it('should throw an error if minimumConfidence is not a number', () => { - const classifier = new Classifier() - - expect(() => classifier.predict('', null, '')).to.throw(Error) - }) - - it('should throw an error if minimumConfidence is lower than 0', () => { - const classifier = new Classifier() - - expect(() => classifier.predict('', null, -1)).to.throw(Error) - }) - - it('should throw an error if minimumConfidence is higher than 1', () => { - const classifier = new Classifier() - - expect(() => classifier.predict('', null, 2)).to.throw(Error) - }) - - it('should return an array', () => { - const classifier = new Classifier() - - assert.typeOf(classifier.predict('test'), 'array') - }) - - it('should return one prediction when trained with a sample', () => { - const classifier = new Classifier() - - classifier.train('hello world', 'test') - - assert.lengthOf(classifier.predict('hello world'), 1) - }) - }) -}) diff --git a/test/model.js b/test/model.js deleted file mode 100644 index 8de3f7a..0000000 --- a/test/model.js +++ /dev/null @@ -1,186 +0,0 @@ -import { assert, expect } from 'chai' -import Model from '../src/model' -import Vocabulary from '../src/vocabulary' - -describe('Model', () => { - describe('constructor', () => { - it('should throw an error if config is not an object literal', () => { - expect(() => new Model([])).to.throw(Error) - }) - - it('should throw an error if config option nGramMin is not a number', () => { - expect(() => new Model({ - nGramMin: '' - })).to.throw(Error) - }) - - it('should throw an error if config option nGramMax is not a number', () => { - expect(() => new Model({ - nGramMax: '' - })).to.throw(Error) - }) - - it('should throw an error if config option nGramMin is less than 1', () => { - expect(() => new Model({ - nGramMin: 0 - })).to.throw(Error) - }) - - it('should throw an error if config option nGramMax is less than 1', () => { - expect(() => new Model({ - nGramMax: 0 - })).to.throw(Error) - }) - - it('should throw an error if config option nGramMax is less than nGramMin', () => { - expect(() => new Model({ - nGramMin: 2, - nGramMax: 1 - })).to.throw(Error) - }) - - it('should throw an error if data is not an object literal', () => { - expect(() => new Model({ - data: [] - })).to.throw(Error) - }) - }) - - describe('nGramMin', () => { - it('should return a number', () => { - const model = new Model() - - expect(model.nGramMin).to.be.a('number') - }) - - it('should return the current nGramMin value', () => { - const model = new Model({ - nGramMin: 3, - nGramMax: 4 - }) - - expect(model.nGramMin).to.equal(3) - }) - - it('should set the nGramMin value', () => { - const model = new Model() - - model.nGramMin = 2 - - expect(model.nGramMin).to.equal(2) - }) - - it('should throw an error if size is not an integer', () => { - const model = new Model() - - expect(() => { - model.nGramMin = 1.1 - }).to.throw(Error) - }) - }) - - describe('nGramMax', () => { - it('should return a number', () => { - const model = new Model() - - expect(model.nGramMax).to.be.a('number') - }) - - it('should return the current nGramMax value', () => { - const model = new Model({ - nGramMax: 2 - }) - - expect(model.nGramMax).to.equal(2) - }) - - it('should set the nGramMax value', () => { - const model = new Model() - - model.nGramMax = 3 - - expect(model.nGramMax).to.equal(3) - }) - - it('should throw an error if size is not an integer', () => { - const model = new Model() - - expect(() => { - model.nGramMax = 1.1 - }).to.throw(Error) - }) - }) - - describe('vocabulary', () => { - it('should return a vocabulary instance', () => { - const model = new Model() - - assert.instanceOf(model.vocabulary, Vocabulary) - }) - - it('should return false when vocabulary is configured to false', () => { - const model = new Model({ - vocabulary: false - }) - - expect(model.vocabulary).to.equal(false) - }) - - it('should set the vocabulary value when passing an array', () => { - const model = new Model() - - model.vocabulary = ['hello', 'world'] - - expect(Array.from(model.vocabulary.terms)).to.eql(['hello', 'world']) - }) - - it('should set the vocabulary value when passing false', () => { - const model = new Model() - - model.vocabulary = false - - assert.isFalse(model.vocabulary) - }) - }) - - describe('data', () => { - it('should return an object literal', () => { - const model = new Model() - - expect(model.data).to.eql({}) - }) - - it('should set the model data', () => { - const model = new Model() - - model.data = { - test: { 0: 1 } - } - - expect(model.data).to.eql({ - test: { 0: 1 } - }) - }) - - it('should throw an error if data is not an object literal', () => { - const model = new Model() - - expect(() => { - model.data = [] - }).to.throw(Error) - }) - }) - - describe('serialize', () => { - it('should return an object literal created from the current model', () => { - const model = new Model() - - expect(model.serialize()).to.eql({ - nGramMin: 1, - nGramMax: 1, - vocabulary: [], - data: {} - }) - }) - }) -}) diff --git a/test/prediction.js b/test/prediction.js deleted file mode 100644 index 61be86d..0000000 --- a/test/prediction.js +++ /dev/null @@ -1,74 +0,0 @@ -import { assert, expect } from 'chai' -import Prediction from '../src/prediction' - -describe('Prediction', () => { - describe('constructor', () => { - it('should throw an error if prediction is not an object literal', () => { - expect(() => new Prediction([])).to.throw(Error) - }) - }) - - describe('label', () => { - it('should throw an error if label is not a string', () => { - const prediction = new Prediction() - - expect(() => { - prediction.label = [] - }).to.throw(Error) - }) - - it('should return a string', () => { - const prediction = new Prediction() - - expect(prediction.label).to.be.a('string') - }) - - it('should return the defined prediction label', () => { - const prediction = new Prediction({ - label: 'test' - }) - - expect(prediction.label).to.equal('test') - }) - - it('should set the prediction label', () => { - const prediction = new Prediction() - - prediction.label = 'test' - - expect(prediction.label).to.equal('test') - }) - }) - - describe('confidence', () => { - it('should throw an error if confidence is not a number', () => { - const prediction = new Prediction() - - expect(() => { - prediction.confidence = 'test' - }).to.throw(Error) - }) - - it('should return a number', () => { - const prediction = new Prediction() - - expect(prediction.confidence).to.be.a('number') - }) - - it('should return the defined prediction confidence', () => { - const prediction = new Prediction({ - confidence: 0.5 - }) - - expect(prediction.confidence).to.equal(0.5) - }) - - it('should set the prediction confidence', () => { - const prediction = new Prediction() - - prediction.confidence = 1 - - expect(prediction.confidence).to.equal(1) - }) - }) -}) diff --git a/test/vocabulary.js b/test/vocabulary.js deleted file mode 100644 index 46db994..0000000 --- a/test/vocabulary.js +++ /dev/null @@ -1,166 +0,0 @@ -import { assert, expect } from 'chai' -import Vocabulary from '../src/vocabulary' - -describe('Vocabulary', () => { - describe('constructor', () => { - it('should throw an error if terms is not an array or set', () => { - expect(() => new Vocabulary({})).to.throw(Error) - }) - }) - - describe('size', () => { - it('should return a number', () => { - const vocabulary = new Vocabulary() - - expect(vocabulary.size).to.be.a('number') - }) - - it('should return the vocabulary size', () => { - const vocabulary = new Vocabulary([ 'hello' ]) - - expect(vocabulary.size).to.equal(1) - }) - }) - - describe('terms', () => { - it('should return a set instance', () => { - const vocabulary = new Vocabulary() - - assert.instanceOf(vocabulary.terms, Set) - }) - - it('should return the vocabulary terms', () => { - const vocabulary = new Vocabulary(['hello']) - - expect(Array.from(vocabulary.terms)).to.eql(['hello']) - }) - - it('should set the vocabulary terms from an array', () => { - const vocabulary = new Vocabulary() - - vocabulary.terms = ['hello', 'world'] - - expect(Array.from(vocabulary.terms)).to.eql(['hello', 'world']) - }) - - it('should set the vocabulary terms from a set', () => { - const vocabulary = new Vocabulary() - - vocabulary.terms = new Set(['hello', 'world']) - - expect(Array.from(vocabulary.terms)).to.eql(['hello', 'world']) - }) - - it('should throw an error if terms is not an array or set', () => { - const vocabulary = new Vocabulary() - - expect(() => { - vocabulary.terms = {} - }).to.throw(Error) - }) - }) - - describe('add', () => { - it('should throw an error if terms is not a string, array or set', () => { - const vocabulary = new Vocabulary() - - expect(() => vocabulary.add({})).to.throw(Error) - }) - - it('should add a term to the vocabulary from a string', () => { - const vocabulary = new Vocabulary() - - vocabulary.add('test') - - expect(Array.from(vocabulary.terms)).to.eql(['test']) - }) - - it('should add terms to the vocabulary from an array', () => { - const vocabulary = new Vocabulary() - - vocabulary.add(['hello', 'world']) - - expect(Array.from(vocabulary.terms)).to.eql(['hello', 'world']) - }) - - it('should add terms to the vocabulary from a set', () => { - const vocabulary = new Vocabulary() - - vocabulary.add(new Set(['hello', 'world'])) - - expect(Array.from(vocabulary.terms)).to.eql(['hello', 'world']) - }) - - it('should return vocabulary instance', () => { - const vocabulary = new Vocabulary() - - assert.instanceOf(vocabulary.add('test'), Vocabulary) - }) - }) - - describe('remove', () => { - it('should throw an error if terms is not a string, array or set', () => { - const vocabulary = new Vocabulary() - - expect(() => vocabulary.remove({})).to.throw(Error) - }) - - it('should remove a term to the vocabulary when called with a string', () => { - const vocabulary = new Vocabulary(['test']) - - vocabulary.remove('test') - - expect(Array.from(vocabulary.terms)).to.eql([]) - }) - - it('should remove terms from the vocabulary when called with an array', () => { - const vocabulary = new Vocabulary(['hello', 'world']) - - vocabulary.remove(['world']) - - expect(Array.from(vocabulary.terms)).to.eql(['hello']) - }) - - it('should remove terms from the vocabulary when called with a set', () => { - const vocabulary = new Vocabulary(['hello', 'world']) - - vocabulary.remove(new Set(['world'])) - - expect(Array.from(vocabulary.terms)).to.eql(['hello']) - }) - - it('should return a vocabulary instance', () => { - const vocabulary = new Vocabulary(['test']) - - assert.instanceOf(vocabulary.remove('test'), Vocabulary) - }) - }) - - describe('has', () => { - it('should return a boolean', () => { - const vocabulary = new Vocabulary() - - assert.isBoolean(vocabulary.has('test')) - }) - - it('should return whether a term exists in the vocabulary', () => { - const vocabulary = new Vocabulary(['test']) - - assert.isTrue(vocabulary.has('test')) - }) - }) - - describe('indexOf', () => { - it('should return the index of an existing vocabulary term', () => { - const vocabulary = new Vocabulary(['test']) - - expect(vocabulary.indexOf('test')).to.equal(0) - }) - - it('should return -1 for non-existing vocabulary terms', () => { - const vocabulary = new Vocabulary() - - expect(vocabulary.indexOf('test')).to.equal(-1) - }) - }) -}) From c46753fc7bec52ff7567c59b18ec312e7fa9205c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:17:52 +0100 Subject: [PATCH 11/19] Fix casing in class filenames --- src/{classifier.js => Classifier.js} | 0 src/{model.js => Model.js} | 0 src/{prediction.js => Prediction.js} | 0 src/{vocabulary.js => Vocabulary.js} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename src/{classifier.js => Classifier.js} (100%) rename src/{model.js => Model.js} (100%) rename src/{prediction.js => Prediction.js} (100%) rename src/{vocabulary.js => Vocabulary.js} (100%) diff --git a/src/classifier.js b/src/Classifier.js similarity index 100% rename from src/classifier.js rename to src/Classifier.js diff --git a/src/model.js b/src/Model.js similarity index 100% rename from src/model.js rename to src/Model.js diff --git a/src/prediction.js b/src/Prediction.js similarity index 100% rename from src/prediction.js rename to src/Prediction.js diff --git a/src/vocabulary.js b/src/Vocabulary.js similarity index 100% rename from src/vocabulary.js rename to src/Vocabulary.js From edc0bc55a046ec1c19491f302f982d31ac55c851 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:26:24 +0100 Subject: [PATCH 12/19] Add Prediction to exports to allow for type checks --- src/index.js | 1 + 1 file changed, 1 insertion(+) diff --git a/src/index.js b/src/index.js index 4332d7a..c995579 100644 --- a/src/index.js +++ b/src/index.js @@ -2,6 +2,7 @@ import Classifier from './Classifier' export { default as Model } from './Model' export { default as Vocabulary } from './Vocabulary' +export { default as Prediction } from './Prediction' export { Classifier as Classifier } export default Classifier From 9d1a34d51972bbbbd12f8429ad6feeafa0a53c94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:33:48 +0100 Subject: [PATCH 13/19] Update docs --- docs/classifier.md | 89 +++++++++++++++++++++++++--------------------- docs/model.md | 35 ++++++++++-------- docs/prediction.md | 8 +++-- docs/vocabulary.md | 55 +++++++++++++++------------- 4 files changed, 103 insertions(+), 84 deletions(-) diff --git a/docs/classifier.md b/docs/classifier.md index 90ae8e3..6c584ef 100644 --- a/docs/classifier.md +++ b/docs/classifier.md @@ -2,90 +2,97 @@ ## Classifier -* [Classifier](#Classifier) - * [new Classifier([model])](#new_Classifier_new) - * [.model](#Classifier+model) : Model - * [.train(input, label)](#Classifier+train) ⇒ this - * [.predict(input, [maxMatches], [minimumConfidence])](#Classifier+predict) ⇒ Array - * [.splitWords(input)](#Classifier+splitWords) ⇒ Array - * [.tokenize(input)](#Classifier+tokenize) ⇒ Object - * [.vectorize(tokens)](#Classifier+vectorize) ⇒ Object - * [.cosineSimilarity(v1, v2)](#Classifier+cosineSimilarity) ⇒ float - - +- [Classifier](#Classifier) + - [new Classifier([model])](#new_Classifier) + - [.model](#Classifier+model) : Model + - [.train(input, label)](#Classifier+train) ⇒ this + - [.predict(input, [maxMatches], [minimumConfidence])](#Classifier+predict) ⇒ Array + - [.splitWords(input)](#Classifier+splitWords) ⇒ Array + - [.tokenize(input)](#Classifier+tokenize) ⇒ Object + - [.vectorize(tokens)](#Classifier+vectorize) ⇒ Object + - [.cosineSimilarity(v1, v2)](#Classifier+cosineSimilarity) ⇒ float + + ### new Classifier([model]) -| Param | Type | Default | Description | -| --- | --- | --- | --- | -| [model] | `Model` \| `Object` | | | -| [model.nGramMin] | `int` | `1` | Minimum n-gram size | -| [model.nGramMax] | `int` | `1` | Maximum n-gram size | -| [model.vocabulary] | `Array` \| `Set` \| `false` | `[]` | Terms mapped to indexes in the model data, set to `false` to store terms directly in the data entries | -| [model.data] | `Object` | `{}` | Key-value store of labels and training data vectors | +| Param | Type | Default | Description | +| ------------------ | --------------------------- | ------- | ----------------------------------------------------------------------------------------------------- | +| [model] | `Model` \| `Object` | | | +| [model.nGramMin] | `int` | `1` | Minimum n-gram size | +| [model.nGramMax] | `int` | `1` | Maximum n-gram size | +| [model.vocabulary] | `Array` \| `Set` \| `false` | `[]` | Terms mapped to indexes in the model data, set to `false` to store terms directly in the data entries | +| [model.data] | `Object` | `{}` | Key-value store of labels and training data vectors | ### classifier.model : `Model` + Model instance ### classifier.train(input, label) ⇒ `this` + Train the current model using an input string (or array of strings) and a corresponding label -| Param | Type | Description | -| --- | --- | --- | +| Param | Type | Description | +| ----- | ------------------- | ------------------------------ | | input | `string` \| `Array` | String, or an array of strings | -| label | `string` | Corresponding label | +| label | `string` | Corresponding label | ### classifier.predict(input, [maxMatches], [minimumConfidence]) ⇒ `Array` + Return an array of one or more Prediction instances -| Param | Type | Default | Description | -| --- | --- | --- | --- | -| input | `string` | | Input string to make a prediction from | -| [maxMatches] | `int` | `1` | Maximum number of predictions to return | -| [minimumConfidence] | `float` | `0.2` | Minimum confidence required to include a prediction | +| Param | Type | Default | Description | +| ------------------- | -------- | ------- | --------------------------------------------------- | +| input | `string` | | Input string to make a prediction from | +| [maxMatches] | `int` | `1` | Maximum number of predictions to return | +| [minimumConfidence] | `float` | `0.2` | Minimum confidence required to include a prediction | ### classifier.splitWords(input) ⇒ `Array` + Split a string into an array of lowercase words, with all non-letter characters removed -| Param | Type | -| --- | --- | -| input | `string` | +| Param | Type | +| ----- | -------- | +| input | `string` | ### classifier.tokenize(input) ⇒ `Object` + Create an object literal of unique tokens (n-grams) as keys, and their respective occurrences as values based on an input string, or array of words -| Param | Type | -| --- | --- | -| input | `string` \| `Array` | +| Param | Type | +| ----- | ------------------- | +| input | `string` \| `Array` | ### classifier.vectorize(tokens) ⇒ `Object` + Convert a tokenized object into a new object with all keys (terms) -translated to their index in the vocabulary (adding all terms to -the vocabulary that do not already exist) +translated to their index in the returned vocabulary (which is also +returned along with the object, with any new terms added to the end) -| Param | Type | -| --- | --- | -| tokens | `Object` | +| Param | Type | +| ------ | -------- | +| tokens | `Object` | ### classifier.cosineSimilarity(v1, v2) ⇒ `float` + Return the cosine similarity between two vectors -| Param | Type | -| --- | --- | -| v1 | `Object` | -| v2 | `Object` | +| Param | Type | +| ----- | -------- | +| v1 | `Object` | +| v2 | `Object` | diff --git a/docs/model.md b/docs/model.md index 2c515f7..4c3e2fb 100644 --- a/docs/model.md +++ b/docs/model.md @@ -2,49 +2,54 @@ ## Model -* [Model](#Model) - * [new Model([config])](#new_Model_new) - * [.nGramMin](#Model+nGramMin) : `int` - * [.nGramMax](#Model+nGramMax) : `int` - * [.vocabulary](#Model+vocabulary) : `Vocabulary` \| `false` - * [.data](#Model+data) : `Object` - * [.serialize()](#Model+serialize) ⇒ `Object` +- [Model](#Model) + - [new Model([config])](#new_Model) + - [.nGramMin](#Model+nGramMin) : `int` + - [.nGramMax](#Model+nGramMax) : `int` + - [.vocabulary](#Model+vocabulary) : `Vocabulary` \| `false` + - [.data](#Model+data) : `Object` + - [.serialize()](#Model+serialize) ⇒ `Object` - + ### new Model([config]) -| Param | Type | Default | Description | -| --- | --- | --- | --- | -| [config] | `Object` | | | -| [config.nGramMin] | `int` | `1` | Minimum n-gram size | -| [config.nGramMax] | `int` | `1` | Maximum n-gram size | -| [config.vocabulary] | `Array` \| `Set` \| `false` | `[]` | Terms mapped to indexes in the model data entries, set to false to store terms directly in the data entries | -| [config.data] | `Object` | `{}` | Key-value store containing all training data | +| Param | Type | Default | Description | +| ------------------- | --------------------------- | ------- | ----------------------------------------------------------------------------------------------------------- | +| [config] | `Object` | | | +| [config.nGramMin] | `int` | `1` | Minimum n-gram size | +| [config.nGramMax] | `int` | `1` | Maximum n-gram size | +| [config.vocabulary] | `Array` \| `Set` \| `false` | `[]` | Terms mapped to indexes in the model data entries, set to false to store terms directly in the data entries | +| [config.data] | `Object` | `{}` | Key-value store containing all training data | ### model.nGramMin : `int` + Minimum n-gram size ### model.nGramMax : `int` + Maximum n-gram size ### model.vocabulary : `Vocabulary` \| `false` + Vocabulary instance ### model.data : `Object` + Model data ### model.serialize() ⇒ `Object` + Return the model in its current state an an object literal, including the configured n-gram min/max values, the vocabulary as an array (if any, otherwise false), and an object literal with all the training data diff --git a/docs/prediction.md b/docs/prediction.md index 10b80f8..7776fe8 100644 --- a/docs/prediction.md +++ b/docs/prediction.md @@ -2,16 +2,18 @@ ## Prediction -* [Prediction](#Prediction) - * [.label](#Prediction+label) : `string` - * [.confidence](#Prediction+confidence) : `number` +- [Prediction](#Prediction) + - [.label](#Prediction+label) : `string` + - [.confidence](#Prediction+confidence) : `number` ### prediction.label : `string` + Label of the prediction ### prediction.confidence : `number` + Confidence of the prediction diff --git a/docs/vocabulary.md b/docs/vocabulary.md index c607ca9..dfa317f 100644 --- a/docs/vocabulary.md +++ b/docs/vocabulary.md @@ -2,66 +2,71 @@ ## Vocabulary -* [Vocabulary](#Vocabulary) - * [new Vocabulary(terms)](#new_Vocabulary_new) - * [.size](#Vocabulary+size) : `number` - * [.terms](#Vocabulary+terms) : `Array` \| `Set` - * [.add(terms)](#Vocabulary+add) ⇒ `this` - * [.remove(terms)](#Vocabulary+remove) ⇒ `this` - * [.has(term)](#Vocabulary+has) ⇒ `bool` - * [.indexOf(term)](#Vocabulary+indexOf) ⇒ `number` +- [Vocabulary](#Vocabulary) + - [new Vocabulary(terms)](#new_Vocabulary) + - [.size](#Vocabulary+size) : `number` + - [.terms](#Vocabulary+terms) : `Array` \| `Set` + - [.add(terms)](#Vocabulary+add) ⇒ `this` + - [.remove(terms)](#Vocabulary+remove) ⇒ `this` + - [.has(term)](#Vocabulary+has) ⇒ `bool` + - [.indexOf(term)](#Vocabulary+indexOf) ⇒ `number` - + ### new Vocabulary(terms) -| Param | Type | -| --- | --- | -| terms | `Array` \| `Set` | +| Param | Type | +| ----- | ---------------- | +| terms | `Array` \| `Set` | ### vocabulary.size : `number` + Vocabulary size ### vocabulary.terms : `Array` \| `Set` + Vocabulary terms ### vocabulary.add(terms) ⇒ `this` + Add one or more terms to the vocabulary -| Param | Type | -| --- | --- | -| terms | `string` \| `Array` \| `Set` | +| Param | Type | +| ----- | ---------------------------- | +| terms | `string` \| `Array` \| `Set` | ### vocabulary.remove(terms) ⇒ `this` + Remove one or more terms from the vocabulary -| Param | Type | -| --- | --- | -| terms | `string` \| `Array` \| `Set` | +| Param | Type | +| ----- | ---------------------------- | +| terms | `string` \| `Array` \| `Set` | ### vocabulary.has(term) ⇒ `bool` + Return whether the vocabulary contains a certain term -| Param | Type | -| --- | --- | -| term | `string` | +| Param | Type | +| ----- | -------- | +| term | `string` | ### vocabulary.indexOf(term) ⇒ `number` -Return the index of a term in the vocabulary (returns -1 if not found) -| Param | Type | -| --- | --- | -| term | `string` | +Return the index of a term in the vocabulary (returns -1 if not found) +| Param | Type | +| ----- | -------- | +| term | `string` | From cf42a496cad7330da04dec7433d3173232e95308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:34:49 +0100 Subject: [PATCH 14/19] Fix casing in docs filenames --- docs/{classifier.md => Classifier.md} | 0 docs/{model.md => Model.md} | 0 docs/{prediction.md => Prediction.md} | 0 docs/{vocabulary.md => Vocabulary.md} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename docs/{classifier.md => Classifier.md} (100%) rename docs/{model.md => Model.md} (100%) rename docs/{prediction.md => Prediction.md} (100%) rename docs/{vocabulary.md => Vocabulary.md} (100%) diff --git a/docs/classifier.md b/docs/Classifier.md similarity index 100% rename from docs/classifier.md rename to docs/Classifier.md diff --git a/docs/model.md b/docs/Model.md similarity index 100% rename from docs/model.md rename to docs/Model.md diff --git a/docs/prediction.md b/docs/Prediction.md similarity index 100% rename from docs/prediction.md rename to docs/Prediction.md diff --git a/docs/vocabulary.md b/docs/Vocabulary.md similarity index 100% rename from docs/vocabulary.md rename to docs/Vocabulary.md From 387d718fe3f6f7e5d922cb8f64adfd74ae8fc383 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 22:36:06 +0100 Subject: [PATCH 15/19] Update README --- README.md | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index fec886a..bd02438 100644 --- a/README.md +++ b/README.md @@ -45,16 +45,16 @@ const classifier = new Classifier() ### Training a model ```javascript -let positive = [ - 'This is great, so cool!', - 'Wow, I love it!', - 'It really is amazing', +const positive = [ + 'This is great, so cool!', + 'Wow, I love it!', + 'It really is amazing' ] -let negative = [ - 'This is really bad', - 'I hate it with a passion', - 'Just terrible!', +const negative = [ + 'This is really bad', + 'I hate it with a passion', + 'Just terrible!' ] classifier.train(positive, 'positive') @@ -64,10 +64,10 @@ classifier.train(negative, 'negative') ### Getting a prediction ```javascript -let predictions = classifier.predict('It sure is pretty great!') +const predictions = classifier.predict('It sure is pretty great!') if (predictions.length) { - predictions.forEach(prediction => { + predictions.forEach((prediction) => { console.log(`${prediction.label} (${prediction.confidence})`) }) } else { @@ -89,12 +89,12 @@ The following configuration options can be passed both directly to a new [Model] #### Options -| Property | Type | Default | Description | -| --- | --- | --- | --- | -| **nGramMin** | `int` | `1` | Minimum n-gram size | -| **nGramMax** | `int` | `1` | Maximum n-gram size | -| **vocabulary** | `Array` \| `Set` \| `false` | `[]` | Terms mapped to indexes in the model data, set to `false` to store terms directly in the data entries | -| **data** | `Object` | `{}` | Key-value store of labels and training data vectors | +| Property | Type | Default | Description | +| -------------- | --------------------------- | ------- | ----------------------------------------------------------------------------------------------------- | +| **nGramMin** | `int` | `1` | Minimum n-gram size | +| **nGramMax** | `int` | `1` | Maximum n-gram size | +| **vocabulary** | `Array` \| `Set` \| `false` | `[]` | Terms mapped to indexes in the model data, set to `false` to store terms directly in the data entries | +| **data** | `Object` | `{}` | Key-value store of labels and training data vectors | ### Using n-grams @@ -112,7 +112,7 @@ const classifier = new Classifier({ nGramMax: 2 }) -let tokens = classifier.tokenize('I really dont like it') +const tokens = classifier.tokenize('I really dont like it') console.log(tokens) ``` @@ -135,7 +135,7 @@ After training a model with large sets of data, you'll want to store all this da To do this, simply use the `serialize` method on your [Model](docs/model.md), and either save the data structure to a file, send it to a server, or store it in any other way you want. ```javascript -let model = classifier.model +const model = classifier.model console.log(model.serialize()) ``` @@ -173,10 +173,10 @@ Returning: ## Documentation -* [Classifier](docs/classifier.md) -* [Model](docs/model.md) -* [Vocabulary](docs/vocabulary.md) -* [Prediction](docs/prediction.md) +- [Classifier](docs/Classifier.md) +- [Model](docs/Model.md) +- [Vocabulary](docs/Vocabulary.md) +- [Prediction](docs/Prediction.md) ## Contributing From 9f9d34986a0527bc23ba059ba082316f34bfa641 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 23:15:12 +0100 Subject: [PATCH 16/19] Updated package version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index dff5988..429a9d9 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "ml-classify-text", - "version": "2.0.0", + "version": "2.0.1", "description": "Text classification using n-grams and cosine similarity", "module": "./lib", "main": "./lib", From 5d6cd6cf50d3d862475f8c6f42e16a5ace5c219e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 23:15:57 +0100 Subject: [PATCH 17/19] Fix casing in docs index --- docs/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/README.md b/docs/README.md index 5ec5aab..15dc4d9 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,7 +2,7 @@ Full documentation of all the available classes, properties and methods. -* [Classifier](classifier.md) -* [Model](model.md) -* [Vocabulary](vocabulary.md) -* [Prediction](prediction.md) +- [Classifier](Classifier.md) +- [Model](Model.md) +- [Vocabulary](Vocabulary.md) +- [Prediction](Prediction.md) From dfd60beb7e05f0f39a3f917aef225a13ff5ca5b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 23:16:30 +0100 Subject: [PATCH 18/19] Update CONTRIBUTING.md --- CONTRIBUTING.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d6db632..6264c53 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,19 +2,19 @@ This document contains basic guidelines to make contributing to this project as easy and transparent as possible, whether it's: -- Reporting a bug -- Discussing the current state of the code -- Submitting a fix -- Proposing new features -- Becoming a maintainer +- Reporting a bug +- Discussing the current state of the code +- Submitting a fix +- Proposing new features +- Becoming a maintainer ## Pull requests are actively welcomed 1. Fork the repo and create your branch from `master`. 2. If you've added code that should be tested, add tests. 3. If you've changed APIs, update the documentation. -5. Make sure your code lints. -6. Issue your pull request. +4. Make sure your code lints. +5. Issue your pull request. ## Any contributions you make will be under the MIT Software License @@ -28,13 +28,13 @@ All bugs are tracked using GitHub issues to track public bugs. Report a bug by [ **Great bug reports** tend to have: -- A quick summary and/or background -- Steps to reproduce - - Be specific! - - Give sample code if you can. - - What you expected would happen - - What actually happens -- Notes (possibly including why you think this might be happening, or stuff you tried that didn't work) +- A quick summary and/or background +- Steps to reproduce + - Be specific! + - Give sample code if you can. + - What you expected would happen + - What actually happens +- Notes (possibly including why you think this might be happening, or stuff you tried that didn't work) ## License From ce7f82c4d047b7dd786c5541a614e55a9cc5b010 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Ekeberg?= Date: Sun, 5 Feb 2023 23:17:05 +0100 Subject: [PATCH 19/19] Update CHANGELOG.md --- CHANGELOG.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e6f3930..52aaa75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,15 +2,24 @@ All notable changes to this project will be documented in this file. +## [2.0.1] - 2023-02-05 + +### Changed + +- Fixed all instances of improper object literal type checks +- Fixed bug where terms were added to the model vocabulary when making predictions +- Migrated tests from Mocha/Chai to Jest + ## [2.0.0] - 2020-08-28 ### Breaking changes -* Removed `minimumConfidence` from `Model` +- Removed `minimumConfidence` from `Model` ## [1.0.0] - 2020-08-26 Initial release +[2.0.1]: https://github.com/andreekeberg/ml-classify-text-js/releases/tag/2.0.1 [2.0.0]: https://github.com/andreekeberg/ml-classify-text-js/releases/tag/2.0.0 [1.0.0]: https://github.com/andreekeberg/ml-classify-text-js/releases/tag/1.0.0