Skip to content
Browse files

Bug 865484 - refactor dictionary data structure and search algorithm …

…for keyboard auto-correct

address Christoph's review comments

cut another 40% off the dictionary size!

Not sure whether this is a good idea: allow arbitrary substitutions in addition to nearby character substitutions.

store the length of the longest word in the dictionary file, and use it to immediately reject input that is too long to be a word

be smarter about variant forms (case and diacritics)

Add abbreviations and frequency 1 words to the dictionary, increasing size about 20%

simplify the algorithm and focus on corrections instead of predictions

address review comments
  • Loading branch information...
1 parent 58665c2 commit e575f442bc7c73d942c3fd15cffd227fceae5577 @davidflanagan davidflanagan committed Apr 12, 2013
View
BIN apps/keyboard/js/imes/latin/dictionaries/de.dict
Binary file not shown.
View
BIN apps/keyboard/js/imes/latin/dictionaries/en_us.dict
Binary file not shown.
View
BIN apps/keyboard/js/imes/latin/dictionaries/es.dict
Binary file not shown.
View
BIN apps/keyboard/js/imes/latin/dictionaries/fr.dict
Binary file not shown.
View
BIN apps/keyboard/js/imes/latin/dictionaries/pl.dict
Binary file not shown.
View
BIN apps/keyboard/js/imes/latin/dictionaries/pt_br.dict
Binary file not shown.
View
582 apps/keyboard/js/imes/latin/dictionaries/xml2dict.py
@@ -1,582 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from optparse import OptionParser
-from xml.parsers import expat
-import struct
-
-_NodeCounter = 0
-_NodeRemoveCounter = 0
-_NodeVisitCounter = 0
-_EmitCounter = 0
-
-_EndOfWord = '$'
-
-_DiacriticIndex = {
- 'a': 'ÁáĂăǍǎÂâÄäȦȧẠạȀȁÀàẢảȂȃĀāĄąÅåḀḁȺⱥÃãǼǽǢǣÆæ',
- 'b': 'ḂḃḄḅƁɓḆḇɃƀƂƃ',
- 'c': 'ĆćČčÇçĈĉĊċƇƈȻȼ',
- 'd': 'ĎďḐḑḒḓḊḋḌḍƊɗḎḏĐđƋƌð',
- 'e': 'ÉéĔĕĚěȨȩÊêḘḙËëĖėẸẹȄȅÈèẺẻȆȇĒēĘę',
- 'f': 'ḞḟƑƒ',
- 'g': 'ǴǵĞğǦǧĢģĜĝĠġƓɠḠḡǤǥ',
- 'h': 'ḪḫȞȟḨḩĤĥⱧⱨḦḧḢḣḤḥĦħ',
- 'i': 'ÍíĬĭǏǐÎîÏïỊịȈȉÌìỈỉȊȋĪīĮįƗɨĨĩḬḭı',
- 'j': 'ĴĵɈɉ',
- 'k': 'ḰḱǨǩĶķⱩⱪꝂꝃḲḳƘƙḴḵꝀꝁ',
- 'l': 'ĹĺȽƚĽľĻļḼḽḶḷⱠⱡꝈꝉḺḻĿŀⱢɫŁł',
- 'm': 'ḾḿṀṁṂṃⱮɱ',
- 'n': 'ŃńŇňŅņṊṋṄṅṆṇǸǹƝɲṈṉȠƞÑñ',
- 'o': 'ÓóŎŏǑǒÔôÖöȮȯỌọŐőȌȍÒòỎỏƠơȎȏꝊꝋꝌꝍŌōǪǫØøÕõŒœ',
- 'p': 'ṔṕṖṗꝒꝓƤƥⱣᵽꝐꝑ',
- 'q': 'Ꝗꝗ',
- 'r': 'ŔŕŘřŖŗṘṙṚṛȐȑȒȓṞṟɌɍⱤɽ',
- 's': 'ŚśŠšŞşŜŝȘșṠṡṢṣß$',
- 't': 'ŤťŢţṰṱȚțȾⱦṪṫṬṭƬƭṮṯƮʈŦŧ',
- 'u': 'ÚúŬŭǓǔÛûṶṷÜüṲṳỤụŰűȔȕÙùỦủƯưȖȗŪūŲųŮůŨũṴṵ',
- 'v': 'ṾṿƲʋṼṽ',
- 'w': 'ẂẃŴŵẄẅẆẇẈẉẀẁⱲⱳ',
- 'x': 'ẌẍẊẋ',
- 'y': 'ÝýŶŷŸÿẎẏỴỵỲỳƳƴỶỷỾỿȲȳɎɏỸỹ',
- 'z': 'ŹźŽžẐẑⱫⱬŻżẒẓȤȥẔẕƵƶ'
-}
-_Diacritics = {} # the mapping from accented to non-accented letters
-
-# Build the _Diacritics mapping
-for letter in _DiacriticIndex:
- for diacritic in _DiacriticIndex[letter]:
- _Diacritics[diacritic] = letter
-
-
-# Data Structure for TST Tree
-class TSTNode:
- # Constructor for creating a new TSTNode
- def __init__(self, ch):
- global _NodeCounter
- _NodeCounter += 1
- self.ch = ch
- self.left = self.center = self.right = None
- self.frequency = 0 # averaged maximum frequency
- # store the count for balancing the tst
- self.count = 0
- # store the number of tree nodes compressed into one DAG node
- self.ncompressed = 1
- # store hash for creating the DAG
- self.hash = 0
-
-class TSTTree:
- # Constructor for creating a TST Tree
- def __init__(self):
- self.table = {}
-
- # Insert a word into the TSTTree
- def insert(self, node, word, freq):
- ch = word[0]
-
- if not node:
- node = TSTNode(ch)
- if ch < node.ch:
- node.left = self.insert(node.left, word, freq)
- elif ch > node.ch:
- node.right = self.insert(node.right, word, freq)
- else:
- node.frequency = max(node.frequency, freq)
- if len(word) > 1:
- node.center = self.insert(node.center, word[1:], freq)
- return node
-
- # Balance the TST
- # set the number of children nodes
- def setCount(self, node):
- if not node:
- return 0
- node.count = self.setCount(node.left) + self.setCount(node.right) + 1
- self.setCount(node.center)
- return node.count
-
- def rotateRight(self, node):
- tmp = node.left
- # move the subtree between tmp and node
- node.left = tmp.right
- # swap tmp and node
- tmp.right = node
- # restore count field
- node.count = (node.left.count if node.left else 0) + (node.right.count if node.right else 0) + 1
- tmp.count = (tmp.left.count if tmp.left else 0) + tmp.right.count + 1
- return tmp
-
- def rotateLeft(self, node):
- tmp = node.right
- # move the subtree between tmp and node
- node.right = tmp.left
- # swap tmp and node
- tmp.left = node
- # restore count field
- node.count = (node.left.count if node.left else 0) + (node.right.count if node.right else 0) + 1
- tmp.count = tmp.left.count + (tmp.right.count if tmp.right else 0) + 1
- return tmp
-
- def divide(self, node, divCount):
- leftCount = (node.left.count if node.left else 0)
- # if the dividing node is in the left subtree, go down to it
- if divCount < leftCount:
- node.left = self.divide(node.left, divCount)
- # on the way back from the dividing node to the root, do right rotations
- node = self.rotateRight(node)
- elif divCount > leftCount:
- node.right = self.divide(node.right, divCount - leftCount - 1)
- node = self.rotateLeft(node)
- return node
-
- # balance level of TST
- def balanceLevel(self, node):
- if not node:
- return node
-
- # make center node the root
- node = self.divide(node, node.count // 2)
- # balance subtrees recursively
- node.left = self.balanceLevel(node.left)
- node.right = self.balanceLevel(node.right)
-
- node.center = self.balanceTree(node.center)
- return node
-
- def normalizeChar(self, ch):
- ch = ch.lower()
- if ch in _Diacritics:
- ch = _Diacritics[ch]
- return ch
-
- def collectLevel(self, level, node):
- if not node:
- return
- level.setdefault(self.normalizeChar(node.ch), []).append(node)
- self.collectLevel(level, node.left)
- self.collectLevel(level, node.right)
-
- def sortLevelByFreq(self, node):
- # Collect nodes on the same level (lowercase/uppercase/accented characters are grouped together)
- level = {}
- self.collectLevel(level, node)
- level = list(level.values())
-
- # Sort by frequency joining nodes with lowercase/uppercase/accented versions of the same character
- level.sort(key = lambda items: max(items, key = lambda node: node.frequency).frequency, reverse = True)
- nodes = []
- for items in level:
- nodes += items
-
- # Add nextFreq/prevFreq pointers to each node
- prevFreq = None
- for i in range(len(nodes)):
- nodes[i].nextFreq = nodes[i + 1] if i < len(nodes) - 1 else None
- nodes[i].prevFreq = prevFreq
- prevFreq = nodes[i]
- return nodes[0]
-
- # find node in the subtree of root and promote it to root
- def promoteNodeToRoot(self, root, node):
- if node.ch < root.ch:
- root.left = self.promoteNodeToRoot(root.left, node)
- return self.rotateRight(root)
- elif node.ch > root.ch:
- root.right = self.promoteNodeToRoot(root.right, node)
- return self.rotateLeft(root)
- else:
- return root
-
- # balance the whole TST
- def balanceTree(self, node):
- if not node:
- return
-
- # promote to root the letter with the highest maximum frequency
- # of a suffix starting with this letter
- node = self.promoteNodeToRoot(node, self.sortLevelByFreq(node))
-
- # balance other letters on this level of the tree
- node.left = self.balanceLevel(node.left)
- node.right = self.balanceLevel(node.right)
- node.center = self.balanceTree(node.center)
- return node
-
- def balance(self, root):
- self.setCount(root)
- root = self.balanceTree(root)
- return root
-
-
- # Compress TST into DAWG
-
-
- # Compare two subtrees. If they are equal, return the mapping from the nodes
- # in nodeA to the corresponding nodes in nodeB. If they are not equal, return False
- def equal(self, nodeA, nodeB, mapping):
- if nodeA == None or nodeB == None:
- return mapping if nodeA == None and nodeB == None else False
- # two nodes are equal if their characters and their
- # children are equal
- mapping[nodeA] = nodeB
- return mapping if nodeA.ch == nodeB.ch and \
- self.equal(nodeA.left, nodeB.left, mapping) and \
- self.equal(nodeA.center, nodeB.center, mapping) and \
- self.equal(nodeA.right, nodeB.right, mapping) else False
-
- # Return True if all nextFreq nodes are in the nodeA subtree
- # at the same positions as in the nodeB subtree
- def equalNextFreq(self, mapping):
- for node in mapping:
- if node.nextFreq == None and mapping[node].nextFreq == None:
- continue
- if node.nextFreq not in mapping:
- return False
- if mapping[node.nextFreq] != mapping[node].nextFreq:
- return False
- return True
-
- # find the head of the nextFreq/prevFreq linked list
- def findListHead(self, node, mapping):
- while node.prevFreq and node.prevFreq in mapping:
- node = node.prevFreq
- return node
-
- def calculateHash(self, node):
- if not node:
- return 0
- assert (len(node.ch) == 1)
- node.hash = (ord(node.ch) - ord('a')) + 31 * self.calculateHash(node.center)
- node.hash ^= self.calculateHash(node.left)
- node.hash ^= self.calculateHash(node.right)
- node.hash ^= (node.hash >> 16)
- return node.hash
-
- # find the node in the hash table. if it does not exist,
- # add a new one, return true and the original node,
- # if not, return false and the existing node
- def checkAndRemoveDuplicate(self, node):
- global _NodeRemoveCounter
-
- if node.hash in self.table:
- for candidate in self.table[node.hash]:
- mapping = self.equal(node, candidate, {})
- if mapping and self.equalNextFreq(mapping):
- # this node already exists in the table.
- # remove the duplicate
- _NodeRemoveCounter += len(mapping)
- head = self.findListHead(node, mapping)
- if head.prevFreq:
- head.prevFreq.nextFreq = mapping[head]
- self.addFreq(candidate, node)
- return False, candidate
- self.table.setdefault(node.hash, []).append(node)
- return True, node
-
- # recursively add frequency
- def addFreq(self, node, candidate):
- if not node:
- return
- #print(node.frequency, 'add', candidate.frequency, 'in', node.ch)
- node.frequency += candidate.frequency
- node.ncompressed += 1
- self.addFreq(node.left, candidate.left)
- self.addFreq(node.right, candidate.right)
- self.addFreq(node.center, candidate.center)
-
- # remove duplicates suffixes starting from the longest one
- def removeDuplicates(self, node):
- global _NodeVisitCounter
- _NodeVisitCounter += 1
- if _NodeVisitCounter % 10000 == 0:
- print (" >>> (visiting: " +
- str(_NodeVisitCounter) + "/" + str(_NodeCounter) +
- ", removed: " + str(_NodeRemoveCounter) + ")")
-
- if node.left:
- # if the node already exists in the table
- # (checkAndRemoveDuplicate returns false),
- # its children were checked for duplicates already
- # avoid duplicate checking
- checkDeeper, node.left = self.checkAndRemoveDuplicate(node.left)
- if checkDeeper:
- self.removeDuplicates(node.left)
- if node.right:
- checkDeeper, node.right = self.checkAndRemoveDuplicate(node.right)
- if checkDeeper:
- self.removeDuplicates(node.right)
- if node.center:
- checkDeeper, node.center = self.checkAndRemoveDuplicate(node.center)
- if checkDeeper:
- self.removeDuplicates(node.center)
- return node
-
- def averageFrequencies(self):
- for hash in self.table:
- for candidate in self.table[hash]:
- candidate.frequency /= candidate.ncompressed
- del self.table
-
- # For debugging
- def printNode(self, node, level, path):
- print(' ' * level, path, node.ch, '(', \
- node.nextFreq.ch if node.nextFreq else '', ')', id(node), '(', \
- id(node.nextFreq) if node.nextFreq else 'None', ')', \
- node.frequency, '^')
-
- def printDAG(self, root):
- stack = []
- visited = []
- stack.append((root, 0, ''))
-
- while stack:
- node, level, path = stack.pop()
- if node in visited:
- self.printNode(node, level, path)
- continue
- visited.append(node)
-
- self.printNode(node, level, path)
-
- if node.right:
- stack.append((node.right, level + 1, 'R'))
- if node.left:
- stack.append((node.left, level + 1, 'L'))
- if node.center:
- stack.append((node.center, level + 1, '='))
-
- # traverse the tree using DFS to find all possible candidates
- # starting with the given prefix (for debugging)
- def predict(self, root, prefix, maxsuggestions):
- def addNextFreq(node, prefix):
- nonlocal candidates
-
- # Insert new node into candidates (sorted by frequency)
- i = len(candidates) - 1
- while i >= 0 and node.frequency > candidates[i][0]:
- i -= 1
-
- # Don't insert at the end if already have the required number of candidates
- if i == len(candidates) - 1 and len(candidates) >= maxsuggestions:
- return
-
- candidates.insert(i + 1, (node.frequency, node, prefix))
-
- def findPrefix(node, prefix):
- if not node: # not found
- return None
- if len(prefix) == 0:
- return node
- if prefix[0] < node.ch:
- return findPrefix(node.left, prefix)
- elif prefix[0] > node.ch:
- return findPrefix(node.right, prefix)
- else:
- return findPrefix(node.center, prefix[1:])
-
- node = findPrefix(root, prefix)
- if not node:
- return []
-
- # find the predictions
- candidates = [(node.frequency, node, prefix)]
- suggestions = []
-
- index = 0
- while len(candidates) > 0 and len(suggestions) < maxsuggestions:
- # Find the best candidate
- node = candidates[0][1]
- prefix = candidates[0][2]
- candidates.pop(0)
- while node.ch != _EndOfWord:
- if node.nextFreq: # Add the next best suggestion
- addNextFreq(node.nextFreq, prefix)
- prefix += node.ch
- node = node.center
- if node.nextFreq: # Add the next best suggestion
- addNextFreq(node.nextFreq, prefix)
- suggestions.append(prefix)
- #print(suggestions, end=' ')
- #for s in candidates:
- # print(s[0], s[2] + ',', end='')
- #print()
- index += 1
-
- print ("suggestions: " + str(len(suggestions)))
-
- return suggestions
-
-
-def writeInt16(output, int16):
- output.write(struct.pack("H", int16))
-
-def emitChild(output, verboseOutput, node, child, letter):
- offset = child.offset if child else 0
- writeInt16(output, offset & 0xFFFF)
- if verboseOutput:
- verboseOutput.write(", " + letter + ": " + str(offset))
- return offset >> 16
-
-def emitNodes(output, verboseOutput, nodes):
- i = 0
- for node in nodes:
- writeInt16(output, ord(node.ch) if node.ch != _EndOfWord else 0)
- if verboseOutput:
- ch = node.ch if ord(node.ch) < 0x80 else 'U+' + hex(ord(node.ch))
- verboseOutput.write("["+ str(node.offset) +"] { ch: " + ch)
-
- #print("["+ str(node.offset) +"] { ch: " + ch + ' next:' +
- # (node.nextFreq.ch if node.nextFreq else ''))
- highbits = emitChild(output, verboseOutput, node, node.left, 'L')
- highbits = (highbits << 4) | emitChild(output, verboseOutput, node, node.center, 'C')
- highbits = (highbits << 4) | emitChild(output, verboseOutput, node, node.right, 'R')
- highbits = (highbits << 4) | emitChild(output, verboseOutput, node, node.nextFreq, 'N')
- writeInt16(output, highbits)
- if verboseOutput:
- verboseOutput.write(", h: " + str(highbits))
- writeInt16(output, round(node.frequency))
- if verboseOutput:
- verboseOutput.write(", f: " + str(round(node.frequency)))
- verboseOutput.write("}\n")
-
- i += 1
- if i % 10000 == 0:
- print(" >>> (emitting " + str(i) + "/" + str(len(nodes)) + ")")
-
-
-# emit the tree BFS
-def sortTST(root):
-
- global _EmitCounter
- queue = []
- visited = {}
- output = []
- queue.append(root)
-
- while queue:
- node = queue.pop(0)
- if node in visited:
- continue
- visited[node] = True
- output.append(node)
- node.offset = len(output)
-
- _EmitCounter += 1
- if _EmitCounter % 10000 == 0:
- print(" >>> (sorting " + str(_EmitCounter) + "/" +
- str(_NodeCounter - _NodeRemoveCounter) + ")")
-
- if node.left:
- queue.append(node.left)
- if node.center:
- queue.append(node.center)
- if node.right:
- queue.append(node.right)
-
- return output
-
-# Parse command line arguments.
-#
-# Syntax: python xml2dict.py [-v] -o output-file input-file
-#
-use = "Usage: %prog [options] dictionary.xml"
-parser = OptionParser(usage = use)
-parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Set mode to verbose.")
-parser.add_option("-o", "--output", dest="output", metavar="FILE", help="write output to FILE")
-options, args = parser.parse_args()
-
-# We expect the dictionary name to be present on the command line.
-if len(args) < 1:
- print("Missing dictionary name.")
- exit(-1)
-if options.output == None:
- print("Missing output file.")
- exit(-1)
-
-# print some status statements to the console
-print ("[0/8] Creating dictionary ... (this might take a long time)" )
-print ("[1/8] Reading XML wordlist and creating TST ..." )
-
-_WordCounter = 0
-
-def start_element(name, attrs):
- global lastName, lastFreq, lastFlags, lastWord
- lastName = name
- lastFlags = ""
- if "flags" in attrs:
- lastFlags = attrs["flags"]
- lastFreq = -1
- if "f" in attrs:
- lastFreq = int(attrs["f"])
- if lastName == 'w':
- lastWord = ''
-
-def char_data(text):
- global lastWord
- if lastName == 'w':
- lastWord += text
-
-def end_element(name):
- global tstRoot, _WordCounter
- if name != 'w' or lastName != 'w' or \
- lastFlags == "abbreviation" or \
- lastFreq <= 1:
- return
- tstRoot = tree.insert(tstRoot, lastWord + _EndOfWord, lastFreq)
- _WordCounter += 1
- if _WordCounter % 10000 == 0:
- print(" >>> (" + str(_WordCounter) + " words read)")
-
-tstRoot = None
-tree = TSTTree()
-
-# Parse the XML input file and build the trie.
-p = expat.ParserCreate()
-p.StartElementHandler = start_element
-p.CharacterDataHandler = char_data
-p.EndElementHandler = end_element
-p.ParseFile(open(args[0], 'rb'))
-
-print ("[2/8] Balancing Ternary Search Tree ...")
-tstRoot = tree.balance(tstRoot)
-
-#tree.printDAG(tstRoot)
-
-print ("[3/8] Calculating hash for nodes ...")
-tree.calculateHash(tstRoot)
-print ("[4/8] Compressing TST to DAG ... (removing duplicate nodes)")
-tstRoot = tree.removeDuplicates(tstRoot)
-
-print ("[5/8] Average the frequencies")
-tree.averageFrequencies()
-
-print ("[6/8] Sorting TST ... (" +
- str(_NodeCounter) + " - " + str(_NodeRemoveCounter) + " = " +
- str(_NodeCounter - _NodeRemoveCounter) + " nodes).")
-
-nodes = sortTST(tstRoot)
-
-#tree.printDAG(tstRoot)
-
-print ("[7/8] Emitting TST ...")
-
-verboseOutput = None
-if options.verbose:
- verboseOutput = open(options.output + ".tst", "w")
-
-output = open(options.output, "wb")
-emitNodes(output, verboseOutput, nodes)
-output.close()
-
-if verboseOutput:
- verboseOutput.close()
-
-print ("[8/8] Successfully created Dictionary")
-
-exit()
-
-# Tests the matching function
-# while True:
-# prefix = input()
-# if prefix == '':
-# break
-# suggestions = tree.predict(tstRoot, prefix, 10)
-# print(suggestions)
View
125 apps/keyboard/js/imes/latin/latin.js
@@ -19,6 +19,26 @@
* properties of the input element that has the focus. If inputmode is
* "verbatim" then the input method does not modify the user's input in any
* way. See getInputMode() for a description of input modes.
+ *
+ * TODO:
+ *
+ * when deciding whether to autocorrect, if the first 2 choices are
+ * a prefix of one another, then consider the ratio of 1st to 3rd instead
+ * of 1st to second possibly? If there different forms of the same word
+ * and that word is the most likely, then substitute it?
+ *
+ * add a per-language settings-based list of customizable corrections?
+ *
+ * Display an X icon in the suggestions line to give the user a way
+ * to dismiss an autocorrection? (Easier than space, backspace, space).
+ *
+ * Display a + icon in the suggestions line to give the user a way to
+ * add the current input to a personal word list so it doesn't get
+ * auto-corrected?
+ *
+ * Use color somehow to indicate that a word is properly spelled?
+ *
+ * Make the phone vibrate when it makes an automatic corection?
*/
(function() {
// Register ourselves in the keyboard's set of input methods
@@ -53,6 +73,7 @@
var selection; // The end of the selection, if there is one, or 0
var lastSpaceTimestamp; // If the last key was a space, this is the timestamp
var layoutParams; // Parameters passed to setLayoutParams
+ var nearbyKeyMap; // Map keys to nearby keys
var idleTimer; // Used by deactivate
var suggestionsTimer; // Used by updateSuggestions;
var autoCorrection; // Correction to make if next input is space
@@ -81,6 +102,7 @@
const SEMICOLON = 59;
const WS = /^\s+$/; // all whitespace characters
+ const PUNC = /^[.,?!;:]+$/; // punctuation
const DOUBLE_SPACE_TIME = 700; // ms between spaces to convert to ". "
@@ -184,18 +206,20 @@
if (!worker) {
// If we haven't created the worker before, do it now
worker = new Worker('js/imes/latin/worker.js');
- if (layoutParams)
- worker.postMessage({ cmd: 'setLayout', args: [layoutParams]});
+ if (layoutParams && nearbyKeyMap)
+ worker.postMessage({ cmd: 'setNearbyKeys', args: [nearbyKeyMap]});
worker.onmessage = function(e) {
switch (e.data.cmd) {
case 'log':
- console.log.apply(console, e.data.message);
+ console.log(e.data.message);
break;
- case 'unknownLanguage':
- console.error('No dictionary for language', e.data.language);
+ case 'error':
+ console.error(e.data.message);
break;
case 'predictions':
+ // The worker is suggesting words. If the input is a word, it
+ // will be first.
handleSuggestions(e.data.input, e.data.suggestions);
break;
}
@@ -537,6 +561,8 @@
// Make sure the user also has their actual input as a choice
// XXX: should this be highlighted in some special way?
// XXX: or should we just have a x icon to dismiss the autocorrection?
+ if (suggestions.length === 3)
+ suggestions.pop();
suggestions.push(input);
// Mark the auto-correction so the renderer can highlight it
suggestions[0] = '*' + suggestions[0];
@@ -587,14 +613,84 @@
function setLayoutParams(params) {
layoutParams = params;
+ // XXX We call nearbyKeys() every time the keyboard pops up.
+ // Maybe it would be better to compute it once in keyboard.js and
+ // cache it.
+ nearbyKeyMap = nearbyKeys(params);
if (worker)
- worker.postMessage({ cmd: 'setLayout', args: [params]});
+ worker.postMessage({ cmd: 'setNearbyKeys', args: [nearbyKeyMap]});
+ }
+
+ function nearbyKeys(layout) {
+ var nearbyKeys = {};
+ var keys = layout.keyArray;
+ var keysize = Math.min(layout.keyWidth, layout.keyHeight) * 1.2;
+ var threshold = keysize * keysize;
+
+ // For each key, calculate the keys nearby.
+ for (var n = 0; n < keys.length; ++n) {
+ var key1 = keys[n];
+ if (SpecialKey(key1))
+ continue;
+ var nearby = {};
+ for (var m = 0; m < keys.length; ++m) {
+ if (m === n)
+ continue; // don't compare a key to itself
+ var key2 = keys[m];
+ if (SpecialKey(key2))
+ continue;
+ var d = distance(key1, key2);
+ if (d !== 0)
+ nearby[key2.code] = d;
+ }
+ nearbyKeys[key1.code] = nearby;
+ }
+
+ return nearbyKeys;
+
+ // Compute the inverse square distance between the center point of
+ // two keys, using the radius of the key (where radius is defined
+ // as the distance from the center of key1 to a corner of key1)
+ // as the unit of measure. If the distance is greater than 2.5
+ // times the radius return 0 instead.
+ function distance(key1, key2) {
+ var cx1 = key1.x + key1.width / 2;
+ var cy1 = key1.y + key1.height / 2;
+ var cx2 = key2.x + key2.width / 2;
+ var cy2 = key2.y + key2.height / 2;
+ var radius = Math.sqrt(key1.width * key1.width / 4 +
+ key1.height * key1.height / 4);
+
+ var dx = (cx1 - cx2) / radius;
+ var dy = (cy1 - cy2) / radius;
+ var distanceSquared = dx * dx + dy * dy;
+ if (distanceSquared > 2.5 * 2.5)
+ return 0;
+ else
+ return 1 / distanceSquared;
+ }
+
+ // Determine whether the key is a special character or a regular letter.
+ // Special characters include backspace (8), return (13), and space (32).
+ function SpecialKey(key) {
+ switch (key.code) {
+ case 0:
+ case KeyEvent.DOM_VK_BACK_SPACE:
+ case KeyEvent.DOM_VK_CAPS_LOCK:
+ case KeyEvent.DOM_VK_RETURN:
+ case KeyEvent.DOM_VK_ALT:
+ case KeyEvent.DOM_VK_SPACE:
+ return true;
+ default: // anything else is not special
+ return false;
+ }
+ }
}
function updateSuggestions(repeat) {
// If the user hasn't enabled suggestions, or if they're not appropriate
// for this input type, or are turned off by the input mode, do nothing
- if (!suggesting && ! correcting)
+ if (!suggesting && !correcting)
return;
// If we deferred suggestions because of a key repeat, clear that timer
@@ -603,6 +699,7 @@
suggestionsTimer = null;
}
+ // If we're still repeating, reset the repeat timer.
if (repeat) {
suggestionsTimer = setTimeout(updateSuggestions, autorepeatDelay);
return;
@@ -689,14 +786,18 @@
if (cursor < inputText.length && !WS.test(inputText[cursor]))
return false;
- // We're at the end of a word if the cursor is not at the start and
- // the character before the cursor is not whitespace
- return cursor > 0 && !WS.test(inputText[cursor - 1]);
+ // If the cursor is at position 0 then we're not at the end of a word
+ if (cursor <= 0)
+ return false;
+
+ // We're at the end of a word if the character before the cursor is
+ // not whitespace or punctuation
+ var c = inputText[cursor - 1];
+ return !WS.test(c) && !PUNC.test(c);
}
- // Get the word before the cursor
+ // Get the word before the cursor. Assumes that atWordEnd() is true
function wordBeforeCursor() {
- // Otherwise, find the word we're at the end of and ask for completions
for (var firstletter = cursor - 1; firstletter >= 0; firstletter--) {
if (WS.test(inputText[firstletter])) {
break;
View
1,204 apps/keyboard/js/imes/latin/predictions.js
@@ -1,260 +1,214 @@
/* -*- Mode: js; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- /
/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
-
-// JavaScript predictive text engine.
-//
-// A note on the dictionary format: The prediction engine uses a custom binary
-// dictionary format that is generated by xml2dict.py from a XML-based word
-// lists. The word lists included with this engine are minimally modified
-// versions of the word lists that ship with Android Jelly Bean (4.1). The
-// lists are licensed under the Apache license, as is this engine.
//
-// Consult xml2dict.py to understand the format of the dictionary file. The
-// purpose of the dictionary file is to compactly represent the ternary
-// search tree.
-// We use the binary representation of the tst instead of building a tst
-// out of JS objects because JS objects tend to occupy much
-// more memory than the binary format xml2dict.py generates.
+// This is a JavaScript predictive text engine: given a dictionary, a data
+// structure that specifies which keys are near which other keys, and
+// a string of user input, it guesses what the user meant to type or, (if
+// if cannot find a word loosely matching the input) what the user is planning
+// to type.
//
// This module defines a single global variable Predictions which is an
// object with the following methods:
//
// setDictionary: specifies the dictionary to use
//
-// setLayout: specifies the keyboard layout, which is used to
-// determine the set of nearby keys for each key
+// setNearbyKeys: specifies which keys are near which other keys
//
-// predict: given an input string, return the most likely
-// completions or corrections for it.
+// predict: given an input string, asynchronously find corrections or
+// predictions and pass them to a specified callback.
//
+// The word prediction / auto-correction algorithm works by finding loose
+// matches for the user's input. The hard part of this process is getting
+// the "loose" part right. For each character in the user's input, we'll
+// match variant forms of the character to handle case differences and
+// diacritics. In languages that use apostrophes and hyphens, we'll match
+// those at any point, even if the user omits them. Because we've been
+// passed a data structure that tells us which keys are near each other (and
+// how near they are) we use that information to correct typing errors.
+// Other loose matching techniques we try include transpositions, insertions
+// and deletions.
//
-// Description of the algorithm:
+// During this loose matching process, we associate a weight with each
+// candidate we're considering. This weight is based on the word frequency
+// information from the dictionary file, but it is reduced when we make
+// corrections to the user's input, to reflect our decreased confidence in
+// the suggestion. Substituting variant forms and inserting apostrophes does
+// not reduce the weight by much at all. Substituting a nearby key reduces
+// the weight moderately (depending on how near the keys are), and
+// insertions and deletions and transpositions reduce the weight
+// significantly.
//
-// We use a precompiled dictionary that is loaded into a typed Array (_dict).
-// The underlying data structure is a ternary search tree (TST) which also uses
-// a direct acyclic graph to compress suffixes (we call this Ternary DAGs).
-// see http://www.strchr.com/ternary_dags for further details on TDAGs.
+// The core part of the matching algorithm is in the process() function
+// nested inside the predict() method. To fully understand it, however, you
+// also need to understand the weighted Ternary Search Tree data structure
+// used to represent the dictionary.
//
-// Every Node in the TDAG uses this format:
+// Each node of the tree represents a single character, and is part of
+// an ordinary binary tree. Nodes representing characters that are less
+// than the current node are found by following the left pointer, and nodes
+// that repesent characters that are greater than the character in the
+// current node are found by following the right pointer.
//
-// Node {
-// int16 ch; // character
-// int16 lPtr; // left child
-// int16 cPtr; // center child
-// int16 rPtr; // right child
-// int16 nPtr; // next child, holds a pointer to the node
-// // with the next highest frequency after the
-// // the frequency in the current node.
-// int16 high; // holds an overflow byte for lPtr, cPtr, rPtr, nPtr
-// // which keeps nodes as small as possible.
-// int16 frequency; // frequency from the XML file, or
-// // average of compressed/combined nodes.
-// };
+// Our data structure is called a ternary tree because in addition to these
+// left and right pointers, each node also has a center pointer that points
+// to the subtree of nodes that represent the next character of the
+// word. You can think of all the nodes found by recursively traversing
+// left and right from the current node as being on the same level, and the
+// node found by by traversing to the center pointer as being on the next
+// level.
//
-// The algorithm operates in two stages:
+// Ternary trees are good for representing dictionaries and checking
+// whether a string appears in the dictionary. In order to make
+// predictions, however, our dictionary also contains word frequency
+// information, and we use it to predict more frequently used words instead
+// of less frequently used words. So instead of using a generic ternary
+// search tree, we use a weighted ternary search tree. In addition to the
+// left, center, and right pointers, each node also has a next pointer that
+// points to the node that represents the next most likely letter at the
+// current position. In effect, all of the nodes that are on the same level
+// form a linked list sorted from most likely to least likely.
//
-// First, we permute the user input (prefix) by
-// * inserting a character;
-// following direct successors of the current node.
-// * deleting a character;
-// we skip one character in the prefix and try to
-// find direct successor nodes with the next character
-// following the skipped one.
-// * replacing characters with surrounding key-characters;
-// if the character in the successor is a neighbouring
-// key of the current key, we also follow this path.
-// * transposing characters.
-// we swap neighboring characters in the prefix and try to
-// find successor nodes in the TST.
+// To make this linked list work, the tree uses non-traditional
+// balancing. Instead of balancing the binary tree so that there are
+// approximately equal numbers of nodes on the left and the right, the tree
+// is balanced so that when you follow the center pointer from one level to
+// the next level, the node represents the most likely letter on that
+// level. Each center pointer points to the head of a linked list.
//
-// This user input permutation is done while traversing the TDAG trying
-// to find possible candidates. Note, that we multiply the frequency
-// only for exact matches in TDAG. In other words, if there is a word
-// in the TDAG that starts with that prefix, the user most probably
-// has not mistapped it. Therefore we need to boost this candidate.
-// All other permutations are treated equally. We do not rank
-// candidates differently based on the detected error.
+// In addition to holding a character, each node of the tree also stores a
+// frequency value. For each node, this number is the frequency of the most
+// frequent word beneath that node. If you recursively follow the center
+// pointer of a node, you'll find that most frequent word.
//
-// For example, the user taps 's' on the keyboard.
-// Therfore, we add the prefix for 's', with a pointer to the next
-// best candidate ('h' following the 's') to the array of
-// candidates (which will later predict 'she', and then 'such',
-// 'some', ...). This insertion into the sorted candidates array is
-// based on the frequency of the this node.
-// In this case, for example, we would also add 'a' to this array of
-// candidates because 'a' is a surrounding key of 's' and there is a
-// word that starts with that prefix in the TDAG (e.g. 'and').
-// This array is sorted, so that the highest ranked node is found
-// at index 0.
+// It turns out that our matching algorithm does not actually use the left
+// and right pointers of the tree at all, so they are omitted from the
+// dictionary to dramatically reduce its file size. At any given "level" of
+// the tree, we find the possible letters by following the next pointer. And
+// at any given node, we move to the next level by following the center
+// pointer.
//
-// Second, the function 'predictSuffixes' iterates that array of
-// candidates and follows the center pointers (cPtr).
-// The TST is a balanced binary search tree with one exception.
-// The node with the highest frequency is assigned to the center
-// pointer (cPtr). This means, that following the cPtr we always
-// find the word with the highest frequency starting with that
-// prefix.
-// So the center pointer of 's' points to 'h' ('she' ranked 170)
-// The nPtr in the node of that 'h' (prefix 's') points to
-// 'u' ('such' ranked also 170), and so an. While following
-// the cPtr we keep adding candidates to the candidates array.
-// Once we reach the end of a word (node.ch == 0) we take out
-// the next best candidate from the sorted candidates array.
-// Since we add candidates while following the cPtr we now
-// might find a new, better ranked candidate at index 0 in
-// the sorted array. We can see this nPtr as a kind of linked list.
-// Using this linked list we can prune whole subtress which favors
-// lookup speed.
+// You can read more about the data structure in the script that generates the
+// dictionary files:
//
-// Again, a character euqals to 0 (node.ch == 0) indicates we have
-// reached the end of a word in the tree. The frequency associated
-// with this node is the frequency of that word. Note that, we
-// compress suffixes where the character in the node machtes, but
-// not necessarily the frequency, therefore we average the frequency
-// of all compressed suffix nodes which are combined into such a
-// suffix node. Even though this seems to be not accurate and might
-// cause mispredictions, we highlight the fact that commonly shorter
-// input words (prefixes) are not compressed, which means that the
-// correct frequency is still stored in that node. Once the input
-// words get longer and longer, we have allready narrowed the search
-// space for that prefix, so that the averaging of frequencies
-// in compressed nodes hopefully does not cause mispredictions.
+// gaia/dictionaries/xml2dict.py.
//
-// Once the algorithm reaches the maximum number of requested
-// suggestions (_maxSuggestions), we return that array of possible
-// alternatives which are displayed for the user.
+// Also see the following online resources which include helpful diagrams:
//
-// A simplyfied example to demonstrate the use of the nPtr:
+// http://en.wikipedia.org/wiki/Ternary_search_tree
+// http://www.strchr.com/ternary_dags
+// http://www.strchr.com/dawg_predictive
//
+// Note that this implementation does not convert the TST into a DAG to
+// share common suffix nodes as described in the strchr.com blog
+// posts. (That is an optimization that may be possible later: the shared
+// suffix nodes can't hold correct word frequencies, so the search
+// algorithm would have to be modified to carry the correct frequency
+// through while processing a shared suffix.)
//
-// -------------
-// | ch: 't' |
-// | cPtr: 'h' |
-// | nPtr: 's' | <-!!!
-// | freq: 222 |
-// -------------
-// / | \
-// / | \
-// / | \
-// ------------- ------------- -------------
-// | ch: 'k' | | ch: 'h' | | ch: 'u' |
-// | cPtr: *** | | cPtr: *** | | cPtr: *** |
-// | nPtr: *** | | nPtr: *** | | nPtr: *** |
-// | freq: 160 | | freq: 222 | | freq: *** |
-// ------------- ------------- -------------
-// / | \ / | \ / | \
-// * * \ * * * * * *
-// \
-// -------------
-// | ch: 's' |
-// | cPtr: 'h' |
-// | nPtr: *** |
-// | freq: 170 |
-// -------------
-// / | \
-// * | *
-// |
-// -------------
-// | ch: 'h' |
-// | cPtr: 'e' |
-// | nPtr: 'u' | <-!!!
-// | freq: 170 |
-// -------------
-// / | \
-// |*|
-// \
-// -------------
-// | ch: 'u' |
-// | cPtr: 'c' |
-// | nPtr: *** |
-// | freq: 170 |
-// -------------
+// TODO:
+//
+// Add profanity to the dictionary and then modify the logic here so we
+// never suggest profanity but never auto-correct it, either.
+//
+// Also have to figure out if something is going wrong with Polish.
+// When I type an unaccented character, I'm not confident that I'm
+// getting predictions that include accented versions of that character.
+//
+// Write tests!
//
-// The root node is 't' which cPtr points to 'h' which cPtr points
-// to 'e' ('the'). The lPtr of 't' points to 'k' (binary tree), but
-// the nPtr of 't' points to 's'. The cPtr of 's' points to 'h'
-// which cPtr points to 'e' ('she'). The nPtr of the node with ch 'h'
-// points to 'u', because the next highest word in the dictionary
-// is 'such'. This way, we can prune whole subtrees and take
-// shortcuts in the tree to the candidate with the next best frequency
-// after the current frequency.
-
'use strict';
var Predictions = function() {
- var _dict;
- var _nearbyKeys; // nearby keys for any given key
- var _maxSuggestions = 3; // max number of suggestions to be returned
- var _nodeSize = 7; // the size of a node is 7 * 2 bytes
- var _suggestions = []; // the found suggestions
- var _candidates = [];
- var _suggestions_index = {}; // indexed by suggestion (to remove duplicates)
- var _diacritics = {}; // mapping from accented to non-accented letters
-
- // Send a log message to the main thread since we can't output to the console
- // directly.
- function log(msg) {
- self.postMessage({ cmd: 'log', args: [msg] });
- }
+ const maxSuggestions = 3; // max number of suggestions to be returned
+ const maxCorrections = 1; // max number of corrections to the user's typing
+ const cacheSize = 255; // how many suggestions to remember
- // Calculate the squared distance of a point (x, y) to the nearest edge of
- // a rectangle (left, top, width, height). This is used to calculate the
- // nearby keys for every key. We search the dictionary by looking for words
- // where each character corresponds to the key the user touched, or a key
- // near that key.
- function SquaredDistanceToEdge(left, top, width, height, x, y) {
- var right = left + width;
- var bottom = top + height;
- var edgeX = x < left ? left : (x > right ? right : x);
- var edgeY = y < top ? top : (y > bottom ? bottom : y);
- var dx = x - edgeX;
- var dy = y - edgeY;
- return dx * dx + dy * dy;
- }
+ // While searching we maintain a priority queue of candidates we want
+ // to search further. These constants specify how many candidates we
+ // retain in that queue.
+ const maxCandidates = maxSuggestions * 8;
- // Determine whether the key is a special character or a regular letter.
- // Special characters include backspace (8), return (13), and space (32).
- function SpecialKey(key) {
- var code = key.code;
- // codes: 'a' = 97, 'z' = 122
- return code < 97 || code > 122;
- }
+ // Weights of various permutations we do when matching input
+ const variantFormMultiplier = .99; // slightly prefer exact match
+ const punctuationInsertionMultiplier = .95; // apostrophes are almost free
+ const nearbyKeyReplacementMultiplier = 1; // adjusted by actual distance
+ const transpositionMultiplier = .3;
+ const insertionMultiplier = .3;
+ const substitutionMultiplier = .2; // for keys that are not nearby
+ const deletionMultiplier = .1;
- function setDictionary(dict) {
- _dict = Uint16Array(dict);
- }
+ // If we can't find enough exact word matches for the user's input
+ // we have to expand some of the candidates we found into complete
+ // words. But we want words that are close in length to the user's
+ // input. The most frequent word beginning with r in the en.us wordlist is
+ // received, but we don't want that as a suggestion if the user just
+ // type r. We want things like red and run. So for each extra character
+ // we have to add, we multiply the weight by this amount.
+ const wordExtensionMultiplier = 0.4;
- function setLayout(params) {
- // For each key, calculate the keys nearby.
- var keyWidth = params.keyWidth;
- var keyHeight = params.keyHeight;
- var threshold = Math.min(keyWidth, keyHeight) * 1.2;
- var keyArray = params.keyArray;
- _nearbyKeys = {};
- threshold *= threshold;
- for (var n = 0; n < keyArray.length; ++n) {
- var key1 = keyArray[n];
- if (SpecialKey(key1))
- continue;
- var list = {};
- for (var m = 0; m < keyArray.length; ++m) {
- var key2 = keyArray[m];
- if (SpecialKey(key2))
- continue;
- if (SquaredDistanceToEdge(/* key dimensions */
- key1.x, key1.y,
- key1.width, key1.height,
- /* center of candidate key */
- key2.x + key2.width / 2,
- key2.y + key2.height / 2) <
- threshold) {
- list[String.fromCharCode(key2.code).toLowerCase()] = true;
- }
- }
- _nearbyKeys[String.fromCharCode(key1.code).toLowerCase()] = list;
+ // How many candidates do we consider before pausing with a setTimeout()?
+ // Smaller values make the prediction code more interruptible and
+ // possibly result in a more responsive UX. Larger values may reduce the
+ // total time required to get predictions
+ const candidatesPerBatch = 10;
+
+ var tree; // A typed array of bytes holding the dictionary tree
+ var maxWordLength; // We can reject any input longer than this
+ var characterTable = []; // Maps charcodes to frequency in dictionary
+ var variants = []; // Maps charcodes to variant forms
+ var rootform = []; // Maps charcodes to the root form
+ var nearbyKeys; // Maps charcodes to a set of codes of nearby keys
+ var cache; // Cache inputs to completions.
+
+ // This function is called to pass our dictionary to us as an ArrayBuffer.
+ function setDictionary(buffer) {
+ cache = new LRUCache(cacheSize); // Start with a new cache
+ var file = Uint8Array(buffer);
+
+ function uint32(offset) {
+ return (file[offset] << 24) +
+ (file[offset + 1] << 16) +
+ (file[offset + 2] << 8) +
+ file[offset + 3];
+ }
+
+ function uint16(offset) {
+ return (file[offset] << 8) +
+ file[offset + 1];
}
- // Fill the diacritics array
- var diacritics = {
+
+ if (uint32(0) !== 0x46784F53 || // "FxOS"
+ uint32(4) !== 0x44494354) // "DICT"
+ throw new Error('Invalid dictionary file');
+
+ if (uint32(8) !== 1)
+ throw new Error('Unknown dictionary version');
+
+ // Read the maximum word length.
+ // We add 1 because word predictions can delete characters, so the
+ // user could type one extra character and we might still predict it.
+ maxWordLength = file[12] + 1;
+
+ // Read the table of characters and their frequencies
+ var numEntries = uint16(13);
+ for (var i = 0; i < numEntries; i++) {
+ var offset = 15 + i * 6;
+ var ch = uint16(offset);
+ var count = uint32(offset + 2);
+ characterTable[ch] = count;
+ }
+
+ // The dictionary data begins right after the character table
+ tree = new Uint8Array(buffer, 15 + numEntries * 6);
+
+ // The rest of this function processes the character table to create a
+ // list of variant forms that we'll accept for each character in the
+ // dictionary. Variants cover case differences and unaccented forms of
+ // accented letters. Characters with no variants are considered word
+ // internal punctuation like apostophes and hyphens.
+
+ // Map from lowercase ASCII to all known accented forms of the letter
+ var rootToAccentedForm = {
'a': 'ÁáĂăǍǎÂâÄäȦȧẠạȀȁÀàẢảȂȃĀāĄąÅåḀḁȺⱥÃãǼǽǢǣÆæ',
'b': 'ḂḃḄḅƁɓḆḇɃƀƂƃ',
'c': 'ĆćČčÇçĈĉĊċƇƈȻȼ',
@@ -282,174 +236,722 @@ var Predictions = function() {
'y': 'ÝýŶŷŸÿẎẏỴỵỲỳƳƴỶỷỾỿȲȳɎɏỸỹ',
'z': 'ŹźŽžẐẑⱫⱬŻżẒẓȤȥẔẕƵƶ'
};
- for (var letter in diacritics) {
- var s = diacritics[letter];
+
+ // The reverse mapping from accented forms to the normalized ASCII form
+ var accentedFormToRoot = {};
+ for (var letter in rootToAccentedForm) {
+ var s = rootToAccentedForm[letter];
for (var i = 0, len = s.length; i < len; i++)
- _diacritics[s[i]] = letter;
+ accentedFormToRoot[s[i]] = letter;
}
- }
- function readNode(offset, node) {
- if (offset == 0) {
- node.active = false;
- return;
- }
- offset = (offset - 1) * _nodeSize;
- var high = _dict[offset + 5];
-
- node.active = true;
- node.ch = _dict[offset];
- node.lPtr = _dict[offset + 1] | (high & 0xf000) << 4;
- node.cPtr = _dict[offset + 2] | (high & 0xf00) << 8;
- node.rPtr = _dict[offset + 3] | (high & 0xf0) << 12;
- node.nPtr = _dict[offset + 4] | (high & 0xf) << 16;
- node.freq = _dict[offset + 6];
- }
+ // Now through all the characters that appear in the dictionary
+ // and figure out their variant forms.
+ for (var charcode in characterTable) {
- // Find the end of the prefix (exact match)
- function findExactly(offset, result, prefix) {
- var i = 0, prefixLen = prefix.length;
- var node = Object.create(null);
- readNode(offset, node);
- while (node.active) {
- if (prefixLen == i) {
- // The prefix was found; add the node to candidates
- addNextCandidate(offset, result + prefix, 1);
- return;
+ // Start off by with an empty set of variants for each character.
+ variants[charcode] = '';
+
+ // Handle upper and lowercase forms
+ var ch = String.fromCharCode(charcode);
+ var upper = ch.toUpperCase();
+ var lower = ch.toLowerCase();
+ if (upper !== ch) {
+ variants[charcode] += upper;
+ // It is not upper case it is probably the root form.
+ // If it is an accented character, we'll override this below
+ rootform[charcode] = charcode;
}
- var ch = prefix.charCodeAt(i);
- if (ch < node.ch)
- offset = node.lPtr;
- else if (ch > node.ch)
- offset = node.rPtr;
- else {
- i++;
- offset = node.cPtr;
+
+ if (lower !== ch) {
+ variants[charcode] += lower;
+ rootform[charcode] = lower.charCodeAt(0);
+ }
+
+ // Handle accented forms
+ if (accentedFormToRoot[ch]) {
+ var root = accentedFormToRoot[ch];
+ rootform[charcode] = root.charCodeAt(0);
+
+ // The root form and its uppercase version are variants we'll
+ // accept in user input instead of this accented character.
+ variants[charcode] += root + root.toUpperCase();
}
- readNode(offset, node);
+
+ // log("Variants for " + ch + " " + variants[charcode]);
+ // log("Root form of " + ch + " " +
+ // String.fromCharCode(rootform[charcode]))
}
}
- // Remove the distinction between lowercase/uppercase letters and diacritics
- function normalize(ch) {
- ch = ch.toLowerCase();
- if (ch in _diacritics)
- ch = _diacritics[ch];
- return ch;
+ // latin.js passes us a data structure that holds the inverse square
+ // distance between keys that are near each other on the keyboard.
+ // This method just stores it for use later. We use these values as
+ // weights for nearby character replacement. With typical FirefoxOS
+ // keyboard layouts, adjacent keys on the same row have a value of
+ // about .5, keys directly above or below each other have a value of
+ // about .25 and keys diagonally adjacent to each other have a value of
+ // about .16.
+ function setNearbyKeys(data) {
+ cache = new LRUCache(cacheSize); // Discard any cached results
+ nearbyKeys = data;
+ // log("Nearby Keys: " + JSON.stringify(data));
}
- function findFuzzy(offset, result, prefix) {
- var node = Object.create(null);
- readNode(offset, node);
- if (prefix.length == 0) {
- if (node.active) // Found the exact match
- // If the prefix matches, increase frequency of this candidate.
- // If the word is short, increase it greatly to filter out fuzzy
- // matches (e.g., "or" and "of" when typing "on"). If the word is long,
- // assume that user can mistype it (and allow fuzzy matches).
- addNextCandidate(offset, result, 1 + 3 / result.length);
- return;
+ //
+ // This function asynchronously computes word completions for the specified
+ // input string. When called, it immediately returns an object used for
+ // communicating with the caller and defers its computations with
+ // setTimeout.
+ //
+ // The returned object has an abort() method that, when called, will
+ // cause the predictions to be cancelled the next time it calls setTimeout()
+ // to pause.
+ //
+ // This method communicates with the caller by invoking the specified
+ // callback and onerror functions. If an exception occurs while predicting
+ // the onerror function is called with an error message.
+ //
+ // If no error occurs, then the callback function is called with an array
+ // argument. Each element of this array is also an array holding a word
+ // and a number. The word is a proposed completion or correction to the
+ // input word and the number is the weight that the prediction algorithm
+ // assigns to that suggestions. Higher numbers mean better
+ // suggestions. Suggestions may take tens of milliseconds to compute which
+ // is why this method is designed to be asynchronous. Periodcally during
+ // the search process, the code returns to the event loop with
+ // setTimeout(0) which gives other code time to run and potentially call
+ // the abort method
+ //
+ // Before calling this function you must call setDictionary() and
+ // setNearbyKeys() to provide the data it needs to make predictions.
+ //
+ function predict(input, callback, onerror) {
+ if (!tree || !nearbyKeys)
+ throw Error('not initialized');
+
+ // The search algorithm compares the user's input to the dictionary tree
+ // data structure and generates a set of candidates for each character.
+ // This variable will store the set of candidates we're evaluating as we
+ // do a breadth first search of the dictionary tree and allows us to
+ // pull out the best candidates first for further evaluation.
+ var candidates = new BoundedPriorityQueue(maxCandidates);
+
+ // This is where we store the best complete words we've found so far.
+ var words = new BoundedPriorityQueue(maxSuggestions);
+
+ // This is the object we return. It allows the caller to abort a
+ // prediction in progress.
+ var status = {
+ state: 'predicting',
+ abort: function() {
+ if (this.state !== 'done' && this.state !== 'aborted')
+ this.state = 'aborting';
+ }
+ };
+
+ // Start searching for words soon...
+ setTimeout(getWords);
+
+ // But first, return the status object to the caller.
+ return status;
+
+ // We use this to check whether the user aborted the search and to
+ // set the state property appropriately.
+ function aborted() {
+ if (status.state === 'aborting') {
+ status.state = 'aborted';
+ return true;
+ }
}
- if (!node.active)
- return;
- if (prefix.length == 1) // try to delete the last character
- addNextCandidate(offset, result, 1);
-
- // Try the fuzzy matches that are better (earlier in nPtr list)
- // that the exact match.
- while (node.active) {
- var nodeChar = String.fromCharCode(node.ch);
- if (normalize(nodeChar) == normalize(prefix[0])) {
- do {
- findFuzzy(node.cPtr, result + nodeChar, prefix.substr(1));
- readNode(node.nPtr, node);
- if (!node.active)
+ function getWords() {
+ try {
+ // Check the cache. If we've seen this input recently we can return
+ // suggestions right away.
+ var cached_suggestions = cache.get(input);
+ if (cached_suggestions) {
+ status.state = 'done';
+ status.suggestions = cached_suggestions;
+ callback(status.suggestions);
+ return;
+ }
+
+ // Check length and check for invalid characters. If the input is
+ // bad, we can reject it right away.
+ if (input.length > maxWordLength || !validChars(input)) {
+ status.state = 'done';
+ status.suggestions = [];
+ callback(status.suggestions);
+ return;
+ }
+
+ // Start off with a single root candidate. The first argument is the
+ // address of the root node of the tree
+ addCandidate(0, input, '', 1, 1, 0);
+
+ // And then process it. This will generate more candidates to
+ // process. processCandidates() runs until all the words we want
+ // have been found or until all possiblities have been tried. It
+ // returns to the event loop with setTimeout() so the search can be
+ // aborted, but arranges to resume. It calls the callback when done.
+ processCandidates();
+ }
+ catch (e) {
+ status.state = 'error';
+ status.error = e;
+ onerror(e.toString() + '\n' + e.stack);
+ }
+ }
+
+ // Check whether all the characters of s appear in the dictionary or are
+ // at least near characters that do. If we are passed a string that does
+ // not pass this test then there is no way we will be able to offer
+ // suggestions and it is not even worth searching.
+ function validChars(s) {
+ outer: for (var i = 0, n = s.length; i < n; i++) {
+ var c = s.charCodeAt(i);
+ if (characterTable.hasOwnProperty(c)) // character is valid
+ continue;
+ // If the character does not occur in this language, but there is
+ // a nearby key that does occur, then maybe it is okay
+ if (!nearbyKeys.hasOwnProperty(c))
+ return false; // no nearby keys, so no suggestions possible
+ var nearby = nearbyKeys[c];
+ for (c in nearby) {
+ if (characterTable.hasOwnProperty(c))
+ continue outer;
+ }
+ // no nearby keys are in the dictionary, so no suggestions possible
+ return false;
+ }
+
+ // All the characters of s are valid
+ return true;
+ }
+
+ // Add a candidate to the list of promising candidates if frequency *
+ // multiplier is high enough. A candidate is a pointer (byte offset) to
+ // a node in the tree, the portion of the user's input that has not yet
+ // been considered, the output string that has been generated so far, a
+ // number based on the highest frequency word that begins with the
+ // output we've generated, a multipler that adjusts that frequency based
+ // on how much we've modified the user's input, and a number that
+ // indicates how many times we've already corrected the user's input for
+ // this candidate.
+ function addCandidate(pointer, remaining, output,
+ multiplier, frequency, corrections)
+ {
+ var weight = frequency * multiplier;
+
+ // If this candidate could never become a word, don't add it
+ if (weight <= words.threshold)
+ return;
+
+ candidates.add({
+ pointer: pointer,
+ input: remaining,
+ output: output,
+ multiplier: multiplier,
+ weight: weight,
+ corrections: corrections
+ }, weight);
+ }
+
+ // Add a word to the priority queue of words
+ function addWord(word, weight) {
+ // Make sure we don't already have the word in the queue
+ for (var i = 0, n = words.items.length; i < n; i++) {
+ if (words.items[i][0] === word) {
+ // If the version we already have has higher weight, skip this one
+ if (words.priorities[i] >= weight)
return;
- nodeChar = String.fromCharCode(node.ch);
- } while (normalize(nodeChar) == normalize(prefix[0]));
- // If there are enough candidates, finish the search without
- // viewing the fuzzy matches that are worse than the exact match.
- if (_candidates.length >= _maxSuggestions)
+ else // otherwise, remove the existing lower-weight copy
+ words.removeItemAt(i);
+ break;
+ }
+ }
+
+ words.add([word, weight], weight);
+ }
+
+ // Take the highest-ranked candidate from the list of candidates and
+ // process it. (This will often add more candidates to the list). After
+ // we've processed a batch of candidates this way, use setTimeout() to
+ // schedule the processing of the next batch after returning to the
+ // event loop. If there are no more candidates or if the highest ranked
+ // one is not highly ranked enough, then we're done finding words.
+ function processCandidates() {
+ try {
+ if (aborted())
return;
- continue;
+
+ for (var count = 0; count < candidatesPerBatch; count++) {
+ var candidate = candidates.remove();
+
+ // If there are no more candidates, or if the weight isn't
+ // high enough, we're done. Call the callback with the current
+ // set of words.
+ if (!candidate || candidate.weight <= words.threshold) {
+ status.state = 'done';
+ status.suggestions = words.items; // the array in the word queue
+ cache.add(input, status.suggestions);
+ callback(status.suggestions);
+ return;
+ }
+
+ process(candidate);
+
+ //
+ // If the predicted words don't seem right, uncomment these lines
+ // to see how the call to process() modifies the set of candiates
+ // at each step. The output is verbose, but with careful study it
+ // reveals what is going on in the algorithm.
+ //
+ // var s = "";
+ // for(var i = 0; i < candidates.items.length; i++) {
+ // s += candidates.priorities[i].toPrecision(2) + " "
+ // + candidates.items[i].output + " " +
+ // + candidates.items[i].multiplier.toPrecision(2) + ", ";
+ // }
+ // log(input + " Candidate " + candidate.output +
+ // " for " + candidate.input[0] + ": " + s);
+ }
+
+ // After processing one batch of candidates, use setTimeout to
+ // schedule another invocation of this function. This returns to
+ // the event loop so we can process messages from the main thread
+ // and allows us to abort the search if more input arrives.
+ setTimeout(processCandidates);
+ }
+ catch (e) {
+ status.state = 'error';
+ status.error = e;
+ onerror(e.toString() + '\n' + e.stack);
}
- if (node.cPtr) {
- var res = result + nodeChar;
+ }
+
+ //
+ // This function is the heart of the dictionary search algorithm. The
+ // key to understanding it is that we do not traverse the dictionary
+ // tree as we would when looking for an exact match. Instead, at each
+ // level, we visit the nodes in frequency order, following the next
+ // pointer. If we find nodes that loosely match the first character of
+ // input, we use those nodes to generate new candidates for further
+ // evaluation later. Note that by maintaining a list of candidates like
+ // this, we're doing a breadth-first search rather than a depth-first
+ // search. (But since we weight the candidates, it is not a pure
+ // breadth-first search).
+ //
+ // The input candidate specifies a node in the dictionary tree, the next
+ // character of the user's input, and the output generated so far. This
+ // function uses the dictionary to loop through all possible characters
+ // that could appear after the current output, and considers those
+ // characters in most frequent to least frequent order. It compares each
+ // character to the next character of the user's input and generates new
+ // candidates based on that comparison.
+ //
+ // The candidate generation considers things such as accented characters
+ // from the dictionary, nearby keys from the keyboard layout and the
+ // possibility of user input errors such as transpositions and
+ // omissions.
+ //
+ function process(candidate) {
+ var remaining = candidate.input;
+ var output = candidate.output;
+ var multiplier = candidate.multiplier;
+ var corrections = candidate.corrections;
+ var node = {};
+
+ // The next character of the user's input
+ var char, code;
+ if (remaining.length > 0) {
+ char = remaining[0];
+ code = remaining.charCodeAt(0);
+ }
+
+ for (var next = candidate.pointer; next !== -1; next = node.next) {
+ readNode(next, node);
+
+ // How common is the most common word under this node?
+ var frequency = node.freq;
+ var weight = frequency * multiplier;
+
+ // If this node does not have a high enough weight to make it into
+ // the list of candidates, we don't need to continue. None of the
+ // nodes that follow in the next pointer linked list will have a
+ // higher weight than this one.
+ if (weight <= candidates.threshold)
+ break;
+
+ // If we generate new candidates from this node, this is what
+ // their output string will be
+ var newoutput = output + String.fromCharCode(node.ch);
- findExactly(node.cPtr, res, prefix); // insert a character
+ // The various ways we can generate new candidates from this node
+ // follow. Note that each one can have a different associated
+ // multiplier. And note that some are considered "corrections". To
+ // prevent explosive growth in the number of candidates we limit the
+ // number of corrections allowed on any candidate.
- // replace a character
- if (prefix[0] in _nearbyKeys && nodeChar in _nearbyKeys[prefix[0]]) {
- findExactly(node.cPtr, res, prefix.substr(1));
+ // If there isn't any more input from the user, then we'll try to
+ // extend the output we've already got to find a complete word. But
+ // we apply a penalty for each character we add so that shorter
+ // completions are favored over longer completions
+ if (remaining.length === 0) {
+ // If a word ends here, add it to the queue of words
+ if (node.ch === 0) {
+ addWord(output, weight);
+ continue;
+ }
+
+ // Otherwise, extend the candidate with the current node. We
+ // reduce the multiplier but do not count this as a correction so
+ // that we can extend candidates as far as needed to find words.
+ addCandidate(node.center,
+ remaining, // the empty string
+ newoutput,
+ multiplier * wordExtensionMultiplier,
+ frequency, corrections);
+
+ // If there isn't any more input then we don't want to consider
+ // any of the other possible corrections below.
+ continue;
+ }
+
+ // Handle the case where this node marks the end of a word.
+ if (node.ch === 0) {
+ // If there is just one more character of user input remaining,
+ // maybe the user accidentally typed an extra character at the
+ // end, so try just dropping the last character. Note that this
+ // case is unique in that instead of following the center pointer
+ // it revisits the same node just without the one remaining
+ // character of input.
+ if (remaining.length === 1) {
+ addCandidate(next, // stay at this same node
+ '', // no more remaining characters
+ output, // not newoutput
+ multiplier * deletionMultiplier,
+ frequency, corrections + 1);
+ }
+ continue;
+ }
+
+
+ // If we get to here, we know that we're still processing the user's
+ // input and that there is a character associated with this node.
+
+ // These next few cases are all in an if/else chain. Each of them
+ // match (or substitute) the character in the node with the next
+ // character of the user's input: they all add the same candidate,
+ // so it never makes sense for more than one of them to run. But
+ // note that it is possible for the match to happen more than one
+ // way, so we have to be sure to do the tests in highest to lowest
+ // multiplier order.
+
+ if (node.ch === code) {
+ // We found an exact match
+ addCandidate(node.center,
+ remaining.substring(1),
+ newoutput,
+ multiplier,
+ frequency, corrections);
+ }
+ else if (variants[node.ch].indexOf(char) !== -1) {
+ // The user's input is a variant form of the character in this
+ // node, so we'll accept that input as a substitute for the node
+ // character. This covers case differences and unaccented forms of
+ // accented characters. (We don't accept accented forms typed by
+ // the user as variants of unaccented characters in the
+ // dictionary, however.)
+ addCandidate(node.center,
+ remaining.substring(1),
+ newoutput,
+ multiplier * variantFormMultiplier,
+ frequency, corrections);
}
+ else if (corrections < maxCorrections) {
+ // If we haven't made any corrections on this candidate yet, try
+ // substituting the character from this node for the current
+ // character in the user's input. If the two keys are near each
+ // other on the keyboard, then we do this with higher weight than
+ // if they are distant.
+ var root = rootform[node.ch];
+ var rootcode = rootform[code];
+ var nearby = nearbyKeys[root] ? nearbyKeys[root][rootcode] : 0;
+ if (nearby) {
+ var adjust =
+ Math.max(nearby * nearbyKeyReplacementMultiplier,
+ substitutionMultiplier);
+ // If the node holds a character that is near the one the user
+ // typed, try it, assuming that the user has fat fingers and
+ // just missed the key. Note that we use a weight based on the
+ // distance between the keys. (Keys on the same row are
+ // generally closer together than keys above or below each
+ // other)
+ addCandidate(node.center,
+ remaining.substring(1),
+ newoutput,
+ multiplier * adjust,
+ frequency, corrections + 1);
+ }
+ else if (output.length > 0) {
+ // If it wasn't a nearby key, try substituting it anyway, but
+ // with a much lower weight. This handles the case where the
+ // user just doesn't know how to spell the word. We assume that
+ // the user knows the correct first letter of the word.
+ addCandidate(node.center,
+ remaining.substring(1),
+ newoutput,
+ multiplier * substitutionMultiplier,
+ frequency, corrections + 1);
+ }
+ }
+
+ // Now we try some other tests that generate different candidates
+ // than the above. These involve insertion, deletion or
+ // transposition. Note that to avoid exponential blow-up of the
+ // search space, we generally only allow maxCorrections (usually 1)
+ // correction per candidate.
- if (prefix.length > 1 && nodeChar == prefix[1]) {
+ // First, just try inserting this character. Maybe the user forgot
+ // to type it or omitted punctuation on purpose. If this character
+ // has no variants, then it is a punctuation character and we allow
+ // it to be inserted with a high multiplier and no correction
+ // penalty. If it is not word punctuation, then the insertion is
+ // more costly. Also: assume that the user got the first character
+ // correct and don't insert at position 0.
+ if (!variants[node.ch]) { // If it is a punctuation character
+ addCandidate(node.center,
+ remaining, // insertion, so no substring here
+ newoutput,
+ multiplier * punctuationInsertionMultiplier,
+ frequency, corrections);
+ }
+ else if (corrections < maxCorrections && output.length > 0) {
+ addCandidate(node.center,
+ remaining,
+ newoutput,
+ multiplier * insertionMultiplier,
+ frequency, corrections + 1);
+ }
+
+ // If there is more input after this character, and if this node of
+ // the tree matches the next character of the input, try deleting
+ // the current character and try transposing the two. But assume
+ // that the user got their first character correct and don't mess
+ // with that.
+ if (corrections < maxCorrections &&
+ remaining.length > 1 && output.length > 0 &&
+ (node.ch === remaining.charCodeAt(1) ||
+ variants[node.ch].indexOf(remaining[1]) !== -1))
+ {
// transpose
- findExactly(node.cPtr, res, prefix[0] + prefix.substr(2));
+ addCandidate(node.center,
+ remaining[0] + remaining.substring(2),
+ newoutput,
+ multiplier * transpositionMultiplier,
+ frequency, corrections + 1);
+
// delete
- findExactly(node.cPtr, res, prefix.substr(2));
+ addCandidate(node.center,
+ remaining.substring(2),
+ newoutput,
+ multiplier * deletionMultiplier,
+ frequency, corrections + 1);
}
}
- readNode(node.nPtr, node);
}
}
- function addNextCandidate(offset, prefix, multiplier) {
- var node = Object.create(null);
- readNode(offset, node);
- var i = _candidates.length - 1;
- // Find the insertion point
- var freq = node.freq * multiplier;
- while (i >= 0 && freq > _candidates[i].freq)
- i--;
- // Don't insert a candidate that is worse than already found candidates
- // if we already have the required number of candidates
- if (i == _candidates.length - 1 && _candidates.length >= _maxSuggestions)
- return;
- _candidates.splice(i + 1, 0, { node: node, prefix: prefix,
- multiplier: multiplier, freq: node.freq * multiplier });
+ //
+ // This function unpacks binary data from the dictionary and returns
+ // the nodes of the dictionary tree in expanded form as JS objects.
+ // See gaia/dictionaries/xml2dict.py for the corresponding code that
+ // serializes the nodes of the tree into this binary format. Full
+ // documentation of the binary format is in that file.
+ //
+ function readNode(offset, node) {
+ if (offset === -1) {
+ throw Error('Assertion error: followed invalid pointer');
+ }
+
+ var firstbyte = tree[offset++];
+ var haschar = firstbyte & 0x80;
+ var bigchar = firstbyte & 0x40;
+ var hasnext = firstbyte & 0x20;
+ node.freq = firstbyte & 0x1F;
+
+ if (haschar) {
+ node.ch = tree[offset++];
+ if (bigchar)
+ node.ch = (node.ch << 8) + tree[offset++];
+ }
+ else {
+ node.ch = 0;
+ }
+
+ if (hasnext) {
+ node.next =
+ (tree[offset++] << 16) +
+ (tree[offset++] << 8) +
+ tree[offset++];
+ }
+ else {
+ node.next = -1;
+ }
+
+ if (haschar)
+ node.center = offset;
+ else
+ node.center = -1;
+/*
+ log("readNode:" +
+ " haschar:" + haschar +
+ " bigchar:" + bigchar +
+ " hasnext:" + hasnext +
+ " freq:" + node.freq +
+ " char:" + node.ch +
+ " next:" + node.next +
+ " center:" + node.center);
+*/
}
- function predictSuffixes() {
- while (_candidates.length > 0 && _suggestions.length < _maxSuggestions) {
- var cand = _candidates.shift();
- var node = cand.node;
- var prefix = cand.prefix;
- for (;;) {
- if (node.nPtr) // Add the next best candidate
- addNextCandidate(node.nPtr, prefix, cand.multiplier);
+ //
+ // A priority queue with a maximum size.
+ //
+ // add() inserts an item at a position according to its priority. It
+ // returns true if the item was inserted or false if the item's priority
+ // was too low for a spot in the fixed-size queue.
+ //
+ // remove() removes and returns the highest priority item in the queue or
+ // null if there are no items
+ //
+ // threshold is 0 if the queue is not yet full. Otherwise it is the
+ // priority of the lowest-priority item in the queue. Items with
+ // priorities higher lower than this will never be added to the queue.
+ //
+ // items is the sorted array of items, with the highest priority item
+ // first.
+ //
+ function BoundedPriorityQueue(maxSize) {
+ this.maxSize = maxSize;
+ this.threshold = 0;
+ this.items = [];
+ this.priorities = [];
+ }
- if (node.ch == 0) // If the word ends here
- break;
+ BoundedPriorityQueue.prototype.add = function add(item, priority) {
+ // If the array is full we have to reject this item or make room for it
+ if (this.items.length === this.maxSize) {
+ if (priority <= this.threshold) { // Reject the item.
+ return false;
+ }
+ else { // Make room for it.
+ this.items.pop();
+ this.priorities.pop();
+ }
+ }
- // Move to the next character in the best candidate
- prefix += String.fromCharCode(node.ch);
- readNode(node.cPtr, node);
+ // Search to find the insertion point for this new item
+ var index;
+ if (this.priorities.length > 60) {
+ // Binary search only for relatively long arrays.
+ // See http://jsperf.com/linear-or-binary-search for perf data.
+ var start = 0, end = this.priorities.length;
+ while (start !== end) {
+ var mid = Math.floor((start + end) / 2);
+ if (priority > this.priorities[mid]) {
+ end = mid;
+ }
+ else {
+ start = mid + 1;
+ }
}
- // Record the suggestion and move to the next best candidate
- if (!(prefix in _suggestions_index)) {
- _suggestions.push([prefix, cand.freq]);
- _suggestions_index[prefix] = true;
+ index = start;
+ }
+ else {
+ // Linear search for small arrays
+ for (var i = 0, n = this.priorities.length; i < n; i++) {
+ if (priority > this.priorities[i])
+ break;
}
+ index = i;
}
- }
- function predict(prefix) {
- if (!_dict || !_nearbyKeys)
- throw Error('not initialized');
+ // Insert the new item at that position
+ this.items.splice(index, 0, item);
+ this.priorities.splice(index, 0, priority);
- _suggestions = [];
- _candidates = [];
- _suggestions_index = Object.create(null);
- findFuzzy(1, '', prefix);
- predictSuffixes();
- return _suggestions;
+ // Update the threshold
+ this.threshold = this.priorities[this.maxSize - 1] || 0;
+ };
+
+ BoundedPriorityQueue.prototype.remove = function remove() {
+ if (this.items.length === 0)
+ return null;
+ this.priorities.shift();
+ this.threshold = this.priorities[this.maxSize - 1] || 0;
+ return this.items.shift();
+ };
+
+ BoundedPriorityQueue.prototype.removeItemAt = function removeItemAt(index) {
+ this.priorities.splice(index, 1);
+ this.items.splice(index, 1);
+ this.threshold = this.priorities[this.maxSize - 1] || 0;
+ };
+
+ //
+ // A very simple Least Recently Used cache. It depends on the fact that the
+ // JavaScript for/in loop enumerates properties in order from least recently
+ // to most recently added. (Note that this does not work for properties
+ // that are numbers, however.)
+ //
+ function LRUCache(maxsize) {
+ this.maxsize = maxsize;
+ this.size = 0;
+ this.map = Object.create(null); // map keys to values
}
+ // Cache the key/value pair
+ LRUCache.prototype.add = function add(key, value) {
+ // If the key is already in the cache, adjust the size since we'll
+ // be incrementing below
+ if (key in this.map) {
+ this.size--;
+ }
+
+ // Now insert the item
+ this.map[key] = value;
+ this.size++;
+
+ // If the size is too big delete the first property returned by
+ // for/in. This should be the least recently used because the get()
+ // method deletes and reinserts
+ if (this.size > this.maxsize) {
+ for (var p in this.map) {
+ delete this.map[p];
+ break;
+ }
+ this.size--;
+ }
+ };
+
+ // Look for a cached value matching the specified key
+ LRUCache.prototype.get = function(key) {
+ if (key in this.map) { // If the key is in the cache
+ var value = this.map[key]; // Get the value
+ delete this.map[key]; // Delete the key/value mapping
+ this.map[key] = value; // And re-insert to make it most recent
+ return value; // Return the value
+ }
+ return null;
+ };
+
+ // This is the Predictions object that is the public API of this module.
return {
setDictionary: setDictionary,
- setLayout: setLayout,
+ setNearbyKeys: setNearbyKeys,
predict: predict
};
}();
View
50 apps/keyboard/js/imes/latin/worker.js
@@ -14,15 +14,18 @@
//
// setLanguage: the input method uses this message to tell the worker
// to load a dictionary for the prediction engine. If no such dictionary
-// is found the worker response with an "unknownLanguage" message.
+// is found or if the dictionary is invalid, the worker responds with
+// an "error" message.
//
-// setLayout: the input method uses this message to pass a new keyboard
-// layout to the prediction engine
+// setNearbyKeys: the input method uses this message to pass information about
+// what keys are near each other in the current keyboard layout
//
// predict: the input method uses this message to ask the prediction
// engine to suggestion completions for (or corrections to) the specified
// string. The worker responds with a "predictions" message whose argument
-// is an array of up to 3 predicted words.
+// is an array of up to 3 predicted words. If the the specified string
+// is a word in the dictionary, the worker may send a "word" message
+// first to quickly tell the keyboard that the word is valid.
//
'use strict';
@@ -45,6 +48,9 @@ function log(msg) {
// than we have to.
var currentLanguage;
+// The prediction that is currently running, if any. So that it can be cancelled
+var pendingPrediction;
+
var Commands = {
setLanguage: function setLanguage(language) {
if (language !== currentLanguage) {
@@ -66,26 +72,44 @@ var Commands = {
//
if (!xhr.response || xhr.response.byteLength === 0) {
log('error loading dictionary');
- postMessage({ cmd: 'unknownLanguage', language: language });
+ postMessage({ cmd: 'error', message: 'Unknown language: ' + language });
}
else {
- Predictions.setDictionary(xhr.response);
+ try {
+ Predictions.setDictionary(xhr.response);
+ }
+ catch (e) {
+ postMessage({ cmd: 'error', message: e.message + ': ' + dicturl});
+ }
}
}
},
- setLayout: function setLayout(layout) {
- Predictions.setLayout(layout);
+ setNearbyKeys: function setNearbyKeys(nearbyKeys) {
+ try {
+ Predictions.setNearbyKeys(nearbyKeys);
+ }
+ catch (e) {
+ postMessage({cmd: 'error',
+ message: 'Predictions.setNearbyKeys(): ' + e.message});
+ }
},
predict: function predict(prefix) {
- try {
- var words = Predictions.predict(prefix);
+ if (pendingPrediction) // Make sure we're not still running a previous one
+ pendingPrediction.abort();
+
+ // var start = Date.now();
+ pendingPrediction = Predictions.predict(prefix, success, error);
+
+ function success(words) {
+ // log('suggestions for: ' + prefix + ' ' + JSON.stringify(words) + ' ' +
+ // (Date.now() - start));
postMessage({ cmd: 'predictions', input: prefix, suggestions: words });
}
- catch (e) {
- log('Exception in predictions.js: ' + JSON.stringify(e));
- postMessage({cmd: 'predictions', input: prefix, suggestions: [] });
+
+ function error(msg) {
+ log('Error in Predictions.predict(): ' + msg);
}
}
};
View
7 ...board/js/imes/latin/dictionaries/Makefile → dictionaries/Makefile
@@ -1,5 +1,6 @@
-DICTIONARY_PATH =../../../../../../dictionaries
-
+# After building a dictionary here, move the resulting .dict file to
+# apps/keyboard/js/imes/latin/dictionaries/ and check it in.
+#
# Available:
# cs.dict, de.dict, en_gb.dict, en_us.dict, es.dict, fr.dict,
# hr.dict, it.dict, nb.dict, pt_pt.dict, pt_br.dict
@@ -12,8 +13,6 @@ WHITELIST = \
fr.dict \
es.dict
-VPATH = $(DICTIONARY_PATH)
-
%.dict: %_wordlist.xml xml2dict.py
python3 xml2dict.py -o $@ $<
View
532 dictionaries/xml2dict.py
<
@@ -0,0 +1,532 @@
+# -*- coding: utf-8 -*-
+"""
+
+This script reads a XML-formatted word list and produces a dictionary
+file used by the FirefoxOS virtual keyboard for word suggestions and
+auto corrections.
+
+The word lists come from the Android source: https://android.googlesource.com/platform/packages/inputmethods/LatinIME/+/master/dictionaries/
+
+This script currently depends on the XML format of the Android
+wordlists. (Eventually we might want to pre-process the XML files
+to a plain text format and simplify this script so that it will work
+with any plain-text word and frequency list)
+
+The sample.xml file from the Android repo looks like this:
+
+----------------------------------------------------------------------
+
+ <!-- This is a sample wordlist that can be converted to a binary
+ dictionary for use by the Latin IME. The format of the word
+ list is a flat list of word entries. Each entry has a frequency
+ between 255 and 0. Highest frequency words get more weight in
+ the prediction algorithm. As a special case, a weight of 0 is
+ taken to mean profanity - words that should not be considered a
+ typo, but that should never be suggested explicitly. You can
+ capitalize words that must always be capitalized, such as
+ "January". You can have a capitalized and a non-capitalized
+ word as separate entries, such as "robin" and "Robin". -->
+
+ <wordlist>
+ <w f="255">this</w>
+ <w f="255">is</w>
+ <w f="128">sample</w>
+ <w f="1">wordlist</w>
+ </wordlist>
+----------------------------------------------------------------------
+
+This script processes the word list and converts it to a Ternary
+Search Tree (TST), as described in
+gaia/apps/keyboard/js/imes/latin/predictions.js and also in
+
+ http://en.wikipedia.org/wiki/Ternary_search_tree
+ http://www.strchr.com/ternary_dags
+ http://www.strchr.com/dawg_predictive
+
+Note that the script does not convert the tree into a DAG (by sharing
+common word suffixes) because it cannot maintain separate frequency
+data for each word if the words share nodes.
+
+This script balances the TST such that at any node the
+highest-frequency word is found by following the center pointer. The
+script also overlays a linked list on top of the tree. At any node,
+the next most frequent word with the same parent node is found by
+following the next pointer.
+
+After building the TST data strucure this script serializes it into a
+compact binary file with variable length nodes. The file begins with
+the 8 ASCII characters "FxOSDICT" and four more bytes. Bytes 8, 9 and
+10 are currently unused and byte 11 is a dictionary format version
+number, currently 1.
+
+Byte 12 file specifies the length of the longest word in the
+dictionary.
+
+After these first 13 bytes, the file contains a character table that
+lists the characters used in the dictionary. This table is a two-byte
+big-endian integer that specifies the number of entries in the
+table. Each table entry is a big-endian two-byte character code
+followed by a big-endian 4-byte number that specifies the number of
+times the character appears in the dictionary.
+
+After the character table (starting at byte 15 + num_entries*6), the
+file consists of serialized nodes. Each node is betwen 1 byte and 6
+bytes long, encoded as follows.
+
+The first byte of each node is always an 8-bit bitfield: csnfffff
+
+The c bit specifies whether this node represents a character. If c is
+1 then a character code follows this first byte. If c is 0 then this
+is a terminal node that marks the end of the word and it consists
+of this single byte by itself.
+
+The s bit specifies the size of the character associated with this
+node. If s is 0 the character is one byte long. If s is 1 the
+character is a big-endian two byte value.
+
+The n bit specifies whether this node includes a next pointer. If n is 1 then
+the character code is followed by a big-endian 3 byte number.
+
+The fffff bits are represent a number between 0 and 31 and provide a
+weight for the node. This is usually based on the word frequency data
+from the dictionary, though this may be tuned by adjusting frequency
+depending on word length, for example. At any node, these frequency
+bits represent the weight of the highest frequency word under the
+node. (And, as described in the predictions.js file, the tree is
+balanced so that that highest frequency word is found by following the
+chain of center pointers.)
+
+If the c bit is set, the next one or two bytes (depending on the
+s bit) of the node are the Unicode character code that is stored in
+the node. Two-byte codes are big-endian.
+
+If the n bit was set, the next 3 bytes are a big-endian 24-bit
+unsigned integer offset to the start of the node pointed to by the
+next pointer.
+
+If the c bit was set the node has a character, and this means that it
+also has a center pointer. We serialize the tree so that a node's
+center pointer always points to the next node sequentially. So we
+never need to write the center pointer to the file: it is always the
+next node.
+
+"""
+