-
Notifications
You must be signed in to change notification settings - Fork 0
/
approx_lookup_table.py
102 lines (83 loc) · 3.14 KB
/
approx_lookup_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
This class implements fast approximate string search in a large lexicon by
building an n-gram lookup table. Shared n-grams between a query word and
lexicon words can be used as a proxy for edit distance. For example, this might
be used by a spell checker to find close matches in a dictionary.
This system is based on the paper "Finding Approximate Matches in Large
Lexicons" (Zobel & Dart, 1995).
- Colton Gyulay
"""
import time
EDGE_TOKEN = u"="
class ApproxLookupTable():
"""This class is responsible for building an n-gram lookup table for
searching for approximate matches in a provided lexicon.
"""
def __init__(self, words):
"""Build n-gram lookup table, and word->idx/idx->word tables for
smaller memory footprint.
Args:
words: a list of words to search against.
Returns:
a queryable lookup object.
"""
self.word2idx = {}
self.idx2word = {}
self.ngram_lookup = {}
print "building n-gram lookup table..."
start = time.clock()
idx = 0
for w in words:
if w not in self.word2idx:
self.word2idx[w] = idx
self.idx2word[idx] = w
idx += 1
ngrams = self._extract_ngrams(w)
for ngram in ngrams:
if ngram not in self.ngram_lookup:
self.ngram_lookup[ngram] = []
self.ngram_lookup[ngram].append(self.word2idx[w])
print "completed table in %s s." % (time.clock() - start)
def _extract_ngrams(self, word, ngram=3):
"""Extract all substrings of size n-gram from a word, using edge
padding (e.g., "John" -> ["=jo", "joh", "ohn", "hn="]).
Args:
word: a string from which to extract n-grams.
ngram: the n-gram length to use for matching.
Returns:
a list of n-grams contained in word.
"""
w = EDGE_TOKEN + word.lower() + EDGE_TOKEN
if len(w) < ngram + 1:
return [w]
subs = []
for i in xrange(len(w) - ngram + 1):
subs.append(w[i: i + ngram])
return subs
def query(self, word):
"""Find next closest word in lexicon based on shared n-grams.
Args:
word: a target string for which to find approximate matches.
Returns:
the closest word in the lexicon.
"""
return self.query_k(word, 1)[0]
def query_k(self, word, k=5):
"""Find k closest words in lexicon based on shared n-grams.
Args:
word: a target string for which to find approximate matches.
Returns:
the k closest words in the lexicon.
"""
common = {}
ngrams = self._extract_ngrams(word)
for ngram in ngrams:
shared = self.ngram_lookup.get(ngram) or []
for idx in shared:
if idx not in common:
common[idx] = 0
common[idx] += 1
if len(common) == 0:
return [word]
top_k = sorted(common.iteritems(), key=lambda x: -x[1])[:k]
return [self.idx2word[i] for (i, v) in top_k]