-
Notifications
You must be signed in to change notification settings - Fork 4
/
base.py
282 lines (242 loc) · 11.1 KB
/
base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""FastPair: Data-structure for the dynamic closest-pair problem.
This data-structure is based on the observation that the conga line data
structure, in practice, does better the more subsets you give to it: even
though the worst case time for k subsets is O(nk log (n/k)), that worst case
seems much harder to reach than the nearest neighbor algorithm.
In the limit of arbitrarily many subsets, each new addition or point moved
by a deletion will be in a singleton subset, and the algorithm will
differ from nearest neighbors in only a couple of ways: (1) when we
create the initial data structure, we use a conga line rather than
all nearest neighbors, to keep the indegree of each point low, and
(2) when we insert a point, we don't bother updating other points'
neighbors.
Notes
-----
Total space: 20n bytes. (Could be reduced to 4n at some cost in update time.)
Time per insertion or single distance update: O(n)
Time per deletion or point update: O(n) expected, O(n^2) worst case
Time per closest pair: O(n)
References
----------
[1] Eppstein, David: Fast hierarchical clustering and other applications of
dynamic closest pairs. Journal of Experimental Algorithmics 5 (2000) 1.
"""
# Copyright (c) 2016, Carson J. Q. Farmer <carsonfarmer@gmail.com>
# Copyright (c) 2002-2015, David Eppstein
# Licensed under the MIT Licence (http://opensource.org/licenses/MIT).
from __future__ import print_function, division, absolute_import
from itertools import combinations, cycle
from operator import itemgetter
from collections import defaultdict
import scipy.spatial.distance as dist
__all__ = ["FastPair", "dist"]
class attrdict(dict):
"""Simple dict with support for accessing elements as attributes."""
def __init__(self, *args, **kwargs):
super(attrdict, self).__init__(*args, **kwargs)
self.__dict__ = self
class FastPair(object):
"""FastPair 'sketch' class.
"""
def __init__(self, min_points=10, dist=dist.euclidean):
"""Initialize an empty FastPair data-structure.
Parameters
----------
min_points : int, default=10
The minimum number of points to add before initializing the
data-structure. Queries _before_ `min_points` have been added to
the data-structure will be brute-force.
dist : function, default=scipy.spatial.distance.euclidean
Can be any Python function that returns a distance (float) between
between two vectors (tuples) `u` and `v`. Any distance function
from `scipy.spatial.distance` will do the trick. By default, the
Euclidean distance function is used. This function should play
nicely with the `merge` function.
"""
self.min_points = min_points
self.dist = dist
self.initialized = False # Has the data-structure been initialized?
self.neighbors = defaultdict(attrdict) # Dict of neighbor points and dists
self.points = list() # Internal point set; entries may be non-unique
def __add__(self, p):
"""Add a point and find its nearest neighbor.
There is some extra logic here to allow us to gradually build up the
FastPair data-structure until we reach `min_points`. Once `min_points`
has been reached, we initialize the data-structure and start to take
advantage of the FastPair efficiencies.
"""
self.points.append(p)
if self.initialized:
self._find_neighbor(p)
elif len(self) >= self.min_points:
self.build()
return self
def __sub__(self, p):
"""Remove a point and update neighbors."""
self.points.remove(p)
if self.initialized:
# We must update neighbors of points for which `p` had been nearest.
for q in self.points:
if self.neighbors[q].neigh == p:
res = self._find_neighbor(q)
return self
def __len__(self):
"""Number of points in data structure."""
return len(self.points)
def __call__(self):
"""Find closest pair by scanning list of nearest neighbors."""
return self.closest_pair()
def __contains__(self, p):
return p in self.points
def __iter__(self):
return iter(self.points)
def __getitem__(self, item):
if not item in self:
raise KeyError("{} not found".format(item))
return self.neighbors[item]
def __setitem__(self, item, value):
if not item in self:
raise KeyError("{} not found".format(item))
self._update_point(item, value)
def build(self, points=None):
"""Build a FastPairs data-structure from a set of (new) points.
Here we use a conga line rather than calling explicitly (re)building
the neighbors map multiple times as it is more efficient. This method
needs to be called _before_ querying the data-structure or adding/
removing any new points. Once it has been called, the internal
`initialized` flag will be set to True. Otherwise, simple brute-force
versions of queries and calculations will be used.
Parameters
----------
points : list of tuples/vectors, default=None
An optional list of point tuples to be added to the point set,
prior to computing the conga line and main FastPair data structure.
"""
if points is not None:
self.points += list(points)
np = len(self)
# Go through and find all neighbors, placing then in a conga line
for i in range(np-1):
# Find neighbor to p[0] to start
nbr = i + 1
nbd = float("inf")
for j in range(i+1, np):
d = self.dist(self.points[i], self.points[j])
if d < nbd:
nbr = j
nbd = d
# Add that edge, move nbr to points[i+1]
self.neighbors[self.points[i]].dist = nbd
self.neighbors[self.points[i]].neigh = self.points[nbr]
self.points[nbr] = self.points[i + 1]
self.points[i + 1] = self.neighbors[self.points[i]].neigh
# No more neighbors, terminate conga line.
# Last person on the line has no neigbors :(
self.neighbors[self.points[np-1]].neigh = self.points[np-1]
self.neighbors[self.points[np-1]].dist = float("inf")
self.initialized = True
return self
def closest_pair(self):
"""Find closest pair by scanning list of nearest neighbors.
If `npoints` is less than `min_points`, a brute-force version
of the closest pair algrithm is used.
"""
if len(self) < 2:
raise ValueError("Must have `npoints >= 2` to form a pair.")
elif not self.initialized:
return self.closest_pair_brute_force()
a = self.points[0] # Start with first point
d = self.neighbors[a].dist
for p in self.points:
if self.neighbors[p].dist < d:
a = p # Update `a` and distance `d`
d = self.neighbors[p].dist
b = self.neighbors[a].neigh
return d, (a, b)
def closest_pair_brute_force(self):
"""Find closest pair using brute-force algorithm."""
return _closest_pair_brute_force(self.points)
def sdist(self, p):
"""Compute distances from input to all other points in data-structure.
This returns an iterator over all other points and their distance
from the input point `p`. The resulting iterator returns tuples with
the first item giving the distance, and the second item giving in
neighbor point. The `min` of this iterator is essentially a brute-
force 'nearest-neighbor' calculation. To do this, supply `itemgetter`
(or a lambda version) as the `key` argument to `min`.
Examples
--------
>>> fp = FastPair().build(points)
>>> min(fp.sdist(point), key=itemgetter(0))
"""
return ((self.dist(a, b), b) for a, b in
zip(cycle([p]), self.points) if b != a)
def _find_neighbor(self, p):
"""Find and update nearest neighbor of a given point."""
# If no neighbors available, set flag for `_update_point` to find
if len(self) < 2:
self.neighbors[p].neigh = p
self.neighbors[p].dist = float("inf")
else:
# Find first point unequal to `p` itself
first_nbr = 0
if p == self.points[first_nbr]:
first_nbr = 1
self.neighbors[p].neigh = self.points[first_nbr]
self.neighbors[p].dist = self.dist(p, self.neighbors[p].neigh)
# Now test whether each other point is closer
for q in self.points[first_nbr+1:]:
if p != q:
d = self.dist(p, q)
if d < self.neighbors[p].dist:
self.neighbors[p].dist = d
self.neighbors[p].neigh = q
return dict(self.neighbors[p]) # Return plain ol' dict
def _update_point(self, old, new):
"""Update point location, neighbors, and distances.
All distances to point have changed, we need to recompute all aspects
of the data structure that may be affected. Note that although we
completely recompute the neighbors of the original point (`old`), we
don't explicitly rebuild the neighbors map, since that would double the
number of distance computations made by this routine. Also, like
deletion, we don't change any _other_ point's neighbor to the updated
point.
"""
# Out with the old, in with the new...
self.points.remove(old)
self.points.append(new)
if not self.initialized:
return new
del self.neighbors[old]
self.neighbors[new].neigh = new # Flag for not yet found any
self.neighbors[new].dist = float("inf")
for q in self.points:
if q != new:
d = self.dist(new, q)
if d < self.neighbors[new].dist:
self.neighbors[new].dist = d
self.neighbors[new].neigh = q
if self.neighbors[q].neigh == old:
if d > self.neighbors[q].dist:
self._find_neighbor(q)
else:
self.neighbors[q].neigh = new
self.neighbors[q].dist = d
return dict(self.neighbors[new])
# def merge_closest(self):
# dist, (a, b) = self.closest_pair()
# c = self.merge(a, b)
# self -= b
# return self._update_point(a, c)
def _closest_pair_brute_force(pts, dst=dist.euclidean):
"""Compute closest pair of points using brute-force algorithm.
Notes
-----
Computes all possible combinations of points and compares their distances.
This is _not_ efficient, nor scalable, but it provides a useful reference
for more efficient algorithms. This version is actually pretty fast due
to its use of fast Python builtins.
"""
return min((dst(p1, p2), (p1, p2)) for p1, p2 in combinations(pts, r=2))