-
Notifications
You must be signed in to change notification settings - Fork 250
/
dbscan.py
executable file
·204 lines (135 loc) · 7.41 KB
/
dbscan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""!
@brief Cluster analysis algorithm: DBSCAN.
@details Implementation based on article:
- M.Ester, H.Kriegel, J.Sander, X.Xiaowei. A density-based algorithm for discovering clusters in large spatial databases with noise. 1996.
@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2017
@copyright GNU Public License
@cond GNU_PUBLIC_LICENSE
PyClustering is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
PyClustering is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
@endcond
"""
from pyclustering.container.kdtree import kdtree;
from pyclustering.cluster.encoder import type_encoding;
import pyclustering.core.dbscan_wrapper as wrapper;
from pyclustering.utils import euclidean_distance_sqrt;
class dbscan:
"""!
@brief Class represents clustering algorithm DBSCAN.
Example:
@code
# sample for cluster analysis (represented by list)
sample = read_sample(path_to_sample);
# create object that uses CCORE for processing
dbscan_instance = dbscan(sample, 0.5, 3, True);
# cluster analysis
dbscan_instance.process();
# obtain results of clustering
clusters = dbscan_instance.get_clusters();
noise = dbscan_instance.get_noise();
@endcode
"""
def __init__(self, data, eps, neighbors, ccore = False):
"""!
@brief Constructor of clustering algorithm DBSCAN.
@param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
@param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less then the radius.
@param[in] neighbors (uint): minimum number of shared neighbors that is required for establish links between points.
@param[in] ccore (bool): if True than DLL CCORE (C++ solution) will be used for solving the problem.
"""
self.__pointer_data = data;
self.__kdtree = None;
self.__eps = eps;
self.__sqrt_eps = eps * eps;
self.__neighbors = neighbors;
self.__visited = [False] * len(self.__pointer_data);
self.__belong = [False] * len(self.__pointer_data);
self.__clusters = [];
self.__noise = [];
self.__ccore = ccore;
def process(self):
"""!
@brief Performs cluster analysis in line with rules of DBSCAN algorithm.
@see get_clusters()
@see get_noise()
"""
if (self.__ccore is True):
(self.__clusters, self.__noise) = wrapper.dbscan(self.__pointer_data, self.__eps, self.__neighbors, True);
else:
self.__kdtree = kdtree(self.__pointer_data, range(len(self.__pointer_data)));
for i in range(0, len(self.__pointer_data)):
if (self.__visited[i] == False):
cluster = self.__expand_cluster(i); # Fast mode
if (cluster != None):
self.__clusters.append(cluster);
else:
self.__noise.append(i);
self.__belong[i] = True;
def get_clusters(self):
"""!
@brief Returns allocated clusters.
@remark Allocated clusters can be returned only after data processing (use method process()). Otherwise empty list is returned.
@return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
@see process()
@see get_noise()
"""
return self.__clusters;
def get_noise(self):
"""!
@brief Returns allocated noise.
@remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
@return (list) List of indexes that are marked as a noise.
@see process()
@see get_clusters()
"""
return self.__noise;
def get_cluster_encoding(self):
"""!
@brief Returns clustering result representation type that indicate how clusters are encoded.
@return (type_encoding) Clustering result representation.
@see get_clusters()
"""
return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
def __expand_cluster(self, point):
"""!
@brief Expands cluster from specified point in the input data space.
@param[in] point (list): Index of a point from the data.
@return (list) Return tuple of list of indexes that belong to the same cluster and list of points that are marked as noise: (cluster, noise), or None if nothing has been expanded.
"""
cluster = None;
self.__visited[point] = True;
neighbors = self.__neighbor_indexes(point);
if (len(neighbors) >=self.__neighbors):
cluster = [];
cluster.append(point);
self.__belong[point] = True;
for i in neighbors:
if (self.__visited[i] == False):
self.__visited[i] = True;
next_neighbors = self.__neighbor_indexes(i);
if (len(next_neighbors) >= self.__neighbors):
# if some node has less then minimal number of neighbors than we shouldn't look at them
# because maybe it's a noise.
neighbors += [k for k in next_neighbors if ( (k in neighbors) == False)];
if (self.__belong[i] == False):
cluster.append(i);
self.__belong[i] = True;
return cluster;
def __neighbor_indexes(self, index_point):
"""!
@brief Return list of indexes of neighbors of specified point for the data.
@param[in] index_point (list): An index of a point for which potential neighbors should be returned in line with connectivity radius.
@return (list) Return list of indexes of neighbors in line the connectivity radius.
"""
kdnodes = self.__kdtree.find_nearest_dist_nodes(self.__pointer_data[index_point], self.__eps);
return [node_tuple[1].payload for node_tuple in kdnodes if node_tuple[1].payload != index_point];
# return [i for i in range(0, len(self.__pointer_data)) if euclidean_distance_sqrt(self.__pointer_data[index_point], self.__pointer_data[i]) <= self.__sqrt_eps and (i != index_point) ]; # Fast mode