-
Notifications
You must be signed in to change notification settings - Fork 1
/
smeans.js
262 lines (210 loc) · 6.56 KB
/
smeans.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
'use strict';
function Smeans() { }
Smeans.cluster = cluster;
Smeans.distance = distance;
Smeans.centroid = centroid;
Smeans.closest = closest;
/**
* @ngdoc function
* @name cluster
* @param {Array} elements - An array of N-element arrays where each element is a number.
* @param {object} options - A set of options to pass to the clustering function
* @description
Cluster takes an array of number-arrays like so:
<pre>
smeans.cluster([
[0, 0],
[1, 1],
[20, 30],
[40, 2]
]);
</pre>
... and groups them into "clusters" according to Euclidean-distance similarity.
*/
function cluster(elements, opts) {
/*jshint -W004 */
if (!Array.isArray(elements)) {
throw new Error("Elements argument is not an array");
}
if (elements.length === 0) { return []; }
opts = opts ? opts : {
k: null,
clusters: null
};
var i,
ci,
ei,
e,
diff,
tot_diff = 0,
diffs = [];
// Get the variance and std dev for the distances between elements
for (var i = 1, el = elements.length; i < el; i++) { // Skip the 0th element since nothing precedes it
diff = Smeans.distance(elements[ i ], elements[ i - 1 ]);
tot_diff += diff;
diffs.push(diff);
}
var mean_diff = tot_diff / diffs.length;
// Get the variance & std dev
var sum_diff_variance = 0;
for (var i = 0, dl = diffs.length; i < dl; i++) {
diff = diffs[i];
sum_diff_variance += Math.pow(diff - mean_diff, 2);
}
var diff_variance = sum_diff_variance / diffs.length;
var diff_stdev = Math.sqrt(diff_variance);
/* S-means */
// Flag for whether the clusters are changing or not, stop when it's 0
var changing = true;
// Initial number of clusters
var k = opts.k ? opts.k : 1;
// Similarity threshold
var threshold = diff_stdev;
// Randomly choose [k] centroids from among the data points
var cluster_map = {};
if (opts.clusters) {
// Clear out the cluster elements
cluster_map = opts.clusters;
for (var ci in cluster_map) {
var cluster = cluster_map[ci];
cluster.elements = {};
}
}
// No clusters pre-defined, pick a random element and make it the first cluster
if (Object.keys(cluster_map).length === 0) {
for (var i = 1; i <= k; i++) {
var centroid = elements[Math.floor(Math.random() * elements.length)];
cluster_map[i] = {
i: i,
centroid: centroid,
elements: {}
};
}
}
while (changing) {
// Flag for if a new cluster was made this iteration
var new_cluster = false;
// For each data point
for (var ei = 0, el = elements.length; ei < el; ei++) {
e = elements[ei];
var closest_dist = null; // Closest distance to this data point
var closest_cluster = null; // Closest centroid to this data point
// For each cluster
for (var ci in cluster_map) {
cluster = cluster_map[ci];
// Get the distance from this point to the cluster centroid
var dist = Math.abs(Smeans.distance(e, cluster.centroid));
// If this distance is closer than the closest recorded distance, reset the closest
// distance and cluster
if (closest_dist === null || dist < closest_dist) {
closest_dist = dist;
closest_cluster = cluster;
}
}
// If the distance is below the threshold, or the distance is 0, add it to this cluster
if (closest_dist < threshold || closest_dist === 0) {
closest_cluster.elements[e] = e;
}
// Otherwise create a new cluster with this data point as the centroid,
// also add this data point to the new cluster
else {
// Get the index of the new cluster
var max_ci = 0;
// max_ci = i for i, val of cluster_map when i > max_ci
for (var cmi in cluster_map) {
var max_cluster = cluster_map[cmi];
if (max_ci < max_cluster.i) {
max_ci = max_cluster.i;
}
}
max_ci++;
cluster_map[ max_ci ] = {
i: max_ci,
centroid: e,
elements: {}
};
cluster_map[max_ci].elements[e] = e;
// *** This seems to make it never return...
// new_cluster = true;
}
}
var cluster_changed = false;
// For each cluster
for (var ci in cluster_map) {
cluster = cluster_map[ci];
// Delete clusters that have no elements
var cluster_elms = Object.keys(cluster.elements);
if (cluster_elms.length === 0) {
delete cluster_map[ ci ];
continue;
}
// Calculate the average Euclidean distance to each of its elements (since we're one dimensional here it's just the mean)
var cluster_vals = [];
for (var i = 0, cl = cluster_elms.length; i < cl; i++) {
var ce = cluster_elms[i];
cluster_vals.push(cluster.elements[ce]);
}
var new_centroid = Smeans.centroid(cluster_vals);
// If this newly calculated centroid is different than the current one, assign it
if (Smeans.distance(new_centroid, cluster.centroid) !== 0) {
cluster.centroid = new_centroid;
// Flip the cluster changed flag
cluster_changed = true;
}
}
// If no cluster was changed and no new cluster was created, we're done clustering
if (!cluster_changed && !new_cluster) {
changing = false;
}
}
return cluster_map;
}
/**
* @ngdoc function
* @name distance
* @param {object} point1 - An N-element array of numbers
* @param {object} point2 - An N-element array of numbers
* @description Calculate distance between two points in N-space
*/
function distance(d1, d2) {
// Build an array of distance squares
var tot = 0;
for (var i = 0, l = d1.length; i < l; i++) {
var dist = d1[i] - d2[i];
tot += dist * dist;
}
return Math.sqrt(tot);
}
function closest(point, points) {
var closest_dist = Infinity,
closest_point;
for (var i = 0, pl = points.length; i < pl; i++) {
var check_point = points[i];
var dist = distance(point, check_point);
if (dist < closest) {
closest_dist = dist;
closest_point = check_point;
}
}
return closest_point;
}
function centroid(elements) {
/*jshint -W004 */
var l = elements[0].length;
var el = elements.length;
var dims = [];
for (var i = 0; i < l; i++) {
dims[i] = dims[i] = 0;
}
for (var ei = 0; ei < el; ei++) {
for (var i = 0; i < l; i++) {
dims[i] += elements[ei][i];
}
}
var centroid = [];
for (var i = 0; i < l; i++) {
centroid[i] = dims[i] / el;
}
return centroid;
}
module.exports = Smeans;