forked from zero-one-group/geni
/
clustering.clj
71 lines (66 loc) · 2.59 KB
/
clustering.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
(ns zero-one.geni.ml.clustering
(:require
[zero-one.geni.interop :as interop])
(:import
(org.apache.spark.ml.clustering BisectingKMeans
GaussianMixture
KMeans
LDA
PowerIterationClustering)))
(defn bisecting-k-means [params]
(let [defaults {:distance-measure "euclidean",
:max-iter 20,
:features-col "features",
:k 4,
:min-divisible-cluster-size 1.0,
:seed 566573821,
:prediction-col "prediction"}
props (merge defaults params)]
(interop/instantiate BisectingKMeans props)))
(defn gaussian-mixture [params]
(let [defaults {:seed 538009335,
:k 2,
:max-iter 100,
:probability-col "probability",
:tol 0.01,
:features-col "features",
:prediction-col "prediction"}
props (merge defaults params)]
(interop/instantiate GaussianMixture props)))
(def gmm gaussian-mixture)
(defn k-means [params]
(let [defaults {:max-iter 20,
:tol 1.0E-4,
:init-mode "k-means||",
:seed -1689246527,
:k 2,
:init-steps 2,
:distance-measure "euclidean",
:prediction-col "prediction",
:features-col "features"}
props (merge defaults params)]
(interop/instantiate KMeans props)))
(defn lda [params]
(let [defaults {:subsampling-rate 0.05,
:max-iter 20,
:keep-last-checkpoint true,
:topic-distribution-col "topicDistribution",
:optimize-doc-concentration true,
:seed 1435876747,
:k 10,
:learning-offset 1024.0,
:checkpoint-interval 10,
:optimizer "online",
:learning-decay 0.51,
:features-col "features"}
props (merge defaults params)]
(interop/instantiate LDA props)))
(def latent-dirichlet-allocation lda)
(defn power-iteration-clustering [params]
(let [defaults {:k 2,
:dst-col "dst",
:src-col "src",
:init-mode "random",
:max-iter 20}
props (merge defaults params)]
(interop/instantiate PowerIterationClustering props)))