forked from zero-one-group/geni
/
ml.clj
298 lines (272 loc) · 9.55 KB
/
ml.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
(ns zero-one.geni.ml
(:refer-clojure :exclude [Double])
(:require
[camel-snake-kebab.core :refer [->kebab-case]]
[clojure.walk :refer [keywordize-keys]]
[potemkin :refer [import-vars]]
[zero-one.geni.core.column :as column]
[zero-one.geni.core.polymorphic :as polymorphic]
[zero-one.geni.interop :as interop]
[zero-one.geni.ml.classification]
[zero-one.geni.ml.clustering]
[zero-one.geni.ml.evaluation]
[zero-one.geni.ml.feature]
[zero-one.geni.ml.fpm]
[zero-one.geni.ml.recommendation]
[zero-one.geni.ml.regression]
[zero-one.geni.ml.tuning]
[zero-one.geni.ml.xgb])
(:import
(org.apache.spark.ml Pipeline
PipelineStage
functions)
(org.apache.spark.ml.stat ChiSquareTest
KolmogorovSmirnovTest)))
(import-vars
[zero-one.geni.ml.xgb
write-native-model!
xgboost-classifier
xgboost-regressor])
(import-vars
[zero-one.geni.ml.clustering
bisecting-k-means
gaussian-mixture
gmm
k-means
lda
latent-dirichlet-allocation
power-iteration-clustering])
(import-vars
[zero-one.geni.ml.evaluation
binary-classification-evaluator
clustering-evaluator
multiclass-classification-evaluator
multilabel-classification-evaluator
ranking-evaluator
regression-evaluator])
(import-vars
[zero-one.geni.ml.feature
binariser
binarizer
bucketed-random-projection-lsh
bucketiser
bucketizer
chi-sq-selector
count-vectoriser
count-vectorizer
dct
discrete-cosine-transform
elementwise-product
feature-hasher
hashing-tf
idf
imputer
index-to-string
interaction
max-abs-scaler
min-hash-lsh
min-max-scaler
n-gram
normaliser
normalizer
one-hot-encoder
pca
polynomial-expansion
quantile-discretiser
quantile-discretizer
regex-tokeniser
regex-tokenizer
robust-scaler
sql-transformer
standard-scaler
stop-words-remover
string-indexer
tokeniser
tokenizer
vector-assembler
vector-indexer
vector-size-hint
word2vec])
(import-vars
[zero-one.geni.ml.classification
decision-tree-classifier
fm-classifier
gbt-classifier
linear-svc
logistic-regression
mlp-classifier
multilayer-perceptron-classifier
naive-bayes
one-vs-rest
random-forest-classifier])
(import-vars
[zero-one.geni.ml.fpm
fp-growth
frequent-pattern-growth
prefix-span])
(import-vars
[zero-one.geni.ml.regression
aft-survival-regression
decision-tree-regressor
fm-regressor
gbt-regressor
generalised-linear-regression
generalized-linear-regression
glm
isotonic-regression
linear-regression
random-forest-regressor])
(import-vars
[zero-one.geni.ml.recommendation
als
alternating-least-squares
item-factors
recommend-for-all-items
recommend-for-all-users
recommend-for-item-subset
recommend-for-user-subset
recommend-items
recommend-users
user-factors])
(import-vars
[zero-one.geni.ml.tuning
param-grid
cross-validator
train-validation-split])
(defn vector-to-array
([expr] (vector-to-array (column/->column expr) "float64"))
([expr dtype] (functions/vector_to_array (column/->column expr) dtype)))
(def vector->array vector-to-array)
(def corr polymorphic/corr)
(defn chi-square-test [dataframe features-col label-col]
(ChiSquareTest/test dataframe (name features-col) (name label-col)))
(defn kolmogorov-smirnov-test [dataframe sample-col dist-name params]
(KolmogorovSmirnovTest/test dataframe (name sample-col) dist-name (interop/->scala-seq params)))
(defn pipeline [& stages]
(-> (Pipeline.)
(.setStages (into-array PipelineStage stages))))
(defn fit [dataframe estimator]
(.fit estimator dataframe))
(defn transform [dataframe transformer]
(.transform transformer dataframe))
(defn evaluate [dataframe evaluator]
(.evaluate evaluator dataframe))
(defn params [stage]
(let [param-pairs (-> stage .extractParamMap .toSeq interop/scala-seq->vec)
unpack-pair (fn [p]
[(-> p .param .name ->kebab-case) (interop/->clojure (.value p))])]
(->> param-pairs
(map unpack-pair)
(into {})
keywordize-keys)))
(defn approx-nearest-neighbours
([dataset model key-v n-nearest]
(.approxNearestNeighbors model dataset (interop/dense key-v) n-nearest))
([dataset model key-v n-nearest dist-col]
(.approxNearestNeighbors model dataset (interop/dense key-v) n-nearest dist-col)))
(defn approx-similarity-join
([dataset-a dataset-b model threshold]
(.approxSimilarityJoin model dataset-a dataset-b threshold))
([dataset-a dataset-b model threshold dist-col]
(.approxSimilarityJoin model dataset-a dataset-b threshold dist-col)))
(defn association-rules [model] (.associationRules model))
(defn binary-summary [model] (.binarySummary model))
(defn best-model [model] (.bestModel model))
(defn boundaries [model] (interop/->clojure (.boundaries model)))
(defn category-maps [model] (->> model .categoryMaps interop/scala-map->map))
(defn category-sizes [model] (seq (.categorySizes model)))
(defn cluster-centers [model] (->> model .clusterCenters seq (map interop/->clojure)))
(defn coefficient-matrix [model] (interop/matrix->seqs (.coefficientMatrix model)))
(defn coefficients [model] (interop/vector->seq (.coefficients model)))
(defn depth [model] (.depth model))
(def describe-topics (memfn describeTopics))
(defn estimated-doc-concentration [model] (interop/->clojure (.estimatedDocConcentration model)))
(defn feature-importances [model] (interop/->clojure (.featureImportances model)))
(defn find-frequent-sequential-patterns [dataset prefix-span]
(.findFrequentSequentialPatterns prefix-span dataset))
(def find-patterns find-frequent-sequential-patterns)
(defn frequent-item-sets [model] (.freqItemsets model))
(def freq-itemsets frequent-item-sets)
(defn gaussians-df [model] (.gaussiansDF model))
(defn get-features-col [model] (.getFeaturesCol model))
(def features-col get-features-col)
(defn get-input-col [model] (.getInputCol model))
(def input-col get-input-col)
(defn get-input-cols [model] (seq (.getInputCols model)))
(def input-cols get-input-cols)
(defn get-label-col [model] (.getLabelCol model))
(def label-col get-label-col)
(defn get-output-col [model] (.getOutputCol model))
(def output-col get-output-col)
(defn get-output-cols [model] (seq (.getOutputCols model)))
(def output-cols get-output-cols)
(defn get-prediction-col [model] (.getPredictionCol model))
(def prediction-col get-prediction-col)
(defn get-probability-col [model] (.getProbabilityCol model))
(def probability-col get-probability-col)
(defn get-raw-prediction-col [model] (.getRawPredictionCol model))
(def raw-prediction-col get-raw-prediction-col)
(defn get-thresholds [model] (seq (.getThresholds model)))
(def thresholds get-thresholds)
(defn get-num-trees [model] (.getNumTrees model))
(defn get-size [model] (.getSize model))
(defn idf-vector [model] (interop/vector->seq (.idf model)))
(defn intercept [model] (.intercept model))
(defn intercept-vector [model] (interop/vector->seq (.interceptVector model)))
(defn is-distributed [model] (.isDistributed model))
(def distributed? is-distributed)
(defn labels [model] (seq (.labels model)))
(defn log-likelihood [dataset model] (.logLikelihood model dataset))
(defn log-perplexity [dataset model] (.logPerplexity model dataset))
(defn max-abs [model] (interop/vector->seq (.maxAbs model)))
(defn mean [model] (interop/vector->seq (.mean model)))
(defn num-classes [model] (.numClasses model))
(defn num-features [model] (.numFeatures model))
(defn num-nodes [model] (.numNodes model))
(defn original-max [model] (interop/vector->seq (.originalMax model)))
(defn original-min [model] (interop/vector->seq (.originalMin model)))
(defn pc [model] (interop/matrix->seqs (.pc model)))
(def principal-components pc)
(defn pi [model] (interop/vector->seq (.pi model)))
(defn root-node [model] (.rootNode model))
(defn scale [model] (.scale model))
(defn summary [model] (.summary model))
(defn supported-optimizers [model] (seq (.supportedOptimizers model)))
(def supported-optimisers supported-optimizers)
(defn stages [model] (seq (.stages model)))
(defn std [model] (interop/vector->seq (.std model)))
(defn surrogate-df [model] (.surrogateDF model))
(defn theta [model] (interop/matrix->seqs (.theta model)))
(defn total-num-nodes [model] (.totalNumNodes model))
(defn tree-weights [model] (seq (.treeWeights model)))
(defn trees [model] (seq (.trees model)))
(defn uid [model] (.uid model))
(defn vocab-size [model] (.vocabSize model))
(defn vocabulary [model] (seq (.vocabulary model)))
(defn weights [model] (seq (.weights model)))
(defn write-stage!
([stage path] (write-stage! stage path {}))
([stage path options]
(let [unconfigured-writer (-> stage
.write
(cond-> (= (:mode options) "overwrite")
.overwrite))
configured-writer (reduce
(fn [w [k v]] (.option w (name k) v))
unconfigured-writer
(dissoc options :mode))]
(.save configured-writer path))))
(defn load-method? [^java.lang.reflect.Method method]
(and ; (= 1 (alength ^"[Ljava.lang.Class;" (.getParameterTypes method)))
(= "load" (.getName method))))
(defn load-method [cls]
(->> cls
.getMethods
(filter load-method?)
first))
(defn read-stage! [model-cls path]
(.invoke (load-method model-cls) model-cls (into-array [path])))
(comment
(import '(org.apache.spark.ml.classification GBTRegressor))
(params (GBTRegressor))
true)