From 6d5de46fabfc6a04f25a73ad5bd4eca08d372631 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 13 Mar 2018 19:36:26 +0900 Subject: [PATCH 1/3] Fixed annotations and option comments --- .../smile/classification/RandomForestClassifierUDTF.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java b/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java index 6e8a6505a..d0db3a124 100644 --- a/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java +++ b/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java @@ -82,9 +82,9 @@ @Description( name = "train_randomforest_classifier", - value = "_FUNC_(array features, int label [, const array classWeights, const string options]) - " - + "Returns a relation consists of " - + " var_importance, int oob_errors, int oob_tests, double weight>") + value = "_FUNC_(array features, int label [, const string options, const array classWeights])" + + "- Returns a relation consists of " + + " var_importance, int oob_errors, int oob_tests>") public final class RandomForestClassifierUDTF extends UDTFWithOptions { private static final Log logger = LogFactory.getLog(RandomForestClassifierUDTF.class); @@ -150,7 +150,7 @@ protected Options getOptions() { opts.addOption("rule", "split_rule", true, "Split algorithm [default: GINI, ENTROPY]"); opts.addOption("stratified", "stratified_sampling", false, "Enable Stratified sampling for unbalanced data"); - opts.addOption("subsample", true, "Sampling rate in range (0.0,1.0]"); + opts.addOption("subsample", true, "Sampling rate in range (0.0,1.0]. [default: 1.0]"); return opts; } From d6d5856093ee4c437fda220aaf8913652354ae8a Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 13 Mar 2018 19:37:02 +0900 Subject: [PATCH 2/3] Updated RandomForest documents --- docs/gitbook/binaryclass/news20_rf.md | 6 +- docs/gitbook/binaryclass/titanic_rf.md | 37 +++++----- docs/gitbook/multiclass/iris_randomforest.md | 73 +++++++------------- 3 files changed, 46 insertions(+), 70 deletions(-) diff --git a/docs/gitbook/binaryclass/news20_rf.md b/docs/gitbook/binaryclass/news20_rf.md index 327939bc7..065c7365c 100644 --- a/docs/gitbook/binaryclass/news20_rf.md +++ b/docs/gitbook/binaryclass/news20_rf.md @@ -20,7 +20,7 @@ Hivemall Random Forest supports libsvm-like sparse inputs. > #### Note -> This feature, i.e., Sparse input support in Random Forest, is supported since Hivemall v0.5-rc.1 or later._ +> This feature, i.e., Sparse input support in Random Forest, is supported since Hivemall v0.5.0 or later._ > [`feature_hashing`](http://hivemall.incubator.apache.org/userguide/ft_engineering/hashing.html#featurehashing-function) function is useful to prepare feature vectors for Random Forest. @@ -60,8 +60,10 @@ FROM ( SELECT rowid, m.model_weight, + -- v0.5.0 and later tree_predict(m.model_id, m.model, t.features, "-classification") as predicted - -- tree_predict(m.model_id, m.model, t.features, ${classification}) as predicted + -- before v0.5.0 + -- tree_predict(m.model_id, m.model, t.features, ${classification}) as predicted FROM rf_model m LEFT OUTER JOIN -- CROSS JOIN diff --git a/docs/gitbook/binaryclass/titanic_rf.md b/docs/gitbook/binaryclass/titanic_rf.md index 2b5407427..3d51fa8ed 100644 --- a/docs/gitbook/binaryclass/titanic_rf.md +++ b/docs/gitbook/binaryclass/titanic_rf.md @@ -148,8 +148,9 @@ from `Q` and `C` represent quantitative variable and categorical variables, respectively. -*Caution:* Note that the output of `guess_attribute_types` is not perfect. Revise it by your self. -For example, `pclass` is a categorical variable. +> #### Caution +> Note that the output of `guess_attribute_types` is not perfect. Revise it by your self. +> For example, `pclass` is a categorical variable. ```sql set hivevar:attrs=C,C,C,Q,Q,Q,C,Q,C,C; @@ -159,7 +160,6 @@ create table model_rf AS select train_randomforest_classifier(features, survived, "-trees 500 -attrs ${attrs}") - -- as (model_id, model_type, pred_model, var_importance, oob_errors, oob_tests) from train_rf ; @@ -192,24 +192,23 @@ FROM ( SELECT passengerid, -- rf_ensemble(predicted) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) FROM ( SELECT t.passengerid, - -- hivemall v0.4.1-alpha.3 or later + -- from v0.4.1-alpha.3 to v0.4.2-rc4 -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later p.model_weight, - tree_predict(p.model_id, p.model, t.features, "-classification") as predicted - -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted - -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5-rc.1 or later + tree_predict(p.model_id, p.model, t.features, "-classification") as predicted + -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later FROM ( SELECT - -- hivemall v0.4.1-alpha.3 or later + -- from v0.4.1-alpha.3 or v0.4.2-rc4 -- model_id, model_type, pred_model - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later model_id, model_weight, model FROM model_rf @@ -224,7 +223,7 @@ FROM ( ``` > #### Caution -> `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5-rc.1` on `v0.5-rc.1` or later. +> `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5.0` on `v0.5.0` or later. # Kaggle submission @@ -251,7 +250,7 @@ Accuracy would gives `0.76555` for a Kaggle submission. # Graphvis export > #### Note -> `tree_export` feature is supported from Hivemall v0.5-rc.1 or later. +> `tree_export` feature is supported from Hivemall v0.5.0 or later. > Better to limit tree depth on training by `-depth` option to plot a Decision Tree. Hivemall provide `tree_export` to export a decision tree into [Graphviz](http://www.graphviz.org/) or human-readable Javascript format. You can find the usage by issuing the following query: @@ -336,24 +335,24 @@ FROM ( SELECT passengerid, -- rf_ensemble(predicted) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) FROM ( SELECT t.passengerid, - -- hivemall v0.4.1-alpha.3 or later + -- from v0.4.1-alpha.3 or v0.4.2-rc4 -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later p.model_weight, tree_predict(p.model_id, p.model, t.features, "-classification") as predicted -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted - -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5-rc.1 or later + -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later FROM ( SELECT - -- hivemall v0.4.1-alpha.3 or later + -- from v0.4.1-alpha.3 to v0.4.2-rc4 -- model_id, model_type, pred_model - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later model_id, model_weight, model FROM model_rf_07 diff --git a/docs/gitbook/multiclass/iris_randomforest.md b/docs/gitbook/multiclass/iris_randomforest.md index bfc197f09..a7758eff6 100644 --- a/docs/gitbook/multiclass/iris_randomforest.md +++ b/docs/gitbook/multiclass/iris_randomforest.md @@ -94,17 +94,19 @@ CREATE TABLE model STORED AS SEQUENCEFILE AS select - train_randomforest_classifier(features, label) - -- hivemall v0.4.1-alpha.2 and before + train_randomforest_classifier(features, label) + -- v0.5.0 and later + -- train_randomforest_classifier(features, label) as (model_id, model_weight, model, var_importance, oob_errors, oob_tests) + -- v0.4.1-alpha.2 and before -- train_randomforest_classifier(features, label) as (pred_model, var_importance, oob_errors, oob_tests) - -- hivemall v0.4.1 and later + -- from v0.4.1 to v0.4.2-rc4 -- train_randomforest_classifier(features, label) as (model_id, model_type, pred_model, var_importance, oob_errors, oob_tests) from training; ``` > #### Caution -> The default `TEXTFILE` should not be used for model table when using Javascript output through `-output javascript` option. +> Note that model storage format is different between versions as seen the above. ```sql hive> desc extended model; @@ -163,7 +165,7 @@ usage: train_randomforest_classifier(array features, int features [default: ceil(sqrt(x[0].length))]. int(num_variables * x[0].length) is - considered if num_variable is (0,1 + considered if num_variable is (0,1] ``` > #### Caution @@ -215,19 +217,19 @@ as SELECT rowid, -- rf_ensemble(predicted) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) FROM ( SELECT rowid, - -- hivemall v0.4.1 and later + -- from v0.4.1 to v0.4.2-rc4 -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later p.model_weight, tree_predict(p.model_id, p.model, t.features, "-classification") as predicted -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted - -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5-rc.1 or later + -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later FROM model p LEFT OUTER JOIN -- CROSS JOIN @@ -238,8 +240,11 @@ group by ; ``` +> #### Note +> Left outer join without a join condition (i.e., `model p LEFT OUTER JOIN training t`) is a trick to fix the left table for cross join. + > #### Caution -> `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5-rc.1` on `v0.5-rc.1` or later. +> `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5` on `v0.5` or later. ### Parallelize Prediction @@ -251,37 +256,7 @@ set hive.auto.convert.join=true; SET hive.mapjoin.optimized.hashtable=false; SET mapred.reduce.tasks=8; -create table predicted -as -SELECT - rowid, - -- rf_ensemble(predicted) as predicted - -- hivemall v0.5-rc.1 or later - rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted - -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) -FROM ( - SELECT - t.rowid, - -- hivemall v0.4.1 and later - -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted - -- hivemall v0.5-rc.1 or later - p.model_weight, - tree_predict(p.model_id, p.model, t.features, "-classification") as predicted - -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted - -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted as predicted -- to use the old model in v0.5-rc.1 or later - FROM ( - SELECT - -- hivemall v0.4.1 and later - -- model_id, model_type, pred_model - -- hivemall v0.5-rc.1 or later - model_id, model_weight, model - FROM model - DISTRIBUTE BY rand(1) - ) p - LEFT OUTER JOIN training t -) t1 -group by - rowid; + ``` # Evaluation @@ -295,13 +270,13 @@ select count(1) from training; set hivevar:total_cnt=150; WITH t1 as ( -SELECT - t.rowid, - t.label as actual, - p.predicted.label as predicted -FROM - predicted p - LEFT OUTER JOIN training t ON (t.rowid = p.rowid) + SELECT + t.rowid, + t.label as actual, + p.predicted.label as predicted + FROM + predicted p + LEFT OUTER JOIN training t ON (t.rowid = p.rowid) ) SELECT count(1) / ${total_cnt} @@ -316,7 +291,7 @@ WHERE # Graphvis export > #### Note -> `tree_export` feature is supported from Hivemall v0.5-rc.1 or later. +> `tree_export` feature is supported from Hivemall v0.5.0 or later. > Better to limit tree depth on training by `-depth` option to plot a Decision Tree. Hivemall provide `tree_export` to export a decision tree into [Graphviz](http://www.graphviz.org/) or human-readable Javascript format. You can find the usage by issuing the following query: From bdfba98a6987cec35d6d3e464b9ae3300b398caf Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Tue, 13 Mar 2018 19:42:45 +0900 Subject: [PATCH 3/3] inserted a missing query --- docs/gitbook/multiclass/iris_randomforest.md | 33 +++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/docs/gitbook/multiclass/iris_randomforest.md b/docs/gitbook/multiclass/iris_randomforest.md index a7758eff6..73ea4a362 100644 --- a/docs/gitbook/multiclass/iris_randomforest.md +++ b/docs/gitbook/multiclass/iris_randomforest.md @@ -256,7 +256,38 @@ set hive.auto.convert.join=true; SET hive.mapjoin.optimized.hashtable=false; SET mapred.reduce.tasks=8; - +drop table predicted; +create table predicted +as +SELECT + rowid, + -- rf_ensemble(predicted) as predicted + -- v0.5.0 or later + rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted + -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) +FROM ( + SELECT + t.rowid, + -- from v0.4.1 to v0.4.2-rc4 + -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted + -- v0.5.0 or later + p.model_weight, + tree_predict(p.model_id, p.model, t.features, "-classification") as predicted + -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted + -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted as predicted -- to use the old model in v0.5.0 or later + FROM ( + SELECT + -- from v0.4.1 to v0.4.2-rc4 + -- model_id, model_type, pred_model + -- v0.5.0 or later + model_id, model_weight, model + FROM model + DISTRIBUTE BY rand(1) + ) p + LEFT OUTER JOIN training t +) t1 +group by + rowid; ``` # Evaluation