From 6d5de46fabfc6a04f25a73ad5bd4eca08d372631 Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Tue, 13 Mar 2018 19:36:26 +0900
Subject: [PATCH 1/3] Fixed annotations and option comments

---
 .../smile/classification/RandomForestClassifierUDTF.java  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java b/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java
index 6e8a6505a..d0db3a124 100644
--- a/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java
+++ b/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java
@@ -82,9 +82,9 @@
 
 @Description(
         name = "train_randomforest_classifier",
-        value = "_FUNC_(array<double|string> features, int label [, const array<double> classWeights, const string options]) - "
-                + "Returns a relation consists of "
-                + "<int model_id, int model_type, string pred_model, array<double> var_importance, int oob_errors, int oob_tests, double weight>")
+        value = "_FUNC_(array<double|string> features, int label [, const string options, const array<double> classWeights])"
+                + "- Returns a relation consists of "
+                + "<string model_id, double model_weight, string model, array<double> var_importance, int oob_errors, int oob_tests>")
 public final class RandomForestClassifierUDTF extends UDTFWithOptions {
     private static final Log logger = LogFactory.getLog(RandomForestClassifierUDTF.class);
 
@@ -150,7 +150,7 @@ protected Options getOptions() {
         opts.addOption("rule", "split_rule", true, "Split algorithm [default: GINI, ENTROPY]");
         opts.addOption("stratified", "stratified_sampling", false,
             "Enable Stratified sampling for unbalanced data");
-        opts.addOption("subsample", true, "Sampling rate in range (0.0,1.0]");
+        opts.addOption("subsample", true, "Sampling rate in range (0.0,1.0]. [default: 1.0]");
         return opts;
     }
 

From d6d5856093ee4c437fda220aaf8913652354ae8a Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Tue, 13 Mar 2018 19:37:02 +0900
Subject: [PATCH 2/3] Updated RandomForest documents

---
 docs/gitbook/binaryclass/news20_rf.md        |  6 +-
 docs/gitbook/binaryclass/titanic_rf.md       | 37 +++++-----
 docs/gitbook/multiclass/iris_randomforest.md | 73 +++++++-------------
 3 files changed, 46 insertions(+), 70 deletions(-)

diff --git a/docs/gitbook/binaryclass/news20_rf.md b/docs/gitbook/binaryclass/news20_rf.md
index 327939bc7..065c7365c 100644
--- a/docs/gitbook/binaryclass/news20_rf.md
+++ b/docs/gitbook/binaryclass/news20_rf.md
@@ -20,7 +20,7 @@
 Hivemall Random Forest supports libsvm-like sparse inputs. 
 
 > #### Note
-> This feature, i.e., Sparse input support in Random Forest, is supported since Hivemall v0.5-rc.1 or later._
+> This feature, i.e., Sparse input support in Random Forest, is supported since Hivemall v0.5.0 or later._
 > [`feature_hashing`](http://hivemall.incubator.apache.org/userguide/ft_engineering/hashing.html#featurehashing-function) function is useful to prepare feature vectors for Random Forest.
 
 <!-- toc -->
@@ -60,8 +60,10 @@ FROM (
   SELECT
     rowid, 
     m.model_weight,
+	-- v0.5.0 and later
     tree_predict(m.model_id, m.model, t.features, "-classification") as predicted
-    -- tree_predict(m.model_id, m.model, t.features, ${classification}) as predicted
+    -- before v0.5.0
+	-- tree_predict(m.model_id, m.model, t.features, ${classification}) as predicted
   FROM
     rf_model m
     LEFT OUTER JOIN -- CROSS JOIN
diff --git a/docs/gitbook/binaryclass/titanic_rf.md b/docs/gitbook/binaryclass/titanic_rf.md
index 2b5407427..3d51fa8ed 100644
--- a/docs/gitbook/binaryclass/titanic_rf.md
+++ b/docs/gitbook/binaryclass/titanic_rf.md
@@ -148,8 +148,9 @@ from
 
 `Q` and `C` represent quantitative variable and categorical variables, respectively.
 
-*Caution:* Note that the output of `guess_attribute_types` is not perfect. Revise it by your self.
-For example, `pclass` is a categorical variable.
+> #### Caution
+> Note that the output of `guess_attribute_types` is not perfect. Revise it by your self.
+> For example, `pclass` is a categorical variable.
 
 ```sql
 set hivevar:attrs=C,C,C,Q,Q,Q,C,Q,C,C;
@@ -159,7 +160,6 @@ create table model_rf
 AS
 select
   train_randomforest_classifier(features, survived, "-trees 500 -attrs ${attrs}") 
-    -- as (model_id, model_type, pred_model, var_importance, oob_errors, oob_tests)
 from
   train_rf
 ;
@@ -192,24 +192,23 @@ FROM (
   SELECT
     passengerid,
     -- rf_ensemble(predicted) as predicted
-    -- hivemall v0.5-rc.1 or later
+    -- v0.5.0 or later
     rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
     -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
   FROM (
     SELECT
       t.passengerid, 
-      -- hivemall v0.4.1-alpha.3 or later
+      -- from v0.4.1-alpha.3 to v0.4.2-rc4
       -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted
-      -- hivemall v0.5-rc.1 or later
+      -- v0.5.0 or later
       p.model_weight,
-	  tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
-	  -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted
-      -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5-rc.1 or later
+      tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
+      -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later
     FROM (
       SELECT 
-        -- hivemall v0.4.1-alpha.3 or later
+        -- from v0.4.1-alpha.3 or v0.4.2-rc4
         -- model_id, model_type, pred_model
-        -- hivemall v0.5-rc.1 or later
+        -- v0.5.0 or later
         model_id, model_weight, model
       FROM 
         model_rf 
@@ -224,7 +223,7 @@ FROM (
 ```
 
 > #### Caution
-> `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5-rc.1` on `v0.5-rc.1` or later.
+> `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5.0` on `v0.5.0` or later.
 
 # Kaggle submission
 
@@ -251,7 +250,7 @@ Accuracy would gives `0.76555` for a Kaggle submission.
 # Graphvis export
 
 > #### Note
-> `tree_export` feature is supported from Hivemall v0.5-rc.1 or later.
+> `tree_export` feature is supported from Hivemall v0.5.0 or later.
 > Better to limit tree depth on training by `-depth` option to plot a Decision Tree.
 
 Hivemall provide `tree_export` to export a decision tree into [Graphviz](http://www.graphviz.org/) or human-readable Javascript format. You can find the usage by issuing the following query:
@@ -336,24 +335,24 @@ FROM (
   SELECT
     passengerid,
     -- rf_ensemble(predicted) as predicted
-    -- hivemall v0.5-rc.1 or later
+    -- v0.5.0 or later
     rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
     -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
   FROM (
     SELECT
       t.passengerid, 
-      -- hivemall v0.4.1-alpha.3 or later
+      -- from v0.4.1-alpha.3 or v0.4.2-rc4
       -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted
-      -- hivemall v0.5-rc.1 or later
+      -- v0.5.0 or later
       p.model_weight,
       tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
       -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted
-      -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5-rc.1 or later
+      -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later
     FROM (
       SELECT 
-        -- hivemall v0.4.1-alpha.3 or later
+        -- from v0.4.1-alpha.3 to v0.4.2-rc4
         -- model_id, model_type, pred_model
-        -- hivemall v0.5-rc.1 or later
+        -- v0.5.0 or later
         model_id, model_weight, model
       FROM 
         model_rf_07
diff --git a/docs/gitbook/multiclass/iris_randomforest.md b/docs/gitbook/multiclass/iris_randomforest.md
index bfc197f09..a7758eff6 100644
--- a/docs/gitbook/multiclass/iris_randomforest.md
+++ b/docs/gitbook/multiclass/iris_randomforest.md
@@ -94,17 +94,19 @@ CREATE TABLE model
   STORED AS SEQUENCEFILE 
 AS
 select 
-  train_randomforest_classifier(features, label) 
-  -- hivemall v0.4.1-alpha.2 and before
+  train_randomforest_classifier(features, label)
+  -- v0.5.0 and later
+  -- train_randomforest_classifier(features, label) as (model_id, model_weight, model, var_importance, oob_errors, oob_tests)
+  -- v0.4.1-alpha.2 and before
   -- train_randomforest_classifier(features, label) as (pred_model, var_importance, oob_errors, oob_tests)
-  -- hivemall v0.4.1 and later
+  -- from v0.4.1 to v0.4.2-rc4
   -- train_randomforest_classifier(features, label) as (model_id, model_type, pred_model, var_importance, oob_errors, oob_tests)
 from
   training;
 ```
 
 > #### Caution
-> The default `TEXTFILE` should not be used for model table when using Javascript output through `-output javascript` option.
+> Note that model storage format is different between versions as seen the above.
 
 ```sql
 hive> desc extended model;
@@ -163,7 +165,7 @@ usage: train_randomforest_classifier(array<double|string> features, int
                                      features [default:
                                      ceil(sqrt(x[0].length))].
                                      int(num_variables * x[0].length) is
-                                     considered if num_variable is (0,1
+                                     considered if num_variable is (0,1]
 ```
 
 > #### Caution
@@ -215,19 +217,19 @@ as
 SELECT
   rowid,
   -- rf_ensemble(predicted) as predicted
-  -- hivemall v0.5-rc.1 or later
+  -- v0.5.0 or later
   rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
   -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
 FROM (
   SELECT
     rowid, 
-    -- hivemall v0.4.1 and later
+    -- from v0.4.1 to v0.4.2-rc4
     -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted
-    -- hivemall v0.5-rc.1 or later
+    -- v0.5.0 or later
     p.model_weight,
     tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
     -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted
-    -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5-rc.1 or later
+    -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later
   FROM
     model p
     LEFT OUTER JOIN -- CROSS JOIN
@@ -238,8 +240,11 @@ group by
 ;
 ```
 
+> #### Note
+> Left outer join without a join condition (i.e., `model p LEFT OUTER JOIN training t`) is a trick to fix the left table for cross join.
+
 > #### Caution
-> `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5-rc.1` on `v0.5-rc.1` or later.
+> `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5` on `v0.5` or later.
 
 ### Parallelize Prediction
 
@@ -251,37 +256,7 @@ set hive.auto.convert.join=true;
 SET hive.mapjoin.optimized.hashtable=false;
 SET mapred.reduce.tasks=8;
 
-create table predicted
-as
-SELECT
-  rowid,
-  -- rf_ensemble(predicted) as predicted
-  -- hivemall v0.5-rc.1 or later
-  rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
-  -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
-FROM (
-  SELECT
-    t.rowid, 
-    -- hivemall v0.4.1 and later
-    -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted
-    -- hivemall v0.5-rc.1 or later
-    p.model_weight,
-    tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
-    -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted
-    -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted as predicted -- to use the old model in v0.5-rc.1 or later
-  FROM (
-    SELECT 
-      -- hivemall v0.4.1 and later
-      -- model_id, model_type, pred_model
-      -- hivemall v0.5-rc.1 or later
-      model_id, model_weight, model
-    FROM model
-    DISTRIBUTE BY rand(1)
-  ) p 
-  LEFT OUTER JOIN training t
-) t1
-group by
-  rowid;
+
 ```
 
 # Evaluation
@@ -295,13 +270,13 @@ select count(1) from training;
 set hivevar:total_cnt=150;
 
 WITH t1 as (
-SELECT
-  t.rowid,
-  t.label as actual,
-  p.predicted.label as predicted
-FROM
-  predicted p
-  LEFT OUTER JOIN training t ON (t.rowid = p.rowid)
+  SELECT
+    t.rowid,
+    t.label as actual,
+    p.predicted.label as predicted
+  FROM
+    predicted p
+    LEFT OUTER JOIN training t ON (t.rowid = p.rowid)
 )
 SELECT
   count(1) / ${total_cnt}
@@ -316,7 +291,7 @@ WHERE
 # Graphvis export
 
 > #### Note
-> `tree_export` feature is supported from Hivemall v0.5-rc.1 or later.
+> `tree_export` feature is supported from Hivemall v0.5.0 or later.
 > Better to limit tree depth on training by `-depth` option to plot a Decision Tree.
 
 Hivemall provide `tree_export` to export a decision tree into [Graphviz](http://www.graphviz.org/) or human-readable Javascript format. You can find the usage by issuing the following query:

From bdfba98a6987cec35d6d3e464b9ae3300b398caf Mon Sep 17 00:00:00 2001
From: Makoto Yui <myui@apache.org>
Date: Tue, 13 Mar 2018 19:42:45 +0900
Subject: [PATCH 3/3] inserted a missing query

---
 docs/gitbook/multiclass/iris_randomforest.md | 33 +++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/docs/gitbook/multiclass/iris_randomforest.md b/docs/gitbook/multiclass/iris_randomforest.md
index a7758eff6..73ea4a362 100644
--- a/docs/gitbook/multiclass/iris_randomforest.md
+++ b/docs/gitbook/multiclass/iris_randomforest.md
@@ -256,7 +256,38 @@ set hive.auto.convert.join=true;
 SET hive.mapjoin.optimized.hashtable=false;
 SET mapred.reduce.tasks=8;
 
-
+drop table predicted;
+create table predicted
+as
+SELECT
+  rowid,
+  -- rf_ensemble(predicted) as predicted
+  -- v0.5.0 or later
+  rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
+  -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight)
+FROM (
+  SELECT
+    t.rowid, 
+    -- from v0.4.1 to v0.4.2-rc4
+    -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted
+    -- v0.5.0 or later
+    p.model_weight,
+    tree_predict(p.model_id, p.model, t.features, "-classification") as predicted
+    -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted
+    -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted as predicted -- to use the old model in v0.5.0 or later
+  FROM (
+    SELECT 
+      -- from v0.4.1 to v0.4.2-rc4
+      -- model_id, model_type, pred_model
+      -- v0.5.0 or later
+      model_id, model_weight, model
+    FROM model
+    DISTRIBUTE BY rand(1)
+  ) p 
+  LEFT OUTER JOIN training t
+) t1
+group by
+  rowid;
 ```
 
 # Evaluation