DT/RF: Fix user doc examples

apache · Aug 1, 2018 · 186390f · 186390f
1 parent 1aac377
commit 186390f
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 11 deletions.
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -284,14 +284,17 @@ tree_train(
       <th>impurity_var_importance</th>
       <td>DOUBLE PRECISION[]. Impurity importance of each variable.
       The order of the variables is the same as
-      that of 'independent_varnames' column in the summary table (see below).
+      that of the 'independent_varnames' column in the summary table (see below).
 
       The impurity importance of any feature is the decrease in impurity by a
       node containing the feature as a primary split, summed over the whole
       tree. If surrogates are used, then the importance value includes the
       impurity decrease scaled by the adjusted surrogate agreement.
-      Reported importance values are normalized to sum to 100 across
-      all variables.
+      Importance values are displayed as raw values as per the 'split_criterion'
+      parameter.
+      To see importance values normalized to sum to 100 across
+      all variables, use the importance display helper function 
+      described later on this page. 
       Please refer to [1] for more information on variable importance.
       </td>
       </tr>
@@ -727,7 +730,7 @@ independent_var_types       | text, boolean, double precision
 n_folds                     | 0
 null_proxy                  |
 </pre>
-View the impurity importance table using the helper function:
+View the normalized impurity importance table using the helper function:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS imp_output;
@@ -1111,10 +1114,11 @@ which shows ordering of levels of categorical variables 'vs' and 'cyl':
 SELECT pruning_cp, cat_levels_in_text, cat_n_levels, impurity_var_importance, tree_depth FROM train_output;
 </pre>
 <pre class="result">
+-[ RECORD 1 ]-----------+------------------------------------------------------------------------
 pruning_cp              | 0
 cat_levels_in_text      | {0,1,4,6,8}
 cat_n_levels            | {2,3}
-impurity_var_importance | {0,51.8593201959496,10.976977929129,5.31897402755374,31.8447278473677}
+impurity_var_importance | {0,22.6309172500675,4.79024943310651,2.32115000000003,13.8967382920111}
 tree_depth              | 4
 </pre>
 View the summary table:
@@ -1147,7 +1151,7 @@ independent_var_types       | integer, integer, double precision, double precisi
 n_folds                     | 0
 null_proxy                  |
 </pre>
-View the impurity importance table using the helper function:
+View the normalized impurity importance table using the helper function:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS imp_output;

diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
@@ -164,7 +164,9 @@ forest_train(training_table_name,
     Due to nature of permutation, the importance value can end up being
     negative if the number of levels for a categorical variable is small and is
     unbalanced. In such a scenario, the importance values are shifted to ensure
-    that the lowest importance value is 0.
+    that the lowest importance value is 0.  To see importance values normalized 
+    to sum to 100 across all variables, use the importance display helper function 
+    described later on this page. 
 
   </DD>
 
@@ -758,7 +760,7 @@ the variables in 'independent_varnames'
 in <model_table>_summary.
 A higher value means higher importance for the
 variable.  We can use the helper function to
-get a better view of variable importance:
+get a normalized view of variable importance:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS imp_output;
@@ -1160,7 +1162,7 @@ oob_error               | 16.5197718747446
 oob_var_importance      | {5.22711111111111,10.0872041666667,9.6875362244898,3.97782,2.99447839506173}
 impurity_var_importance | {5.1269704861111,7.04765974920884,20.9817274159476,4.02800949238769,10.5539079705215}
 </pre>
-Use the helper function to display variable importance:
+Use the helper function to display normalized variable importance:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS mt_imp_output;
@@ -1347,14 +1349,14 @@ View the summary table:
 SELECT * FROM train_output_group;
 </pre>
 <pre class='result'>
--[ RECORD 1 ]-----------+-----------------------------------------------------
+-[ RECORD 1 ]-----------+-----------------------------------------
 gid                     | 1
 success                 | t
 cat_n_levels            | {2,2,2}
 cat_levels_in_text      | {US,__NULL__,rainy,__NULL__,NY,__NULL__}
 oob_error               | 1.00000000000000000000
 oob_var_importance      | {0,0,0}
-impurity_var_importance | {32.1752184623349,25.2686155402256,22.5560374792348}
+impurity_var_importance | {0.125,0.0944444444444,0.1836666666667}
 </pre>
 
 -# Predict for data not previously seen by assuming NULL