From 089a4e2162a7b92dd288e5518ca8710f0aeac696 Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Thu, 26 Jul 2018 12:17:58 -0700
Subject: [PATCH 01/13] DT/RF: Don't eliminate single-level categorical
 variable

JIRA: MADLIB-1258

When DT/RF is run with grouping, a subset of the groups could eliminate
a categorical variable leading to multiple issues downstream, including
invalid importance values and incorrect prediction.

This commit keeps all categorical variables (even if it contains just
one level). This would lead to some inefficiency during tree train,
since the accumulator state would use additional space for this
categorical variable but never use it in a tree. This inefficiency is
still preferred for clean code and error-free prediction/importance
reporting.

Closes #301

Co-authored-by: Nandish Jayaram <njayaram@apache.org>
---
 .../recursive_partitioning/DT_impl.hpp        | 90 ++++++++++---------
 .../recursive_partitioning/decision_tree.cpp  | 20 ++---
 .../decision_tree.py_in                       | 38 ++------
 .../random_forest.py_in                       |  8 +-
 .../test/decision_tree.sql_in                 | 53 +++++------
 .../test/random_forest.sql_in                 | 46 +++++-----
 6 files changed, 121 insertions(+), 134 deletions(-)
diff --git a/src/modules/recursive_partitioning/DT_impl.hpp b/src/modules/recursive_partitioning/DT_impl.hpp
index 69bdc88ea..0eb8751a7 100644
--- a/src/modules/recursive_partitioning/DT_impl.hpp
+++ b/src/modules/recursive_partitioning/DT_impl.hpp
@@ -665,21 +665,29 @@ DecisionTree<Container>::pickSurrogates(
 
             // 1. Compute the max count and corresponding split threshold for
             // each categorical and continuous feature
+
             ColumnVector cat_max_thres = ColumnVector::Zero(n_cats);
             ColumnVector cat_max_count = ColumnVector::Zero(n_cats);
             IntegerVector cat_max_is_reverse = IntegerVector::Zero(n_cats);
             Index prev_cum_levels = 0;
             for (Index each_cat=0; each_cat < n_cats; each_cat++){
                 Index n_levels = state.cat_levels_cumsum(each_cat) - prev_cum_levels;
-                Index max_label;
-                (cat_stats_counts.row(stats_i).segment(
-                    prev_cum_levels * 2, n_levels * 2)).maxCoeff(&max_label);
-                cat_max_thres(each_cat) = static_cast<double>(max_label / 2);
-                cat_max_count(each_cat) =
-                        cat_stats_counts(stats_i, prev_cum_levels*2 + max_label);
-                // every odd col is for reverse, hence i % 2 == 1 for reverse index i
-                cat_max_is_reverse(each_cat) = (max_label % 2 == 1) ? 1 : 0;
-                prev_cum_levels = state.cat_levels_cumsum(each_cat);
+                if (n_levels > 0){
+                    Index max_label;
+                    (cat_stats_counts.row(stats_i).segment(
+                        prev_cum_levels * 2, n_levels * 2)).maxCoeff(&max_label);
+
+                    // For each split, there are two stats => max_label / 2
+                    // gives the split index. There isn't a need to do a floor
+                    // operation, since the threshold will yield the same results
+                    // for n and n+0.5.
+                    cat_max_thres(each_cat) = static_cast<double>(max_label / 2);
+                    cat_max_count(each_cat) =
+                            cat_stats_counts(stats_i, prev_cum_levels*2 + max_label);
+                    // every odd col is for reverse, hence i % 2 == 1 for reverse index i
+                    cat_max_is_reverse(each_cat) = max_label % 2;
+                    prev_cum_levels = state.cat_levels_cumsum(each_cat);
+                }
             }
 
             ColumnVector con_max_thres = ColumnVector::Zero(n_cons);
@@ -800,7 +808,7 @@ DecisionTree<Container>::expand_by_sampling(const Accumulator &state,
             std::random_shuffle(cat_con_feature_indices,
                     cat_con_feature_indices + total_cat_con_features, rvt);
             // if a leaf node exists, compute the gain in impurity for each split
-            // pick split  with maximum gain and update node with split value
+            // pick split with maximum gain and update node with split value
             int max_feat = -1;
             Index max_bin = -1;
             bool max_is_cat = false;
@@ -831,7 +839,6 @@ DecisionTree<Container>::expand_by_sampling(const Accumulator &state,
                         }
                     }
 
-
                 } else { //f >= state.n_cat.features
                     //continuous feature
                     f -= state.n_cat_features;
@@ -854,40 +861,41 @@ DecisionTree<Container>::expand_by_sampling(const Accumulator &state,
                 }
             }
 
-            // Create and update child nodes if splitting current
-            uint64_t true_count = statCount(max_stats.segment(0, sps));
-            uint64_t false_count = statCount(max_stats.segment(sps, sps));
-            uint64_t total_count = statCount(predictions.row(current));
-
-            if (max_impurity_gain > 0 &&
-                    shouldSplit(total_count, true_count, false_count,
+            bool is_leaf_split = FALSE;
+            if (max_impurity_gain > 0){
+                // Create and update child nodes if splitting current
+                uint64_t true_count = statCount(max_stats.segment(0, sps));
+                uint64_t false_count = statCount(max_stats.segment(sps, sps));
+                uint64_t total_count = statCount(predictions.row(current));
+                if (shouldSplit(total_count, true_count, false_count,
                                 min_split, min_bucket, max_depth)) {
 
-                double max_threshold;
-                if (max_is_cat)
-                    max_threshold = static_cast<double>(max_bin);
-                else
-                    max_threshold = con_splits(max_feat, max_bin);
-
-                if (children_not_allocated) {
-                    // allocate the memory for child nodes if not allocated already
-                    incrementInPlace();
-                    children_not_allocated = false;
-                }
-
-                children_wont_split &=
-                    updatePrimarySplit(
-                        current, static_cast<int>(max_feat),
-                        max_threshold, max_is_cat,
-                        min_split,
-                        max_stats.segment(0, sps),   // true_stats
-                        max_stats.segment(sps, sps)  // false_stats
-                    );
+                    is_leaf_split = TRUE;
+                    double max_threshold;
+                    if (max_is_cat)
+                        max_threshold = static_cast<double>(max_bin);
+                    else
+                        max_threshold = con_splits(max_feat, max_bin);
+
+                    if (children_not_allocated) {
+                        // allocate the memory for child nodes if not allocated already
+                        incrementInPlace();
+                        children_not_allocated = false;
+                    }
 
-            } else {
+                    children_wont_split &=
+                        updatePrimarySplit(
+                            current, static_cast<int>(max_feat),
+                            max_threshold, max_is_cat,
+                            min_split,
+                            max_stats.segment(0, sps),   // true_stats
+                            max_stats.segment(sps, sps)  // false_stats
+                        );
+                } // if shouldSplit
+            } //if max_impurity_gain > 0
+            if (not is_leaf_split)
                 feature_indices(current) = FINISHED_LEAF;
-            }
-        } // if leaf exists
+        } // if leaf is in_process
     } // for each leaf
 
     // return true if tree expansion is finished
diff --git a/src/modules/recursive_partitioning/decision_tree.cpp b/src/modules/recursive_partitioning/decision_tree.cpp
index 351fcedb9..49eb349ca 100644
--- a/src/modules/recursive_partitioning/decision_tree.cpp
+++ b/src/modules/recursive_partitioning/decision_tree.cpp
@@ -123,19 +123,21 @@ compute_leaf_stats_transition::run(AnyType & args){
         return args[0];
     }
 
-    // cat_levels size = n_cat_features
+    // cat_levels.size = n_cat_features
     NativeIntegerVector cat_levels;
     if (args[6].isNull()){
         cat_levels.rebind(this->allocateArray<int>(0));
     }
     else {
-        MutableNativeIntegerVector xx_cat = args[6].getAs<MutableNativeIntegerVector>();
-        for (Index i = 0; i < xx_cat.size(); i++)
-            xx_cat[i] -= 1;   // ignore the last level since a split
-                              // like 'var <= last level' would move all rows to
-                              // a one side. Such a split will always be ignored
-                              // when selecting the best split.
-        cat_levels.rebind(xx_cat.memoryHandle(), xx_cat.size());
+        MutableNativeIntegerVector n_levels_per_cat =
+            args[6].getAs<MutableNativeIntegerVector>();
+        for (Index i = 0; i < n_levels_per_cat.size(); i++)
+            n_levels_per_cat[i] -= 1;
+                // ignore the last level since a split
+                // like 'var <= last level' would move all rows to
+                // a one side. Such a split will always be ignored
+                // when selecting the best split.
+        cat_levels.rebind(n_levels_per_cat.memoryHandle(), n_levels_per_cat.size());
     }
 
     // con_splits size = num_con_features x num_bins
@@ -203,7 +205,6 @@ compute_leaf_stats_transition::run(AnyType & args){
             current_sum += cat_levels(i);
             state.cat_levels_cumsum(i) = current_sum;
         }
-
     }
 
     state << MutableLevelState::tuple_type(dt, cat_features, con_features,
@@ -257,7 +258,6 @@ dt_apply::run(AnyType & args){
         return_code = TERMINATED;  // indicates termination due to error
     }
 
-
     AnyType output_tuple;
     output_tuple << dt.storage()
                  << return_code
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 89acd8aa1..cbcb0f6ba 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -288,8 +288,6 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
         tree = _tree_train_using_bins(**locals())
         tree['grp_key'] = ''
         tree['cp'] = grp_key_to_cp[tree['grp_key']]
-        tree['cat_features'] = cat_features
-        tree['con_features'] = con_features
         tree_states = [tree]
     else:
         grouping_array_str = get_grouping_array_str(training_table_name, grouping_cols)
@@ -325,8 +323,6 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
                         tree['cp'] = grp_key_to_cp.values()[0]
                     else:
                         tree['cp'] = grp_key_to_cp[grp_key]
-                    tree['cat_features'] = bins['grp_to_cat_features'][grp_key]
-                    tree['con_features'] = bins['con_features']
 
     # 5) prune the tree using provided 'cp' value and produce a list of
     #   cp values if cross-validation is required (cp_list = [] if not)
@@ -345,7 +341,7 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
 
         importance_vectors = _compute_var_importance(
             schema_madlib, tree,
-            len(tree['cat_features']), len(tree['con_features']))
+            len(cat_features), len(con_features))
         tree.update(**importance_vectors)
 
     return tree_states, bins, dep_list, n_rows, cat_features_info_table
@@ -745,7 +741,6 @@ def _get_bins(schema_madlib, training_table_name,
                     {union_null_proxy}
                 ) s
             ) s1
-            WHERE array_upper(levels, 1) > 1
             """.format(training_table_name=training_table_name,
                        filter_str=filter_str,
                        union_null_proxy=union_null_proxy)
@@ -765,14 +760,6 @@ def _get_bins(schema_madlib, training_table_name,
                 order_fun=expr if col_name in ordered_cat_features else order_fun)
             for col_name, expr in all_col_expressions.items())
         all_levels = plpy.execute(sql_all_cats)
-
-        if len(all_levels) != len(cat_features):
-            use_cat_features = [row['colname'] for row in all_levels]
-            cat_features = [feature for feature in cat_features
-                            if feature in use_cat_features]
-            plpy.warning("Decision tree warning: Categorical columns with only "
-                         "one value are dropped from the tree model.")
-
         col_to_row = dict((row['colname'], i) for i, row in enumerate(all_levels))
 
         return dict(
@@ -992,7 +979,6 @@ def _get_bins_grps(
                     ) s
                     GROUP BY grp_key
                 ) s1
-                where array_upper(levels, 1) > 1
                 """.format(**locals())
 
         all_col_expressions = {}
@@ -1017,13 +1003,6 @@ def _get_bins_grps(
         all_levels = list(plpy.execute(sql_all_cats))
         all_levels.sort(key=itemgetter('grp_key'))
 
-        use_cat_features = set([row['colname'] for row in all_levels])
-        if len(use_cat_features) != len(cat_features):
-            plpy.warning("Decision tree warning: Categorical columns with only "
-                         "one value are dropped from the tree model.")
-            cat_features = [feature for feature in cat_features
-                            if feature in use_cat_features]
-
         # grp_col_to_levels is a list of tuples (pairs) with
         #   first value = group value,
         #   second value = a dict mapping a categorical column to its levels in data
@@ -1079,7 +1058,7 @@ def _get_bins_grps(
                 grp_key_cat=grp_key_cat,
                 grouping_array_str=grouping_array_str,
                 grp_to_col_to_levels=grp_to_col_to_levels,
-                grp_to_cat_features=grp_to_cat_features)
+                )
 # ------------------------------------------------------------
 
 
@@ -1115,7 +1094,9 @@ def _create_cat_features_info_table(cat_features_info_table, bins):
                 cat_levels = [quote_literal(each_level)
                               for sublist in cat_levels
                               for each_level in sublist]
-                cat_levels_str = py_list_to_sql_string(cat_levels, 'text', long_format=True)
+                cat_levels_str = py_list_to_sql_string(cat_levels,
+                                                       'text',
+                                                       long_format=True)
             else:
                 # this is the case if no categorical features present
                 cat_names_str = cat_n_levels_str = cat_levels_str = "NULL"
@@ -1167,10 +1148,9 @@ def get_feature_str(schema_madlib, boolean_cats,
         for col in cat_features:
             if col in boolean_cats:
                 cat_features_cast.append(
-                    "(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)::text")
-            else:
-                cat_features_cast.append(
-                    "(coalesce({0}::text, '{1}'))::text".format(col, null_val))
+                    "(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)")
+            cat_features_cast.append(
+                "(coalesce({0}::text, '{1}'))::text".format(col, null_val))
 
         cat_features_str = ("{0}._map_catlevel_to_int(array[" +
                             ", ".join(cat_features_cast) + "], {1}, {2}, {3})"
@@ -2181,7 +2161,7 @@ def _xvalidate(schema_madlib, tree_states, training_table_name, output_table_nam
                 tree['pruned_depth'] = 0
         importance_vectors = _compute_var_importance(
             schema_madlib, tree,
-            len(tree['cat_features']), len(tree['con_features']))
+            len(cat_features), len(con_features))
         tree.update(**importance_vectors)
 
     plpy.execute("DROP TABLE {group_to_param_list_table}".format(**locals()))
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index 4d7487272..b183bb7c5 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -558,8 +558,6 @@ def forest_train(
                             num_random_features, max_n_surr, null_proxy)
 
                         tree['grp_key'] = ''
-                        tree['cat_features'] = cat_features
-                        tree['con_features'] = con_features
                         if importance:
                             tree.update(_compute_var_importance(
                                 schema_madlib, tree,
@@ -584,14 +582,12 @@ def forest_train(
                         # stop calculating that group further.
                         for tree in tree_states:
                             grp_key = tree['grp_key']
-                            tree['cat_features'] = bins['grp_to_cat_features'][grp_key]
-                            tree['con_features'] = bins['con_features']
                             tree_terminated[grp_key] = tree['finished']
                             if importance:
                                 importance_vectors = _compute_var_importance(
                                     schema_madlib, tree,
-                                    len(tree['cat_features']),
-                                    len(tree['con_features']))
+                                    len(cat_features),
+                                    len(con_features))
                                 tree.update(**importance_vectors)
 
                     _insert_into_result_table(
diff --git a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
index dee3e329a..f83f5dd68 100644
--- a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
@@ -243,26 +243,27 @@ CREATE TABLE dt_golf (
     "Cont_features" double precision[],
     cat_features text[],
     windy boolean,
+    windy2 boolean,
     class text
 ) ;
 
-INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,"Cont_features",cat_features, windy,class) VALUES
-(1, 'sunny', 85, 85,ARRAY[85, 85], ARRAY['a', 'b'], false, 'Don''t Play'),
-(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['a', 'b'], true, 'Don''t Play'),
-(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['a', 'b'], false, 'Play'),
-(4, 'rain', 70, NULL, ARRAY[70, 96], ARRAY['a', 'b'], false, 'Play'),
-(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['a', 'b'], false, 'Play'),
-(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, 'Don''t Play'),
-(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['c', 'b'], NULL , 'Play'),
-(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['a', 'b'], false, 'Don''t Play'),
-(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['a', 'b'], false, 'Play'),
-(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['a', 'b'], false, 'Play'),
-(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['a', 'd'], true, 'Play'),
-(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['c', 'b'], NULL, 'Play'),
-(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
-(15, NULL, 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
-(16, 'overcast', NULL, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
-(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['c', 'b'], true, 'Don''t Play');
+INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,"Cont_features",cat_features, windy, windy2,class) VALUES
+(1, 'sunny', 85, 85,ARRAY[85, 85], ARRAY['a', 'b'], false, false, 'Don''t Play'),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['a', 'b'], true, false, 'Don''t Play'),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['a', 'b'], false, false, 'Play'),
+(4, 'rain', 70, NULL, ARRAY[70, 96], ARRAY['a', 'b'], false, false, 'Play'),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['a', 'b'], false, false, 'Play'),
+(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, false, 'Don''t Play'),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['c', 'b'], NULL, NULL, 'Play'),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['a', 'b'], false, false, 'Don''t Play'),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['a', 'b'], false, false, 'Play'),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['a', 'b'], false, false, 'Play'),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['a', 'd'], true, false, 'Play'),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['c', 'b'], NULL, NULL, 'Play'),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, false, 'Play'),
+(15, NULL, 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, false, 'Play'),
+(16, 'overcast', NULL, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, false, 'Play'),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['c', 'b'], true, false, 'Don''t Play');
 
 update dt_golf set id_2 = id % 2;
 -------------------------------------------------------------------------
@@ -273,7 +274,7 @@ SELECT tree_train('dt_golf'::text,         -- source table
                          'train_output'::text,    -- output model table
                          'id'::text,              -- id column
                          'temperature::double precision'::text,           -- response
-                         'humidity, windy, "Cont_features"'::text,   -- features
+                         'humidity, windy2, "Cont_features"'::text,   -- features
                          NULL::text,        -- exclude columns
                          'gini'::text,      -- split criterion
                          NULL::text,        -- no grouping
@@ -282,13 +283,16 @@ SELECT tree_train('dt_golf'::text,         -- source table
                          6::integer,        -- min split
                          2::integer,        -- min bucket
                          3::integer,        -- number of bins per continuous variable
-                         'cp=0.01, n_folds=2'          -- cost-complexity pruning parameter
+                         'cp=0.01, n_folds=2',          -- cost-complexity pruning parameter
+                         'null_as_category=True'
                          );
 
 SELECT _print_decision_tree(tree) from train_output;
 SELECT tree_display('train_output', False);
 SELECT impurity_var_importance FROM train_output;
+SELECT cat_levels_in_text, cat_n_levels, impurity_var_importance FROM train_output;
 SELECT * FROM train_output_cv;
+SELECT * FROM train_output_summary;
 -------------------------------------------------------------------------
 
 -- grouping
@@ -303,17 +307,16 @@ SELECT tree_train('dt_golf'::text,         -- source table
                          'class'::text,     -- grouping
                          NULL::text,        -- no weights
                          10::integer,       -- max depth
-                         6::integer,        -- min split
-                         2::integer,        -- min bucket
+                         2::integer,        -- min split
+                         1::integer,        -- min bucket
                          3::integer,        -- number of bins per continuous variable
-                         'cp=0.01'          -- cost-complexity pruning parameter
+                         'cp=0',          -- cost-complexity pruning parameter
+                         'max_surrogates=2'
                          );
 
 SELECT _print_decision_tree(tree) from train_output;
 SELECT tree_display('train_output', FALSE);
 SELECT * FROM train_output;
-SELECT tree_display('train_output', False);
-
 
 -- testing tree_predict with a category not present in training table
 CREATE TABLE dt_golf2 as
@@ -321,7 +324,7 @@ SELECT * FROM dt_golf
 UNION
 SELECT 15 as id, 1 as id_2, 'humid' as "OUTLOOK", 71 as temperature, 80 as humidity,
        ARRAY[90, 90] as "Cont_features", ARRAY['b', 'c'] as cat_features,
-       true as windy, 'Don''t Play' as class;
+       true as windy, false as windy2, 'Don''t Play' as class;
 \x off
 SELECT * FROM dt_golf2;
 SELECT tree_predict('train_output', 'dt_golf2', 'predict_output');
diff --git a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
index ecb24c77f..364e459c4 100644
--- a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
@@ -40,7 +40,7 @@ SELECT forest_train(
                   NULL,        -- exclude columns
                   NULL,        -- no grouping
                   5,                -- num of trees
-                  NULL,                 -- num of random features
+                  2,                 -- num of random features
                   TRUE,    -- importance
                   1,       -- num_permutations
                   10,       -- max depth
@@ -64,13 +64,13 @@ SELECT forest_train(
                   'train_output',    -- output model table
                   'id',              -- id column
                   'temperature::double precision',           -- response
-                  'humidity, cat_features, windy, "Cont_features"',   -- features
+                  'cat_features, windy, "Cont_features"',   -- features
                   NULL,        -- exclude columns
                   'class',          -- grouping
                   5,                -- num of trees
-                  NULL,                 -- num of random features
+                  5,                 -- num of random features
                   TRUE,     -- importance
-                  20,         -- num_permutations
+                  20,       -- num_permutations
                   10,       -- max depth
                   1,        -- min split
                   1,        -- min bucket
@@ -234,7 +234,7 @@ SELECT forest_train(
                   NULL,        -- exclude columns
                   NULL,        -- no grouping
                   5,                -- num of trees
-                  1,                 -- num of random features
+                  NULL,                 -- num of random features
                   TRUE,    -- importance
                   1,       -- num_permutations
                   10,       -- max depth
@@ -287,21 +287,21 @@ INSERT INTO rf_gr_test (id,gr,f1,f2,f3,cl) VALUES
 
 DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
 SELECT forest_train(
-                  'rf_gr_test',         -- source table
-                  'train_output',    -- output model table
-                  'id',              -- id column
-                  'cl',           -- response
-                  'f1, f2',   -- features
-                  NULL,        -- exclude columns
-                  'gr',        -- grouping
-                  2,                -- num of trees
-                  1,                 -- num of random features
-                  TRUE,    -- importance
-                  1,       -- num_permutations
-                  10,       -- max depth
-                  1,        -- min split
-                  1,        -- min bucket
-                  2,        -- number of bins per continuous variable
-                  'max_surrogates=0',
-                  FALSE
-                  );
+    'rf_gr_test',   -- source table
+    'train_output', -- output model table
+    'id',    -- id column
+    'cl',    -- response
+    'f1, f2', -- features
+    NULL,    -- exclude columns
+    'gr',    -- grouping
+    2,       -- num of trees
+    1,       -- num of random features
+    TRUE,    -- importance
+    1,       -- num_permutations
+    10,      -- max depth
+    1,       -- min split
+    1,       -- min bucket
+    2,       -- number of bins per continuous variable
+    'max_surrogates=0',
+    FALSE
+    );

From a1b9f547beb4c45fc037b7d81b5a3330e0df5740 Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Fri, 27 Jul 2018 13:26:28 -0700
Subject: [PATCH 02/13] Fix bool to text cast mapping

---
 .../decision_tree.py_in                       | 22 +++++++++++--------
 .../random_forest.py_in                       | 12 ++++------
 .../modules/utilities/validate_args.py_in     |  8 ++-----
 3 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index cbcb0f6ba..dd4197d46 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -30,6 +30,7 @@ from utilities.utilities import unique_string
 
 from utilities.validate_args import _get_table_schema_names
 from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import explicit_bool_to_text
 from utilities.validate_args import get_cols
 from utilities.validate_args import get_cols_and_types
 from utilities.validate_args import get_expr_type
@@ -1132,7 +1133,7 @@ def _create_cat_features_info_table(cat_features_info_table, bins):
 # ------------------------------------------------------------------------------
 
 
-def get_feature_str(schema_madlib, boolean_cats,
+def get_feature_str(schema_madlib, source_table,
                     cat_features, con_features,
                     levels_str, n_levels_str,
                     null_proxy=None):
@@ -1144,13 +1145,16 @@ def get_feature_str(schema_madlib, boolean_cats,
         # (1 to N). The unique value will be mapped to -1 indicating an
         # unknown/missing value in the underlying layers.
         null_val = unique_string() if null_proxy is None else null_proxy
+
+        # Cast boolean column to text: requires a special cast expression for
+        # platforms where __HAS_BOOL_TO_TEXT_CAST__ is not enabled
+        patched_cat_features = explicit_bool_to_text(source_table,
+                                                     cat_features,
+                                                     schema_madlib)
         cat_features_cast = []
-        for col in cat_features:
-            if col in boolean_cats:
-                cat_features_cast.append(
-                    "(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)")
+        for col in patched_cat_features:
             cat_features_cast.append(
-                "(coalesce({0}::text, '{1}'))::text".format(col, null_val))
+                "(coalesce(({0})::text, '{1}'))::text".format(col, null_val))
 
         cat_features_str = ("{0}._map_catlevel_to_int(array[" +
                             ", ".join(cat_features_cast) + "], {1}, {2}, {3})"
@@ -1187,7 +1191,7 @@ def _one_step(schema_madlib, training_table_name, cat_features,
     # XXX cat_feature_str contains $5 and $2, and a SQL function
     bytea8 = schema_madlib + '.bytea8'
     cat_features_str, con_features_str = get_feature_str(schema_madlib,
-                                                         boolean_cats,
+                                                         training_table_name,
                                                          cat_features,
                                                          con_features,
                                                          "$3", "$2",
@@ -1283,7 +1287,7 @@ def _one_step_for_grps(
     cat_levels_in_text = unique_string()
 
     cat_features_str, con_features_str = get_feature_str(
-        schema_madlib, boolean_cats, cat_features, con_features,
+        schema_madlib, training_table_name, cat_features, con_features,
         cat_levels_in_text, cat_n_levels, null_proxy)
 
     train_apply_func = """
@@ -1720,7 +1724,7 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
                         if value == 'boolean'])
 
     cat_features_str, con_features_str = get_feature_str(
-        schema_madlib, boolean_cats, cat_features, con_features,
+        schema_madlib, source, cat_features, con_features,
         "m.cat_levels_in_text", "m.cat_n_levels", null_proxy)
 
     if use_existing_tables and table_exists(output):
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index b183bb7c5..ccf1a26f2 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -598,7 +598,7 @@ def forest_train(
                         schema_madlib, output_table_name, cat_features_info_table,
                         con_splits_table, oob_prediction_table, oob_view,
                         sample_id, id_col_name, cat_features, con_features,
-                        boolean_cats, grouping_cols, grp_key_to_grp_cols, dep,
+                        training_table_name, grouping_cols, grp_key_to_grp_cols, dep,
                         num_permutations, is_classification, importance,
                         num_bins, filter_null, null_proxy)
 
@@ -691,12 +691,8 @@ def forest_predict(schema_madlib, model, source, output,
     _assert(is_classification or pred_type == 'response',
             "Random forest error: pred_type cannot be 'prob' for regression model.")
 
-    # find which columns are of type boolean
-    boolean_cats = set([key for key, value in get_cols_and_types(source)
-                        if value == 'boolean'])
-
     cat_features_str, con_features_str = get_feature_str(
-        schema_madlib, boolean_cats, cat_features, con_features,
+        schema_madlib, source, cat_features, con_features,
         "cat_levels_in_text", "cat_n_levels", null_proxy)
 
     pred_name = ('"prob_{0}"' if pred_type == "prob" else
@@ -918,12 +914,12 @@ def get_tree(schema_madlib, model_table, gid, sample_id,
 def _calculate_oob_prediction(
         schema_madlib, model_table, cat_features_info_table, con_splits_table,
         oob_prediction_table, oob_view, sample_id, id_col_name, cat_features,
-        con_features, boolean_cats, grouping_cols, grp_key_to_grp_cols, dep,
+        con_features, source_table, grouping_cols, grp_key_to_grp_cols, dep,
         num_permutations, is_classification, importance, num_bins, filter_null, null_proxy=None):
     """Calculate predication for out-of-bag sample"""
 
     cat_features_str, con_features_str = get_feature_str(
-        schema_madlib, boolean_cats, cat_features, con_features,
+        schema_madlib, source_table, cat_features, con_features,
         "cat_levels_in_text", "cat_n_levels", null_proxy)
 
     join_str = "," if grouping_cols is None else "JOIN"
diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in
index 4296491cc..71f42b4bb 100644
--- a/src/ports/postgres/modules/utilities/validate_args.py_in
+++ b/src/ports/postgres/modules/utilities/validate_args.py_in
@@ -518,14 +518,10 @@ def explicit_bool_to_text(tbl, cols, schema_madlib):
     Patch madlib.bool_to_text for columns that are of type boolean.
     """
     m4_ifdef(<!__HAS_BOOL_TO_TEXT_CAST__!>, <!return cols!>, <!!>)
-    col_to_type = dict(get_cols_and_types(tbl))
     patched = []
     for col in cols:
-        if col not in col_to_type:
-            plpy.error("Column ({col}) does not exist "
-                       "in table ({tbl})".format(col=col, tbl=tbl))
-        if col_to_type[col] == 'boolean':
-            patched.append(schema_madlib + ".bool_to_text(" + col + ")")
+        if get_expr_type(col) == 'boolean':
+            patched.append("{0}.bool_to_text({1})".format(schema_madlib, col))
         else:
             patched.append(col)
     return patched

From 3c84b340c6138c9b2fcb790522628c5eed153190 Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Fri, 27 Jul 2018 15:15:23 -0700
Subject: [PATCH 03/13] Refactor get_expr_type and explicit_bool_to_text

---
 .../modules/utilities/validate_args.py_in     | 35 ++++++++++++-------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in
index 71f42b4bb..55444d116 100644
--- a/src/ports/postgres/modules/utilities/validate_args.py_in
+++ b/src/ports/postgres/modules/utilities/validate_args.py_in
@@ -1,7 +1,9 @@
+import collections
 import plpy
 import re
 import string
 
+
 # Postgresql naming restrictions
 """
 Both keywords and identifier names in PostgreSQL have a maximum length limit of
@@ -361,8 +363,8 @@ def get_cols_and_types(tbl):
 # -------------------------------------------------------------------------
 
 
-def get_expr_type(expr, tbl):
-    """ Return the type of an expression run on a given table
+def get_expr_type(expressions, tbl):
+    """ Return the type of a multiple expressions run on a given table
 
     Note: this
     Args:
@@ -371,17 +373,25 @@ def get_expr_type(expr, tbl):
     Returns:
         str
     """
-    expr_type = plpy.execute("""
-        SELECT pg_typeof({0}) AS type
+    if isinstance(expressions, collections.Iterable):
+        expressions = [expressions]
+    pg_type_expressions = ["pg_typeof({0})::text".format(e) for e in expressions]
+    expr_types = plpy.execute("""
+        SELECT {0} as all_types
         FROM {1}
         LIMIT 1
-        """.format(expr, tbl))
-    if not expr_type:
-        plpy.error("Unable to get type of expression ({0}). "
+        """.format("ARRAY[{0}]::TEXT[]".format(','.join(pg_type_expressions)),
+                   tbl))
+    if not expr_types:
+        plpy.error("Unable toget type of expression ({0}). "
                    "Table {1} may not contain any valid tuples".
-                   format(expr, tbl))
-    return expr_type[0]['type'].lower()
-# -------------------------------------------------------------------------
+                   format(expressions, tbl))
+    expr_types = expr_types[0]["all_types"]
+    if len(expr_types) == 1:
+        return expr_types[0]
+    else:
+        return expr_types
+# # -------------------------------------------------------------------------
 
 
 def columns_exist_in_table(tbl, cols, schema_madlib="madlib"):
@@ -519,8 +529,9 @@ def explicit_bool_to_text(tbl, cols, schema_madlib):
     """
     m4_ifdef(<!__HAS_BOOL_TO_TEXT_CAST__!>, <!return cols!>, <!!>)
     patched = []
-    for col in cols:
-        if get_expr_type(col) == 'boolean':
+    col_types = get_expr_type(cols, tbl)
+    for col, col_type in zip(cols, col_types):
+        if col_type == 'boolean':
             patched.append("{0}.bool_to_text({1})".format(schema_madlib, col))
         else:
             patched.append(col)

From 5cb49b82d6b7bb8cca318f3e69c7bb223776684f Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Fri, 27 Jul 2018 15:34:14 -0700
Subject: [PATCH 04/13] Fix minor issues in get_expr_type

---
 src/ports/postgres/modules/utilities/validate_args.py_in | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in
index 55444d116..4f6503218 100644
--- a/src/ports/postgres/modules/utilities/validate_args.py_in
+++ b/src/ports/postgres/modules/utilities/validate_args.py_in
@@ -373,14 +373,14 @@ def get_expr_type(expressions, tbl):
     Returns:
         str
     """
-    if isinstance(expressions, collections.Iterable):
+    if not isinstance(expressions, collections.Iterable):
         expressions = [expressions]
-    pg_type_expressions = ["pg_typeof({0})::text".format(e) for e in expressions]
+    pg_type_expressions = ["pg_typeof({0})".format(e) for e in expressions]
     expr_types = plpy.execute("""
         SELECT {0} as all_types
         FROM {1}
         LIMIT 1
-        """.format("ARRAY[{0}]::TEXT[]".format(','.join(pg_type_expressions)),
+        """.format("ARRAY[{0}]".format(','.join(pg_type_expressions)),
                    tbl))
     if not expr_types:
         plpy.error("Unable toget type of expression ({0}). "

From 35757153390293d4ac778aab6f9e181aea9b2c31 Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Fri, 27 Jul 2018 16:23:25 -0700
Subject: [PATCH 05/13] Ensure scalar output only if input is scalar

---
 .../modules/utilities/validate_args.py_in        | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in
index 4f6503218..cffaea3f6 100644
--- a/src/ports/postgres/modules/utilities/validate_args.py_in
+++ b/src/ports/postgres/modules/utilities/validate_args.py_in
@@ -1,8 +1,8 @@
-import collections
+from collections import Iterable
 import plpy
 import re
 import string
-
+from types import StringTypes
 
 # Postgresql naming restrictions
 """
@@ -373,8 +373,15 @@ def get_expr_type(expressions, tbl):
     Returns:
         str
     """
-    if not isinstance(expressions, collections.Iterable):
+    # FIXME: Below transformation exist to ensure backwards compatibility
+    # Remove this when all callers have been modified to pass an Iterable 'expressions'
+    if (isinstance(expressions, types.StringTypes) or
+            not isinstance(expressions, collections.Iterable)):
         expressions = [expressions]
+        input_was_scalar = True
+    else:
+        input_was_scalar = False
+
     pg_type_expressions = ["pg_typeof({0})".format(e) for e in expressions]
     expr_types = plpy.execute("""
         SELECT {0} as all_types
@@ -387,7 +394,8 @@ def get_expr_type(expressions, tbl):
                    "Table {1} may not contain any valid tuples".
                    format(expressions, tbl))
     expr_types = expr_types[0]["all_types"]
-    if len(expr_types) == 1:
+    if input_was_scalar:
+        # output should be same form as input
         return expr_types[0]
     else:
         return expr_types

From 750c708900631549a948db9222f711606d79a88e Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Fri, 27 Jul 2018 16:37:49 -0700
Subject: [PATCH 06/13] Fix import error

---
 src/ports/postgres/modules/utilities/validate_args.py_in | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in
index cffaea3f6..f7f79e955 100644
--- a/src/ports/postgres/modules/utilities/validate_args.py_in
+++ b/src/ports/postgres/modules/utilities/validate_args.py_in
@@ -375,8 +375,8 @@ def get_expr_type(expressions, tbl):
     """
     # FIXME: Below transformation exist to ensure backwards compatibility
     # Remove this when all callers have been modified to pass an Iterable 'expressions'
-    if (isinstance(expressions, types.StringTypes) or
-            not isinstance(expressions, collections.Iterable)):
+    if (isinstance(expressions, StringTypes) or
+            not isinstance(expressions, Iterable)):
         expressions = [expressions]
         input_was_scalar = True
     else:

From 93bf63fbd8781b1b50be5604c636b74d8cc0ea0b Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Mon, 30 Jul 2018 16:53:02 -0700
Subject: [PATCH 07/13] DT: Simplify + clean boolean expressions

---
 .../recursive_partitioning/DT_impl.hpp        |  9 +--
 .../recursive_partitioning/decision_tree.cpp  |  3 +-
 .../decision_tree.py_in                       | 59 +++++++------------
 .../test/decision_tree.sql_in                 | 42 +++++++------
 4 files changed, 53 insertions(+), 60 deletions(-)

diff --git a/src/modules/recursive_partitioning/DT_impl.hpp b/src/modules/recursive_partitioning/DT_impl.hpp
index 0eb8751a7..75e4ce4cf 100644
--- a/src/modules/recursive_partitioning/DT_impl.hpp
+++ b/src/modules/recursive_partitioning/DT_impl.hpp
@@ -518,6 +518,7 @@ DecisionTree<Container>::expand(const Accumulator &state,
                     double gain = impurityGain(
                         state.cat_stats.row(stats_i).
                             segment(fv_index, sps * 2), sps);
+
                     if (gain > max_impurity_gain){
                         max_impurity_gain = gain;
                         max_feat = f;
@@ -677,10 +678,10 @@ DecisionTree<Container>::pickSurrogates(
                     (cat_stats_counts.row(stats_i).segment(
                         prev_cum_levels * 2, n_levels * 2)).maxCoeff(&max_label);
 
-                    // For each split, there are two stats => max_label / 2
-                    // gives the split index. There isn't a need to do a floor
-                    // operation, since the threshold will yield the same results
-                    // for n and n+0.5.
+                    // For each split, there are two stats =>
+                    //  max_label / 2 gives the split index. A floor
+                    // operation is unnecessary since the threshold will yield
+                    // the same results for n and n+0.5.
                     cat_max_thres(each_cat) = static_cast<double>(max_label / 2);
                     cat_max_count(each_cat) =
                             cat_stats_counts(stats_i, prev_cum_levels*2 + max_label);
diff --git a/src/modules/recursive_partitioning/decision_tree.cpp b/src/modules/recursive_partitioning/decision_tree.cpp
index 49eb349ca..d2499461c 100644
--- a/src/modules/recursive_partitioning/decision_tree.cpp
+++ b/src/modules/recursive_partitioning/decision_tree.cpp
@@ -131,12 +131,13 @@ compute_leaf_stats_transition::run(AnyType & args){
     else {
         MutableNativeIntegerVector n_levels_per_cat =
             args[6].getAs<MutableNativeIntegerVector>();
-        for (Index i = 0; i < n_levels_per_cat.size(); i++)
+        for (Index i = 0; i < n_levels_per_cat.size(); i++){
             n_levels_per_cat[i] -= 1;
                 // ignore the last level since a split
                 // like 'var <= last level' would move all rows to
                 // a one side. Such a split will always be ignored
                 // when selecting the best split.
+        }
         cat_levels.rebind(n_levels_per_cat.memoryHandle(), n_levels_per_cat.size());
     }
 
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index dd4197d46..6638ad024 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -247,15 +247,15 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
                                               filter_dep)
         dep_list.sort()
         if dep_is_bool:
-            dep_col_str = ("CASE WHEN {0} THEN 'True' ELSE 'False' END".
+            dep_var_str = ("CASE WHEN {0} THEN 0 ELSE 1 END".
                            format(dependent_variable))
+
         else:
-            dep_col_str = dependent_variable
-        dep_var_str = ("(CASE " +
-                       "\n\t\t".join(["WHEN ({dep_col})::text = $${c}$$ THEN {i}".
-                                      format(dep_col=dep_col_str, c=c, i=i)
-                                      for i, c in enumerate(dep_list)]) +
-                       "\nEND)")
+            dep_var_str = ("(CASE " +
+                           "\n\t\t".join(["WHEN ({0})::text = $${1}$$ THEN {2}".
+                                          format(dep_col_str, str(c), i)
+                                          for i, c in enumerate(dep_list)]) +
+                           "\nEND)")
     else:
         if split_criterion.lower().strip() != "mse":
             plpy.warning("Decision tree: Using MSE as split criterion as it "
@@ -354,18 +354,18 @@ def get_grouping_array_str(table_name, grouping_cols, qualifier=None):
     Args:
         @param grouping_cols: list, List of columns used as grouping columns
     """
-    if qualifier:
-        qualifier_str = qualifier + "."
-    else:
-        qualifier_str = ''
+    def _col_to_text(col, col_type):
+        qualifier_str = qualifier + "." if qualifier else ''
+        if col_type.lower() == 'boolean':
+            return "(case when {0} then 'true' else 'false' end)::text".format(col)
+        else:
+            return '({0}{1})::text'.format(qualifier_str, col)
 
     grouping_cols_list = [col.strip() for col in grouping_cols.split(',')]
     grouping_cols_and_types = [(c, get_expr_type(c, table_name))
                                for c in grouping_cols_list]
-    grouping_array_str = 'array_to_string(array[' + \
-        ','.join("(case when " + col + " then 'True' else 'False' end)::text"
-                 if col_type.lower() == 'boolean' else '(' + qualifier_str + col + ')::text'
-                 for col, col_type in grouping_cols_and_types) + "]::text[], ',')"
+    grouping_array_str = "array_to_string(array[{0}]::text[], ',')".format(
+        ','.join(_col_to_text(col, col_type) for col, col_type in grouping_cols_and_types))
     return grouping_array_str
 # ------------------------------------------------------------------------------
 
@@ -697,7 +697,7 @@ def _get_bins(schema_madlib, training_table_name,
     # variable levels to integers, and keep this mapping in the memory.
     if len(cat_features) > 0:
         if is_classification:
-            # For classifications
+            # For classification the dependent variable is encoded as an integer
             order_fun = ("{madlib}._dst_compute_entropy({dep}, {n})".
                          format(madlib=schema_madlib,
                                 dep=dependent_variable,
@@ -746,13 +746,8 @@ def _get_bins(schema_madlib, training_table_name,
                        filter_str=filter_str,
                        union_null_proxy=union_null_proxy)
 
-        all_col_expressions = {}
-        for col in cat_features:
-            if col in boolean_cats:
-                all_col_expressions[col] = ("(CASE WHEN " + col +
-                                            " THEN 'True' ELSE 'False' END)")
-            else:
-                all_col_expressions[col] = col
+        all_col_expressions = dict(zip(cat_features, explicit_bool_to_text(
+            training_table_name, cat_features, schema_madlib)))
 
         sql_all_cats = ' UNION '.join(
             sql_cat_levels.format(
@@ -982,18 +977,8 @@ def _get_bins_grps(
                 ) s1
                 """.format(**locals())
 
-        all_col_expressions = {}
-        for col in cat_features:
-            if col in boolean_cats:
-                all_col_expressions[col] = ("(CASE WHEN " + col +
-                                            " THEN 'True' ELSE 'False' END)")
-            else:
-                if null_proxy is not None:
-                    all_col_expressions[col] = ("COALESCE({0}::TEXT, '{1}')".
-                                                format(col, null_proxy))
-                else:
-                    all_col_expressions[col] = col
-
+        all_col_expressions = dict(zip(cat_features, explicit_bool_to_text(
+            training_table_name, cat_features, schema_madlib)))
         sql_all_cats = ' UNION ALL '.join(
             sql_cat_levels.format(
                 col=expr,
@@ -1761,8 +1746,8 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
     else:
         if dep_type.lower() == "boolean":
             # some platforms don't have text to boolean cast. We manually check the string.
-            dep_cast_str = ("(case {pred_name} when 'True' then "
-                            "True else False end)::BOOLEAN as {pred_name}")
+            dep_cast_str = ("(case {pred_name} when 'true' then true "
+                            "else false end)::BOOLEAN as {pred_name}")
         else:
             dep_cast_str = "{pred_name}::{dep_type}"
         if pred_type == "response":
diff --git a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
index f83f5dd68..74bc51835 100644
--- a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
@@ -269,6 +269,7 @@ update dt_golf set id_2 = id % 2;
 -------------------------------------------------------------------------
 
 -- no grouping, with cross_validation
+-- also adding a categorical with just a single level (windy2)
 DROP TABLE IF EXISTS train_output, train_output_summary, train_output_cv;
 SELECT tree_train('dt_golf'::text,         -- source table
                          'train_output'::text,    -- output model table
@@ -290,7 +291,6 @@ SELECT tree_train('dt_golf'::text,         -- source table
 SELECT _print_decision_tree(tree) from train_output;
 SELECT tree_display('train_output', False);
 SELECT impurity_var_importance FROM train_output;
-SELECT cat_levels_in_text, cat_n_levels, impurity_var_importance FROM train_output;
 SELECT * FROM train_output_cv;
 SELECT * FROM train_output_summary;
 -------------------------------------------------------------------------
@@ -298,25 +298,32 @@ SELECT * FROM train_output_summary;
 -- grouping
 DROP TABLE IF EXISTS train_output, train_output_summary, predict_output;
 SELECT tree_train('dt_golf'::text,         -- source table
-                         'train_output'::text,    -- output model table
-                         'id'::text,              -- id column
-                         'temperature::double precision'::text,           -- response
-                         '"OUTLOOK", humidity, windy, cat_features'::text,   -- features
-                         NULL::text,        -- exclude columns
-                         'gini'::text,      -- split criterion
-                         'class'::text,     -- grouping
-                         NULL::text,        -- no weights
-                         10::integer,       -- max depth
-                         2::integer,        -- min split
-                         1::integer,        -- min bucket
-                         3::integer,        -- number of bins per continuous variable
-                         'cp=0',          -- cost-complexity pruning parameter
-                         'max_surrogates=2'
-                         );
+                  'train_output'::text,    -- output model table
+                  'id'::text,              -- id column
+                  'temperature::double precision'::text,       -- response
+                  '"OUTLOOK", humidity, windy, cat_features'::text, -- features
+                  NULL::text,        -- exclude columns
+                  'gini'::text,      -- split criterion
+                  'class'::text,     -- grouping
+                  NULL::text,        -- no weights
+                  10::integer,       -- max depth
+                  2::integer,        -- min split
+                  1::integer,        -- min bucket
+                  3::integer,        -- number of bins per contvariable
+                  'cp=0',          -- cost-complexity pruning parameter
+                  'max_surrogates=2'
+                  );
 
 SELECT _print_decision_tree(tree) from train_output;
 SELECT tree_display('train_output', FALSE);
-SELECT * FROM train_output;
+
+-- cat_features[2] has a single level. The cat_n_levels is in order of the
+-- input categorical features.
+-- In this case "OUTLOOL" = 1, windy = 2, cat_features[1] = 3, cat_features[2] = 4
+SELECT assert(cat_n_levels[4] = 1,
+              'Categorical features with single level not being retained.')
+FROM train_output
+WHERE class = E'Don\'t Play';
 
 -- testing tree_predict with a category not present in training table
 CREATE TABLE dt_golf2 as
@@ -335,7 +342,6 @@ JOIN
     dt_golf2
 USING (id);
 \x on
-select * from train_output;
 select * from train_output_summary;
 -------------------------------------------------------------------------
 

From 980d78da47d72bdf4abadbada57522ed6ae481c0 Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Tue, 31 Jul 2018 10:36:59 -0700
Subject: [PATCH 08/13] Fix minor bug + refactor split_criterion

---
 .../decision_tree.py_in                       | 38 ++++++++++---------
 .../random_forest.py_in                       | 10 ++---
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 6638ad024..7ebfbfd65 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -50,13 +50,6 @@ def _tree_validate_args(
         min_split, min_bucket, n_bins, cp, n_folds, **kwargs):
     """ Validate the arguments
     """
-    if not split_criterion:
-        split_criterion = 'gini'
-    _assert(split_criterion.lower().strip() in ['mse', 'gini', 'cross-entropy',
-                                                'entropy', 'misclass',
-                                                'misclassification'],
-            "Decision tree error: Invalid split_criterion.")
-
     _assert(training_table_name and
             training_table_name.strip().lower() not in ('null', ''),
             "Decision tree error: Invalid data table.")
@@ -111,6 +104,24 @@ def _tree_validate_args(
 # ------------------------------------------------------------
 
 
+def _validate_split_criterion(split_criterion, is_classification):
+    _assert(split_criterion.lower().strip() in ['mse', 'gini', 'cross-entropy',
+                                                'entropy', 'misclass',
+                                                'misclassification'],
+            "Decision tree error: Invalid split_criterion.")
+    if is_classification:
+        if split_criterion.lower().strip() == "mse":
+            plpy.error("Decision tree error: MSE is not a valid "
+                       "split criterion for classification.")
+    else:
+        if split_criterion.lower().strip() != "mse":
+            plpy.warning("Decision tree: Using MSE as split criterion as it "
+                         "is the only one supported for regression trees.")
+        split_criterion = "mse"
+    return split_criterion
+# ------------------------------------------------------------------------------
+
+
 def _get_features_to_use(schema_madlib, training_table_name,
                          list_of_features, list_of_features_to_exclude,
                          id_col_name, weights, dependent_variable,
@@ -236,11 +247,7 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
                             key is ''
     """
     filter_dep = _get_filter_str(dependent_variable, grouping_cols)
-    # 3)
     if is_classification:
-        if split_criterion.lower().strip() == "mse":
-            plpy.error("Decision tree error: MSE is not a valid "
-                       "split criterion for classification.")
         # For classifications, we also need to map dependent_variable to integers
         n_rows, dep_list = _get_n_and_deplist(training_table_name,
                                               dependent_variable,
@@ -253,13 +260,10 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
         else:
             dep_var_str = ("(CASE " +
                            "\n\t\t".join(["WHEN ({0})::text = $${1}$$ THEN {2}".
-                                          format(dep_col_str, str(c), i)
+                                          format(dependent_variable, str(c), i)
                                           for i, c in enumerate(dep_list)]) +
                            "\nEND)")
     else:
-        if split_criterion.lower().strip() != "mse":
-            plpy.warning("Decision tree: Using MSE as split criterion as it "
-                         "is the only one supported for regression trees.")
         n_rows = long(plpy.execute("SELECT count(*)::bigint "
                                    "FROM {src} "
                                    "WHERE {filter}".
@@ -269,7 +273,6 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
         dep_list = []
 
     dep_n_levels = len(dep_list) if dep_list else 1
-
     cat_features_info_table = unique_string()
     if not grouping_cols:   # non-grouping case
         # 3)  Find the splitting bins, one dict containing two arrays:
@@ -292,7 +295,6 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
         tree_states = [tree]
     else:
         grouping_array_str = get_grouping_array_str(training_table_name, grouping_cols)
-
         with OptimizerControl(False):
             # we disable optimizer (ORCA) for platforms that use it
             #  since ORCA doesn't provide an easy way to disable hashagg
@@ -344,7 +346,6 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
             schema_madlib, tree,
             len(cat_features), len(con_features))
         tree.update(**importance_vectors)
-
     return tree_states, bins, dep_list, n_rows, cat_features_info_table
 # -------------------------------------------------------------------------
 
@@ -513,6 +514,7 @@ def tree_train(schema_madlib, training_table_name, output_table_name,
 
         is_classification, dep_is_bool = _is_dep_categorical(
             training_table_name, dependent_variable)
+        split_criterion = _validate_split_criterion(split_criterion, is_classification)
 
         # 4) Build the tree with provided cp value
         compute_cp_list = (n_folds > 1)
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index ccf1a26f2..93524bf54 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -23,7 +23,6 @@ from utilities.utilities import split_quoted_delimited_str
 from utilities.utilities import unique_string
 
 from utilities.validate_args import cols_in_tbl_valid
-from utilities.validate_args import get_cols_and_types
 from utilities.validate_args import get_expr_type
 from utilities.validate_args import input_tbl_valid
 from utilities.validate_args import is_var_valid
@@ -34,7 +33,6 @@ from decision_tree import _tree_train_grps_using_bins
 from decision_tree import _get_bins
 from decision_tree import _get_bins_grps
 from decision_tree import _get_features_to_use
-from decision_tree import _get_dep_type
 from decision_tree import _is_dep_categorical
 from decision_tree import _get_n_and_deplist
 from decision_tree import _classify_features
@@ -1228,10 +1226,10 @@ def _create_summary_table(**kwargs):
         kwargs['dep_list_str'] = "NULL"
 
     kwargs['indep_type'] = ', '.join(kwargs['all_cols_types'][col]
-                                     for col in kwargs['cat_features'] +
-                                                kwargs['con_features'])
-    kwargs['dep_type'] = _get_dep_type(kwargs['training_table_name'],
-                                       kwargs['dependent_variable'])
+                                     for col in (kwargs['cat_features'] +
+                                                 kwargs['con_features']))
+    kwargs['dep_type'] = get_expr_type(kwargs['dependent_variable'],
+                                       kwargs['training_table_name'])
     kwargs['cat_features_str'] = ','.join(kwargs['cat_features'])
     kwargs['con_features_str'] = ','.join(kwargs['con_features'])
     if kwargs['grouping_cols']:

From bd99196613321e6b04bd4b94fa305a6fb77273d8 Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Tue, 31 Jul 2018 14:13:03 -0700
Subject: [PATCH 09/13] Redo changes lost in previous commit

---
 .../decision_tree.py_in                       | 69 +++++++++++--------
 .../random_forest.py_in                       |  1 -
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 7ebfbfd65..88c72dda0 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -252,9 +252,11 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
         n_rows, dep_list = _get_n_and_deplist(training_table_name,
                                               dependent_variable,
                                               filter_dep)
-        dep_list.sort()
         if dep_is_bool:
-            dep_var_str = ("CASE WHEN {0} THEN 0 ELSE 1 END".
+            # false = 0, true = 1
+            # This order is maintained in dep_list since
+            # _get_n_and_deplist returns a sorted list
+            dep_var_str = ("CASE WHEN {0} THEN 1 ELSE 0 END".
                            format(dependent_variable))
 
         else:
@@ -573,11 +575,14 @@ def _get_n_and_deplist(training_table_name, dependent_variable, filter_null):
     @brief Query the database for the total number of rows and
     levels of dependent variable if the dependent variable is
     categorical.
+
+    Note: The deplist is sorted in the array_agg which is necessary to ensure
+    false = 0 and true = 1 for boolean dependent variable.
     """
     sql = """
         SELECT
             sum(n_rows) as n_rows,
-            array_agg(dep) as dep
+            array_agg(dep ORDER BY dep ASC) as dep
         FROM (
             SELECT
                 count(*) as n_rows,
@@ -1549,14 +1554,19 @@ def _create_summary_table(
     output_table_summary = add_postfix(output_table_name, "_summary")
 
     # dependent variables
+    dep_type = _get_dep_type(training_table_name, dependent_variable)
     if dep_list:
-        dep_list_str = ("$dep_list$" +
-                        ','.join('"{0}"'.format(str(dep)) for dep in dep_list) +
-                        "$dep_list$")
+        if is_psql_boolean_type(dep_type):
+            # Special handling for boolean since Python booleans start with
+            # capitals (i.e False instead of false)
+            # Note:  dep_list is sorted, hence 'false' will be first
+            dep_list_str = "'false, true'"
+        else:
+            dep_list_str = '$__dep_list__$"{0}"$__dep_list__$'.format(
+                ','.join(map(str, dep_list)))
     else:
         dep_list_str = "NULL"
     indep_type = ', '.join(all_cols_types[c] for c in cat_features + con_features)
-    dep_type = _get_dep_type(training_table_name, dependent_variable)
     independent_varnames = ','.join(cat_features + con_features)
     cat_features_str = ','.join(cat_features)
     con_features_str = ','.join(con_features)
@@ -1597,8 +1607,8 @@ def _create_summary_table(
                 {n_rows_skipped}::integer      AS total_rows_skipped,
                 {dep_list_str}::text           AS dependent_var_levels,
                 '{dep_type}'::text             AS dependent_var_type,
-                {cp_str}                       AS input_cp,
                 '{indep_type}'::text           AS independent_var_types,
+                {cp_str}                       AS input_cp,
                 {n_folds}::integer             AS n_folds,
                 {null_proxy_str}::text         AS null_proxy
             """.format(**locals())
@@ -1675,7 +1685,6 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
                             'prob' gives the probability of the classes in a
                              classification tree.
                           For regression tree, only type='response' is defined.
-
     Returns:
         None
 
@@ -1699,9 +1708,9 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
             "that were used during training".format(source))
     id_col_name = summary_elements["id_col_name"]
     dep_varname = summary_elements["dependent_varname"]
-    dep_levels = summary_elements["dependent_var_levels"]
-    is_classification = summary_elements["is_classification"]
+    dep_levels = split_quoted_delimited_str(summary_elements["dependent_var_levels"])
     dep_type = summary_elements['dependent_var_type']
+    is_classification = summary_elements["is_classification"]
     # optional variables, default value is None
     grouping_cols_str = summary_elements.get("grouping_cols")
     null_proxy = summary_elements.get('null_proxy')
@@ -1715,8 +1724,8 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
         "m.cat_levels_in_text", "m.cat_n_levels", null_proxy)
 
     if use_existing_tables and table_exists(output):
-        plpy.execute("truncate " + output)
-        header = "INSERT INTO " + output + " "
+        plpy.execute("TRUNCATE " + output)
+        header = "INSERT INTO " + output
         use_fold = 'WHERE k = ' + str(k)
     else:
         header = "CREATE TABLE " + output + " AS "
@@ -1746,23 +1755,26 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
             {use_fold}
             """
     else:
-        if dep_type.lower() == "boolean":
+        if is_psql_boolean_type(dep_type):
             # some platforms don't have text to boolean cast. We manually check the string.
             dep_cast_str = ("(case {pred_name} when 'true' then true "
-                            "else false end)::BOOLEAN as {pred_name}")
+                            " when 'false' then false "
+                            "end)::BOOLEAN as {pred_name}")
         else:
             dep_cast_str = "{pred_name}::{dep_type}"
+        dep_levels_array_str = py_list_to_sql_string(dep_levels, 'TEXT', long_format=True)
+        plpy.info(dep_levels_array_str)
         if pred_type == "response":
             sql = header + """
                     SELECT
-                        {id_col_name}
-                       , %s
+                        {id_col_name},
+                        %s
                     FROM (
                          SELECT
-                            {id_col_name}
-                           ,
-                           ($sql${{ {dep_levels} }}$sql$::varchar[])[
-                               {schema_madlib}._predict_dt_response (
+                            {id_col_name},
+                           -- _predict_dt_response returns 0-based indexing.
+                           -- Hence the "+ 1" (DB by default uses 1-based indexing)
+                           (%s)[{schema_madlib}._predict_dt_response (
                                    tree,
                                    {cat_features_str}::INTEGER[],
                                    {con_features_str}::DOUBLE PRECISION[]) + 1]::TEXT
@@ -1770,27 +1782,26 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
                         FROM {source} as s {join_str} {model} as m {using_str}
                         {use_fold}
                     ) q
-                """ % (dep_cast_str)
+                """ % (dep_cast_str, dep_levels_array_str)
         else:
-            intermediate_col = unique_string()
+            temp_col = unique_string()
             score_format = ', \n'.join([
-                '{interim}[{j}] as "estimated_prob_{c}"'.
-                format(j=i+1, c=c.strip(' "'), interim=intermediate_col)
-                for i, c in enumerate(split_quoted_delimited_str(dep_levels))])
+                '{0}[{1}] as "estimated_prob_{2}"'.format(temp_col, i, c.strip(' "'))
+                for i, c in enumerate(dep_levels, start=1)])
             sql = header + """
                 SELECT
                     {id_col_name},
-                    {score_format}
+                    %s
                     FROM (
                         SELECT {id_col_name},
                                {schema_madlib}._predict_dt_prob(tree,
                                     {cat_features_str}::INTEGER[],
                                     {con_features_str}::DOUBLE PRECISION[])
-                                        AS {intermediate_col}
+                                        AS {temp_col}
                         FROM {source} as s {join_str} {model} as m {using_str}
                         {use_fold}
                     ) q
-                """
+                """ % (score_format)
     sql = sql.format(**locals())
     with MinWarning('warning'):
         with OptimizerControl(False):
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index 93524bf54..5f3e4f27f 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -353,7 +353,6 @@ def forest_train(
                     _assert(n_rows > 0,
                             "Random forest error: There should be at least one "
                             "data point for each class where all features are non NULL")
-                    dep_list.sort()
                     dep_col_str = ("CASE WHEN " + dependent_variable +
                                    " THEN 'True' ELSE 'False' END") if is_bool else dependent_variable
                     dep = ("(CASE " +

From e06921d18f4c96d06a73833917cdc726bef44dcd Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Tue, 31 Jul 2018 14:47:21 -0700
Subject: [PATCH 10/13] Repeat DT changes in RF

---
 .../decision_tree.py_in                       |  4 +-
 .../random_forest.py_in                       | 72 +++++++++----------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 88c72dda0..e9fcf0fe8 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -256,7 +256,7 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion,
             # false = 0, true = 1
             # This order is maintained in dep_list since
             # _get_n_and_deplist returns a sorted list
-            dep_var_str = ("CASE WHEN {0} THEN 1 ELSE 0 END".
+            dep_var_str = ("(CASE WHEN {0} THEN 1 ELSE 0 END)".
                            format(dependent_variable))
 
         else:
@@ -359,7 +359,7 @@ def get_grouping_array_str(table_name, grouping_cols, qualifier=None):
     """
     def _col_to_text(col, col_type):
         qualifier_str = qualifier + "." if qualifier else ''
-        if col_type.lower() == 'boolean':
+        if is_psql_boolean_type(col_type):
             return "(case when {0} then 'true' else 'false' end)::text".format(col)
         else:
             return '({0}{1})::text'.format(qualifier_str, col)
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index 5f3e4f27f..583f1455b 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -18,6 +18,7 @@ from utilities.control import HashaggControl
 from utilities.utilities import _assert
 from utilities.utilities import add_postfix
 from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import is_psql_boolean_type
 from utilities.utilities import py_list_to_sql_string
 from utilities.utilities import split_quoted_delimited_str
 from utilities.utilities import unique_string
@@ -39,6 +40,7 @@ from decision_tree import _classify_features
 from decision_tree import _get_filter_str
 from decision_tree import _get_display_header
 from decision_tree import get_feature_str
+from decision_tree import get_grouping_array_str
 from decision_tree import _compute_var_importance
 from decision_tree import _create_cat_features_info_table
 # ------------------------------------------------------------
@@ -322,7 +324,7 @@ def forest_train(
                 _assert(bool(features),
                         "Random forest error: No feature is selected for the model.")
 
-                is_classification, is_bool = _is_dep_categorical(
+                is_classification, dep_is_bool = _is_dep_categorical(
                     training_table_name, dependent_variable)
                 split_criterion = 'gini' if is_classification else 'mse'
 
@@ -350,23 +352,26 @@ def forest_train(
                     n_rows, dep_list = _get_n_and_deplist(training_table_name,
                                                           dependent_variable,
                                                           filter_null)
+                    dep_n_levels = len(dep_list)
                     _assert(n_rows > 0,
                             "Random forest error: There should be at least one "
                             "data point for each class where all features are non NULL")
-                    dep_col_str = ("CASE WHEN " + dependent_variable +
-                                   " THEN 'True' ELSE 'False' END") if is_bool else dependent_variable
-                    dep = ("(CASE " +
-                           "\n               ".
-                           join(["WHEN ({dep_col})::text = $${c}$$ THEN {i}".
-                                format(dep_col=dep_col_str, c=c, i=i)
-                                 for i, c in enumerate(dep_list)]) +
-                           "\nEND)")
-                    dep_n_levels = len(dep_list)
+                    if dep_is_bool:
+                        # false = 0, true = 1
+                        # This order is maintained in dep_list since
+                        # _get_n_and_deplist returns a sorted list
+                        dep = ("(CASE WHEN {0} THEN 1 ELSE 0 END)".
+                               format(dependent_variable))
+                    else:
+                        dep = ("(CASE " +
+                               "\n\t\t".join(["WHEN ({0})::text = $${1}$$ THEN {2}".
+                                               format(dependent_variable, c, i)
+                                              for i, c in enumerate(dep_list)]) +
+                               "\nEND)")
                 else:
                     n_rows = plpy.execute(
-                        "SELECT count(*) FROM {source_table} where {filter_null}".
-                        format(source_table=training_table_name,
-                               filter_null=filter_null))[0]['count']
+                        "SELECT count(*) FROM {0} WHERE {1}".
+                        format(training_table_name, filter_null))[0]['count']
                     dep = dependent_variable
                     dep_n_levels = 1
                     dep_list = None
@@ -394,15 +399,8 @@ def forest_train(
                     cat_features = bins['cat_features']
                     bins['grp_key_cat'] = ['']
                 else:
-                    grouping_cols_list = [col.strip() for col in grouping_cols.split(',')]
-                    grouping_cols_and_types = [(col, get_expr_type(col, training_table_name))
-                                               for col in grouping_cols_list]
-                    grouping_array_str = (
-                        "array_to_string(array[" +
-                        ','.join("(case when " + col + " then 'True' else 'False' end)::text"
-                                 if col_type.lower() == 'boolean' else '(' + col + ')::text'
-                                 for col, col_type in grouping_cols_and_types) +
-                        "], ',')")
+                    grouping_array_str = get_grouping_array_str(
+                        training_table_name, grouping_cols)
                     grouping_cols_str = ('' if grouping_cols is None
                                          else grouping_cols + ",")
                     sql_grp_key_to_grp_cols = """
@@ -427,7 +425,8 @@ def forest_train(
                                           con_features, num_bins, dep,
                                           boolean_cats, grouping_cols,
                                           grouping_array_str, n_rows,
-                                          is_classification, dep_n_levels, filter_null, null_proxy)
+                                          is_classification, dep_n_levels,
+                                          filter_null, null_proxy)
                     cat_features = bins['cat_features']
 
                 # a table for getting information of cat features for each group
@@ -679,7 +678,7 @@ def forest_predict(schema_madlib, model, source, output,
     id_col_name = summary_elements["id_col_name"]
     grouping_cols = summary_elements.get("grouping_cols")  # optional, default = None
     dep_varname = summary_elements["dependent_varname"]
-    dep_levels = summary_elements["dependent_var_levels"]
+    dep_levels = split_quoted_delimited_str(summary_elements["dependent_var_levels"])
     is_classification = summary_elements["is_classification"]
     dep_type = summary_elements['dependent_var_type']
     null_proxy = summary_elements.get('null_proxy')  # optional, default = None
@@ -701,17 +700,19 @@ def forest_predict(schema_madlib, model, source, output,
     if not is_classification:
         majority_pred_expression = "AVG(aggregated_prediction)"
     else:
-        majority_pred_expression = """($sql${{ {dep_levels} }}$sql$::varchar[])[
-                                    {schema_madlib}.mode(aggregated_prediction + 1)]::TEXT
-                                    """.format(**locals())
+        majority_pred_expression = (
+            "({0})[{1}.mode(aggregated_prediction + 1)]::TEXT".
+            format(py_list_to_sql_string(dep_levels, 'TEXT', long_format=True),
+                   schema_madlib))
 
-    if dep_type.lower() == "boolean":
+    if is_psql_boolean_type(dep_type):
         # some platforms don't have text to boolean cast. We manually check the string.
-        majority_pred_cast_str = ("(case {majority_pred_expression} when 'True' then "
-                                  "True else False end)::BOOLEAN as {pred_name}")
+        majority_pred_cast_str = ("(case {majority_pred_expression} "
+                                  " when 'true' then true "
+                                  " when 'false' then false "
+                                  " end)::BOOLEAN AS {pred_name}")
     else:
-        majority_pred_cast_str = "{majority_pred_expression}::{dep_type} as {pred_name}"
-
+        majority_pred_cast_str = "({majority_pred_expression})::{dep_type} AS {pred_name}"
     majority_pred_cast_str = majority_pred_cast_str.format(**locals())
     num_trees_grown = plpy.execute(
         "SELECT count(DISTINCT sample_id) FROM {0}".format(model))[0]['count']
@@ -722,8 +723,7 @@ def forest_predict(schema_madlib, model, source, output,
             SELECT
                 {id_col_name},
                 {majority_pred_cast_str}
-            FROM
-            (
+            FROM (
                 SELECT
                     {id_col_name},
                     {schema_madlib}._predict_dt_response(
@@ -742,12 +742,12 @@ def forest_predict(schema_madlib, model, source, output,
             GROUP BY {id_col_name}
         """.format(**locals())
     else:
-        len_dep_levels = len(split_quoted_delimited_str(dep_levels))
+        len_dep_levels = len(dep_levels)
         normalized_majority_pred = unique_string()
         score_format = ', \n'.join([
             '{temp}[{j}] as "estimated_prob_{c}"'.
             format(j=i+1, c=c.strip(' "'), temp=normalized_majority_pred)
-            for i, c in enumerate(split_quoted_delimited_str(dep_levels))])
+            for i, c in enumerate(dep_levels)])
 
         sql_prediction = """
             CREATE TABLE {output} AS

From 966b509ce3de75de587d997fe2a53c4eceeaad95 Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Tue, 31 Jul 2018 15:00:51 -0700
Subject: [PATCH 11/13] Fix incorrect double quoting

---
 .../modules/recursive_partitioning/decision_tree.py_in      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index e9fcf0fe8..ab19f7ee7 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -37,7 +37,7 @@ from utilities.validate_args import get_expr_type
 from utilities.validate_args import is_var_valid
 from utilities.validate_args import table_is_empty
 from utilities.validate_args import table_exists
-from utilities.validate_args import unquote_ident
+from utilities.validate_args import quote_ident, unquote_ident
 
 from validation.cross_validation import cross_validation_grouping_w_params
 # ------------------------------------------------------------
@@ -1562,8 +1562,8 @@ def _create_summary_table(
             # Note:  dep_list is sorted, hence 'false' will be first
             dep_list_str = "'false, true'"
         else:
-            dep_list_str = '$__dep_list__$"{0}"$__dep_list__$'.format(
-                ','.join(map(str, dep_list)))
+            dep_list_str = '$__dep_list__${0}$__dep_list__$'.format(
+                ','.join(map(quote_ident, dep_list)))
     else:
         dep_list_str = "NULL"
     indep_type = ', '.join(all_cols_types[c] for c in cat_features + con_features)

From b6cdaab38f44dae4c7a1f2f3077f9392838062d7 Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Tue, 31 Jul 2018 16:21:20 -0700
Subject: [PATCH 12/13] quote_literal instead of quote_ident?

---
 .../decision_tree.py_in                       |  9 ++++----
 .../random_forest.py_in                       | 23 +++++++++++--------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index ab19f7ee7..1b0e03b74 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -37,7 +37,7 @@ from utilities.validate_args import get_expr_type
 from utilities.validate_args import is_var_valid
 from utilities.validate_args import table_is_empty
 from utilities.validate_args import table_exists
-from utilities.validate_args import quote_ident, unquote_ident
+from utilities.validate_args import unquote_ident
 
 from validation.cross_validation import cross_validation_grouping_w_params
 # ------------------------------------------------------------
@@ -582,7 +582,7 @@ def _get_n_and_deplist(training_table_name, dependent_variable, filter_null):
     sql = """
         SELECT
             sum(n_rows) as n_rows,
-            array_agg(dep ORDER BY dep ASC) as dep
+            array_agg(dep ORDER BY dep) as dep
         FROM (
             SELECT
                 count(*) as n_rows,
@@ -1563,7 +1563,7 @@ def _create_summary_table(
             dep_list_str = "'false, true'"
         else:
             dep_list_str = '$__dep_list__${0}$__dep_list__$'.format(
-                ','.join(map(quote_ident, dep_list)))
+                ','.join(map(str, dep_list)))
     else:
         dep_list_str = "NULL"
     indep_type = ', '.join(all_cols_types[c] for c in cat_features + con_features)
@@ -1762,7 +1762,8 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
                             "end)::BOOLEAN as {pred_name}")
         else:
             dep_cast_str = "{pred_name}::{dep_type}"
-        dep_levels_array_str = py_list_to_sql_string(dep_levels, 'TEXT', long_format=True)
+        dep_levels_array_str = py_list_to_sql_string(map(quote_literal, dep_levels),
+                                                     'TEXT',
         plpy.info(dep_levels_array_str)
         if pred_type == "response":
             sql = header + """
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index 583f1455b..d3b3f09ec 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -700,10 +700,12 @@ def forest_predict(schema_madlib, model, source, output,
     if not is_classification:
         majority_pred_expression = "AVG(aggregated_prediction)"
     else:
+        dep_levels_array_str = py_list_to_sql_string(map(quote_literal, dep_levels),
+                                                     'TEXT',
+                                                     long_format=True)
         majority_pred_expression = (
             "({0})[{1}.mode(aggregated_prediction + 1)]::TEXT".
-            format(py_list_to_sql_string(dep_levels, 'TEXT', long_format=True),
-                   schema_madlib))
+            format(dep_levels_array_str, schema_madlib))
 
     if is_psql_boolean_type(dep_type):
         # some platforms don't have text to boolean cast. We manually check the string.
@@ -1216,19 +1218,22 @@ def _calculate_oob_error(schema_madlib, oob_prediction_table, oob_error_table,
 
 def _create_summary_table(**kwargs):
     kwargs['features'] = ','.join(kwargs['cat_features'] + kwargs['con_features'])
+    kwargs['dep_type'] = get_expr_type(kwargs['dependent_variable'],
+                                       kwargs['training_table_name'])
     if kwargs['dep_list']:
-        kwargs['dep_list_str'] = (
-            "$dep_list$" +
-            ','.join('"{0}"'.format(str(dep)) for dep in kwargs['dep_list']) +
-            "$dep_list$")
+        if is_psql_boolean_type(kwargs['dep_type']):
+            # Special handling for boolean since Python booleans start with
+            # capitals (i.e False instead of false)
+            # Note:  dep_list is sorted, hence 'false' will be first
+            kwargs['dep_list_str'] = "'false, true'"
+        else:
+            kwargs['dep_list_str'] = '$__dep_list__${0}$__dep_list__$'.format(
+                ','.join(map(str, kwargs['dep_list'])))
     else:
         kwargs['dep_list_str'] = "NULL"
-
     kwargs['indep_type'] = ', '.join(kwargs['all_cols_types'][col]
                                      for col in (kwargs['cat_features'] +
                                                  kwargs['con_features']))
-    kwargs['dep_type'] = get_expr_type(kwargs['dependent_variable'],
-                                       kwargs['training_table_name'])
     kwargs['cat_features_str'] = ','.join(kwargs['cat_features'])
     kwargs['con_features_str'] = ','.join(kwargs['con_features'])
     if kwargs['grouping_cols']:

From afed61f977931b4106ef55c8d6728c143bc9cc02 Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Tue, 31 Jul 2018 21:29:42 -0700
Subject: [PATCH 13/13] Deleted wrong line

---
 .../postgres/modules/recursive_partitioning/decision_tree.py_in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 1b0e03b74..cedfa4800 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -1764,7 +1764,7 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
             dep_cast_str = "{pred_name}::{dep_type}"
         dep_levels_array_str = py_list_to_sql_string(map(quote_literal, dep_levels),
                                                      'TEXT',
-        plpy.info(dep_levels_array_str)
+                                                     long_format=True)
         if pred_type == "response":
             sql = header + """
                     SELECT