From 089a4e2162a7b92dd288e5518ca8710f0aeac696 Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Thu, 26 Jul 2018 12:17:58 -0700 Subject: [PATCH 01/13] DT/RF: Don't eliminate single-level categorical variable JIRA: MADLIB-1258 When DT/RF is run with grouping, a subset of the groups could eliminate a categorical variable leading to multiple issues downstream, including invalid importance values and incorrect prediction. This commit keeps all categorical variables (even if it contains just one level). This would lead to some inefficiency during tree train, since the accumulator state would use additional space for this categorical variable but never use it in a tree. This inefficiency is still preferred for clean code and error-free prediction/importance reporting. Closes #301 Co-authored-by: Nandish Jayaram --- .../recursive_partitioning/DT_impl.hpp | 90 ++++++++++--------- .../recursive_partitioning/decision_tree.cpp | 20 ++--- .../decision_tree.py_in | 38 ++------ .../random_forest.py_in | 8 +- .../test/decision_tree.sql_in | 53 +++++------ .../test/random_forest.sql_in | 46 +++++----- 6 files changed, 121 insertions(+), 134 deletions(-) diff --git a/src/modules/recursive_partitioning/DT_impl.hpp b/src/modules/recursive_partitioning/DT_impl.hpp index 69bdc88ea..0eb8751a7 100644 --- a/src/modules/recursive_partitioning/DT_impl.hpp +++ b/src/modules/recursive_partitioning/DT_impl.hpp @@ -665,21 +665,29 @@ DecisionTree::pickSurrogates( // 1. Compute the max count and corresponding split threshold for // each categorical and continuous feature + ColumnVector cat_max_thres = ColumnVector::Zero(n_cats); ColumnVector cat_max_count = ColumnVector::Zero(n_cats); IntegerVector cat_max_is_reverse = IntegerVector::Zero(n_cats); Index prev_cum_levels = 0; for (Index each_cat=0; each_cat < n_cats; each_cat++){ Index n_levels = state.cat_levels_cumsum(each_cat) - prev_cum_levels; - Index max_label; - (cat_stats_counts.row(stats_i).segment( - prev_cum_levels * 2, n_levels * 2)).maxCoeff(&max_label); - cat_max_thres(each_cat) = static_cast(max_label / 2); - cat_max_count(each_cat) = - cat_stats_counts(stats_i, prev_cum_levels*2 + max_label); - // every odd col is for reverse, hence i % 2 == 1 for reverse index i - cat_max_is_reverse(each_cat) = (max_label % 2 == 1) ? 1 : 0; - prev_cum_levels = state.cat_levels_cumsum(each_cat); + if (n_levels > 0){ + Index max_label; + (cat_stats_counts.row(stats_i).segment( + prev_cum_levels * 2, n_levels * 2)).maxCoeff(&max_label); + + // For each split, there are two stats => max_label / 2 + // gives the split index. There isn't a need to do a floor + // operation, since the threshold will yield the same results + // for n and n+0.5. + cat_max_thres(each_cat) = static_cast(max_label / 2); + cat_max_count(each_cat) = + cat_stats_counts(stats_i, prev_cum_levels*2 + max_label); + // every odd col is for reverse, hence i % 2 == 1 for reverse index i + cat_max_is_reverse(each_cat) = max_label % 2; + prev_cum_levels = state.cat_levels_cumsum(each_cat); + } } ColumnVector con_max_thres = ColumnVector::Zero(n_cons); @@ -800,7 +808,7 @@ DecisionTree::expand_by_sampling(const Accumulator &state, std::random_shuffle(cat_con_feature_indices, cat_con_feature_indices + total_cat_con_features, rvt); // if a leaf node exists, compute the gain in impurity for each split - // pick split with maximum gain and update node with split value + // pick split with maximum gain and update node with split value int max_feat = -1; Index max_bin = -1; bool max_is_cat = false; @@ -831,7 +839,6 @@ DecisionTree::expand_by_sampling(const Accumulator &state, } } - } else { //f >= state.n_cat.features //continuous feature f -= state.n_cat_features; @@ -854,40 +861,41 @@ DecisionTree::expand_by_sampling(const Accumulator &state, } } - // Create and update child nodes if splitting current - uint64_t true_count = statCount(max_stats.segment(0, sps)); - uint64_t false_count = statCount(max_stats.segment(sps, sps)); - uint64_t total_count = statCount(predictions.row(current)); - - if (max_impurity_gain > 0 && - shouldSplit(total_count, true_count, false_count, + bool is_leaf_split = FALSE; + if (max_impurity_gain > 0){ + // Create and update child nodes if splitting current + uint64_t true_count = statCount(max_stats.segment(0, sps)); + uint64_t false_count = statCount(max_stats.segment(sps, sps)); + uint64_t total_count = statCount(predictions.row(current)); + if (shouldSplit(total_count, true_count, false_count, min_split, min_bucket, max_depth)) { - double max_threshold; - if (max_is_cat) - max_threshold = static_cast(max_bin); - else - max_threshold = con_splits(max_feat, max_bin); - - if (children_not_allocated) { - // allocate the memory for child nodes if not allocated already - incrementInPlace(); - children_not_allocated = false; - } - - children_wont_split &= - updatePrimarySplit( - current, static_cast(max_feat), - max_threshold, max_is_cat, - min_split, - max_stats.segment(0, sps), // true_stats - max_stats.segment(sps, sps) // false_stats - ); + is_leaf_split = TRUE; + double max_threshold; + if (max_is_cat) + max_threshold = static_cast(max_bin); + else + max_threshold = con_splits(max_feat, max_bin); + + if (children_not_allocated) { + // allocate the memory for child nodes if not allocated already + incrementInPlace(); + children_not_allocated = false; + } - } else { + children_wont_split &= + updatePrimarySplit( + current, static_cast(max_feat), + max_threshold, max_is_cat, + min_split, + max_stats.segment(0, sps), // true_stats + max_stats.segment(sps, sps) // false_stats + ); + } // if shouldSplit + } //if max_impurity_gain > 0 + if (not is_leaf_split) feature_indices(current) = FINISHED_LEAF; - } - } // if leaf exists + } // if leaf is in_process } // for each leaf // return true if tree expansion is finished diff --git a/src/modules/recursive_partitioning/decision_tree.cpp b/src/modules/recursive_partitioning/decision_tree.cpp index 351fcedb9..49eb349ca 100644 --- a/src/modules/recursive_partitioning/decision_tree.cpp +++ b/src/modules/recursive_partitioning/decision_tree.cpp @@ -123,19 +123,21 @@ compute_leaf_stats_transition::run(AnyType & args){ return args[0]; } - // cat_levels size = n_cat_features + // cat_levels.size = n_cat_features NativeIntegerVector cat_levels; if (args[6].isNull()){ cat_levels.rebind(this->allocateArray(0)); } else { - MutableNativeIntegerVector xx_cat = args[6].getAs(); - for (Index i = 0; i < xx_cat.size(); i++) - xx_cat[i] -= 1; // ignore the last level since a split - // like 'var <= last level' would move all rows to - // a one side. Such a split will always be ignored - // when selecting the best split. - cat_levels.rebind(xx_cat.memoryHandle(), xx_cat.size()); + MutableNativeIntegerVector n_levels_per_cat = + args[6].getAs(); + for (Index i = 0; i < n_levels_per_cat.size(); i++) + n_levels_per_cat[i] -= 1; + // ignore the last level since a split + // like 'var <= last level' would move all rows to + // a one side. Such a split will always be ignored + // when selecting the best split. + cat_levels.rebind(n_levels_per_cat.memoryHandle(), n_levels_per_cat.size()); } // con_splits size = num_con_features x num_bins @@ -203,7 +205,6 @@ compute_leaf_stats_transition::run(AnyType & args){ current_sum += cat_levels(i); state.cat_levels_cumsum(i) = current_sum; } - } state << MutableLevelState::tuple_type(dt, cat_features, con_features, @@ -257,7 +258,6 @@ dt_apply::run(AnyType & args){ return_code = TERMINATED; // indicates termination due to error } - AnyType output_tuple; output_tuple << dt.storage() << return_code diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index 89acd8aa1..cbcb0f6ba 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -288,8 +288,6 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, tree = _tree_train_using_bins(**locals()) tree['grp_key'] = '' tree['cp'] = grp_key_to_cp[tree['grp_key']] - tree['cat_features'] = cat_features - tree['con_features'] = con_features tree_states = [tree] else: grouping_array_str = get_grouping_array_str(training_table_name, grouping_cols) @@ -325,8 +323,6 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, tree['cp'] = grp_key_to_cp.values()[0] else: tree['cp'] = grp_key_to_cp[grp_key] - tree['cat_features'] = bins['grp_to_cat_features'][grp_key] - tree['con_features'] = bins['con_features'] # 5) prune the tree using provided 'cp' value and produce a list of # cp values if cross-validation is required (cp_list = [] if not) @@ -345,7 +341,7 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, importance_vectors = _compute_var_importance( schema_madlib, tree, - len(tree['cat_features']), len(tree['con_features'])) + len(cat_features), len(con_features)) tree.update(**importance_vectors) return tree_states, bins, dep_list, n_rows, cat_features_info_table @@ -745,7 +741,6 @@ def _get_bins(schema_madlib, training_table_name, {union_null_proxy} ) s ) s1 - WHERE array_upper(levels, 1) > 1 """.format(training_table_name=training_table_name, filter_str=filter_str, union_null_proxy=union_null_proxy) @@ -765,14 +760,6 @@ def _get_bins(schema_madlib, training_table_name, order_fun=expr if col_name in ordered_cat_features else order_fun) for col_name, expr in all_col_expressions.items()) all_levels = plpy.execute(sql_all_cats) - - if len(all_levels) != len(cat_features): - use_cat_features = [row['colname'] for row in all_levels] - cat_features = [feature for feature in cat_features - if feature in use_cat_features] - plpy.warning("Decision tree warning: Categorical columns with only " - "one value are dropped from the tree model.") - col_to_row = dict((row['colname'], i) for i, row in enumerate(all_levels)) return dict( @@ -992,7 +979,6 @@ def _get_bins_grps( ) s GROUP BY grp_key ) s1 - where array_upper(levels, 1) > 1 """.format(**locals()) all_col_expressions = {} @@ -1017,13 +1003,6 @@ def _get_bins_grps( all_levels = list(plpy.execute(sql_all_cats)) all_levels.sort(key=itemgetter('grp_key')) - use_cat_features = set([row['colname'] for row in all_levels]) - if len(use_cat_features) != len(cat_features): - plpy.warning("Decision tree warning: Categorical columns with only " - "one value are dropped from the tree model.") - cat_features = [feature for feature in cat_features - if feature in use_cat_features] - # grp_col_to_levels is a list of tuples (pairs) with # first value = group value, # second value = a dict mapping a categorical column to its levels in data @@ -1079,7 +1058,7 @@ def _get_bins_grps( grp_key_cat=grp_key_cat, grouping_array_str=grouping_array_str, grp_to_col_to_levels=grp_to_col_to_levels, - grp_to_cat_features=grp_to_cat_features) + ) # ------------------------------------------------------------ @@ -1115,7 +1094,9 @@ def _create_cat_features_info_table(cat_features_info_table, bins): cat_levels = [quote_literal(each_level) for sublist in cat_levels for each_level in sublist] - cat_levels_str = py_list_to_sql_string(cat_levels, 'text', long_format=True) + cat_levels_str = py_list_to_sql_string(cat_levels, + 'text', + long_format=True) else: # this is the case if no categorical features present cat_names_str = cat_n_levels_str = cat_levels_str = "NULL" @@ -1167,10 +1148,9 @@ def get_feature_str(schema_madlib, boolean_cats, for col in cat_features: if col in boolean_cats: cat_features_cast.append( - "(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)::text") - else: - cat_features_cast.append( - "(coalesce({0}::text, '{1}'))::text".format(col, null_val)) + "(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)") + cat_features_cast.append( + "(coalesce({0}::text, '{1}'))::text".format(col, null_val)) cat_features_str = ("{0}._map_catlevel_to_int(array[" + ", ".join(cat_features_cast) + "], {1}, {2}, {3})" @@ -2181,7 +2161,7 @@ def _xvalidate(schema_madlib, tree_states, training_table_name, output_table_nam tree['pruned_depth'] = 0 importance_vectors = _compute_var_importance( schema_madlib, tree, - len(tree['cat_features']), len(tree['con_features'])) + len(cat_features), len(con_features)) tree.update(**importance_vectors) plpy.execute("DROP TABLE {group_to_param_list_table}".format(**locals())) diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in index 4d7487272..b183bb7c5 100644 --- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in @@ -558,8 +558,6 @@ def forest_train( num_random_features, max_n_surr, null_proxy) tree['grp_key'] = '' - tree['cat_features'] = cat_features - tree['con_features'] = con_features if importance: tree.update(_compute_var_importance( schema_madlib, tree, @@ -584,14 +582,12 @@ def forest_train( # stop calculating that group further. for tree in tree_states: grp_key = tree['grp_key'] - tree['cat_features'] = bins['grp_to_cat_features'][grp_key] - tree['con_features'] = bins['con_features'] tree_terminated[grp_key] = tree['finished'] if importance: importance_vectors = _compute_var_importance( schema_madlib, tree, - len(tree['cat_features']), - len(tree['con_features'])) + len(cat_features), + len(con_features)) tree.update(**importance_vectors) _insert_into_result_table( diff --git a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in index dee3e329a..f83f5dd68 100644 --- a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in @@ -243,26 +243,27 @@ CREATE TABLE dt_golf ( "Cont_features" double precision[], cat_features text[], windy boolean, + windy2 boolean, class text ) ; -INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,"Cont_features",cat_features, windy,class) VALUES -(1, 'sunny', 85, 85,ARRAY[85, 85], ARRAY['a', 'b'], false, 'Don''t Play'), -(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['a', 'b'], true, 'Don''t Play'), -(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['a', 'b'], false, 'Play'), -(4, 'rain', 70, NULL, ARRAY[70, 96], ARRAY['a', 'b'], false, 'Play'), -(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['a', 'b'], false, 'Play'), -(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, 'Don''t Play'), -(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['c', 'b'], NULL , 'Play'), -(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['a', 'b'], false, 'Don''t Play'), -(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['a', 'b'], false, 'Play'), -(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['a', 'b'], false, 'Play'), -(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['a', 'd'], true, 'Play'), -(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['c', 'b'], NULL, 'Play'), -(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'), -(15, NULL, 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'), -(16, 'overcast', NULL, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'), -(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['c', 'b'], true, 'Don''t Play'); +INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,"Cont_features",cat_features, windy, windy2,class) VALUES +(1, 'sunny', 85, 85,ARRAY[85, 85], ARRAY['a', 'b'], false, false, 'Don''t Play'), +(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['a', 'b'], true, false, 'Don''t Play'), +(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['a', 'b'], false, false, 'Play'), +(4, 'rain', 70, NULL, ARRAY[70, 96], ARRAY['a', 'b'], false, false, 'Play'), +(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['a', 'b'], false, false, 'Play'), +(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, false, 'Don''t Play'), +(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['c', 'b'], NULL, NULL, 'Play'), +(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['a', 'b'], false, false, 'Don''t Play'), +(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['a', 'b'], false, false, 'Play'), +(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['a', 'b'], false, false, 'Play'), +(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['a', 'd'], true, false, 'Play'), +(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['c', 'b'], NULL, NULL, 'Play'), +(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, false, 'Play'), +(15, NULL, 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, false, 'Play'), +(16, 'overcast', NULL, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, false, 'Play'), +(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['c', 'b'], true, false, 'Don''t Play'); update dt_golf set id_2 = id % 2; ------------------------------------------------------------------------- @@ -273,7 +274,7 @@ SELECT tree_train('dt_golf'::text, -- source table 'train_output'::text, -- output model table 'id'::text, -- id column 'temperature::double precision'::text, -- response - 'humidity, windy, "Cont_features"'::text, -- features + 'humidity, windy2, "Cont_features"'::text, -- features NULL::text, -- exclude columns 'gini'::text, -- split criterion NULL::text, -- no grouping @@ -282,13 +283,16 @@ SELECT tree_train('dt_golf'::text, -- source table 6::integer, -- min split 2::integer, -- min bucket 3::integer, -- number of bins per continuous variable - 'cp=0.01, n_folds=2' -- cost-complexity pruning parameter + 'cp=0.01, n_folds=2', -- cost-complexity pruning parameter + 'null_as_category=True' ); SELECT _print_decision_tree(tree) from train_output; SELECT tree_display('train_output', False); SELECT impurity_var_importance FROM train_output; +SELECT cat_levels_in_text, cat_n_levels, impurity_var_importance FROM train_output; SELECT * FROM train_output_cv; +SELECT * FROM train_output_summary; ------------------------------------------------------------------------- -- grouping @@ -303,17 +307,16 @@ SELECT tree_train('dt_golf'::text, -- source table 'class'::text, -- grouping NULL::text, -- no weights 10::integer, -- max depth - 6::integer, -- min split - 2::integer, -- min bucket + 2::integer, -- min split + 1::integer, -- min bucket 3::integer, -- number of bins per continuous variable - 'cp=0.01' -- cost-complexity pruning parameter + 'cp=0', -- cost-complexity pruning parameter + 'max_surrogates=2' ); SELECT _print_decision_tree(tree) from train_output; SELECT tree_display('train_output', FALSE); SELECT * FROM train_output; -SELECT tree_display('train_output', False); - -- testing tree_predict with a category not present in training table CREATE TABLE dt_golf2 as @@ -321,7 +324,7 @@ SELECT * FROM dt_golf UNION SELECT 15 as id, 1 as id_2, 'humid' as "OUTLOOK", 71 as temperature, 80 as humidity, ARRAY[90, 90] as "Cont_features", ARRAY['b', 'c'] as cat_features, - true as windy, 'Don''t Play' as class; + true as windy, false as windy2, 'Don''t Play' as class; \x off SELECT * FROM dt_golf2; SELECT tree_predict('train_output', 'dt_golf2', 'predict_output'); diff --git a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in index ecb24c77f..364e459c4 100644 --- a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in @@ -40,7 +40,7 @@ SELECT forest_train( NULL, -- exclude columns NULL, -- no grouping 5, -- num of trees - NULL, -- num of random features + 2, -- num of random features TRUE, -- importance 1, -- num_permutations 10, -- max depth @@ -64,13 +64,13 @@ SELECT forest_train( 'train_output', -- output model table 'id', -- id column 'temperature::double precision', -- response - 'humidity, cat_features, windy, "Cont_features"', -- features + 'cat_features, windy, "Cont_features"', -- features NULL, -- exclude columns 'class', -- grouping 5, -- num of trees - NULL, -- num of random features + 5, -- num of random features TRUE, -- importance - 20, -- num_permutations + 20, -- num_permutations 10, -- max depth 1, -- min split 1, -- min bucket @@ -234,7 +234,7 @@ SELECT forest_train( NULL, -- exclude columns NULL, -- no grouping 5, -- num of trees - 1, -- num of random features + NULL, -- num of random features TRUE, -- importance 1, -- num_permutations 10, -- max depth @@ -287,21 +287,21 @@ INSERT INTO rf_gr_test (id,gr,f1,f2,f3,cl) VALUES DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group; SELECT forest_train( - 'rf_gr_test', -- source table - 'train_output', -- output model table - 'id', -- id column - 'cl', -- response - 'f1, f2', -- features - NULL, -- exclude columns - 'gr', -- grouping - 2, -- num of trees - 1, -- num of random features - TRUE, -- importance - 1, -- num_permutations - 10, -- max depth - 1, -- min split - 1, -- min bucket - 2, -- number of bins per continuous variable - 'max_surrogates=0', - FALSE - ); + 'rf_gr_test', -- source table + 'train_output', -- output model table + 'id', -- id column + 'cl', -- response + 'f1, f2', -- features + NULL, -- exclude columns + 'gr', -- grouping + 2, -- num of trees + 1, -- num of random features + TRUE, -- importance + 1, -- num_permutations + 10, -- max depth + 1, -- min split + 1, -- min bucket + 2, -- number of bins per continuous variable + 'max_surrogates=0', + FALSE + ); From a1b9f547beb4c45fc037b7d81b5a3330e0df5740 Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Fri, 27 Jul 2018 13:26:28 -0700 Subject: [PATCH 02/13] Fix bool to text cast mapping --- .../decision_tree.py_in | 22 +++++++++++-------- .../random_forest.py_in | 12 ++++------ .../modules/utilities/validate_args.py_in | 8 ++----- 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index cbcb0f6ba..dd4197d46 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -30,6 +30,7 @@ from utilities.utilities import unique_string from utilities.validate_args import _get_table_schema_names from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import explicit_bool_to_text from utilities.validate_args import get_cols from utilities.validate_args import get_cols_and_types from utilities.validate_args import get_expr_type @@ -1132,7 +1133,7 @@ def _create_cat_features_info_table(cat_features_info_table, bins): # ------------------------------------------------------------------------------ -def get_feature_str(schema_madlib, boolean_cats, +def get_feature_str(schema_madlib, source_table, cat_features, con_features, levels_str, n_levels_str, null_proxy=None): @@ -1144,13 +1145,16 @@ def get_feature_str(schema_madlib, boolean_cats, # (1 to N). The unique value will be mapped to -1 indicating an # unknown/missing value in the underlying layers. null_val = unique_string() if null_proxy is None else null_proxy + + # Cast boolean column to text: requires a special cast expression for + # platforms where __HAS_BOOL_TO_TEXT_CAST__ is not enabled + patched_cat_features = explicit_bool_to_text(source_table, + cat_features, + schema_madlib) cat_features_cast = [] - for col in cat_features: - if col in boolean_cats: - cat_features_cast.append( - "(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)") + for col in patched_cat_features: cat_features_cast.append( - "(coalesce({0}::text, '{1}'))::text".format(col, null_val)) + "(coalesce(({0})::text, '{1}'))::text".format(col, null_val)) cat_features_str = ("{0}._map_catlevel_to_int(array[" + ", ".join(cat_features_cast) + "], {1}, {2}, {3})" @@ -1187,7 +1191,7 @@ def _one_step(schema_madlib, training_table_name, cat_features, # XXX cat_feature_str contains $5 and $2, and a SQL function bytea8 = schema_madlib + '.bytea8' cat_features_str, con_features_str = get_feature_str(schema_madlib, - boolean_cats, + training_table_name, cat_features, con_features, "$3", "$2", @@ -1283,7 +1287,7 @@ def _one_step_for_grps( cat_levels_in_text = unique_string() cat_features_str, con_features_str = get_feature_str( - schema_madlib, boolean_cats, cat_features, con_features, + schema_madlib, training_table_name, cat_features, con_features, cat_levels_in_text, cat_n_levels, null_proxy) train_apply_func = """ @@ -1720,7 +1724,7 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response', if value == 'boolean']) cat_features_str, con_features_str = get_feature_str( - schema_madlib, boolean_cats, cat_features, con_features, + schema_madlib, source, cat_features, con_features, "m.cat_levels_in_text", "m.cat_n_levels", null_proxy) if use_existing_tables and table_exists(output): diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in index b183bb7c5..ccf1a26f2 100644 --- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in @@ -598,7 +598,7 @@ def forest_train( schema_madlib, output_table_name, cat_features_info_table, con_splits_table, oob_prediction_table, oob_view, sample_id, id_col_name, cat_features, con_features, - boolean_cats, grouping_cols, grp_key_to_grp_cols, dep, + training_table_name, grouping_cols, grp_key_to_grp_cols, dep, num_permutations, is_classification, importance, num_bins, filter_null, null_proxy) @@ -691,12 +691,8 @@ def forest_predict(schema_madlib, model, source, output, _assert(is_classification or pred_type == 'response', "Random forest error: pred_type cannot be 'prob' for regression model.") - # find which columns are of type boolean - boolean_cats = set([key for key, value in get_cols_and_types(source) - if value == 'boolean']) - cat_features_str, con_features_str = get_feature_str( - schema_madlib, boolean_cats, cat_features, con_features, + schema_madlib, source, cat_features, con_features, "cat_levels_in_text", "cat_n_levels", null_proxy) pred_name = ('"prob_{0}"' if pred_type == "prob" else @@ -918,12 +914,12 @@ def get_tree(schema_madlib, model_table, gid, sample_id, def _calculate_oob_prediction( schema_madlib, model_table, cat_features_info_table, con_splits_table, oob_prediction_table, oob_view, sample_id, id_col_name, cat_features, - con_features, boolean_cats, grouping_cols, grp_key_to_grp_cols, dep, + con_features, source_table, grouping_cols, grp_key_to_grp_cols, dep, num_permutations, is_classification, importance, num_bins, filter_null, null_proxy=None): """Calculate predication for out-of-bag sample""" cat_features_str, con_features_str = get_feature_str( - schema_madlib, boolean_cats, cat_features, con_features, + schema_madlib, source_table, cat_features, con_features, "cat_levels_in_text", "cat_n_levels", null_proxy) join_str = "," if grouping_cols is None else "JOIN" diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in index 4296491cc..71f42b4bb 100644 --- a/src/ports/postgres/modules/utilities/validate_args.py_in +++ b/src/ports/postgres/modules/utilities/validate_args.py_in @@ -518,14 +518,10 @@ def explicit_bool_to_text(tbl, cols, schema_madlib): Patch madlib.bool_to_text for columns that are of type boolean. """ m4_ifdef(, , ) - col_to_type = dict(get_cols_and_types(tbl)) patched = [] for col in cols: - if col not in col_to_type: - plpy.error("Column ({col}) does not exist " - "in table ({tbl})".format(col=col, tbl=tbl)) - if col_to_type[col] == 'boolean': - patched.append(schema_madlib + ".bool_to_text(" + col + ")") + if get_expr_type(col) == 'boolean': + patched.append("{0}.bool_to_text({1})".format(schema_madlib, col)) else: patched.append(col) return patched From 3c84b340c6138c9b2fcb790522628c5eed153190 Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Fri, 27 Jul 2018 15:15:23 -0700 Subject: [PATCH 03/13] Refactor get_expr_type and explicit_bool_to_text --- .../modules/utilities/validate_args.py_in | 35 ++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in index 71f42b4bb..55444d116 100644 --- a/src/ports/postgres/modules/utilities/validate_args.py_in +++ b/src/ports/postgres/modules/utilities/validate_args.py_in @@ -1,7 +1,9 @@ +import collections import plpy import re import string + # Postgresql naming restrictions """ Both keywords and identifier names in PostgreSQL have a maximum length limit of @@ -361,8 +363,8 @@ def get_cols_and_types(tbl): # ------------------------------------------------------------------------- -def get_expr_type(expr, tbl): - """ Return the type of an expression run on a given table +def get_expr_type(expressions, tbl): + """ Return the type of a multiple expressions run on a given table Note: this Args: @@ -371,17 +373,25 @@ def get_expr_type(expr, tbl): Returns: str """ - expr_type = plpy.execute(""" - SELECT pg_typeof({0}) AS type + if isinstance(expressions, collections.Iterable): + expressions = [expressions] + pg_type_expressions = ["pg_typeof({0})::text".format(e) for e in expressions] + expr_types = plpy.execute(""" + SELECT {0} as all_types FROM {1} LIMIT 1 - """.format(expr, tbl)) - if not expr_type: - plpy.error("Unable to get type of expression ({0}). " + """.format("ARRAY[{0}]::TEXT[]".format(','.join(pg_type_expressions)), + tbl)) + if not expr_types: + plpy.error("Unable toget type of expression ({0}). " "Table {1} may not contain any valid tuples". - format(expr, tbl)) - return expr_type[0]['type'].lower() -# ------------------------------------------------------------------------- + format(expressions, tbl)) + expr_types = expr_types[0]["all_types"] + if len(expr_types) == 1: + return expr_types[0] + else: + return expr_types +# # ------------------------------------------------------------------------- def columns_exist_in_table(tbl, cols, schema_madlib="madlib"): @@ -519,8 +529,9 @@ def explicit_bool_to_text(tbl, cols, schema_madlib): """ m4_ifdef(, , ) patched = [] - for col in cols: - if get_expr_type(col) == 'boolean': + col_types = get_expr_type(cols, tbl) + for col, col_type in zip(cols, col_types): + if col_type == 'boolean': patched.append("{0}.bool_to_text({1})".format(schema_madlib, col)) else: patched.append(col) From 5cb49b82d6b7bb8cca318f3e69c7bb223776684f Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Fri, 27 Jul 2018 15:34:14 -0700 Subject: [PATCH 04/13] Fix minor issues in get_expr_type --- src/ports/postgres/modules/utilities/validate_args.py_in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in index 55444d116..4f6503218 100644 --- a/src/ports/postgres/modules/utilities/validate_args.py_in +++ b/src/ports/postgres/modules/utilities/validate_args.py_in @@ -373,14 +373,14 @@ def get_expr_type(expressions, tbl): Returns: str """ - if isinstance(expressions, collections.Iterable): + if not isinstance(expressions, collections.Iterable): expressions = [expressions] - pg_type_expressions = ["pg_typeof({0})::text".format(e) for e in expressions] + pg_type_expressions = ["pg_typeof({0})".format(e) for e in expressions] expr_types = plpy.execute(""" SELECT {0} as all_types FROM {1} LIMIT 1 - """.format("ARRAY[{0}]::TEXT[]".format(','.join(pg_type_expressions)), + """.format("ARRAY[{0}]".format(','.join(pg_type_expressions)), tbl)) if not expr_types: plpy.error("Unable toget type of expression ({0}). " From 35757153390293d4ac778aab6f9e181aea9b2c31 Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Fri, 27 Jul 2018 16:23:25 -0700 Subject: [PATCH 05/13] Ensure scalar output only if input is scalar --- .../modules/utilities/validate_args.py_in | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in index 4f6503218..cffaea3f6 100644 --- a/src/ports/postgres/modules/utilities/validate_args.py_in +++ b/src/ports/postgres/modules/utilities/validate_args.py_in @@ -1,8 +1,8 @@ -import collections +from collections import Iterable import plpy import re import string - +from types import StringTypes # Postgresql naming restrictions """ @@ -373,8 +373,15 @@ def get_expr_type(expressions, tbl): Returns: str """ - if not isinstance(expressions, collections.Iterable): + # FIXME: Below transformation exist to ensure backwards compatibility + # Remove this when all callers have been modified to pass an Iterable 'expressions' + if (isinstance(expressions, types.StringTypes) or + not isinstance(expressions, collections.Iterable)): expressions = [expressions] + input_was_scalar = True + else: + input_was_scalar = False + pg_type_expressions = ["pg_typeof({0})".format(e) for e in expressions] expr_types = plpy.execute(""" SELECT {0} as all_types @@ -387,7 +394,8 @@ def get_expr_type(expressions, tbl): "Table {1} may not contain any valid tuples". format(expressions, tbl)) expr_types = expr_types[0]["all_types"] - if len(expr_types) == 1: + if input_was_scalar: + # output should be same form as input return expr_types[0] else: return expr_types From 750c708900631549a948db9222f711606d79a88e Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Fri, 27 Jul 2018 16:37:49 -0700 Subject: [PATCH 06/13] Fix import error --- src/ports/postgres/modules/utilities/validate_args.py_in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in index cffaea3f6..f7f79e955 100644 --- a/src/ports/postgres/modules/utilities/validate_args.py_in +++ b/src/ports/postgres/modules/utilities/validate_args.py_in @@ -375,8 +375,8 @@ def get_expr_type(expressions, tbl): """ # FIXME: Below transformation exist to ensure backwards compatibility # Remove this when all callers have been modified to pass an Iterable 'expressions' - if (isinstance(expressions, types.StringTypes) or - not isinstance(expressions, collections.Iterable)): + if (isinstance(expressions, StringTypes) or + not isinstance(expressions, Iterable)): expressions = [expressions] input_was_scalar = True else: From 93bf63fbd8781b1b50be5604c636b74d8cc0ea0b Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Mon, 30 Jul 2018 16:53:02 -0700 Subject: [PATCH 07/13] DT: Simplify + clean boolean expressions --- .../recursive_partitioning/DT_impl.hpp | 9 +-- .../recursive_partitioning/decision_tree.cpp | 3 +- .../decision_tree.py_in | 59 +++++++------------ .../test/decision_tree.sql_in | 42 +++++++------ 4 files changed, 53 insertions(+), 60 deletions(-) diff --git a/src/modules/recursive_partitioning/DT_impl.hpp b/src/modules/recursive_partitioning/DT_impl.hpp index 0eb8751a7..75e4ce4cf 100644 --- a/src/modules/recursive_partitioning/DT_impl.hpp +++ b/src/modules/recursive_partitioning/DT_impl.hpp @@ -518,6 +518,7 @@ DecisionTree::expand(const Accumulator &state, double gain = impurityGain( state.cat_stats.row(stats_i). segment(fv_index, sps * 2), sps); + if (gain > max_impurity_gain){ max_impurity_gain = gain; max_feat = f; @@ -677,10 +678,10 @@ DecisionTree::pickSurrogates( (cat_stats_counts.row(stats_i).segment( prev_cum_levels * 2, n_levels * 2)).maxCoeff(&max_label); - // For each split, there are two stats => max_label / 2 - // gives the split index. There isn't a need to do a floor - // operation, since the threshold will yield the same results - // for n and n+0.5. + // For each split, there are two stats => + // max_label / 2 gives the split index. A floor + // operation is unnecessary since the threshold will yield + // the same results for n and n+0.5. cat_max_thres(each_cat) = static_cast(max_label / 2); cat_max_count(each_cat) = cat_stats_counts(stats_i, prev_cum_levels*2 + max_label); diff --git a/src/modules/recursive_partitioning/decision_tree.cpp b/src/modules/recursive_partitioning/decision_tree.cpp index 49eb349ca..d2499461c 100644 --- a/src/modules/recursive_partitioning/decision_tree.cpp +++ b/src/modules/recursive_partitioning/decision_tree.cpp @@ -131,12 +131,13 @@ compute_leaf_stats_transition::run(AnyType & args){ else { MutableNativeIntegerVector n_levels_per_cat = args[6].getAs(); - for (Index i = 0; i < n_levels_per_cat.size(); i++) + for (Index i = 0; i < n_levels_per_cat.size(); i++){ n_levels_per_cat[i] -= 1; // ignore the last level since a split // like 'var <= last level' would move all rows to // a one side. Such a split will always be ignored // when selecting the best split. + } cat_levels.rebind(n_levels_per_cat.memoryHandle(), n_levels_per_cat.size()); } diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index dd4197d46..6638ad024 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -247,15 +247,15 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, filter_dep) dep_list.sort() if dep_is_bool: - dep_col_str = ("CASE WHEN {0} THEN 'True' ELSE 'False' END". + dep_var_str = ("CASE WHEN {0} THEN 0 ELSE 1 END". format(dependent_variable)) + else: - dep_col_str = dependent_variable - dep_var_str = ("(CASE " + - "\n\t\t".join(["WHEN ({dep_col})::text = $${c}$$ THEN {i}". - format(dep_col=dep_col_str, c=c, i=i) - for i, c in enumerate(dep_list)]) + - "\nEND)") + dep_var_str = ("(CASE " + + "\n\t\t".join(["WHEN ({0})::text = $${1}$$ THEN {2}". + format(dep_col_str, str(c), i) + for i, c in enumerate(dep_list)]) + + "\nEND)") else: if split_criterion.lower().strip() != "mse": plpy.warning("Decision tree: Using MSE as split criterion as it " @@ -354,18 +354,18 @@ def get_grouping_array_str(table_name, grouping_cols, qualifier=None): Args: @param grouping_cols: list, List of columns used as grouping columns """ - if qualifier: - qualifier_str = qualifier + "." - else: - qualifier_str = '' + def _col_to_text(col, col_type): + qualifier_str = qualifier + "." if qualifier else '' + if col_type.lower() == 'boolean': + return "(case when {0} then 'true' else 'false' end)::text".format(col) + else: + return '({0}{1})::text'.format(qualifier_str, col) grouping_cols_list = [col.strip() for col in grouping_cols.split(',')] grouping_cols_and_types = [(c, get_expr_type(c, table_name)) for c in grouping_cols_list] - grouping_array_str = 'array_to_string(array[' + \ - ','.join("(case when " + col + " then 'True' else 'False' end)::text" - if col_type.lower() == 'boolean' else '(' + qualifier_str + col + ')::text' - for col, col_type in grouping_cols_and_types) + "]::text[], ',')" + grouping_array_str = "array_to_string(array[{0}]::text[], ',')".format( + ','.join(_col_to_text(col, col_type) for col, col_type in grouping_cols_and_types)) return grouping_array_str # ------------------------------------------------------------------------------ @@ -697,7 +697,7 @@ def _get_bins(schema_madlib, training_table_name, # variable levels to integers, and keep this mapping in the memory. if len(cat_features) > 0: if is_classification: - # For classifications + # For classification the dependent variable is encoded as an integer order_fun = ("{madlib}._dst_compute_entropy({dep}, {n})". format(madlib=schema_madlib, dep=dependent_variable, @@ -746,13 +746,8 @@ def _get_bins(schema_madlib, training_table_name, filter_str=filter_str, union_null_proxy=union_null_proxy) - all_col_expressions = {} - for col in cat_features: - if col in boolean_cats: - all_col_expressions[col] = ("(CASE WHEN " + col + - " THEN 'True' ELSE 'False' END)") - else: - all_col_expressions[col] = col + all_col_expressions = dict(zip(cat_features, explicit_bool_to_text( + training_table_name, cat_features, schema_madlib))) sql_all_cats = ' UNION '.join( sql_cat_levels.format( @@ -982,18 +977,8 @@ def _get_bins_grps( ) s1 """.format(**locals()) - all_col_expressions = {} - for col in cat_features: - if col in boolean_cats: - all_col_expressions[col] = ("(CASE WHEN " + col + - " THEN 'True' ELSE 'False' END)") - else: - if null_proxy is not None: - all_col_expressions[col] = ("COALESCE({0}::TEXT, '{1}')". - format(col, null_proxy)) - else: - all_col_expressions[col] = col - + all_col_expressions = dict(zip(cat_features, explicit_bool_to_text( + training_table_name, cat_features, schema_madlib))) sql_all_cats = ' UNION ALL '.join( sql_cat_levels.format( col=expr, @@ -1761,8 +1746,8 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response', else: if dep_type.lower() == "boolean": # some platforms don't have text to boolean cast. We manually check the string. - dep_cast_str = ("(case {pred_name} when 'True' then " - "True else False end)::BOOLEAN as {pred_name}") + dep_cast_str = ("(case {pred_name} when 'true' then true " + "else false end)::BOOLEAN as {pred_name}") else: dep_cast_str = "{pred_name}::{dep_type}" if pred_type == "response": diff --git a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in index f83f5dd68..74bc51835 100644 --- a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in @@ -269,6 +269,7 @@ update dt_golf set id_2 = id % 2; ------------------------------------------------------------------------- -- no grouping, with cross_validation +-- also adding a categorical with just a single level (windy2) DROP TABLE IF EXISTS train_output, train_output_summary, train_output_cv; SELECT tree_train('dt_golf'::text, -- source table 'train_output'::text, -- output model table @@ -290,7 +291,6 @@ SELECT tree_train('dt_golf'::text, -- source table SELECT _print_decision_tree(tree) from train_output; SELECT tree_display('train_output', False); SELECT impurity_var_importance FROM train_output; -SELECT cat_levels_in_text, cat_n_levels, impurity_var_importance FROM train_output; SELECT * FROM train_output_cv; SELECT * FROM train_output_summary; ------------------------------------------------------------------------- @@ -298,25 +298,32 @@ SELECT * FROM train_output_summary; -- grouping DROP TABLE IF EXISTS train_output, train_output_summary, predict_output; SELECT tree_train('dt_golf'::text, -- source table - 'train_output'::text, -- output model table - 'id'::text, -- id column - 'temperature::double precision'::text, -- response - '"OUTLOOK", humidity, windy, cat_features'::text, -- features - NULL::text, -- exclude columns - 'gini'::text, -- split criterion - 'class'::text, -- grouping - NULL::text, -- no weights - 10::integer, -- max depth - 2::integer, -- min split - 1::integer, -- min bucket - 3::integer, -- number of bins per continuous variable - 'cp=0', -- cost-complexity pruning parameter - 'max_surrogates=2' - ); + 'train_output'::text, -- output model table + 'id'::text, -- id column + 'temperature::double precision'::text, -- response + '"OUTLOOK", humidity, windy, cat_features'::text, -- features + NULL::text, -- exclude columns + 'gini'::text, -- split criterion + 'class'::text, -- grouping + NULL::text, -- no weights + 10::integer, -- max depth + 2::integer, -- min split + 1::integer, -- min bucket + 3::integer, -- number of bins per contvariable + 'cp=0', -- cost-complexity pruning parameter + 'max_surrogates=2' + ); SELECT _print_decision_tree(tree) from train_output; SELECT tree_display('train_output', FALSE); -SELECT * FROM train_output; + +-- cat_features[2] has a single level. The cat_n_levels is in order of the +-- input categorical features. +-- In this case "OUTLOOL" = 1, windy = 2, cat_features[1] = 3, cat_features[2] = 4 +SELECT assert(cat_n_levels[4] = 1, + 'Categorical features with single level not being retained.') +FROM train_output +WHERE class = E'Don\'t Play'; -- testing tree_predict with a category not present in training table CREATE TABLE dt_golf2 as @@ -335,7 +342,6 @@ JOIN dt_golf2 USING (id); \x on -select * from train_output; select * from train_output_summary; ------------------------------------------------------------------------- From 980d78da47d72bdf4abadbada57522ed6ae481c0 Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Tue, 31 Jul 2018 10:36:59 -0700 Subject: [PATCH 08/13] Fix minor bug + refactor split_criterion --- .../decision_tree.py_in | 38 ++++++++++--------- .../random_forest.py_in | 10 ++--- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index 6638ad024..7ebfbfd65 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -50,13 +50,6 @@ def _tree_validate_args( min_split, min_bucket, n_bins, cp, n_folds, **kwargs): """ Validate the arguments """ - if not split_criterion: - split_criterion = 'gini' - _assert(split_criterion.lower().strip() in ['mse', 'gini', 'cross-entropy', - 'entropy', 'misclass', - 'misclassification'], - "Decision tree error: Invalid split_criterion.") - _assert(training_table_name and training_table_name.strip().lower() not in ('null', ''), "Decision tree error: Invalid data table.") @@ -111,6 +104,24 @@ def _tree_validate_args( # ------------------------------------------------------------ +def _validate_split_criterion(split_criterion, is_classification): + _assert(split_criterion.lower().strip() in ['mse', 'gini', 'cross-entropy', + 'entropy', 'misclass', + 'misclassification'], + "Decision tree error: Invalid split_criterion.") + if is_classification: + if split_criterion.lower().strip() == "mse": + plpy.error("Decision tree error: MSE is not a valid " + "split criterion for classification.") + else: + if split_criterion.lower().strip() != "mse": + plpy.warning("Decision tree: Using MSE as split criterion as it " + "is the only one supported for regression trees.") + split_criterion = "mse" + return split_criterion +# ------------------------------------------------------------------------------ + + def _get_features_to_use(schema_madlib, training_table_name, list_of_features, list_of_features_to_exclude, id_col_name, weights, dependent_variable, @@ -236,11 +247,7 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, key is '' """ filter_dep = _get_filter_str(dependent_variable, grouping_cols) - # 3) if is_classification: - if split_criterion.lower().strip() == "mse": - plpy.error("Decision tree error: MSE is not a valid " - "split criterion for classification.") # For classifications, we also need to map dependent_variable to integers n_rows, dep_list = _get_n_and_deplist(training_table_name, dependent_variable, @@ -253,13 +260,10 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, else: dep_var_str = ("(CASE " + "\n\t\t".join(["WHEN ({0})::text = $${1}$$ THEN {2}". - format(dep_col_str, str(c), i) + format(dependent_variable, str(c), i) for i, c in enumerate(dep_list)]) + "\nEND)") else: - if split_criterion.lower().strip() != "mse": - plpy.warning("Decision tree: Using MSE as split criterion as it " - "is the only one supported for regression trees.") n_rows = long(plpy.execute("SELECT count(*)::bigint " "FROM {src} " "WHERE {filter}". @@ -269,7 +273,6 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, dep_list = [] dep_n_levels = len(dep_list) if dep_list else 1 - cat_features_info_table = unique_string() if not grouping_cols: # non-grouping case # 3) Find the splitting bins, one dict containing two arrays: @@ -292,7 +295,6 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, tree_states = [tree] else: grouping_array_str = get_grouping_array_str(training_table_name, grouping_cols) - with OptimizerControl(False): # we disable optimizer (ORCA) for platforms that use it # since ORCA doesn't provide an easy way to disable hashagg @@ -344,7 +346,6 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, schema_madlib, tree, len(cat_features), len(con_features)) tree.update(**importance_vectors) - return tree_states, bins, dep_list, n_rows, cat_features_info_table # ------------------------------------------------------------------------- @@ -513,6 +514,7 @@ def tree_train(schema_madlib, training_table_name, output_table_name, is_classification, dep_is_bool = _is_dep_categorical( training_table_name, dependent_variable) + split_criterion = _validate_split_criterion(split_criterion, is_classification) # 4) Build the tree with provided cp value compute_cp_list = (n_folds > 1) diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in index ccf1a26f2..93524bf54 100644 --- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in @@ -23,7 +23,6 @@ from utilities.utilities import split_quoted_delimited_str from utilities.utilities import unique_string from utilities.validate_args import cols_in_tbl_valid -from utilities.validate_args import get_cols_and_types from utilities.validate_args import get_expr_type from utilities.validate_args import input_tbl_valid from utilities.validate_args import is_var_valid @@ -34,7 +33,6 @@ from decision_tree import _tree_train_grps_using_bins from decision_tree import _get_bins from decision_tree import _get_bins_grps from decision_tree import _get_features_to_use -from decision_tree import _get_dep_type from decision_tree import _is_dep_categorical from decision_tree import _get_n_and_deplist from decision_tree import _classify_features @@ -1228,10 +1226,10 @@ def _create_summary_table(**kwargs): kwargs['dep_list_str'] = "NULL" kwargs['indep_type'] = ', '.join(kwargs['all_cols_types'][col] - for col in kwargs['cat_features'] + - kwargs['con_features']) - kwargs['dep_type'] = _get_dep_type(kwargs['training_table_name'], - kwargs['dependent_variable']) + for col in (kwargs['cat_features'] + + kwargs['con_features'])) + kwargs['dep_type'] = get_expr_type(kwargs['dependent_variable'], + kwargs['training_table_name']) kwargs['cat_features_str'] = ','.join(kwargs['cat_features']) kwargs['con_features_str'] = ','.join(kwargs['con_features']) if kwargs['grouping_cols']: From bd99196613321e6b04bd4b94fa305a6fb77273d8 Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Tue, 31 Jul 2018 14:13:03 -0700 Subject: [PATCH 09/13] Redo changes lost in previous commit --- .../decision_tree.py_in | 69 +++++++++++-------- .../random_forest.py_in | 1 - 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index 7ebfbfd65..88c72dda0 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -252,9 +252,11 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, n_rows, dep_list = _get_n_and_deplist(training_table_name, dependent_variable, filter_dep) - dep_list.sort() if dep_is_bool: - dep_var_str = ("CASE WHEN {0} THEN 0 ELSE 1 END". + # false = 0, true = 1 + # This order is maintained in dep_list since + # _get_n_and_deplist returns a sorted list + dep_var_str = ("CASE WHEN {0} THEN 1 ELSE 0 END". format(dependent_variable)) else: @@ -573,11 +575,14 @@ def _get_n_and_deplist(training_table_name, dependent_variable, filter_null): @brief Query the database for the total number of rows and levels of dependent variable if the dependent variable is categorical. + + Note: The deplist is sorted in the array_agg which is necessary to ensure + false = 0 and true = 1 for boolean dependent variable. """ sql = """ SELECT sum(n_rows) as n_rows, - array_agg(dep) as dep + array_agg(dep ORDER BY dep ASC) as dep FROM ( SELECT count(*) as n_rows, @@ -1549,14 +1554,19 @@ def _create_summary_table( output_table_summary = add_postfix(output_table_name, "_summary") # dependent variables + dep_type = _get_dep_type(training_table_name, dependent_variable) if dep_list: - dep_list_str = ("$dep_list$" + - ','.join('"{0}"'.format(str(dep)) for dep in dep_list) + - "$dep_list$") + if is_psql_boolean_type(dep_type): + # Special handling for boolean since Python booleans start with + # capitals (i.e False instead of false) + # Note: dep_list is sorted, hence 'false' will be first + dep_list_str = "'false, true'" + else: + dep_list_str = '$__dep_list__$"{0}"$__dep_list__$'.format( + ','.join(map(str, dep_list))) else: dep_list_str = "NULL" indep_type = ', '.join(all_cols_types[c] for c in cat_features + con_features) - dep_type = _get_dep_type(training_table_name, dependent_variable) independent_varnames = ','.join(cat_features + con_features) cat_features_str = ','.join(cat_features) con_features_str = ','.join(con_features) @@ -1597,8 +1607,8 @@ def _create_summary_table( {n_rows_skipped}::integer AS total_rows_skipped, {dep_list_str}::text AS dependent_var_levels, '{dep_type}'::text AS dependent_var_type, - {cp_str} AS input_cp, '{indep_type}'::text AS independent_var_types, + {cp_str} AS input_cp, {n_folds}::integer AS n_folds, {null_proxy_str}::text AS null_proxy """.format(**locals()) @@ -1675,7 +1685,6 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response', 'prob' gives the probability of the classes in a classification tree. For regression tree, only type='response' is defined. - Returns: None @@ -1699,9 +1708,9 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response', "that were used during training".format(source)) id_col_name = summary_elements["id_col_name"] dep_varname = summary_elements["dependent_varname"] - dep_levels = summary_elements["dependent_var_levels"] - is_classification = summary_elements["is_classification"] + dep_levels = split_quoted_delimited_str(summary_elements["dependent_var_levels"]) dep_type = summary_elements['dependent_var_type'] + is_classification = summary_elements["is_classification"] # optional variables, default value is None grouping_cols_str = summary_elements.get("grouping_cols") null_proxy = summary_elements.get('null_proxy') @@ -1715,8 +1724,8 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response', "m.cat_levels_in_text", "m.cat_n_levels", null_proxy) if use_existing_tables and table_exists(output): - plpy.execute("truncate " + output) - header = "INSERT INTO " + output + " " + plpy.execute("TRUNCATE " + output) + header = "INSERT INTO " + output use_fold = 'WHERE k = ' + str(k) else: header = "CREATE TABLE " + output + " AS " @@ -1746,23 +1755,26 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response', {use_fold} """ else: - if dep_type.lower() == "boolean": + if is_psql_boolean_type(dep_type): # some platforms don't have text to boolean cast. We manually check the string. dep_cast_str = ("(case {pred_name} when 'true' then true " - "else false end)::BOOLEAN as {pred_name}") + " when 'false' then false " + "end)::BOOLEAN as {pred_name}") else: dep_cast_str = "{pred_name}::{dep_type}" + dep_levels_array_str = py_list_to_sql_string(dep_levels, 'TEXT', long_format=True) + plpy.info(dep_levels_array_str) if pred_type == "response": sql = header + """ SELECT - {id_col_name} - , %s + {id_col_name}, + %s FROM ( SELECT - {id_col_name} - , - ($sql${{ {dep_levels} }}$sql$::varchar[])[ - {schema_madlib}._predict_dt_response ( + {id_col_name}, + -- _predict_dt_response returns 0-based indexing. + -- Hence the "+ 1" (DB by default uses 1-based indexing) + (%s)[{schema_madlib}._predict_dt_response ( tree, {cat_features_str}::INTEGER[], {con_features_str}::DOUBLE PRECISION[]) + 1]::TEXT @@ -1770,27 +1782,26 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response', FROM {source} as s {join_str} {model} as m {using_str} {use_fold} ) q - """ % (dep_cast_str) + """ % (dep_cast_str, dep_levels_array_str) else: - intermediate_col = unique_string() + temp_col = unique_string() score_format = ', \n'.join([ - '{interim}[{j}] as "estimated_prob_{c}"'. - format(j=i+1, c=c.strip(' "'), interim=intermediate_col) - for i, c in enumerate(split_quoted_delimited_str(dep_levels))]) + '{0}[{1}] as "estimated_prob_{2}"'.format(temp_col, i, c.strip(' "')) + for i, c in enumerate(dep_levels, start=1)]) sql = header + """ SELECT {id_col_name}, - {score_format} + %s FROM ( SELECT {id_col_name}, {schema_madlib}._predict_dt_prob(tree, {cat_features_str}::INTEGER[], {con_features_str}::DOUBLE PRECISION[]) - AS {intermediate_col} + AS {temp_col} FROM {source} as s {join_str} {model} as m {using_str} {use_fold} ) q - """ + """ % (score_format) sql = sql.format(**locals()) with MinWarning('warning'): with OptimizerControl(False): diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in index 93524bf54..5f3e4f27f 100644 --- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in @@ -353,7 +353,6 @@ def forest_train( _assert(n_rows > 0, "Random forest error: There should be at least one " "data point for each class where all features are non NULL") - dep_list.sort() dep_col_str = ("CASE WHEN " + dependent_variable + " THEN 'True' ELSE 'False' END") if is_bool else dependent_variable dep = ("(CASE " + From e06921d18f4c96d06a73833917cdc726bef44dcd Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Tue, 31 Jul 2018 14:47:21 -0700 Subject: [PATCH 10/13] Repeat DT changes in RF --- .../decision_tree.py_in | 4 +- .../random_forest.py_in | 72 +++++++++---------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index 88c72dda0..e9fcf0fe8 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -256,7 +256,7 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, # false = 0, true = 1 # This order is maintained in dep_list since # _get_n_and_deplist returns a sorted list - dep_var_str = ("CASE WHEN {0} THEN 1 ELSE 0 END". + dep_var_str = ("(CASE WHEN {0} THEN 1 ELSE 0 END)". format(dependent_variable)) else: @@ -359,7 +359,7 @@ def get_grouping_array_str(table_name, grouping_cols, qualifier=None): """ def _col_to_text(col, col_type): qualifier_str = qualifier + "." if qualifier else '' - if col_type.lower() == 'boolean': + if is_psql_boolean_type(col_type): return "(case when {0} then 'true' else 'false' end)::text".format(col) else: return '({0}{1})::text'.format(qualifier_str, col) diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in index 5f3e4f27f..583f1455b 100644 --- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in @@ -18,6 +18,7 @@ from utilities.control import HashaggControl from utilities.utilities import _assert from utilities.utilities import add_postfix from utilities.utilities import extract_keyvalue_params +from utilities.utilities import is_psql_boolean_type from utilities.utilities import py_list_to_sql_string from utilities.utilities import split_quoted_delimited_str from utilities.utilities import unique_string @@ -39,6 +40,7 @@ from decision_tree import _classify_features from decision_tree import _get_filter_str from decision_tree import _get_display_header from decision_tree import get_feature_str +from decision_tree import get_grouping_array_str from decision_tree import _compute_var_importance from decision_tree import _create_cat_features_info_table # ------------------------------------------------------------ @@ -322,7 +324,7 @@ def forest_train( _assert(bool(features), "Random forest error: No feature is selected for the model.") - is_classification, is_bool = _is_dep_categorical( + is_classification, dep_is_bool = _is_dep_categorical( training_table_name, dependent_variable) split_criterion = 'gini' if is_classification else 'mse' @@ -350,23 +352,26 @@ def forest_train( n_rows, dep_list = _get_n_and_deplist(training_table_name, dependent_variable, filter_null) + dep_n_levels = len(dep_list) _assert(n_rows > 0, "Random forest error: There should be at least one " "data point for each class where all features are non NULL") - dep_col_str = ("CASE WHEN " + dependent_variable + - " THEN 'True' ELSE 'False' END") if is_bool else dependent_variable - dep = ("(CASE " + - "\n ". - join(["WHEN ({dep_col})::text = $${c}$$ THEN {i}". - format(dep_col=dep_col_str, c=c, i=i) - for i, c in enumerate(dep_list)]) + - "\nEND)") - dep_n_levels = len(dep_list) + if dep_is_bool: + # false = 0, true = 1 + # This order is maintained in dep_list since + # _get_n_and_deplist returns a sorted list + dep = ("(CASE WHEN {0} THEN 1 ELSE 0 END)". + format(dependent_variable)) + else: + dep = ("(CASE " + + "\n\t\t".join(["WHEN ({0})::text = $${1}$$ THEN {2}". + format(dependent_variable, c, i) + for i, c in enumerate(dep_list)]) + + "\nEND)") else: n_rows = plpy.execute( - "SELECT count(*) FROM {source_table} where {filter_null}". - format(source_table=training_table_name, - filter_null=filter_null))[0]['count'] + "SELECT count(*) FROM {0} WHERE {1}". + format(training_table_name, filter_null))[0]['count'] dep = dependent_variable dep_n_levels = 1 dep_list = None @@ -394,15 +399,8 @@ def forest_train( cat_features = bins['cat_features'] bins['grp_key_cat'] = [''] else: - grouping_cols_list = [col.strip() for col in grouping_cols.split(',')] - grouping_cols_and_types = [(col, get_expr_type(col, training_table_name)) - for col in grouping_cols_list] - grouping_array_str = ( - "array_to_string(array[" + - ','.join("(case when " + col + " then 'True' else 'False' end)::text" - if col_type.lower() == 'boolean' else '(' + col + ')::text' - for col, col_type in grouping_cols_and_types) + - "], ',')") + grouping_array_str = get_grouping_array_str( + training_table_name, grouping_cols) grouping_cols_str = ('' if grouping_cols is None else grouping_cols + ",") sql_grp_key_to_grp_cols = """ @@ -427,7 +425,8 @@ def forest_train( con_features, num_bins, dep, boolean_cats, grouping_cols, grouping_array_str, n_rows, - is_classification, dep_n_levels, filter_null, null_proxy) + is_classification, dep_n_levels, + filter_null, null_proxy) cat_features = bins['cat_features'] # a table for getting information of cat features for each group @@ -679,7 +678,7 @@ def forest_predict(schema_madlib, model, source, output, id_col_name = summary_elements["id_col_name"] grouping_cols = summary_elements.get("grouping_cols") # optional, default = None dep_varname = summary_elements["dependent_varname"] - dep_levels = summary_elements["dependent_var_levels"] + dep_levels = split_quoted_delimited_str(summary_elements["dependent_var_levels"]) is_classification = summary_elements["is_classification"] dep_type = summary_elements['dependent_var_type'] null_proxy = summary_elements.get('null_proxy') # optional, default = None @@ -701,17 +700,19 @@ def forest_predict(schema_madlib, model, source, output, if not is_classification: majority_pred_expression = "AVG(aggregated_prediction)" else: - majority_pred_expression = """($sql${{ {dep_levels} }}$sql$::varchar[])[ - {schema_madlib}.mode(aggregated_prediction + 1)]::TEXT - """.format(**locals()) + majority_pred_expression = ( + "({0})[{1}.mode(aggregated_prediction + 1)]::TEXT". + format(py_list_to_sql_string(dep_levels, 'TEXT', long_format=True), + schema_madlib)) - if dep_type.lower() == "boolean": + if is_psql_boolean_type(dep_type): # some platforms don't have text to boolean cast. We manually check the string. - majority_pred_cast_str = ("(case {majority_pred_expression} when 'True' then " - "True else False end)::BOOLEAN as {pred_name}") + majority_pred_cast_str = ("(case {majority_pred_expression} " + " when 'true' then true " + " when 'false' then false " + " end)::BOOLEAN AS {pred_name}") else: - majority_pred_cast_str = "{majority_pred_expression}::{dep_type} as {pred_name}" - + majority_pred_cast_str = "({majority_pred_expression})::{dep_type} AS {pred_name}" majority_pred_cast_str = majority_pred_cast_str.format(**locals()) num_trees_grown = plpy.execute( "SELECT count(DISTINCT sample_id) FROM {0}".format(model))[0]['count'] @@ -722,8 +723,7 @@ def forest_predict(schema_madlib, model, source, output, SELECT {id_col_name}, {majority_pred_cast_str} - FROM - ( + FROM ( SELECT {id_col_name}, {schema_madlib}._predict_dt_response( @@ -742,12 +742,12 @@ def forest_predict(schema_madlib, model, source, output, GROUP BY {id_col_name} """.format(**locals()) else: - len_dep_levels = len(split_quoted_delimited_str(dep_levels)) + len_dep_levels = len(dep_levels) normalized_majority_pred = unique_string() score_format = ', \n'.join([ '{temp}[{j}] as "estimated_prob_{c}"'. format(j=i+1, c=c.strip(' "'), temp=normalized_majority_pred) - for i, c in enumerate(split_quoted_delimited_str(dep_levels))]) + for i, c in enumerate(dep_levels)]) sql_prediction = """ CREATE TABLE {output} AS From 966b509ce3de75de587d997fe2a53c4eceeaad95 Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Tue, 31 Jul 2018 15:00:51 -0700 Subject: [PATCH 11/13] Fix incorrect double quoting --- .../modules/recursive_partitioning/decision_tree.py_in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index e9fcf0fe8..ab19f7ee7 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -37,7 +37,7 @@ from utilities.validate_args import get_expr_type from utilities.validate_args import is_var_valid from utilities.validate_args import table_is_empty from utilities.validate_args import table_exists -from utilities.validate_args import unquote_ident +from utilities.validate_args import quote_ident, unquote_ident from validation.cross_validation import cross_validation_grouping_w_params # ------------------------------------------------------------ @@ -1562,8 +1562,8 @@ def _create_summary_table( # Note: dep_list is sorted, hence 'false' will be first dep_list_str = "'false, true'" else: - dep_list_str = '$__dep_list__$"{0}"$__dep_list__$'.format( - ','.join(map(str, dep_list))) + dep_list_str = '$__dep_list__${0}$__dep_list__$'.format( + ','.join(map(quote_ident, dep_list))) else: dep_list_str = "NULL" indep_type = ', '.join(all_cols_types[c] for c in cat_features + con_features) From b6cdaab38f44dae4c7a1f2f3077f9392838062d7 Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Tue, 31 Jul 2018 16:21:20 -0700 Subject: [PATCH 12/13] quote_literal instead of quote_ident? --- .../decision_tree.py_in | 9 ++++---- .../random_forest.py_in | 23 +++++++++++-------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index ab19f7ee7..1b0e03b74 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -37,7 +37,7 @@ from utilities.validate_args import get_expr_type from utilities.validate_args import is_var_valid from utilities.validate_args import table_is_empty from utilities.validate_args import table_exists -from utilities.validate_args import quote_ident, unquote_ident +from utilities.validate_args import unquote_ident from validation.cross_validation import cross_validation_grouping_w_params # ------------------------------------------------------------ @@ -582,7 +582,7 @@ def _get_n_and_deplist(training_table_name, dependent_variable, filter_null): sql = """ SELECT sum(n_rows) as n_rows, - array_agg(dep ORDER BY dep ASC) as dep + array_agg(dep ORDER BY dep) as dep FROM ( SELECT count(*) as n_rows, @@ -1563,7 +1563,7 @@ def _create_summary_table( dep_list_str = "'false, true'" else: dep_list_str = '$__dep_list__${0}$__dep_list__$'.format( - ','.join(map(quote_ident, dep_list))) + ','.join(map(str, dep_list))) else: dep_list_str = "NULL" indep_type = ', '.join(all_cols_types[c] for c in cat_features + con_features) @@ -1762,7 +1762,8 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response', "end)::BOOLEAN as {pred_name}") else: dep_cast_str = "{pred_name}::{dep_type}" - dep_levels_array_str = py_list_to_sql_string(dep_levels, 'TEXT', long_format=True) + dep_levels_array_str = py_list_to_sql_string(map(quote_literal, dep_levels), + 'TEXT', plpy.info(dep_levels_array_str) if pred_type == "response": sql = header + """ diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in index 583f1455b..d3b3f09ec 100644 --- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in @@ -700,10 +700,12 @@ def forest_predict(schema_madlib, model, source, output, if not is_classification: majority_pred_expression = "AVG(aggregated_prediction)" else: + dep_levels_array_str = py_list_to_sql_string(map(quote_literal, dep_levels), + 'TEXT', + long_format=True) majority_pred_expression = ( "({0})[{1}.mode(aggregated_prediction + 1)]::TEXT". - format(py_list_to_sql_string(dep_levels, 'TEXT', long_format=True), - schema_madlib)) + format(dep_levels_array_str, schema_madlib)) if is_psql_boolean_type(dep_type): # some platforms don't have text to boolean cast. We manually check the string. @@ -1216,19 +1218,22 @@ def _calculate_oob_error(schema_madlib, oob_prediction_table, oob_error_table, def _create_summary_table(**kwargs): kwargs['features'] = ','.join(kwargs['cat_features'] + kwargs['con_features']) + kwargs['dep_type'] = get_expr_type(kwargs['dependent_variable'], + kwargs['training_table_name']) if kwargs['dep_list']: - kwargs['dep_list_str'] = ( - "$dep_list$" + - ','.join('"{0}"'.format(str(dep)) for dep in kwargs['dep_list']) + - "$dep_list$") + if is_psql_boolean_type(kwargs['dep_type']): + # Special handling for boolean since Python booleans start with + # capitals (i.e False instead of false) + # Note: dep_list is sorted, hence 'false' will be first + kwargs['dep_list_str'] = "'false, true'" + else: + kwargs['dep_list_str'] = '$__dep_list__${0}$__dep_list__$'.format( + ','.join(map(str, kwargs['dep_list']))) else: kwargs['dep_list_str'] = "NULL" - kwargs['indep_type'] = ', '.join(kwargs['all_cols_types'][col] for col in (kwargs['cat_features'] + kwargs['con_features'])) - kwargs['dep_type'] = get_expr_type(kwargs['dependent_variable'], - kwargs['training_table_name']) kwargs['cat_features_str'] = ','.join(kwargs['cat_features']) kwargs['con_features_str'] = ','.join(kwargs['con_features']) if kwargs['grouping_cols']: From afed61f977931b4106ef55c8d6728c143bc9cc02 Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Tue, 31 Jul 2018 21:29:42 -0700 Subject: [PATCH 13/13] Deleted wrong line --- .../postgres/modules/recursive_partitioning/decision_tree.py_in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index 1b0e03b74..cedfa4800 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -1764,7 +1764,7 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response', dep_cast_str = "{pred_name}::{dep_type}" dep_levels_array_str = py_list_to_sql_string(map(quote_literal, dep_levels), 'TEXT', - plpy.info(dep_levels_array_str) + long_format=True) if pred_type == "response": sql = header + """ SELECT