From 7f7951af44297ea9a81e0e36bd62127bd7fed79c Mon Sep 17 00:00:00 2001 From: Orhan Kislal Date: Wed, 6 Apr 2016 14:07:26 -0700 Subject: [PATCH] Random Forest: Fix filtered feature related bug JIRA: MADLIB-928 Additional Author: Nandish Jayaram Random forest filters out a feature if it has the same value for every row. If grouping is enabled, this filer is applied per group. However _get_bins_grps function did not consider a case where different groups have different feature sets. The commit fixes this issue, adds a test for install-check to replicate the use case and fixes a typo in one of the related functions. --- methods/array_ops/src/pg_gp/array_ops.c | 2 +- .../decision_tree.py_in | 2 +- .../test/random_forest.sql_in | 41 +++++++++++++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/methods/array_ops/src/pg_gp/array_ops.c b/methods/array_ops/src/pg_gp/array_ops.c index 598df8500..93609839f 100644 --- a/methods/array_ops/src/pg_gp/array_ops.c +++ b/methods/array_ops/src/pg_gp/array_ops.c @@ -375,7 +375,7 @@ array_of_float(PG_FUNCTION_ARGS){ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid array length"), - errdetail("array_of_bigint: Size should be in [1, 1e7], %d given", size))); + errdetail("array_of_float: Size should be in [1, 1e7], %d given", size))); } Datum* array = palloc (sizeof(Datum)*size); for(int i = 0; i < size; ++i) { diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index 763670c02..1183a6d83 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -1043,7 +1043,7 @@ def _get_bins_grps( if cat_features: cat_items_list = [rows[col] for col in cat_features - for grp_key, rows in grp_to_col_to_row.items()] + for grp_key, rows in grp_to_col_to_row.items() if col in rows] cat_n = [len(i) for i in cat_items_list] cat_origin = [item for subl in cat_items_list for item in subl] grp_key_cat=[grp_key for grp_key in grp_to_col_to_row] diff --git a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in index cc15dd40d..22b73fda3 100644 --- a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in @@ -255,3 +255,44 @@ SELECT * from predict_output; SELECT get_tree('train_output', 1, sid) from generate_series(1, 5) sid; + +------------------------------------------------------------------------- +-- Test case for the case where a group has a filtered feature + +DROP TABLE IF EXISTS rf_gr_test CASCADE; +CREATE TABLE rf_gr_test ( + id integer NOT NULL, + gr integer, + f1 integer, + f2 integer, --filtered for gr = 2 + f3 double precision, + cl integer +) ; + +INSERT INTO rf_gr_test (id,gr,f1,f2,f3,cl) VALUES +(1,1,1,1,5,1), +(2,1,2,2,4,2), +(3,1,3,3,3,1), +(4,2,4,4,2,2), +(5,2,5,4,1,1); + +DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group; +SELECT madlib.forest_train( + 'rf_gr_test'::TEXT, -- source table + 'train_output'::TEXT, -- output model table + 'id'::TEXT, -- id column + 'cl'::TEXT, -- response + 'f1, f2, f3'::TEXT, -- features + NULL::TEXT, -- exclude columns + 'gr'::TEXT, -- no grouping + 2, -- num of trees + 1, -- num of random features + TRUE::BOOLEAN, -- importance + 1::INTEGER, -- num_permutations + 10::INTEGER, -- max depth + 1::INTEGER, -- min split + 1::INTEGER, -- min bucket + 2::INTEGER, -- number of bins per continuous variable + 'max_surrogates=0', + FALSE + );