From 7f7951af44297ea9a81e0e36bd62127bd7fed79c Mon Sep 17 00:00:00 2001
From: Orhan Kislal <okislal@pivotal.io>
Date: Wed, 6 Apr 2016 14:07:26 -0700
Subject: [PATCH] Random Forest: Fix filtered feature related bug

JIRA: MADLIB-928

Additional Author: Nandish Jayaram <njayaram@pivotal.io>

Random forest filters out a feature if it has the same value for every row. If grouping is enabled, this filer is applied per group. However _get_bins_grps function did not consider a case where different groups have different feature sets. The commit fixes this issue, adds a test for install-check to replicate the use case and fixes a typo in one of the related functions.
---
 methods/array_ops/src/pg_gp/array_ops.c       |  2 +-
 .../decision_tree.py_in                       |  2 +-
 .../test/random_forest.sql_in                 | 41 +++++++++++++++++++
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/methods/array_ops/src/pg_gp/array_ops.c b/methods/array_ops/src/pg_gp/array_ops.c
index 598df8500..93609839f 100644
--- a/methods/array_ops/src/pg_gp/array_ops.c
+++ b/methods/array_ops/src/pg_gp/array_ops.c
@@ -375,7 +375,7 @@ array_of_float(PG_FUNCTION_ARGS){
         ereport(ERROR,
                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                  errmsg("invalid array length"),
-                 errdetail("array_of_bigint: Size should be in [1, 1e7], %d given", size)));
+                 errdetail("array_of_float: Size should be in [1, 1e7], %d given", size)));
     }
     Datum* array = palloc (sizeof(Datum)*size);
     for(int i = 0; i < size; ++i) {
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 763670c02..1183a6d83 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -1043,7 +1043,7 @@ def _get_bins_grps(
 
     if cat_features:
         cat_items_list = [rows[col] for col in cat_features
-                          for grp_key, rows in grp_to_col_to_row.items()]
+                          for grp_key, rows in grp_to_col_to_row.items() if col in rows]
         cat_n = [len(i) for i in cat_items_list]
         cat_origin = [item for subl in cat_items_list for item in subl]
         grp_key_cat=[grp_key for grp_key in grp_to_col_to_row]
diff --git a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
index cc15dd40d..22b73fda3 100644
--- a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
@@ -255,3 +255,44 @@ SELECT * from predict_output;
 
 SELECT get_tree('train_output', 1, sid)
 from generate_series(1, 5) sid;
+
+-------------------------------------------------------------------------
+-- Test case for the case where a group has a filtered feature
+
+DROP TABLE IF EXISTS rf_gr_test CASCADE;
+CREATE TABLE rf_gr_test (
+    id integer NOT NULL,
+    gr integer,
+    f1 integer,
+    f2 integer, --filtered for gr = 2
+    f3 double precision,
+    cl integer
+) ;
+
+INSERT INTO rf_gr_test (id,gr,f1,f2,f3,cl) VALUES
+(1,1,1,1,5,1),
+(2,1,2,2,4,2),
+(3,1,3,3,3,1),
+(4,2,4,4,2,2),
+(5,2,5,4,1,1);
+
+DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
+SELECT madlib.forest_train(
+                  'rf_gr_test'::TEXT,         -- source table
+                  'train_output'::TEXT,    -- output model table
+                  'id'::TEXT,              -- id column
+                  'cl'::TEXT,           -- response
+                  'f1, f2, f3'::TEXT,   -- features
+                  NULL::TEXT,        -- exclude columns
+                  'gr'::TEXT,        -- no grouping
+                  2,                -- num of trees
+                  1,                 -- num of random features
+                  TRUE::BOOLEAN,    -- importance
+                  1::INTEGER,       -- num_permutations
+                  10::INTEGER,       -- max depth
+                  1::INTEGER,        -- min split
+                  1::INTEGER,        -- min bucket
+                  2::INTEGER,        -- number of bins per continuous variable
+                  'max_surrogates=0',
+                  FALSE
+                  );