From 4d9863d3daa4b7ada39226114d2a4499961839df Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Sun, 25 Mar 2018 21:45:47 -0700 Subject: [PATCH] RF: Use NULL::integer[] when no continuous features JIRA: MADLIB-1219 When variable importance is enabled, to compute importance score, distribution of the categorical and continuous features are computed. For continuous features, this function requires initializing a vector of length = number of continuous features. When there are no continuous features, this initialization fails. This commit fixes the issue by inputing a NULL::integer[] vector when there no continuous features. Closes #249 --- .../recursive_partitioning/random_forest.py_in | 15 +++++++++------ .../test/random_forest.sql_in | 3 ++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in index 3961df43d..ce6ef7fbc 100644 --- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in @@ -946,6 +946,11 @@ def _calculate_oob_prediction( oob_var_dist_view = unique_string() if importance: + if con_features: + initialized_float_array = ("{0}.array_of_float({1})::integer[]". + format(schema_madlib, len(con_features))) + else: + initialized_float_array = "NULL::integer[]" sql_create_oob_var_dist_view = """ CREATE VIEW {oob_var_dist_view} AS SELECT @@ -966,12 +971,10 @@ def _calculate_oob_prediction( {con_features_str}::double precision[], con_splits ), -- bin_indices, -1 for NaN - 1 -- -1 shift to 0 for nulls + 1 -- -1 shifted to 0 for null values ), - {schema_madlib}.array_fill( - {schema_madlib}.array_of_float({n_con})::integer[], - ({num_bins}+1)::integer - ) + {schema_madlib}.array_fill({initialized_float_array}, + ({num_bins} + 1)::integer) -- level of any continuous feature == num_bins ) AS con_index_distributions FROM @@ -986,7 +989,7 @@ def _calculate_oob_prediction( {con_splits_table} USING (gid) GROUP BY gid - """.format(n_con=len(con_features), **locals()) + """.format(**locals()) else: sql_create_oob_var_dist_view = """ CREATE VIEW {oob_var_dist_view} AS diff --git a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in index cabca6a0d..20c8bed31 100644 --- a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in @@ -265,6 +265,7 @@ from generate_series(1, 5) sid; ------------------------------------------------------------------------- -- Test case for the case where a group has a filtered feature +-- Also testing var_importance=TRUE with no continuous features (MADLIB-1219) DROP TABLE IF EXISTS rf_gr_test CASCADE; CREATE TABLE rf_gr_test ( @@ -289,7 +290,7 @@ SELECT forest_train( 'train_output', -- output model table 'id', -- id column 'cl', -- response - 'f1, f2, f3', -- features + 'f1, f2', -- features NULL, -- exclude columns 'gr', -- grouping 2, -- num of trees