From 4d9863d3daa4b7ada39226114d2a4499961839df Mon Sep 17 00:00:00 2001
From: Rahul Iyer <riyer@apache.org>
Date: Sun, 25 Mar 2018 21:45:47 -0700
Subject: [PATCH] RF: Use NULL::integer[] when no continuous features

JIRA: MADLIB-1219

When variable importance is enabled, to compute importance score,
distribution of the categorical and continuous features are computed.
For continuous features, this function requires initializing a vector of
length = number of continuous features. When there are no continuous
features, this initialization fails.

This commit fixes the issue by inputing a NULL::integer[] vector when
there no continuous features.

Closes #249
---
 .../recursive_partitioning/random_forest.py_in    | 15 +++++++++------
 .../test/random_forest.sql_in                     |  3 ++-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index 3961df43d..ce6ef7fbc 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -946,6 +946,11 @@ def _calculate_oob_prediction(
 
     oob_var_dist_view = unique_string()
     if importance:
+        if con_features:
+            initialized_float_array = ("{0}.array_of_float({1})::integer[]".
+                                       format(schema_madlib, len(con_features)))
+        else:
+            initialized_float_array = "NULL::integer[]"
         sql_create_oob_var_dist_view = """
             CREATE VIEW {oob_var_dist_view} AS
             SELECT
@@ -966,12 +971,10 @@ def _calculate_oob_prediction(
                             {con_features_str}::double precision[],
                             con_splits
                         ), -- bin_indices, -1 for NaN
-                        1 -- -1 shift to 0 for nulls
+                        1 -- -1 shifted to 0 for null values
                     ),
-                    {schema_madlib}.array_fill(
-                        {schema_madlib}.array_of_float({n_con})::integer[],
-                        ({num_bins}+1)::integer
-                    )
+                    {schema_madlib}.array_fill({initialized_float_array},
+                                               ({num_bins} + 1)::integer)
                     -- level of any continuous feature == num_bins
                 ) AS con_index_distributions
             FROM
@@ -986,7 +989,7 @@ def _calculate_oob_prediction(
                 {con_splits_table}
             USING (gid)
             GROUP BY gid
-        """.format(n_con=len(con_features), **locals())
+        """.format(**locals())
     else:
         sql_create_oob_var_dist_view = """
             CREATE VIEW {oob_var_dist_view} AS
diff --git a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
index cabca6a0d..20c8bed31 100644
--- a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
@@ -265,6 +265,7 @@ from generate_series(1, 5) sid;
 
 -------------------------------------------------------------------------
 -- Test case for the case where a group has a filtered feature
+-- Also testing var_importance=TRUE with no continuous features (MADLIB-1219)
 
 DROP TABLE IF EXISTS rf_gr_test CASCADE;
 CREATE TABLE rf_gr_test (
@@ -289,7 +290,7 @@ SELECT forest_train(
                   'train_output',    -- output model table
                   'id',              -- id column
                   'cl',           -- response
-                  'f1, f2, f3',   -- features
+                  'f1, f2',   -- features
                   NULL,        -- exclude columns
                   'gr',        -- grouping
                   2,                -- num of trees