From 5de8a1c9c877401552cde5b46e666af724867484 Mon Sep 17 00:00:00 2001
From: Swatisoni <soniswati.2010@gmail.com>
Date: Thu, 30 Nov 2017 14:52:09 -0800
Subject: [PATCH 1/2] Feature: Allow NULL in rows for computing correlations
 and covariance JIRA: MADLIB-1166

Additional Authors:
Orhan Kislal <okislal@pivotal.io>

- Replace NULL values with the mean of corresponding column.
---
 .../postgres/modules/stats/correlation.py_in  | 31 ++++-----
 .../postgres/modules/stats/correlation.sql_in |  9 +--
 .../modules/stats/test/correlation.sql_in     | 65 +++++++++++++++++++
 3 files changed, 82 insertions(+), 23 deletions(-)
diff --git a/src/ports/postgres/modules/stats/correlation.py_in b/src/ports/postgres/modules/stats/correlation.py_in
index 0b0820810..4dc5c29f6 100644
--- a/src/ports/postgres/modules/stats/correlation.py_in
+++ b/src/ports/postgres/modules/stats/correlation.py_in
@@ -180,31 +180,29 @@ def _populate_output_table(schema_madlib, source_table, output_table,
             function_name = "Correlation"
             agg_str = "{0}.correlation_agg(x, mean)".format(schema_madlib)
 
+        cols = ','.join(["coalesce(" + col + ", avg_"+col+") "
+                            for col in col_names])
+        avgs = ','.join(["avg(" + col + ") AS avg_" + col + ""
+                            for col in col_names])
+        avg_array = ','.join(["avg_" + col + " " for col in col_names])
         # actual computation
         plpy.execute("""
             CREATE TEMP TABLE {temp_table} AS
             SELECT
-                tot_cnt,
-                count(*) AS non_null_cnt,
+                count(*) AS tot_cnt,
                 mean,
                 {agg_str} as cor_mat
             FROM
             (
-                SELECT {col_names_as_float_array} AS x
-                FROM {source_table}
-            ) src1,
-            (
-                SELECT
-                    count(*) AS tot_cnt,
-                    {schema_madlib}.avg(x) AS mean
-                FROM
+                SELECT ARRAY[ {cols} ] AS x,
+                        ARRAY [ {avg_array} ] AS mean
+                FROM {source_table},
                 (
-                    SELECT {col_names_as_float_array} AS x
+                    SELECT {avgs}
                     FROM {source_table}
-                ) src2
-            ) subq
-            WHERE NOT {schema_madlib}.array_contains_null(x)
-            GROUP BY tot_cnt, mean
+                )sub1
+            ) sub2
+            GROUP BY mean
             """.format(**locals()))
 
         # create summary table
@@ -217,8 +215,7 @@ def _populate_output_table(schema_madlib, source_table, output_table,
                 '{output_table}'::varchar   AS output_table,
                 {col_names_as_text_array}   AS column_names,
                 mean                        AS mean_vector,
-                non_null_cnt                AS total_rows_processed,
-                tot_cnt - non_null_cnt      AS total_rows_skipped
+                tot_cnt                     AS total_rows_processed
             FROM {temp_table}
             """.format(**locals())
 
diff --git a/src/ports/postgres/modules/stats/correlation.sql_in b/src/ports/postgres/modules/stats/correlation.sql_in
index 25c285951..92411c4e4 100644
--- a/src/ports/postgres/modules/stats/correlation.sql_in
+++ b/src/ports/postgres/modules/stats/correlation.sql_in
@@ -104,10 +104,6 @@ is also created at the same time, which has the following columns:
   <th>total_rows_processed</th>
   <td>BIGINT. Total numbers of rows processed.</td>
 </tr>
-<tr>
-  <th>total_rows_skipped</th>
-  <td>BIGINT. Total numbers of rows skipped due to missing values.</td>
-</tr>
 </table>
 </dd>
 
@@ -207,8 +203,9 @@ Result:
 </pre>
 
 @par Notes
-Current implementation ignores a row that contains NULL entirely. This means
-any correlation in such a row (with NULLs) does not contribute to the final answer.
+
+WARNING: Rows with NULL values will not be ignored. Null values will be
+replaced by the mean of their respective columns.
 
 @anchor related
 @par Related Topics
diff --git a/src/ports/postgres/modules/stats/test/correlation.sql_in b/src/ports/postgres/modules/stats/test/correlation.sql_in
index 163f641c0..d6cd0147c 100644
--- a/src/ports/postgres/modules/stats/test/correlation.sql_in
+++ b/src/ports/postgres/modules/stats/test/correlation.sql_in
@@ -22,3 +22,68 @@ DROP TABLE IF EXISTS corr_output, corr_output_summary;
 SELECT * FROM correlation('rand_numeric', 'corr_output', Null);
 DROP TABLE IF EXISTS corr_output, corr_output_summary;
 SELECT * FROM correlation('rand_numeric', 'corr_output', 'a, c, e');
+
+DROP TABLE IF EXISTS example_data;
+CREATE TABLE example_data(
+    id SERIAL, outlook TEXT,
+    temperature FLOAT8, humidity FLOAT8,
+    windy TEXT, class TEXT, new_col FLOAT8);
+INSERT INTO example_data VALUES
+(1, 'sunny', 85, 85, 'false', 'Dont Play', 1),
+(2, 'sunny', 80, 90, 'true', 'Dont Play', 12),
+(3, 'overcast', 83, 78, 'false', 'Play', 13),
+(4, 'rain', 70, 96, 'false', 'Play', 16),
+(5, 'rain', 68, 80, 'false', 'Play', 17),
+(6, 'rain', 65, 70, 'true', 'Dont Play', 12),
+(7, 'overcast', 64, 65, 'true', 'Play', 15),
+(8, 'sunny', 72, 95, 'false', 'Dont Play', 19),
+(9, 'sunny', 69, 70, 'false', 'Play', 20),
+(10, 'rain', 75, 80, 'false', 'Play', 32),
+(11, 'sunny', 75, 70, 'true', 'Play', 31),
+(12, 'overcast', 72, 90, 'true', 'Play', 11),
+(13, 'overcast', 81, 75, 'false', 'Play', 31),
+(14, 'rain', 71, 80, 'true', 'Dont Play', 21),
+(15, NULL, 100, 100, 'true', NULL, 11),
+(16, NULL, 110, 100, 'true', NULL, 13);
+
+--- example_data_output will have correlations of only two columns which do
+--- not include any null columns in another columns
+DROP TABLE IF EXISTS example_data_output, example_data_output_summary;
+SELECT correlation( 'example_data',
+                    'example_data_output',
+                    'temperature, humidity, new_col',
+                    True);
+SELECT * FROM example_data_output;
+
+INSERT INTO example_data VALUES (17, NULL, 110, 100, 'true', NULL, NULL);
+
+--- example_data_output will have correlations of only three columns which does
+--- include any columns. Hence if this
+DROP TABLE IF EXISTS example_data_output_with_null_2column, example_data_output_with_null_2column_summary;
+SELECT correlation( 'example_data',
+                    'example_data_output_with_null_2column',
+                    'temperature, humidity',
+                    True);
+
+SELECT * FROM example_data_output_with_null_2column;
+
+DROP TABLE IF EXISTS example_data_output_with_null_3column, example_data_output_with_null_3column_summary;
+SELECT correlation( 'example_data',
+                    'example_data_output_with_null_3column',
+                    'temperature, humidity, new_col',
+                    True);
+SELECT * FROM example_data_output_with_null_3column;
+
+SELECT assert(ABS(a.temperature - b.temperature) < 0.000001,
+       'Rows with NULL values are ignored.')
+FROM
+  example_data_output_with_null_2column a,
+  example_data_output_with_null_3column b
+WHERE a.column_position = 2 and b.column_position = 2;
+
+SELECT assert(ABS(a.temperature - b.temperature) > 0.000001,
+       'New row with NULL values is ignored.')
+FROM
+  example_data_output a,
+  example_data_output_with_null_2column b
+WHERE a.column_position = 2 and b.column_position = 2;

From c5459931659bf66a18f7b813d77c39d81362209d Mon Sep 17 00:00:00 2001
From: Orhan Kislal <okislal@pivotal.io>
Date: Fri, 1 Dec 2017 10:35:13 -0800
Subject: [PATCH 2/2] Address review comments and add details on mean
 imputation.

---
 .../postgres/modules/stats/correlation.py_in    | 17 +++++++++--------
 .../postgres/modules/stats/correlation.sql_in   | 13 +++++++++++--
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/ports/postgres/modules/stats/correlation.py_in b/src/ports/postgres/modules/stats/correlation.py_in
index 4dc5c29f6..c03e025b1 100644
--- a/src/ports/postgres/modules/stats/correlation.py_in
+++ b/src/ports/postgres/modules/stats/correlation.py_in
@@ -165,8 +165,8 @@ def _populate_output_table(schema_madlib, source_table, output_table,
     with MinWarning("info" if verbose else "error"):
         start = time()
         col_len = len(col_names)
-        col_names_as_float_array = py_list_to_sql_string(col_names, "float8")
         col_names_as_text_array = py_list_to_sql_string(col_names, "varchar")
+        col_names_as_float_array = py_list_to_sql_string(col_names, "float8")
         temp_table = unique_string()
         if get_cov:
             function_name = "Covariance"
@@ -180,13 +180,12 @@ def _populate_output_table(schema_madlib, source_table, output_table,
             function_name = "Correlation"
             agg_str = "{0}.correlation_agg(x, mean)".format(schema_madlib)
 
-        cols = ','.join(["coalesce(" + col + ", avg_"+col+") "
-                            for col in col_names])
-        avgs = ','.join(["avg(" + col + ") AS avg_" + col + ""
-                            for col in col_names])
-        avg_array = ','.join(["avg_" + col + " " for col in col_names])
+        cols = ','.join(["coalesce({0}, avg_{0})".format(col) for col in col_names])
+        avgs = ','.join(["avg({0}) AS avg_{0}".format(col) for col in col_names])
+        avg_array = ','.join(["avg_{0}".format(col) for col in col_names])
         # actual computation
-        plpy.execute("""
+        sql1 = """
+
             CREATE TEMP TABLE {temp_table} AS
             SELECT
                 count(*) AS tot_cnt,
@@ -203,7 +202,9 @@ def _populate_output_table(schema_madlib, source_table, output_table,
                 )sub1
             ) sub2
             GROUP BY mean
-            """.format(**locals()))
+            """.format(**locals())
+
+        plpy.execute(sql1)
 
         # create summary table
         summary_table = add_postfix(output_table, "_summary")
diff --git a/src/ports/postgres/modules/stats/correlation.sql_in b/src/ports/postgres/modules/stats/correlation.sql_in
index 92411c4e4..c3b999302 100644
--- a/src/ports/postgres/modules/stats/correlation.sql_in
+++ b/src/ports/postgres/modules/stats/correlation.sql_in
@@ -20,6 +20,7 @@ m4_include(`SQLCommon.m4')
 <ul>
 <li><a href="#usage">Correlation Function</a></li>
 <li><a href="#examples">Examples</a></li>
+<li><a href="#literature">Literature</a></li>
 <li><a href="#related">Related Topics</a></li>
 </ul>
 </div>
@@ -204,8 +205,16 @@ Result:
 
 @par Notes
 
-WARNING: Rows with NULL values will not be ignored. Null values will be
-replaced by the mean of their respective columns.
+Null values will be replaced by the mean of their respective columns (Mean imputation/substitution). Mean imputation is a method in which the missing value on a certain variable is replaced by the mean of the available cases. This method maintains the sample size and is easy to use, but the variability in the data is reduced, so the standard deviations and the variance estimates tend to be underestimated. Please refer to [1] and [2] for details.
+
+If the mean imputation method is not suitable for the target use case, it is advised to employ a view that handles the NULL values prior to calling the correlation/covariance functions.
+
+@anchor literature
+@literature
+
+[1] https://en.wikipedia.org/wiki/Imputation_(statistics)
+
+[2] https://www.iriseekhout.com/missing-data/missing-data-methods/imputation-methods/
 
 @anchor related
 @par Related Topics