From 5de8a1c9c877401552cde5b46e666af724867484 Mon Sep 17 00:00:00 2001 From: Swatisoni Date: Thu, 30 Nov 2017 14:52:09 -0800 Subject: [PATCH 1/2] Feature: Allow NULL in rows for computing correlations and covariance JIRA: MADLIB-1166 Additional Authors: Orhan Kislal - Replace NULL values with the mean of corresponding column. --- .../postgres/modules/stats/correlation.py_in | 31 ++++----- .../postgres/modules/stats/correlation.sql_in | 9 +-- .../modules/stats/test/correlation.sql_in | 65 +++++++++++++++++++ 3 files changed, 82 insertions(+), 23 deletions(-) diff --git a/src/ports/postgres/modules/stats/correlation.py_in b/src/ports/postgres/modules/stats/correlation.py_in index 0b0820810..4dc5c29f6 100644 --- a/src/ports/postgres/modules/stats/correlation.py_in +++ b/src/ports/postgres/modules/stats/correlation.py_in @@ -180,31 +180,29 @@ def _populate_output_table(schema_madlib, source_table, output_table, function_name = "Correlation" agg_str = "{0}.correlation_agg(x, mean)".format(schema_madlib) + cols = ','.join(["coalesce(" + col + ", avg_"+col+") " + for col in col_names]) + avgs = ','.join(["avg(" + col + ") AS avg_" + col + "" + for col in col_names]) + avg_array = ','.join(["avg_" + col + " " for col in col_names]) # actual computation plpy.execute(""" CREATE TEMP TABLE {temp_table} AS SELECT - tot_cnt, - count(*) AS non_null_cnt, + count(*) AS tot_cnt, mean, {agg_str} as cor_mat FROM ( - SELECT {col_names_as_float_array} AS x - FROM {source_table} - ) src1, - ( - SELECT - count(*) AS tot_cnt, - {schema_madlib}.avg(x) AS mean - FROM + SELECT ARRAY[ {cols} ] AS x, + ARRAY [ {avg_array} ] AS mean + FROM {source_table}, ( - SELECT {col_names_as_float_array} AS x + SELECT {avgs} FROM {source_table} - ) src2 - ) subq - WHERE NOT {schema_madlib}.array_contains_null(x) - GROUP BY tot_cnt, mean + )sub1 + ) sub2 + GROUP BY mean """.format(**locals())) # create summary table @@ -217,8 +215,7 @@ def _populate_output_table(schema_madlib, source_table, output_table, '{output_table}'::varchar AS output_table, {col_names_as_text_array} AS column_names, mean AS mean_vector, - non_null_cnt AS total_rows_processed, - tot_cnt - non_null_cnt AS total_rows_skipped + tot_cnt AS total_rows_processed FROM {temp_table} """.format(**locals()) diff --git a/src/ports/postgres/modules/stats/correlation.sql_in b/src/ports/postgres/modules/stats/correlation.sql_in index 25c285951..92411c4e4 100644 --- a/src/ports/postgres/modules/stats/correlation.sql_in +++ b/src/ports/postgres/modules/stats/correlation.sql_in @@ -104,10 +104,6 @@ is also created at the same time, which has the following columns: total_rows_processed BIGINT. Total numbers of rows processed. - - total_rows_skipped - BIGINT. Total numbers of rows skipped due to missing values. - @@ -207,8 +203,9 @@ Result: @par Notes -Current implementation ignores a row that contains NULL entirely. This means -any correlation in such a row (with NULLs) does not contribute to the final answer. + +WARNING: Rows with NULL values will not be ignored. Null values will be +replaced by the mean of their respective columns. @anchor related @par Related Topics diff --git a/src/ports/postgres/modules/stats/test/correlation.sql_in b/src/ports/postgres/modules/stats/test/correlation.sql_in index 163f641c0..d6cd0147c 100644 --- a/src/ports/postgres/modules/stats/test/correlation.sql_in +++ b/src/ports/postgres/modules/stats/test/correlation.sql_in @@ -22,3 +22,68 @@ DROP TABLE IF EXISTS corr_output, corr_output_summary; SELECT * FROM correlation('rand_numeric', 'corr_output', Null); DROP TABLE IF EXISTS corr_output, corr_output_summary; SELECT * FROM correlation('rand_numeric', 'corr_output', 'a, c, e'); + +DROP TABLE IF EXISTS example_data; +CREATE TABLE example_data( + id SERIAL, outlook TEXT, + temperature FLOAT8, humidity FLOAT8, + windy TEXT, class TEXT, new_col FLOAT8); +INSERT INTO example_data VALUES +(1, 'sunny', 85, 85, 'false', 'Dont Play', 1), +(2, 'sunny', 80, 90, 'true', 'Dont Play', 12), +(3, 'overcast', 83, 78, 'false', 'Play', 13), +(4, 'rain', 70, 96, 'false', 'Play', 16), +(5, 'rain', 68, 80, 'false', 'Play', 17), +(6, 'rain', 65, 70, 'true', 'Dont Play', 12), +(7, 'overcast', 64, 65, 'true', 'Play', 15), +(8, 'sunny', 72, 95, 'false', 'Dont Play', 19), +(9, 'sunny', 69, 70, 'false', 'Play', 20), +(10, 'rain', 75, 80, 'false', 'Play', 32), +(11, 'sunny', 75, 70, 'true', 'Play', 31), +(12, 'overcast', 72, 90, 'true', 'Play', 11), +(13, 'overcast', 81, 75, 'false', 'Play', 31), +(14, 'rain', 71, 80, 'true', 'Dont Play', 21), +(15, NULL, 100, 100, 'true', NULL, 11), +(16, NULL, 110, 100, 'true', NULL, 13); + +--- example_data_output will have correlations of only two columns which do +--- not include any null columns in another columns +DROP TABLE IF EXISTS example_data_output, example_data_output_summary; +SELECT correlation( 'example_data', + 'example_data_output', + 'temperature, humidity, new_col', + True); +SELECT * FROM example_data_output; + +INSERT INTO example_data VALUES (17, NULL, 110, 100, 'true', NULL, NULL); + +--- example_data_output will have correlations of only three columns which does +--- include any columns. Hence if this +DROP TABLE IF EXISTS example_data_output_with_null_2column, example_data_output_with_null_2column_summary; +SELECT correlation( 'example_data', + 'example_data_output_with_null_2column', + 'temperature, humidity', + True); + +SELECT * FROM example_data_output_with_null_2column; + +DROP TABLE IF EXISTS example_data_output_with_null_3column, example_data_output_with_null_3column_summary; +SELECT correlation( 'example_data', + 'example_data_output_with_null_3column', + 'temperature, humidity, new_col', + True); +SELECT * FROM example_data_output_with_null_3column; + +SELECT assert(ABS(a.temperature - b.temperature) < 0.000001, + 'Rows with NULL values are ignored.') +FROM + example_data_output_with_null_2column a, + example_data_output_with_null_3column b +WHERE a.column_position = 2 and b.column_position = 2; + +SELECT assert(ABS(a.temperature - b.temperature) > 0.000001, + 'New row with NULL values is ignored.') +FROM + example_data_output a, + example_data_output_with_null_2column b +WHERE a.column_position = 2 and b.column_position = 2; From c5459931659bf66a18f7b813d77c39d81362209d Mon Sep 17 00:00:00 2001 From: Orhan Kislal Date: Fri, 1 Dec 2017 10:35:13 -0800 Subject: [PATCH 2/2] Address review comments and add details on mean imputation. --- .../postgres/modules/stats/correlation.py_in | 17 +++++++++-------- .../postgres/modules/stats/correlation.sql_in | 13 +++++++++++-- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/ports/postgres/modules/stats/correlation.py_in b/src/ports/postgres/modules/stats/correlation.py_in index 4dc5c29f6..c03e025b1 100644 --- a/src/ports/postgres/modules/stats/correlation.py_in +++ b/src/ports/postgres/modules/stats/correlation.py_in @@ -165,8 +165,8 @@ def _populate_output_table(schema_madlib, source_table, output_table, with MinWarning("info" if verbose else "error"): start = time() col_len = len(col_names) - col_names_as_float_array = py_list_to_sql_string(col_names, "float8") col_names_as_text_array = py_list_to_sql_string(col_names, "varchar") + col_names_as_float_array = py_list_to_sql_string(col_names, "float8") temp_table = unique_string() if get_cov: function_name = "Covariance" @@ -180,13 +180,12 @@ def _populate_output_table(schema_madlib, source_table, output_table, function_name = "Correlation" agg_str = "{0}.correlation_agg(x, mean)".format(schema_madlib) - cols = ','.join(["coalesce(" + col + ", avg_"+col+") " - for col in col_names]) - avgs = ','.join(["avg(" + col + ") AS avg_" + col + "" - for col in col_names]) - avg_array = ','.join(["avg_" + col + " " for col in col_names]) + cols = ','.join(["coalesce({0}, avg_{0})".format(col) for col in col_names]) + avgs = ','.join(["avg({0}) AS avg_{0}".format(col) for col in col_names]) + avg_array = ','.join(["avg_{0}".format(col) for col in col_names]) # actual computation - plpy.execute(""" + sql1 = """ + CREATE TEMP TABLE {temp_table} AS SELECT count(*) AS tot_cnt, @@ -203,7 +202,9 @@ def _populate_output_table(schema_madlib, source_table, output_table, )sub1 ) sub2 GROUP BY mean - """.format(**locals())) + """.format(**locals()) + + plpy.execute(sql1) # create summary table summary_table = add_postfix(output_table, "_summary") diff --git a/src/ports/postgres/modules/stats/correlation.sql_in b/src/ports/postgres/modules/stats/correlation.sql_in index 92411c4e4..c3b999302 100644 --- a/src/ports/postgres/modules/stats/correlation.sql_in +++ b/src/ports/postgres/modules/stats/correlation.sql_in @@ -20,6 +20,7 @@ m4_include(`SQLCommon.m4') @@ -204,8 +205,16 @@ Result: @par Notes -WARNING: Rows with NULL values will not be ignored. Null values will be -replaced by the mean of their respective columns. +Null values will be replaced by the mean of their respective columns (Mean imputation/substitution). Mean imputation is a method in which the missing value on a certain variable is replaced by the mean of the available cases. This method maintains the sample size and is easy to use, but the variability in the data is reduced, so the standard deviations and the variance estimates tend to be underestimated. Please refer to [1] and [2] for details. + +If the mean imputation method is not suitable for the target use case, it is advised to employ a view that handles the NULL values prior to calling the correlation/covariance functions. + +@anchor literature +@literature + +[1] https://en.wikipedia.org/wiki/Imputation_(statistics) + +[2] https://www.iriseekhout.com/missing-data/missing-data-methods/imputation-methods/ @anchor related @par Related Topics