From 6632f72840211f87c99b32c916dc610977329d9f Mon Sep 17 00:00:00 2001 From: hpandeycodeit Date: Tue, 15 Aug 2017 15:30:27 -0700 Subject: [PATCH 1/7] knn code refactoring --- src/ports/postgres/modules/knn/knn.py_in | 82 +++++++++++++++- src/ports/postgres/modules/knn/knn.sql_in | 109 +++------------------- 2 files changed, 95 insertions(+), 96 deletions(-) diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in index c0d9cd7be..7fe7007c8 100644 --- a/src/ports/postgres/modules/knn/knn.py_in +++ b/src/ports/postgres/modules/knn/knn.py_in @@ -127,5 +127,83 @@ def knn_validate_src(schema_madlib, point_source, point_column_name, label_colum "Data type '{0}' is not a valid type for column '{1}' in table '{2}'.".format(colType, id_column_name, test_source)) return k -# ---------------------------------------------------------------------- -m4_changequote(, ) + + + + +def knn(schema_madlib, point_source, point_column_name, label_column_name, + test_source, test_column_name, id_column_name, output_table, operation, k): + + + + oldClientMinMessages = plpy.execute("SELECT setting FROM pg_settings WHERE name = 'client_min_messages'")[0]['setting']; + + plpy.execute("SET client_min_messages TO warning"); + + + k_val = knn_validate_src(schema_madlib, point_source, point_column_name, + label_column_name, test_source, + test_column_name, id_column_name, + output_table, operation, k) + + + plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp()".format(schema_madlib = schema_madlib)); + + x_temp_table = 'knn' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_temp' AS MD5")[0]['md5']; + y_temp_table = 'knn' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_temp' AS MD5")[0]['md5']; + label_column_name_unique = 'label' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_name' AS MD5")[0]['md5']; + test_id = 'id' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_name' AS MD5")[0]['md5']; + + convert_boolean_to_int = ''; + if operation == 'c': + convert_boolean_to_int = '::INTEGER'; + + + plpy.execute("DROP TABLE IF EXISTS pg_temp.madlib_knn_interm"); + plpy.execute( + """ + CREATE TEMP TABLE pg_temp.madlib_knn_interm AS + SELECT * + FROM + ( + SELECT row_number() over (partition by {test_id} order by dist) AS r , {x_temp_table}.* + FROM + ( + SELECT test.{id_column_name} AS {test_id} , {schema_madlib}.squared_dist_norm2(train.{point_column_name} ,test.{test_column_name}) AS dist, train.{label_column_name} {convert_boolean_to_int} AS {label_column_name_unique} + FROM {point_source} AS train, {test_source} AS test + ) {x_temp_table} + ){y_temp_table} + WHERE {y_temp_table}.r <= {k_val}""".format(**locals())); + + if operation == 'c': + plpy.execute( + """ + CREATE TABLE {output_table} AS + SELECT {test_id} AS id, {test_column_name} , {schema_madlib}.mode({label_column_name_unique}) AS prediction + FROM pg_temp.madlib_knn_interm join {test_source} ON {test_id} = {id_column_name} + GROUP BY {test_id} , {test_column_name}""".format(**locals())) + + + else: + plpy.execute( + """ + CREATE TABLE {output_table} AS + SELECT {test_id} AS id, {test_column_name} , avg( {label_column_name_unique} ) AS prediction + FROM + pg_temp.madlib_knn_interm join {test_source} on {test_id} ={id_column_name} + GROUP BY {test_id} , {test_column_name} + ORDER BY {test_id}""".format(**locals())) + + + plpy.execute("SET client_min_messages TO "+ oldClientMinMessages) + + if operation == 'c': + returnstring = 'The classification results have been written to output table '+ output_table; + else: + returnstring = 'The regression results have been written to output table '+ output_table; + + plpy.execute("DROP TABLE pg_temp.madlib_knn_interm"); + return returnstring; + + + diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in index d3c19292c..373e92418 100644 --- a/src/ports/postgres/modules/knn/knn.sql_in +++ b/src/ports/postgres/modules/knn/knn.sql_in @@ -358,6 +358,7 @@ $$ LANGUAGE plpgsql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); + CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( point_source VARCHAR, point_column_name VARCHAR, @@ -369,98 +370,18 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( operation VARCHAR, k INTEGER ) RETURNS VARCHAR AS $$ -DECLARE - l FLOAT; - id INTEGER; - vector DOUBLE PRECISION[]; - cur_pid integer; - oldClientMinMessages VARCHAR; - returnstring VARCHAR; - x_temp_table VARCHAR; - y_temp_table VARCHAR; - k_val INTEGER; - label_column_name_unique VARCHAR; - test_id VARCHAR; - convert_boolean_to_int VARCHAR; -BEGIN - oldClientMinMessages := - (SELECT setting FROM pg_settings WHERE name = 'client_min_messages'); - EXECUTE 'SET client_min_messages TO warning'; - SELECT * FROM MADLIB_SCHEMA.__knn_validate_src(point_source, point_column_name, label_column_name, test_source, test_column_name, id_column_name, output_table, operation, k) INTO k_val; - PERFORM MADLIB_SCHEMA.create_schema_pg_temp(); - x_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp'; - y_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp'; - label_column_name_unique := 'label'||md5('knn_'||now()::text||random()::text)||'_name'; - test_id := 'id'||md5('knn_'||now()::text||random()::text)||'_name'; - - convert_boolean_to_int := ''; - IF (operation = 'c') THEN - convert_boolean_to_int := '::INTEGER'; - END IF; - - EXECUTE - $sql$ - DROP TABLE IF EXISTS pg_temp.madlib_knn_interm; - CREATE TABLE pg_temp.madlib_knn_interm AS - SELECT * - FROM - ( - SELECT row_number() over (partition by $sql$ || test_id || $sql$ order by dist) AS r, $sql$ || x_temp_table || $sql$.* - FROM - ( - SELECT test.$sql$ || id_column_name || $sql$ AS $sql$ || test_id || $sql$, MADLIB_SCHEMA.squared_dist_norm2(train.$sql$ || point_column_name || $sql$,test.$sql$ || test_column_name || $sql$) AS dist, train.$sql$ || label_column_name || $sql$ $sql$ || convert_boolean_to_int || $sql$ AS $sql$ || label_column_name_unique || $sql$ - FROM $sql$ || textin(regclassout(point_source)) || $sql$ AS train, $sql$ || textin(regclassout(test_source)) || $sql$ AS test - )$sql$ || x_temp_table || $sql$ - )$sql$ || y_temp_table || $sql$ - WHERE $sql$ || y_temp_table || $sql$.r <= $sql$ || k_val; - - IF (operation = 'c') THEN - EXECUTE - $sql$ - CREATE TABLE $sql$ || output_table || $sql$ AS - SELECT $sql$ || test_id || $sql$ AS id, $sql$ || test_column_name || $sql$, MADLIB_SCHEMA.mode($sql$ || label_column_name_unique || $sql$) AS prediction - FROM pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ ON $sql$ || test_id || $sql$=$sql$ || id_column_name || $sql$ - GROUP BY $sql$ || test_id || $sql$, $sql$ || test_column_name; - ELSE - EXECUTE - $sql$ - CREATE TABLE $sql$ || output_table || $sql$ AS - SELECT $sql$ || test_id || $sql$ AS id, $sql$ || test_column_name || $sql$, avg($sql$ || label_column_name_unique || $sql$) AS prediction - FROM - pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ on $sql$ || test_id || $sql$=$sql$ || id_column_name || $sql$ - GROUP BY $sql$ || test_id || $sql$, $sql$ || test_column_name || $sql$ - ORDER BY $sql$ || test_id || $sql$ $sql$; - END IF; - - EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages; - IF (operation = 'c') THEN - returnstring := 'The classification results have been written to output table '||output_table; - ELSE - returnstring := 'The regression results have been written to output table '||output_table; - END IF; - DROP TABLE pg_temp.madlib_knn_interm; - RETURN returnstring; -END; -$$ LANGUAGE plpgsql VOLATILE -m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); - - -CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( - point_source VARCHAR, - point_column_name VARCHAR, - label_column_name VARCHAR, - test_source VARCHAR, - test_column_name VARCHAR, - id_column_name VARCHAR, - output_table VARCHAR, - operation VARCHAR -) RETURNS VARCHAR AS $$ -DECLARE - returnstring VARCHAR; -BEGIN - returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1); - RETURN returnstring; -END; -$$ LANGUAGE plpgsql VOLATILE + PythonFunctionBodyOnly(`knn', `knn') + return knn.knn( + schema_madlib, + point_source, + point_column_name, + label_column_name, + test_source, + test_column_name, + id_column_name, + output_table, + operation, + k + ) +$$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); - From 881a098a5d256730dc2a55bdbb2f4dba9e2a3cbc Mon Sep 17 00:00:00 2001 From: hpandeycodeit Date: Wed, 16 Aug 2017 11:34:41 -0700 Subject: [PATCH 2/7] knn code refactor: replaced query with unique_string --- src/ports/postgres/modules/knn/knn.py_in | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in index 7fe7007c8..7f18f6b49 100644 --- a/src/ports/postgres/modules/knn/knn.py_in +++ b/src/ports/postgres/modules/knn/knn.py_in @@ -37,6 +37,7 @@ from utilities.validate_args import columns_exist_in_table from utilities.validate_args import is_col_array from utilities.validate_args import array_col_has_no_null from utilities.validate_args import get_cols_and_types +from utilities.utilities import unique_string STATE_IN_MEM = m4_ifdef(, , ) HAS_FUNCTION_PROPERTIES = m4_ifdef(, , ) @@ -149,20 +150,21 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name, plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp()".format(schema_madlib = schema_madlib)); - x_temp_table = 'knn' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_temp' AS MD5")[0]['md5']; - y_temp_table = 'knn' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_temp' AS MD5")[0]['md5']; - label_column_name_unique = 'label' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_name' AS MD5")[0]['md5']; - test_id = 'id' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_name' AS MD5")[0]['md5']; + x_temp_table = unique_string(desp='x_temp_table') + y_temp_table = unique_string(desp='y_temp_table') + label_column_name_unique = unique_string(desp='label_column_name_unique') + test_id = unique_string(desp='test_id') convert_boolean_to_int = ''; if operation == 'c': convert_boolean_to_int = '::INTEGER'; + madlib_knn_interm = unique_string(desp='madlib_knn_interm') - plpy.execute("DROP TABLE IF EXISTS pg_temp.madlib_knn_interm"); + plpy.execute("""DROP TABLE IF EXISTS pg_temp.{madlib_knn_interm}""".format(**locals())); plpy.execute( """ - CREATE TEMP TABLE pg_temp.madlib_knn_interm AS + CREATE TEMP TABLE pg_temp.{madlib_knn_interm} AS SELECT * FROM ( @@ -180,7 +182,7 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name, """ CREATE TABLE {output_table} AS SELECT {test_id} AS id, {test_column_name} , {schema_madlib}.mode({label_column_name_unique}) AS prediction - FROM pg_temp.madlib_knn_interm join {test_source} ON {test_id} = {id_column_name} + FROM pg_temp.{madlib_knn_interm} join {test_source} ON {test_id} = {id_column_name} GROUP BY {test_id} , {test_column_name}""".format(**locals())) @@ -190,7 +192,7 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name, CREATE TABLE {output_table} AS SELECT {test_id} AS id, {test_column_name} , avg( {label_column_name_unique} ) AS prediction FROM - pg_temp.madlib_knn_interm join {test_source} on {test_id} ={id_column_name} + pg_temp.{madlib_knn_interm} join {test_source} on {test_id} ={id_column_name} GROUP BY {test_id} , {test_column_name} ORDER BY {test_id}""".format(**locals())) @@ -202,7 +204,8 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name, else: returnstring = 'The regression results have been written to output table '+ output_table; - plpy.execute("DROP TABLE pg_temp.madlib_knn_interm"); + plpy.execute("""DROP TABLE pg_temp.{madlib_knn_interm}""".format(**locals())); + return returnstring; From 933cd3f83bd48e88c0f84df856e00df60c0f2c07 Mon Sep 17 00:00:00 2001 From: hpandeycodeit Date: Wed, 16 Aug 2017 14:42:35 -0700 Subject: [PATCH 3/7] formatting --- src/ports/postgres/modules/knn/knn.py_in | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in index 7f18f6b49..7b474bff9 100644 --- a/src/ports/postgres/modules/knn/knn.py_in +++ b/src/ports/postgres/modules/knn/knn.py_in @@ -40,9 +40,10 @@ from utilities.validate_args import get_cols_and_types from utilities.utilities import unique_string STATE_IN_MEM = m4_ifdef(, , ) -HAS_FUNCTION_PROPERTIES = m4_ifdef(, , ) -UDF_ON_SEGMENT_NOT_ALLOWED = m4_ifdef(, , ) -# ---------------------------------------------------------------------- +HAS_FUNCTION_PROPERTIES = m4_ifdef(, , +) UDF_ON_SEGMENT_NOT_ALLOWED = +m4_ifdef(, , ) # +---------------------------------------------------------------------- def knn_validate_src(schema_madlib, point_source, point_column_name, label_column_name, @@ -135,6 +136,23 @@ def knn_validate_src(schema_madlib, point_source, point_column_name, label_colum def knn(schema_madlib, point_source, point_column_name, label_column_name, test_source, test_column_name, id_column_name, output_table, operation, k): + """ + KNN function to find the K Nearest neighbours + Args: + @param schema_madlib Name of the Madlib Schema + @param point_source Training data table + @param point_column_name Name of the column with training data points. + @param label_column_name Name of the column with labels/values of training data points. + @param test_source Name of the table containing the test data points. + @param test_column_name Name of the column with testing data points. + @param id_column_name Name of the column having ids of data points in test data table. + @param output_table Name of the table to store final results. + @param k default: 1. Number of nearest neighbors to consider + + + Returns: + VARCHAR Name of the output table. + """ oldClientMinMessages = plpy.execute("SELECT setting FROM pg_settings WHERE name = 'client_min_messages'")[0]['setting']; From 869ec2afeeb6b2bed3cf949d0fc69342d4f71665 Mon Sep 17 00:00:00 2001 From: Himanshu Pandey Date: Fri, 25 Aug 2017 14:55:28 -0700 Subject: [PATCH 4/7] kNN: Refactor code for ease of use JIRA: MADLIB-927 Additional author: Orhan Kislal Closes #168 --- src/ports/postgres/modules/knn/knn.py_in | 300 +++++++++------------- src/ports/postgres/modules/knn/knn.sql_in | 22 +- 2 files changed, 146 insertions(+), 176 deletions(-) diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in index 7b474bff9..bc2ef5c1d 100644 --- a/src/ports/postgres/modules/knn/knn.py_in +++ b/src/ports/postgres/modules/knn/knn.py_in @@ -17,8 +17,6 @@ # specific language governing permissions and limitations # under the License. -m4_changequote(`') - """ @file knn.py_in @@ -26,205 +24,157 @@ m4_changequote(`') @namespace knn -@brief knn: Driver functions """ import plpy -from utilities.validate_args import table_exists -from utilities.validate_args import table_is_empty -from utilities.validate_args import columns_exist_in_table -from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import input_tbl_valid, output_tbl_valid +from utilities.validate_args import cols_in_tbl_valid from utilities.validate_args import is_col_array from utilities.validate_args import array_col_has_no_null -from utilities.validate_args import get_cols_and_types +from utilities.validate_args import get_expr_type from utilities.utilities import unique_string +from utilities.control import MinWarning -STATE_IN_MEM = m4_ifdef(, , ) -HAS_FUNCTION_PROPERTIES = m4_ifdef(, , -) UDF_ON_SEGMENT_NOT_ALLOWED = -m4_ifdef(, , ) # ----------------------------------------------------------------------- +def knn_validate_src(schema_madlib, point_source, point_column_name, + label_column_name, test_source, test_column_name, + id_column_name, output_table, operation, k, **kwargs): -def knn_validate_src(schema_madlib, point_source, point_column_name, label_column_name, - test_source, test_column_name, id_column_name, output_table, operation, k, **kwargs): if not operation or operation not in ['c', 'r']: - plpy.error("kNN Error: operation='{0}' is an invalid value, has to be 'r' for regression OR 'c' for classification.".format(operation)) - if not point_source: - plpy.error("kNN Error: Invalid training table name.") - if not table_exists(point_source): - plpy.error("kNN Error: Training table '{0}' does not exist.".format(point_source)) - if table_is_empty(point_source): - plpy.error("kNN Error: Training table '{0}' is empty.".format(point_source)) - - if not test_source: - plpy.error("kNN Error: Invalid test table name.") - if not table_exists(test_source): - plpy.error("kNN Error: Test table '{0}' does not exist.".format(test_source)) - if table_is_empty(test_source): - plpy.error("kNN Error: Test table '{0}' is empty.".format(test_source)) - - for c in (label_column_name, point_column_name): - if not c: - plpy.error("kNN Error: Invalid column name in training table.") - if not columns_exist_in_table(point_source, [c]): - plpy.error("kNN Error: " + \ - "Column '{0}' does not exist in {1}.".format(c, point_source)) - - for c in (test_column_name, id_column_name): - if not c: - plpy.error("kNN Error: Invalid column name in test table.") - if not columns_exist_in_table(test_source, [c]): - plpy.error("kNN Error: " + \ - "Column '{0}' does not exist in {1}.".format(c, test_source)) + plpy.error("kNN Error: operation='{0}' is an invalid value, has to be" + " 'r' for regression OR 'c' for classification.". + format(operation)) + input_tbl_valid(point_source, 'kNN') + input_tbl_valid(test_source, 'kNN') + output_tbl_valid(output_table, 'kNN') + cols_in_tbl_valid(point_source, (label_column_name, point_column_name), 'kNN') + cols_in_tbl_valid(test_source, (test_column_name, id_column_name), 'kNN') if not is_col_array(point_source, point_column_name): - plpy.error("kNN Error: " + \ - "Feature column '{0}' in train table is not an array.".format(point_column_name)) + plpy.error("kNN Error: Feature column '{0}' in train table is not" + " an array.").format(point_column_name) if not is_col_array(test_source, test_column_name): - plpy.error("kNN Error: " + \ - "Feature column '{0}' in test table is not an array.".format(test_column_name)) + plpy.error("kNN Error: Feature column '{0}' in test table is not" + " an array.").format(test_column_name) if not array_col_has_no_null(point_source, point_column_name): - plpy.error("kNN Error: " + \ - "Feature column '{0}' in train table has some NULL values.".format(point_column_name)) + plpy.error("kNN Error: Feature column '{0}' in train table has some" + " NULL values.".format(point_column_name)) if not array_col_has_no_null(test_source, test_column_name): - plpy.error("kNN Error: " + \ - "Feature column '{0}' in test table has some NULL values.".format(test_column_name)) - - if not output_table: - plpy.error("kNN Error: Invalid output table name") - if table_exists(output_table): - plpy.error("kNN Error: Table '{0}' already exists, cannot use it as output table.".format(output_table)) + plpy.error("kNN Error: Feature column '{0}' in test table has some" + " NULL values.".format(test_column_name)) if k is None: k = 1 - if k<=0: - plpy.error("kNN Error: k='{0}' is an invalid value, must be greater than 0.".format(k)) - bound = plpy.execute("""SELECT {k} <= count(*) - AS bound FROM {tbl}""".format(k=k, - point_column_name=point_column_name, tbl=point_source))[0]['bound'] + if k <= 0: + plpy.error("kNN Error: k={0} is an invalid value, must be greater" + "than 0.".format(k)) + bound = plpy.execute("SELECT {k} <= count(*) AS bound FROM {tbl}". + format(k=k, tbl=point_source))[0]['bound'] if not bound: - plpy.error("kNN Error: " + \ - "k='{0}' is greater than number of rows in training table.".format(k)) - - colTypesList = get_cols_and_types(point_source) - colType = '' - for type in colTypesList: - if type[0] == label_column_name: - colType = type[1] - break - if colType not in ['INTEGER','integer','double precision','DOUBLE PRECISION','float','FLOAT','boolean','BOOLEAN'] : - plpy.error("kNN Error: " + \ - "Data type '{0}' is not a valid type for column '{1}' in table '{2}'.".format(colType, label_column_name, point_source)) - - colTypesTestList = get_cols_and_types(test_source) - colType = '' - for type in colTypesTestList: - if type[0] == id_column_name: - colType = type[1] - break - if colType not in ['INTEGER','integer'] : - plpy.error("kNN Error: " + \ - "Data type '{0}' is not a valid type for column '{1}' in table '{2}'.".format(colType, id_column_name, test_source)) + plpy.error("kNN Error: k={0} is greater than number of rows in" + " training table.".format(k)) + + col_type = get_expr_type(label_column_name, point_source).lower() + if col_type not in ['integer', 'double precision', 'float', 'boolean']: + plpy.error("kNN error: Data type '{0}' is not a valid type for" + " column '{1}' in table '{2}'.". + format(col_type, label_column_name, point_source)) + + col_type_test = get_expr_type(id_column_name, test_source).lower() + if col_type_test not in ['integer']: + plpy.error("kNN Error: Data type '{0}' is not a valid type for" + " column '{1}' in table '{2}'.". + format(col_type_test, id_column_name, test_source)) return k - - - +# ------------------------------------------------------------------------------ def knn(schema_madlib, point_source, point_column_name, label_column_name, - test_source, test_column_name, id_column_name, output_table, operation, k): - + test_source, test_column_name, id_column_name, output_table, + operation, k): """ KNN function to find the K Nearest neighbours Args: - @param schema_madlib Name of the Madlib Schema - @param point_source Training data table - @param point_column_name Name of the column with training data points. - @param label_column_name Name of the column with labels/values of training data points. - @param test_source Name of the table containing the test data points. - @param test_column_name Name of the column with testing data points. - @param id_column_name Name of the column having ids of data points in test data table. - @param output_table Name of the table to store final results. - @param k default: 1. Number of nearest neighbors to consider - - - Returns: - VARCHAR Name of the output table. - """ - - - oldClientMinMessages = plpy.execute("SELECT setting FROM pg_settings WHERE name = 'client_min_messages'")[0]['setting']; - - plpy.execute("SET client_min_messages TO warning"); - - - k_val = knn_validate_src(schema_madlib, point_source, point_column_name, - label_column_name, test_source, - test_column_name, id_column_name, - output_table, operation, k) - - - plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp()".format(schema_madlib = schema_madlib)); - - x_temp_table = unique_string(desp='x_temp_table') - y_temp_table = unique_string(desp='y_temp_table') - label_column_name_unique = unique_string(desp='label_column_name_unique') - test_id = unique_string(desp='test_id') - - convert_boolean_to_int = ''; - if operation == 'c': - convert_boolean_to_int = '::INTEGER'; - - madlib_knn_interm = unique_string(desp='madlib_knn_interm') - - plpy.execute("""DROP TABLE IF EXISTS pg_temp.{madlib_knn_interm}""".format(**locals())); - plpy.execute( + @param schema_madlib Name of the Madlib Schema + @param point_source Training data table + @param point_column_name Name of the column with training data + points. + @param label_column_name Name of the column with labels/values + of training data points. + @param test_source Name of the table containing the test + data points. + @param test_column_name Name of the column with testing data + points. + @param id_column_name Name of the column having ids of data + points in test data table. + @param output_table Name of the table to store final + results. + @param operation Flag for the operation: + 'c' for classification and + 'r' for regression + @param k default: 1. Number of nearest + neighbors to consider + Returns: + VARCHAR Name of the output table. """ - CREATE TEMP TABLE pg_temp.{madlib_knn_interm} AS - SELECT * - FROM - ( - SELECT row_number() over (partition by {test_id} order by dist) AS r , {x_temp_table}.* - FROM - ( - SELECT test.{id_column_name} AS {test_id} , {schema_madlib}.squared_dist_norm2(train.{point_column_name} ,test.{test_column_name}) AS dist, train.{label_column_name} {convert_boolean_to_int} AS {label_column_name_unique} - FROM {point_source} AS train, {test_source} AS test - ) {x_temp_table} - ){y_temp_table} - WHERE {y_temp_table}.r <= {k_val}""".format(**locals())); - - if operation == 'c': - plpy.execute( - """ - CREATE TABLE {output_table} AS - SELECT {test_id} AS id, {test_column_name} , {schema_madlib}.mode({label_column_name_unique}) AS prediction - FROM pg_temp.{madlib_knn_interm} join {test_source} ON {test_id} = {id_column_name} - GROUP BY {test_id} , {test_column_name}""".format(**locals())) - - - else: + with MinWarning('warning'): + k_val = knn_validate_src(schema_madlib, point_source, + point_column_name, label_column_name, + test_source, test_column_name, id_column_name, + output_table, operation, k) + + x_temp_table = unique_string(desp='x_temp_table') + y_temp_table = unique_string(desp='y_temp_table') + label_col_temp = unique_string(desp='label_col_temp') + test_id = unique_string(desp='test_id') + + is_classification = operation == 'c' + interim_table = unique_string(desp='interim_table') plpy.execute( - """ - CREATE TABLE {output_table} AS - SELECT {test_id} AS id, {test_column_name} , avg( {label_column_name_unique} ) AS prediction - FROM - pg_temp.{madlib_knn_interm} join {test_source} on {test_id} ={id_column_name} - GROUP BY {test_id} , {test_column_name} - ORDER BY {test_id}""".format(**locals())) - - - plpy.execute("SET client_min_messages TO "+ oldClientMinMessages) - - if operation == 'c': - returnstring = 'The classification results have been written to output table '+ output_table; - else: - returnstring = 'The regression results have been written to output table '+ output_table; - - plpy.execute("""DROP TABLE pg_temp.{madlib_knn_interm}""".format(**locals())); - - return returnstring; - - - + """ + CREATE TEMP TABLE {interim_table} AS + SELECT * FROM + ( + SELECT row_number() over + (partition by {test_id} order by dist) AS r, + {x_temp_table}.* + FROM + ( + SELECT test.{id_column_name} AS {test_id} , + {schema_madlib}.squared_dist_norm2( + train.{point_column_name}, + test.{test_column_name}) + AS dist, + train.{label_column_name}{cast_to_int} + AS {label_col_temp} + FROM {point_source} AS train, {test_source} AS test + ) {x_temp_table} + ) {y_temp_table} + WHERE {y_temp_table}.r <= {k_val} + """.format(cast_to_int='::INTEGER' if is_classification else '', + **locals())) + + if is_classification: + plpy.execute( + """ + CREATE TABLE {output_table} AS + SELECT {test_id} AS id, {test_column_name}, + {schema_madlib}.mode({label_col_temp}) AS prediction + FROM {interim_table} JOIN {test_source} + ON {test_id} = {id_column_name} + GROUP BY {test_id}, {test_column_name} + """.format(**locals())) + else: + plpy.execute( + """ + CREATE TABLE {output_table} AS + SELECT {test_id} AS id, {test_column_name}, + AVG({label_col_temp}) AS prediction + FROM + {interim_table} JOIN {test_source} + ON {test_id} = {id_column_name} + GROUP BY {test_id}, {test_column_name} + """.format(**locals())) + plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table)) +# ------------------------------------------------------------------------------ \ No newline at end of file diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in index 373e92418..865b709c5 100644 --- a/src/ports/postgres/modules/knn/knn.sql_in +++ b/src/ports/postgres/modules/knn/knn.sql_in @@ -308,7 +308,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( ) RETURNS VOID AS $$ BEGIN IF arg1 = 'help' OR arg1 = 'usage' OR arg1 = '?' THEN - RAISE NOTICE + RAISE NOTICE ' ----------------------------------------------------------------------- USAGE @@ -385,3 +385,23 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( ) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( + point_source VARCHAR, + point_column_name VARCHAR, + label_column_name VARCHAR, + test_source VARCHAR, + test_column_name VARCHAR, + id_column_name VARCHAR, + output_table VARCHAR, + operation VARCHAR +) RETURNS VARCHAR AS $$ +DECLARE + returnstring VARCHAR; +BEGIN + returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1); + RETURN returnstring; +END; +$$ LANGUAGE plpgsql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); \ No newline at end of file From 83bbe29d7dc21081a7569c9e5360dc11d949b03b Mon Sep 17 00:00:00 2001 From: Jingyi Date: Mon, 21 Aug 2017 17:49:53 -0700 Subject: [PATCH 5/7] Fix example py_in file Originaly, there is a comment line at the end of the file that will cause problem during running make. This commit just remove the last comment line. Closes #175 --- examples/hello_world/iterative/simple_logistic.py_in | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/hello_world/iterative/simple_logistic.py_in b/examples/hello_world/iterative/simple_logistic.py_in index 719bc186f..d19740c46 100644 --- a/examples/hello_world/iterative/simple_logistic.py_in +++ b/examples/hello_world/iterative/simple_logistic.py_in @@ -237,4 +237,3 @@ SELECT * from patients_logregr; help_string = "No such option. Use {schema_madlib}.logregr_simple_train('help')" return help_string.format(schema_madlib=schema_madlib) -# ------------------------------------------------------------------------ From 4c1d3bf712d27bae8ae133a0a4087a07e6c61dda Mon Sep 17 00:00:00 2001 From: hpandeycodeit Date: Thu, 7 Sep 2017 13:05:51 -0700 Subject: [PATCH 6/7] KNN changes for MADLIB-1129 --- src/ports/postgres/modules/knn/knn.py_in | 154 ++++++++++++------ src/ports/postgres/modules/knn/knn.sql_in | 130 +++++++++------ .../postgres/modules/knn/test/knn.sql_in | 50 ++++-- 3 files changed, 226 insertions(+), 108 deletions(-) diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in index 27cb7353d..0cd17d4ed 100644 --- a/src/ports/postgres/modules/knn/knn.py_in +++ b/src/ports/postgres/modules/knn/knn.py_in @@ -36,19 +36,20 @@ from utilities.utilities import unique_string from utilities.control import MinWarning -def knn_validate_src(schema_madlib, point_source, point_column_name, +def knn_validate_src(schema_madlib, point_source, point_column_name,point_id, label_column_name, test_source, test_column_name, - id_column_name, output_table, operation, k, **kwargs): - - if not operation or operation not in ['c', 'r']: - plpy.error("kNN Error: operation='{0}' is an invalid value, has to be" - " 'r' for regression OR 'c' for classification.". - format(operation)) + test_id, output_table, k, output_neighbors , **kwargs): + # if not operation or operation not in ['c', 'r']: + # plpy.error("kNN Error: operation='{0}' is an invalid value, has to be" + # " 'r' for regression OR 'c' for classification.". + # format(operation)) input_tbl_valid(point_source, 'kNN') input_tbl_valid(test_source, 'kNN') output_tbl_valid(output_table, 'kNN') - cols_in_tbl_valid(point_source, (label_column_name, point_column_name), 'kNN') - cols_in_tbl_valid(test_source, (test_column_name, id_column_name), 'kNN') + if label_column_name is not None and label_column_name != '': + cols_in_tbl_valid(point_source, (label_column_name, point_column_name), 'kNN') + cols_in_tbl_valid(point_source, (point_column_name, point_id), 'kNN') + cols_in_tbl_valid(test_source, (test_column_name, test_id), 'kNN') if not is_col_array(point_source, point_column_name): plpy.error("kNN Error: Feature column '{0}' in train table is not" @@ -75,30 +76,32 @@ def knn_validate_src(schema_madlib, point_source, point_column_name, plpy.error("kNN Error: k={0} is greater than number of rows in" " training table.".format(k)) - col_type = get_expr_type(label_column_name, point_source).lower() - if col_type not in ['integer', 'double precision', 'float', 'boolean']: - plpy.error("kNN error: Data type '{0}' is not a valid type for" + if label_column_name is not None and label_column_name != '': + col_type = get_expr_type(label_column_name, point_source).lower() + if col_type not in ['integer', 'double precision', 'float', 'boolean']: + plpy.error("kNN error: Data type '{0}' is not a valid type for" " column '{1}' in table '{2}'.". format(col_type, label_column_name, point_source)) - col_type_test = get_expr_type(id_column_name, test_source).lower() + col_type_test = get_expr_type(test_id, test_source).lower() if col_type_test not in ['integer']: plpy.error("kNN Error: Data type '{0}' is not a valid type for" " column '{1}' in table '{2}'.". - format(col_type_test, id_column_name, test_source)) + format(col_type_test, test_id, test_source)) return k # ------------------------------------------------------------------------------ -def knn(schema_madlib, point_source, point_column_name, label_column_name, - test_source, test_column_name, id_column_name, output_table, - operation, k): +def knn(schema_madlib, point_source, point_column_name,point_id, label_column_name, + test_source, test_column_name, test_id, output_table, k,output_neighbors): """ KNN function to find the K Nearest neighbours Args: @param schema_madlib Name of the Madlib Schema @param point_source Training data table @param point_column_name Name of the column with training data + @param point_id Name of the column having ids of data + point in train data table points. @param label_column_name Name of the column with labels/values of training data points. @@ -106,7 +109,7 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name, data points. @param test_column_name Name of the column with testing data points. - @param id_column_name Name of the column having ids of data + @param test_id Name of the column having ids of data points in test data table. @param output_table Name of the table to store final results. @@ -115,33 +118,85 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name, 'r' for regression @param k default: 1. Number of nearest neighbors to consider + @output_neighbours Outputs the list of k-nearest neighbors + that were used in the voting/averaging. Returns: VARCHAR Name of the output table. """ with MinWarning('warning'): k_val = knn_validate_src(schema_madlib, point_source, - point_column_name, label_column_name, - test_source, test_column_name, id_column_name, - output_table, operation, k) + point_column_name, point_id ,label_column_name, + test_source, test_column_name, test_id, + output_table, k , output_neighbors) x_temp_table = unique_string(desp='x_temp_table') y_temp_table = unique_string(desp='y_temp_table') label_col_temp = unique_string(desp='label_col_temp') - test_id = unique_string(desp='test_id') + test_id_temp = unique_string(desp='test_id_temp') + + if output_neighbors is None or '': + output_neighbors=False - is_classification = operation == 'c' interim_table = unique_string(desp='interim_table') + + if label_column_name is None or label_column_name == '': + plpy.execute( + """ + CREATE TEMP TABLE {interim_table} AS + SELECT * FROM + ( + SELECT row_number() over + (partition by {test_id_temp} order by dist) AS r, + {x_temp_table}.* + FROM + ( + SELECT test.{test_id} AS {test_id_temp} , + train.id as train_id , + {schema_madlib}.squared_dist_norm2( + train.{point_column_name}, + test.{test_column_name}) + AS dist + FROM {point_source} AS train, {test_source} AS test + ) {x_temp_table} + ) {y_temp_table} + WHERE {y_temp_table}.r <= {k_val} + """.format(**locals())) + plpy.execute( + """ + CREATE TABLE {output_table} AS + SELECT {test_id_temp} AS id, {test_column_name} , + CASE WHEN {output_neighbors} + THEN array_agg(knn_temp.train_id) + ELSE NULL END AS k_nearest_neighbours + FROM pg_temp.{interim_table} AS knn_temp + join + {test_source} AS knn_test ON + knn_temp.{test_id_temp} = knn_test.{test_id} + GROUP BY {test_id_temp} , {test_column_name} + """.format(**locals())) + return + + + is_classification = False + label_column_type = get_expr_type(label_column_name, point_source).lower() + if label_column_type in ['boolean','integer', 'text']: + is_classification = True + convert_boolean_to_int = '::INTEGER' + else: + is_classification = False + plpy.execute( """ CREATE TEMP TABLE {interim_table} AS SELECT * FROM ( SELECT row_number() over - (partition by {test_id} order by dist) AS r, + (partition by {test_id_temp} order by dist) AS r, {x_temp_table}.* FROM ( - SELECT test.{id_column_name} AS {test_id} , + SELECT test.{test_id} AS {test_id_temp} , + train.id as train_id , {schema_madlib}.squared_dist_norm2( train.{point_column_name}, test.{test_column_name}) @@ -152,33 +207,34 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name, ) {x_temp_table} ) {y_temp_table} WHERE {y_temp_table}.r <= {k_val} - """.format(cast_to_int='::INTEGER' if is_classification else '', + """.format(cast_to_int='::INTEGER' if is_classification else '', **locals())) + knn_create_table = 'CREATE TABLE '+output_table+' AS ' \ + 'SELECT '+test_id_temp+' AS id,'+test_column_name+',' + knn_pred_class = schema_madlib+'.mode(' +label_col_temp+') AS prediction' + knn_pred_reg = 'avg(' +label_col_temp+') AS prediction' + knn_neighbours = ', array_agg(knn_temp.train_id) AS k_nearest_neighbours ' + knn_group_by = 'FROM pg_temp.'+interim_table+' AS knn_temp join ' \ + +test_source+' AS knn_test ON knn_temp.'+test_id_temp+'= knn_test.' \ + +test_id +' GROUP BY '+test_id_temp+', '+test_column_name + + if is_classification: - plpy.execute( - """ - CREATE TABLE {output_table} AS - SELECT {test_id} AS id, {test_column_name}, - {schema_madlib}.mode({label_col_temp}) AS prediction - FROM {interim_table} JOIN {test_source} - ON {test_id} = {id_column_name} - GROUP BY {test_id}, {test_column_name} - """.format(**locals())) + if output_neighbors: + plpy.execute("""{knn_create_table}{knn_pred_class} + {knn_neighbours}{knn_group_by}""".format(**locals())) + else: + plpy.execute(""" {knn_create_table}{knn_pred_class} + {knn_group_by}""".format(**locals())) else: - plpy.execute( - """ - CREATE TABLE {output_table} AS - SELECT {test_id} AS id, {test_column_name}, - AVG({label_col_temp}) AS prediction - FROM - {interim_table} JOIN {test_source} - ON {test_id} = {id_column_name} - GROUP BY {test_id}, {test_column_name} - """.format(**locals())) + if output_neighbors: + plpy.execute(""" {knn_create_table}{knn_pred_reg} + {knn_neighbours}{knn_group_by}""".format(**locals())) + else: + plpy.execute("""{knn_create_table}{knn_pred_reg} + {knn_group_by}""".format(**locals())) + + plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table)) -<<<<<<< HEAD -# ------------------------------------------------------------------------------ -======= # ------------------------------------------------------------------------------ ->>>>>>> 6f99dbe4659a5582df61e2fdcc68f7ee60d5859d diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in index 1d547bddf..dfd23748e 100644 --- a/src/ports/postgres/modules/knn/knn.sql_in +++ b/src/ports/postgres/modules/knn/knn.sql_in @@ -71,13 +71,14 @@ neighbors of the given test point.
 knn( point_source,
      point_column_name,
+     point_id,
      label_column_name,
      test_source,
      test_column_name,
-     id_column_name,
+     test_id,
      output_table,
-     operation,
-     k
+     k,
+     output_neighbors
    )
 
@@ -93,8 +94,17 @@ in a column of type DOUBLE PRECISION[].
point_column_name
TEXT. Name of the column with training data points.
+
point_id
+
TEXT. Name of the column in 'point_source’ containing source data ids. +The ids are of type INTEGER with no duplicates. They do not need to be contiguous. +This parameter must be used if the list of nearest neighbors are to be output, i.e., +if the parameter 'output_neighbors' below is TRUE or if 'label_column_name' is NULL. +
label_column_name
-
TEXT. Name of the column with labels/values of training data points.
+
TEXT. Name of the column with labels/values of training data points. +If Boolean, integer or text types will run knn classification, else if +double precision values will run knn regression. +If you set this to NULL will return neighbors only without doing classification or regression.
test_source
TEXT. Name of the table containing the test data points. @@ -106,7 +116,7 @@ in a column of type DOUBLE PRECISION[].
test_column_name
TEXT. Name of the column with testing data points.
-
id_column_name
+
test_id
TEXT. Name of the column having ids of data points in test data table.
output_table
@@ -117,7 +127,12 @@ in a column of type DOUBLE PRECISION[].
k (optional)
INTEGER. default: 1. Number of nearest neighbors to consider. -For classification, should be an odd number to break ties.
+For classification, should be an odd number to break ties. +otherwise result may depend on ordering of the input data. + +
output_neighbors (optional)
+
BOOLEAN default: FALSE. Outputs the list of k-nearest +neighbors that were used in the voting/averaging.
@@ -145,15 +160,35 @@ The output of the KNN module is a table with the following columns: @anchor examples @examp --# Prepare some training data: +-# Prepare some training data for classification:
 DROP TABLE IF EXISTS knn_train_data;
 CREATE TABLE knn_train_data (
                     id integer, 
                     data integer[], 
-                    label float
+                    label integer
                     );
 INSERT INTO knn_train_data VALUES
+(1, '{1,1}', 1),
+(2, '{2,2}', 1),
+(3, '{3,3}', 1),
+(4, '{4,4}', 1),
+(5, '{4,5}', 1),
+(6, '{20,50}', 0),
+(7, '{10,31}', 0),
+(8, '{81,13}', 0),
+(9, '{1,111}', 0);
+
+ +-# Prepare some training data for regression: +
+DROP TABLE IF EXISTS knn_train_data_reg;
+CREATE TABLE knn_train_data_reg (
+                    id integer, 
+                    data integer[], 
+                    label float
+                    );
+INSERT INTO knn_train_data_reg VALUES
 (1, '{1,1}', 1.0),
 (2, '{2,2}', 1.0),
 (3, '{3,3}', 1.0),
@@ -187,26 +222,27 @@ DROP TABLE IF EXISTS madlib_knn_result_classification;
 SELECT * FROM madlib.knn( 
                 'knn_train_data',      -- Table of training data
                 'data',                -- Col name of training data
+                'id',                  -- Col Name of id in train data 
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data 
                 'madlib_knn_result_classification',  -- Output table
-                'c',                   -- Classification
                  3                     -- Number of nearest neighbours
+                 True                  -- True if you want to show Nearest-Neighbors, False otherwise
                 );
 SELECT * from madlib_knn_result_classification ORDER BY id;
 
Result:
- id |  data   | prediction 
-----+---------+------------
-  1 | {2,1}   |          1
-  2 | {2,6}   |          1
-  3 | {15,40} |          0
-  4 | {12,1}  |          1
-  5 | {2,90}  |          0
-  6 | {50,45} |          0
+ id |  data   | prediction | k_nearest_neighbours 
+----+---------+------------+----------------------
+  1 | {2,1}   |          1 | {1,2,3}
+  2 | {2,6}   |          1 | {5,4,3}
+  3 | {15,40} |          0 | {7,6,5}
+  4 | {12,1}  |          1 | {4,5,3}
+  5 | {2,90}  |          0 | {9,6,7}
+  6 | {50,45} |          0 | {6,7,8}
 (6 rows)
 
@@ -214,28 +250,29 @@ Result:
 DROP TABLE IF EXISTS madlib_knn_result_regression;
 SELECT * FROM madlib.knn( 
-                'knn_train_data',      -- Table of training data
+                'knn_train_data_reg',  -- Table of training data
                 'data',                -- Col name of training data
+                'id',                  -- Col Name of id in train data 
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data 
                 'madlib_knn_result_regression',  -- Output table
-                'r',                   -- Regressions
-                 3                     -- Number of nearest neighbours
+                 3,                    -- Number of nearest neighbours
+                True                   -- True if you want to show Nearest-Neighbors, False otherwise
                 );
 SELECT * from madlib_knn_result_regression ORDER BY id;
 
Result:
- id |  data   |    prediction     
-----+---------+-------------------
-  1 | {2,1}   |                 1
-  2 | {2,6}   |                 1
-  3 | {15,40} | 0.333333333333333
-  4 | {12,1}  |                 1
-  5 | {2,90}  |                 0
-  6 | {50,45} |                 0
+ id |  data   |    prediction     | k_nearest_neighbours 
+----+---------+-------------------+----------------------
+  1 | {2,1}   |                 1 | {1,2,3}
+  2 | {2,6}   |                 1 | {5,4,3}
+  3 | {15,40} | 0.333333333333333 | {7,6,5}
+  4 | {12,1}  |                 1 | {4,5,3}
+  5 | {2,90}  |                 0 | {9,6,7}
+  6 | {50,45} |                 0 | {6,7,8}
 (6 rows)
 
@@ -281,7 +318,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src( label_column_name VARCHAR, test_source VARCHAR, test_column_name VARCHAR, - id_column_name VARCHAR, + test_id VARCHAR, output_table VARCHAR, operation VARCHAR, k INTEGER @@ -294,7 +331,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src( label_column_name, test_source, test_column_name, - id_column_name, + test_id, output_table, operation, k @@ -316,13 +353,14 @@ BEGIN SELECT {schema_madlib}.knn( point_source, -- Training data table having training features as vector column and labels point_column_name, -- Name of column having feature vectors in training data table + point_id, -- Name of column having feature vector Ids in train data table label_column_name, -- Name of column having actual label/vlaue for corresponding feature vector in training data table test_source, -- Test data table having features as vector column. Id of features is mandatory test_column_name, -- Name of column having feature vectors in test data table - id_column_name, -- Name of column having feature vector Ids in test data table + test_id, -- Name of column having feature vector Ids in test data table output_table, -- Name of output table - operation, -- c for classification task, r for regression task - k -- value of k. Default will go as 1 + k, -- value of k. Default will go as 1 + output_neighbors -- Outputs the list of k-nearest neighbors that were used in the voting/averaging. ); ----------------------------------------------------------------------- @@ -333,6 +371,7 @@ The output of the KNN module is a table with the following columns: id The ids of test data points. test_column_name The test data points. prediction The output of KNN- label in case of classification, average value in case of regression. +k_nearest_neighbours The list of k-nearest neighbors that were used in the voting/averaging. '; END IF; END; @@ -362,26 +401,28 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( point_source VARCHAR, point_column_name VARCHAR, + point_id VARCHAR, label_column_name VARCHAR, test_source VARCHAR, test_column_name VARCHAR, - id_column_name VARCHAR, + test_id VARCHAR, output_table VARCHAR, - operation VARCHAR, - k INTEGER + k INTEGER, + output_neighbors Boolean ) RETURNS VARCHAR AS $$ PythonFunctionBodyOnly(`knn', `knn') return knn.knn( schema_madlib, point_source, point_column_name, + point_id, label_column_name, test_source, test_column_name, - id_column_name, + test_id, output_table, - operation, - k + k, + output_neighbors ) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -390,22 +431,19 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( point_source VARCHAR, point_column_name VARCHAR, + point_id VARCHAR, label_column_name VARCHAR, test_source VARCHAR, test_column_name VARCHAR, - id_column_name VARCHAR, + test_id VARCHAR, output_table VARCHAR, - operation VARCHAR + output_neighbors Boolean ) RETURNS VARCHAR AS $$ DECLARE returnstring VARCHAR; BEGIN - returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1); + returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1,$9); RETURN returnstring; END; $$ LANGUAGE plpgsql VOLATILE -<<<<<<< HEAD -m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); -======= m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); ->>>>>>> 6f99dbe4659a5582df61e2fdcc68f7ee60d5859d diff --git a/src/ports/postgres/modules/knn/test/knn.sql_in b/src/ports/postgres/modules/knn/test/knn.sql_in index c7d6798c6..405068c26 100644 --- a/src/ports/postgres/modules/knn/test/knn.sql_in +++ b/src/ports/postgres/modules/knn/test/knn.sql_in @@ -26,12 +26,29 @@ m4_include(`SQLCommon.m4') * FIXME: Verify results * -------------------------------------------------------------------------- */ -drop table if exists "KNN_TRAIN_DATA"; -create table "KNN_TRAIN_DATA" ( - id integer, - "DATA" integer[], - label float); -copy "KNN_TRAIN_DATA" (id, "DATA", label) from stdin delimiter '|'; +drop table if exists knn_train_data; +create table knn_train_data ( +id integer, +data integer[], +label integer); +copy knn_train_data (id, data, label) from stdin delimiter '|'; +1|{1,1}|1 +2|{2,2}|1 +3|{3,3}|1 +4|{4,4}|1 +5|{4,5}|1 +6|{20,50}|0 +7|{10,31}|0 +8|{81,13}|0 +9|{1,111}|0 +\. +DROP TABLE IF EXISTS knn_train_data_reg; +CREATE TABLE knn_train_data_reg ( + id integer, + data integer[], + label float + ); +COPY knn_train_data_reg (id, data, label) from stdin delimiter '|'; 1|{1,1}|1.0 2|{2,2}|1.0 3|{3,3}|1.0 @@ -42,11 +59,10 @@ copy "KNN_TRAIN_DATA" (id, "DATA", label) from stdin delimiter '|'; 8|{81,13}|0.0 9|{1,111}|0.0 \. -drop table if exists knn_test_data; create table knn_test_data ( - id integer, - "DATA" integer[]); -copy knn_test_data (id, "DATA") from stdin delimiter '|'; +id integer, +data integer[]); +copy knn_test_data (id, data) from stdin delimiter '|'; 1|{2,1} 2|{2,6} 3|{15,40} @@ -55,15 +71,23 @@ copy knn_test_data (id, "DATA") from stdin delimiter '|'; 6|{50,45} \. drop table if exists madlib_knn_result_classification; -select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_classification','c',3); +select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,False); select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=3') from madlib_knn_result_classification; +drop table if exists madlib_knn_result_classification; +select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,True); +select assert(array_agg(k_nearest_neighbours order by id)='{ {1,2,3},{5,4,3},{7,6,5},{4,5,3},{9,6,7},{6,7,8} }', 'Wrong output in classification with k=3') from madlib_knn_result_classification; + drop table if exists madlib_knn_result_regression; -select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_regression','r',4); +select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',4,False); select assert(array_agg(prediction order by id)='{1,1,0.5,1,0.25,0.25}', 'Wrong output in regression') from madlib_knn_result_regression; +drop table if exists madlib_knn_result_regression; +select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',3,True); +select assert(array_agg(k_nearest_neighbours order by id)='{ {1,2,3},{5,4,3},{7,6,5},{4,5,3},{9,6,7},{6,7,8} }', 'Wrong output in regression') from madlib_knn_result_regression; + drop table if exists madlib_knn_result_classification; -select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_classification','c'); +select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',False); select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=1') from madlib_knn_result_classification; select knn(); From 8eb2b9119de695813a81b0b2d856d15bca345428 Mon Sep 17 00:00:00 2001 From: hpandeycodeit Date: Fri, 8 Sep 2017 14:50:49 -0700 Subject: [PATCH 7/7] Revert "KNN changes for MADLIB-1129" This reverts commit 4c1d3bf712d27bae8ae133a0a4087a07e6c61dda. --- src/ports/postgres/modules/knn/knn.py_in | 154 ++++++------------ src/ports/postgres/modules/knn/knn.sql_in | 130 ++++++--------- .../postgres/modules/knn/test/knn.sql_in | 50 ++---- 3 files changed, 108 insertions(+), 226 deletions(-) diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in index 0cd17d4ed..27cb7353d 100644 --- a/src/ports/postgres/modules/knn/knn.py_in +++ b/src/ports/postgres/modules/knn/knn.py_in @@ -36,20 +36,19 @@ from utilities.utilities import unique_string from utilities.control import MinWarning -def knn_validate_src(schema_madlib, point_source, point_column_name,point_id, +def knn_validate_src(schema_madlib, point_source, point_column_name, label_column_name, test_source, test_column_name, - test_id, output_table, k, output_neighbors , **kwargs): - # if not operation or operation not in ['c', 'r']: - # plpy.error("kNN Error: operation='{0}' is an invalid value, has to be" - # " 'r' for regression OR 'c' for classification.". - # format(operation)) + id_column_name, output_table, operation, k, **kwargs): + + if not operation or operation not in ['c', 'r']: + plpy.error("kNN Error: operation='{0}' is an invalid value, has to be" + " 'r' for regression OR 'c' for classification.". + format(operation)) input_tbl_valid(point_source, 'kNN') input_tbl_valid(test_source, 'kNN') output_tbl_valid(output_table, 'kNN') - if label_column_name is not None and label_column_name != '': - cols_in_tbl_valid(point_source, (label_column_name, point_column_name), 'kNN') - cols_in_tbl_valid(point_source, (point_column_name, point_id), 'kNN') - cols_in_tbl_valid(test_source, (test_column_name, test_id), 'kNN') + cols_in_tbl_valid(point_source, (label_column_name, point_column_name), 'kNN') + cols_in_tbl_valid(test_source, (test_column_name, id_column_name), 'kNN') if not is_col_array(point_source, point_column_name): plpy.error("kNN Error: Feature column '{0}' in train table is not" @@ -76,32 +75,30 @@ def knn_validate_src(schema_madlib, point_source, point_column_name,point_id, plpy.error("kNN Error: k={0} is greater than number of rows in" " training table.".format(k)) - if label_column_name is not None and label_column_name != '': - col_type = get_expr_type(label_column_name, point_source).lower() - if col_type not in ['integer', 'double precision', 'float', 'boolean']: - plpy.error("kNN error: Data type '{0}' is not a valid type for" + col_type = get_expr_type(label_column_name, point_source).lower() + if col_type not in ['integer', 'double precision', 'float', 'boolean']: + plpy.error("kNN error: Data type '{0}' is not a valid type for" " column '{1}' in table '{2}'.". format(col_type, label_column_name, point_source)) - col_type_test = get_expr_type(test_id, test_source).lower() + col_type_test = get_expr_type(id_column_name, test_source).lower() if col_type_test not in ['integer']: plpy.error("kNN Error: Data type '{0}' is not a valid type for" " column '{1}' in table '{2}'.". - format(col_type_test, test_id, test_source)) + format(col_type_test, id_column_name, test_source)) return k # ------------------------------------------------------------------------------ -def knn(schema_madlib, point_source, point_column_name,point_id, label_column_name, - test_source, test_column_name, test_id, output_table, k,output_neighbors): +def knn(schema_madlib, point_source, point_column_name, label_column_name, + test_source, test_column_name, id_column_name, output_table, + operation, k): """ KNN function to find the K Nearest neighbours Args: @param schema_madlib Name of the Madlib Schema @param point_source Training data table @param point_column_name Name of the column with training data - @param point_id Name of the column having ids of data - point in train data table points. @param label_column_name Name of the column with labels/values of training data points. @@ -109,7 +106,7 @@ def knn(schema_madlib, point_source, point_column_name,point_id, label_column_na data points. @param test_column_name Name of the column with testing data points. - @param test_id Name of the column having ids of data + @param id_column_name Name of the column having ids of data points in test data table. @param output_table Name of the table to store final results. @@ -118,85 +115,33 @@ def knn(schema_madlib, point_source, point_column_name,point_id, label_column_na 'r' for regression @param k default: 1. Number of nearest neighbors to consider - @output_neighbours Outputs the list of k-nearest neighbors - that were used in the voting/averaging. Returns: VARCHAR Name of the output table. """ with MinWarning('warning'): k_val = knn_validate_src(schema_madlib, point_source, - point_column_name, point_id ,label_column_name, - test_source, test_column_name, test_id, - output_table, k , output_neighbors) + point_column_name, label_column_name, + test_source, test_column_name, id_column_name, + output_table, operation, k) x_temp_table = unique_string(desp='x_temp_table') y_temp_table = unique_string(desp='y_temp_table') label_col_temp = unique_string(desp='label_col_temp') - test_id_temp = unique_string(desp='test_id_temp') - - if output_neighbors is None or '': - output_neighbors=False + test_id = unique_string(desp='test_id') + is_classification = operation == 'c' interim_table = unique_string(desp='interim_table') - - if label_column_name is None or label_column_name == '': - plpy.execute( - """ - CREATE TEMP TABLE {interim_table} AS - SELECT * FROM - ( - SELECT row_number() over - (partition by {test_id_temp} order by dist) AS r, - {x_temp_table}.* - FROM - ( - SELECT test.{test_id} AS {test_id_temp} , - train.id as train_id , - {schema_madlib}.squared_dist_norm2( - train.{point_column_name}, - test.{test_column_name}) - AS dist - FROM {point_source} AS train, {test_source} AS test - ) {x_temp_table} - ) {y_temp_table} - WHERE {y_temp_table}.r <= {k_val} - """.format(**locals())) - plpy.execute( - """ - CREATE TABLE {output_table} AS - SELECT {test_id_temp} AS id, {test_column_name} , - CASE WHEN {output_neighbors} - THEN array_agg(knn_temp.train_id) - ELSE NULL END AS k_nearest_neighbours - FROM pg_temp.{interim_table} AS knn_temp - join - {test_source} AS knn_test ON - knn_temp.{test_id_temp} = knn_test.{test_id} - GROUP BY {test_id_temp} , {test_column_name} - """.format(**locals())) - return - - - is_classification = False - label_column_type = get_expr_type(label_column_name, point_source).lower() - if label_column_type in ['boolean','integer', 'text']: - is_classification = True - convert_boolean_to_int = '::INTEGER' - else: - is_classification = False - plpy.execute( """ CREATE TEMP TABLE {interim_table} AS SELECT * FROM ( SELECT row_number() over - (partition by {test_id_temp} order by dist) AS r, + (partition by {test_id} order by dist) AS r, {x_temp_table}.* FROM ( - SELECT test.{test_id} AS {test_id_temp} , - train.id as train_id , + SELECT test.{id_column_name} AS {test_id} , {schema_madlib}.squared_dist_norm2( train.{point_column_name}, test.{test_column_name}) @@ -207,34 +152,33 @@ def knn(schema_madlib, point_source, point_column_name,point_id, label_column_na ) {x_temp_table} ) {y_temp_table} WHERE {y_temp_table}.r <= {k_val} - """.format(cast_to_int='::INTEGER' if is_classification else '', + """.format(cast_to_int='::INTEGER' if is_classification else '', **locals())) - knn_create_table = 'CREATE TABLE '+output_table+' AS ' \ - 'SELECT '+test_id_temp+' AS id,'+test_column_name+',' - knn_pred_class = schema_madlib+'.mode(' +label_col_temp+') AS prediction' - knn_pred_reg = 'avg(' +label_col_temp+') AS prediction' - knn_neighbours = ', array_agg(knn_temp.train_id) AS k_nearest_neighbours ' - knn_group_by = 'FROM pg_temp.'+interim_table+' AS knn_temp join ' \ - +test_source+' AS knn_test ON knn_temp.'+test_id_temp+'= knn_test.' \ - +test_id +' GROUP BY '+test_id_temp+', '+test_column_name - - if is_classification: - if output_neighbors: - plpy.execute("""{knn_create_table}{knn_pred_class} - {knn_neighbours}{knn_group_by}""".format(**locals())) - else: - plpy.execute(""" {knn_create_table}{knn_pred_class} - {knn_group_by}""".format(**locals())) + plpy.execute( + """ + CREATE TABLE {output_table} AS + SELECT {test_id} AS id, {test_column_name}, + {schema_madlib}.mode({label_col_temp}) AS prediction + FROM {interim_table} JOIN {test_source} + ON {test_id} = {id_column_name} + GROUP BY {test_id}, {test_column_name} + """.format(**locals())) else: - if output_neighbors: - plpy.execute(""" {knn_create_table}{knn_pred_reg} - {knn_neighbours}{knn_group_by}""".format(**locals())) - else: - plpy.execute("""{knn_create_table}{knn_pred_reg} - {knn_group_by}""".format(**locals())) - - + plpy.execute( + """ + CREATE TABLE {output_table} AS + SELECT {test_id} AS id, {test_column_name}, + AVG({label_col_temp}) AS prediction + FROM + {interim_table} JOIN {test_source} + ON {test_id} = {id_column_name} + GROUP BY {test_id}, {test_column_name} + """.format(**locals())) plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table)) +<<<<<<< HEAD +# ------------------------------------------------------------------------------ +======= # ------------------------------------------------------------------------------ +>>>>>>> 6f99dbe4659a5582df61e2fdcc68f7ee60d5859d diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in index dfd23748e..1d547bddf 100644 --- a/src/ports/postgres/modules/knn/knn.sql_in +++ b/src/ports/postgres/modules/knn/knn.sql_in @@ -71,14 +71,13 @@ neighbors of the given test point.
 knn( point_source,
      point_column_name,
-     point_id,
      label_column_name,
      test_source,
      test_column_name,
-     test_id,
+     id_column_name,
      output_table,
-     k,
-     output_neighbors
+     operation,
+     k
    )
 
@@ -94,17 +93,8 @@ in a column of type DOUBLE PRECISION[].
point_column_name
TEXT. Name of the column with training data points.
-
point_id
-
TEXT. Name of the column in 'point_source’ containing source data ids. -The ids are of type INTEGER with no duplicates. They do not need to be contiguous. -This parameter must be used if the list of nearest neighbors are to be output, i.e., -if the parameter 'output_neighbors' below is TRUE or if 'label_column_name' is NULL. -
label_column_name
-
TEXT. Name of the column with labels/values of training data points. -If Boolean, integer or text types will run knn classification, else if -double precision values will run knn regression. -If you set this to NULL will return neighbors only without doing classification or regression.
+
TEXT. Name of the column with labels/values of training data points.
test_source
TEXT. Name of the table containing the test data points. @@ -116,7 +106,7 @@ in a column of type DOUBLE PRECISION[].
test_column_name
TEXT. Name of the column with testing data points.
-
test_id
+
id_column_name
TEXT. Name of the column having ids of data points in test data table.
output_table
@@ -127,12 +117,7 @@ in a column of type DOUBLE PRECISION[].
k (optional)
INTEGER. default: 1. Number of nearest neighbors to consider. -For classification, should be an odd number to break ties. -otherwise result may depend on ordering of the input data.
- -
output_neighbors (optional)
-
BOOLEAN default: FALSE. Outputs the list of k-nearest -neighbors that were used in the voting/averaging.
+For classification, should be an odd number to break ties. @@ -160,35 +145,15 @@ The output of the KNN module is a table with the following columns: @anchor examples @examp --# Prepare some training data for classification: +-# Prepare some training data:
 DROP TABLE IF EXISTS knn_train_data;
 CREATE TABLE knn_train_data (
-                    id integer, 
-                    data integer[], 
-                    label integer
-                    );
-INSERT INTO knn_train_data VALUES
-(1, '{1,1}', 1),
-(2, '{2,2}', 1),
-(3, '{3,3}', 1),
-(4, '{4,4}', 1),
-(5, '{4,5}', 1),
-(6, '{20,50}', 0),
-(7, '{10,31}', 0),
-(8, '{81,13}', 0),
-(9, '{1,111}', 0);
-
- --# Prepare some training data for regression: -
-DROP TABLE IF EXISTS knn_train_data_reg;
-CREATE TABLE knn_train_data_reg (
                     id integer, 
                     data integer[], 
                     label float
                     );
-INSERT INTO knn_train_data_reg VALUES
+INSERT INTO knn_train_data VALUES
 (1, '{1,1}', 1.0),
 (2, '{2,2}', 1.0),
 (3, '{3,3}', 1.0),
@@ -222,27 +187,26 @@ DROP TABLE IF EXISTS madlib_knn_result_classification;
 SELECT * FROM madlib.knn( 
                 'knn_train_data',      -- Table of training data
                 'data',                -- Col name of training data
-                'id',                  -- Col Name of id in train data 
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data 
                 'madlib_knn_result_classification',  -- Output table
+                'c',                   -- Classification
                  3                     -- Number of nearest neighbours
-                 True                  -- True if you want to show Nearest-Neighbors, False otherwise
                 );
 SELECT * from madlib_knn_result_classification ORDER BY id;
 
Result:
- id |  data   | prediction | k_nearest_neighbours 
-----+---------+------------+----------------------
-  1 | {2,1}   |          1 | {1,2,3}
-  2 | {2,6}   |          1 | {5,4,3}
-  3 | {15,40} |          0 | {7,6,5}
-  4 | {12,1}  |          1 | {4,5,3}
-  5 | {2,90}  |          0 | {9,6,7}
-  6 | {50,45} |          0 | {6,7,8}
+ id |  data   | prediction 
+----+---------+------------
+  1 | {2,1}   |          1
+  2 | {2,6}   |          1
+  3 | {15,40} |          0
+  4 | {12,1}  |          1
+  5 | {2,90}  |          0
+  6 | {50,45} |          0
 (6 rows)
 
@@ -250,29 +214,28 @@ Result:
 DROP TABLE IF EXISTS madlib_knn_result_regression;
 SELECT * FROM madlib.knn( 
-                'knn_train_data_reg',  -- Table of training data
+                'knn_train_data',      -- Table of training data
                 'data',                -- Col name of training data
-                'id',                  -- Col Name of id in train data 
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data 
                 'madlib_knn_result_regression',  -- Output table
-                 3,                    -- Number of nearest neighbours
-                True                   -- True if you want to show Nearest-Neighbors, False otherwise
+                'r',                   -- Regressions
+                 3                     -- Number of nearest neighbours
                 );
 SELECT * from madlib_knn_result_regression ORDER BY id;
 
Result:
- id |  data   |    prediction     | k_nearest_neighbours 
-----+---------+-------------------+----------------------
-  1 | {2,1}   |                 1 | {1,2,3}
-  2 | {2,6}   |                 1 | {5,4,3}
-  3 | {15,40} | 0.333333333333333 | {7,6,5}
-  4 | {12,1}  |                 1 | {4,5,3}
-  5 | {2,90}  |                 0 | {9,6,7}
-  6 | {50,45} |                 0 | {6,7,8}
+ id |  data   |    prediction     
+----+---------+-------------------
+  1 | {2,1}   |                 1
+  2 | {2,6}   |                 1
+  3 | {15,40} | 0.333333333333333
+  4 | {12,1}  |                 1
+  5 | {2,90}  |                 0
+  6 | {50,45} |                 0
 (6 rows)
 
@@ -318,7 +281,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src( label_column_name VARCHAR, test_source VARCHAR, test_column_name VARCHAR, - test_id VARCHAR, + id_column_name VARCHAR, output_table VARCHAR, operation VARCHAR, k INTEGER @@ -331,7 +294,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src( label_column_name, test_source, test_column_name, - test_id, + id_column_name, output_table, operation, k @@ -353,14 +316,13 @@ BEGIN SELECT {schema_madlib}.knn( point_source, -- Training data table having training features as vector column and labels point_column_name, -- Name of column having feature vectors in training data table - point_id, -- Name of column having feature vector Ids in train data table label_column_name, -- Name of column having actual label/vlaue for corresponding feature vector in training data table test_source, -- Test data table having features as vector column. Id of features is mandatory test_column_name, -- Name of column having feature vectors in test data table - test_id, -- Name of column having feature vector Ids in test data table + id_column_name, -- Name of column having feature vector Ids in test data table output_table, -- Name of output table - k, -- value of k. Default will go as 1 - output_neighbors -- Outputs the list of k-nearest neighbors that were used in the voting/averaging. + operation, -- c for classification task, r for regression task + k -- value of k. Default will go as 1 ); ----------------------------------------------------------------------- @@ -371,7 +333,6 @@ The output of the KNN module is a table with the following columns: id The ids of test data points. test_column_name The test data points. prediction The output of KNN- label in case of classification, average value in case of regression. -k_nearest_neighbours The list of k-nearest neighbors that were used in the voting/averaging. '; END IF; END; @@ -401,28 +362,26 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( point_source VARCHAR, point_column_name VARCHAR, - point_id VARCHAR, label_column_name VARCHAR, test_source VARCHAR, test_column_name VARCHAR, - test_id VARCHAR, + id_column_name VARCHAR, output_table VARCHAR, - k INTEGER, - output_neighbors Boolean + operation VARCHAR, + k INTEGER ) RETURNS VARCHAR AS $$ PythonFunctionBodyOnly(`knn', `knn') return knn.knn( schema_madlib, point_source, point_column_name, - point_id, label_column_name, test_source, test_column_name, - test_id, + id_column_name, output_table, - k, - output_neighbors + operation, + k ) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -431,19 +390,22 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( point_source VARCHAR, point_column_name VARCHAR, - point_id VARCHAR, label_column_name VARCHAR, test_source VARCHAR, test_column_name VARCHAR, - test_id VARCHAR, + id_column_name VARCHAR, output_table VARCHAR, - output_neighbors Boolean + operation VARCHAR ) RETURNS VARCHAR AS $$ DECLARE returnstring VARCHAR; BEGIN - returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1,$9); + returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1); RETURN returnstring; END; $$ LANGUAGE plpgsql VOLATILE +<<<<<<< HEAD +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +======= m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +>>>>>>> 6f99dbe4659a5582df61e2fdcc68f7ee60d5859d diff --git a/src/ports/postgres/modules/knn/test/knn.sql_in b/src/ports/postgres/modules/knn/test/knn.sql_in index 405068c26..c7d6798c6 100644 --- a/src/ports/postgres/modules/knn/test/knn.sql_in +++ b/src/ports/postgres/modules/knn/test/knn.sql_in @@ -26,29 +26,12 @@ m4_include(`SQLCommon.m4') * FIXME: Verify results * -------------------------------------------------------------------------- */ -drop table if exists knn_train_data; -create table knn_train_data ( -id integer, -data integer[], -label integer); -copy knn_train_data (id, data, label) from stdin delimiter '|'; -1|{1,1}|1 -2|{2,2}|1 -3|{3,3}|1 -4|{4,4}|1 -5|{4,5}|1 -6|{20,50}|0 -7|{10,31}|0 -8|{81,13}|0 -9|{1,111}|0 -\. -DROP TABLE IF EXISTS knn_train_data_reg; -CREATE TABLE knn_train_data_reg ( - id integer, - data integer[], - label float - ); -COPY knn_train_data_reg (id, data, label) from stdin delimiter '|'; +drop table if exists "KNN_TRAIN_DATA"; +create table "KNN_TRAIN_DATA" ( + id integer, + "DATA" integer[], + label float); +copy "KNN_TRAIN_DATA" (id, "DATA", label) from stdin delimiter '|'; 1|{1,1}|1.0 2|{2,2}|1.0 3|{3,3}|1.0 @@ -59,10 +42,11 @@ COPY knn_train_data_reg (id, data, label) from stdin delimiter '|'; 8|{81,13}|0.0 9|{1,111}|0.0 \. +drop table if exists knn_test_data; create table knn_test_data ( -id integer, -data integer[]); -copy knn_test_data (id, data) from stdin delimiter '|'; + id integer, + "DATA" integer[]); +copy knn_test_data (id, "DATA") from stdin delimiter '|'; 1|{2,1} 2|{2,6} 3|{15,40} @@ -71,23 +55,15 @@ copy knn_test_data (id, data) from stdin delimiter '|'; 6|{50,45} \. drop table if exists madlib_knn_result_classification; -select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,False); +select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_classification','c',3); select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=3') from madlib_knn_result_classification; -drop table if exists madlib_knn_result_classification; -select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,True); -select assert(array_agg(k_nearest_neighbours order by id)='{ {1,2,3},{5,4,3},{7,6,5},{4,5,3},{9,6,7},{6,7,8} }', 'Wrong output in classification with k=3') from madlib_knn_result_classification; - drop table if exists madlib_knn_result_regression; -select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',4,False); +select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_regression','r',4); select assert(array_agg(prediction order by id)='{1,1,0.5,1,0.25,0.25}', 'Wrong output in regression') from madlib_knn_result_regression; -drop table if exists madlib_knn_result_regression; -select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',3,True); -select assert(array_agg(k_nearest_neighbours order by id)='{ {1,2,3},{5,4,3},{7,6,5},{4,5,3},{9,6,7},{6,7,8} }', 'Wrong output in regression') from madlib_knn_result_regression; - drop table if exists madlib_knn_result_classification; -select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',False); +select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_classification','c'); select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=1') from madlib_knn_result_classification; select knn();