From a0b1e0a78ffc993f2e2efad8df9a2c49cfc0fcbb Mon Sep 17 00:00:00 2001 From: Orhan Kislal Date: Mon, 11 Dec 2017 15:27:09 -0800 Subject: [PATCH] KNN: Move online help to python layer Additional Author: Nikhil Kak - Remove the dependency on the client message level for knn online help. --- src/ports/postgres/modules/knn/knn.py_in | 76 ++++++++++++++++++++++ src/ports/postgres/modules/knn/knn.sql_in | 77 +++++------------------ 2 files changed, 93 insertions(+), 60 deletions(-) diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in index 7729d2f77..fd94dcd25 100644 --- a/src/ports/postgres/modules/knn/knn.py_in +++ b/src/ports/postgres/modules/knn/knn.py_in @@ -228,3 +228,79 @@ def knn(schema_madlib, point_source, point_column_name, point_id, plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table)) return # ------------------------------------------------------------------------------ +# ---------------------------------------------------------------------- + + +def knn_help(schema_madlib, message, **kwargs): + """ + Help function for knn + + Args: + @param schema_madlib + @param message: string, Help message string + @param kwargs + + Returns: + String. Help/usage information + """ + if not message: + help_string = """ +----------------------------------------------------------------------- + SUMMARY +----------------------------------------------------------------------- +k-Nearest Neighbors is a method for finding k closest points to a given data +point in terms of a given metric. Its input consist of data points as features +from testing examples. For a given k, it looks for k closest points in +training set for each of the data points in test set. Algorithm generates one +output per testing example. The output of KNN depends on the type of task: +For Classification, the output is majority vote of the classes of the k +nearest data points. The testing example gets assigned the most popular class +among nearest neighbors. For Regression, the output is average of the values +of k nearest neighbors of the given testing example. + """ + elif message in ['usage', 'help', '?']: + help_string = """ +----------------------------------------------------------------------- + USAGE +----------------------------------------------------------------------- +SELECT {schema_madlib}.knn( + point_source, -- Training data table having training features + as vector column and labels + point_column_name, -- Name of column having feature vectors in + training data table + point_id, -- Name of column having feature vector Ids in + train data table + label_column_name, -- Name of column having actual label/vlaue for + corresponding feature vector in training + data table + test_source, -- Test data table having features as vector + column. Id of features is mandatory + test_column_name, -- Name of column having feature vectors in test + data table + test_id, -- Name of column having feature vector Ids in + test data table + output_table, -- Name of output table + k, -- value of k. Default will go as 1 + output_neighbors -- Outputs the list of k-nearest neighbors that + were used in the voting/averaging. + fn_dist -- The name of the function to use to calculate + the distance from a data point to a centroid. + ); + +----------------------------------------------------------------------- + OUTPUT +----------------------------------------------------------------------- +The output of the KNN module is a table with the following columns: + +id The ids of test data points. +test_column_name The test data points. +prediction The output of KNN- label in case of classification, + average value in case of regression. +k_nearest_neighbours The list of k-nearest neighbors that were used in + the voting/averaging. +""" + else: + help_string = "No such option. Use {schema_madlib}.knn()" + + return help_string.format(schema_madlib=schema_madlib) +# --------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in index d45f0f414..8408de95d 100644 --- a/src/ports/postgres/modules/knn/knn.sql_in +++ b/src/ports/postgres/modules/knn/knn.sql_in @@ -386,66 +386,6 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src( $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); - -CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( - arg1 VARCHAR -) RETURNS VOID AS $$ -BEGIN - IF arg1 = 'help' OR arg1 = 'usage' OR arg1 = '?' THEN - RAISE NOTICE -' ------------------------------------------------------------------------ - USAGE ------------------------------------------------------------------------ -SELECT {schema_madlib}.knn( - point_source, -- Training data table having training features as vector column and labels - point_column_name, -- Name of column having feature vectors in training data table - point_id, -- Name of column having feature vector Ids in train data table - label_column_name, -- Name of column having actual label/vlaue for corresponding feature vector in training data table - test_source, -- Test data table having features as vector column. Id of features is mandatory - test_column_name, -- Name of column having feature vectors in test data table - test_id, -- Name of column having feature vector Ids in test data table - output_table, -- Name of output table - k, -- value of k. Default will go as 1 - output_neighbors -- Outputs the list of k-nearest neighbors that were used in the voting/averaging. - fn_dist -- The name of the function to use to calculate the distance from a data point to a centroid. - ); - ------------------------------------------------------------------------ - OUTPUT ------------------------------------------------------------------------ -The output of the KNN module is a table with the following columns: - -id The ids of test data points. -test_column_name The test data points. -prediction The output of KNN- label in case of classification, average value in case of regression. -k_nearest_neighbours The list of k-nearest neighbors that were used in the voting/averaging. -'; - END IF; -END; -$$ LANGUAGE plpgsql VOLATILE -m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); - -CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( -) RETURNS VOID AS $$ -BEGIN - RAISE NOTICE ' -k-Nearest Neighbors is a method for finding k closest points to a given data -point in terms of a given metric. Its input consist of data points as features -from testing examples. For a given k, it looks for k closest points in -training set for each of the data points in test set. Algorithm generates one -output per testing example. The output of KNN depends on the type of task: -For Classification, the output is majority vote of the classes of the k -nearest data points. The testing example gets assigned the most popular class -among nearest neighbors. For Regression, the output is average of the values -of k nearest neighbors of the given testing example. - '; -END; -$$ LANGUAGE plpgsql VOLATILE -m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); - - - CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( point_source VARCHAR, point_column_name VARCHAR, @@ -540,3 +480,20 @@ BEGIN END; $$ LANGUAGE plpgsql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +-- Online help +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( + message VARCHAR +) RETURNS VARCHAR AS $$ + PythonFunction(knn, knn, knn_help) +$$ LANGUAGE plpythonu IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn() +RETURNS VARCHAR AS $$ + SELECT MADLIB_SCHEMA.knn(''); +$$ LANGUAGE sql IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); +--------------------------------------------------------------------------------