From a0b1e0a78ffc993f2e2efad8df9a2c49cfc0fcbb Mon Sep 17 00:00:00 2001
From: Orhan Kislal <okislal@pivotal.io>
Date: Mon, 11 Dec 2017 15:27:09 -0800
Subject: [PATCH] KNN: Move online help to python layer

Additional Author: Nikhil Kak <nkak@pivotal.io>

- Remove the dependency on the client message level for knn online help.
---
 src/ports/postgres/modules/knn/knn.py_in  | 76 ++++++++++++++++++++++
 src/ports/postgres/modules/knn/knn.sql_in | 77 +++++------------------
 2 files changed, 93 insertions(+), 60 deletions(-)

diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in
index 7729d2f77..fd94dcd25 100644
--- a/src/ports/postgres/modules/knn/knn.py_in
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -228,3 +228,79 @@ def knn(schema_madlib, point_source, point_column_name, point_id,
         plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table))
         return
 # ------------------------------------------------------------------------------
+# ----------------------------------------------------------------------
+
+
+def knn_help(schema_madlib, message, **kwargs):
+    """
+    Help function for knn
+
+    Args:
+        @param schema_madlib
+        @param message: string, Help message string
+        @param kwargs
+
+    Returns:
+        String. Help/usage information
+    """
+    if not message:
+        help_string = """
+-----------------------------------------------------------------------
+                            SUMMARY
+-----------------------------------------------------------------------
+k-Nearest Neighbors is a method for finding k closest points to a given data
+point in terms of a given metric. Its input consist of data points as features
+from testing examples. For a given k, it looks for k closest points in
+training set for each of the data points in test set. Algorithm generates one
+output per testing example. The output of KNN depends on the type of task:
+For Classification, the output is majority vote of the classes of the k
+nearest data points. The testing example gets assigned the most popular class
+among nearest neighbors. For Regression, the output is average of the values
+of k nearest neighbors of the given testing example.
+            """
+    elif message in ['usage', 'help', '?']:
+        help_string = """
+-----------------------------------------------------------------------
+                            USAGE
+-----------------------------------------------------------------------
+SELECT {schema_madlib}.knn(
+    point_source,       -- Training data table having training features
+                           as vector column and labels
+    point_column_name,  -- Name of column having feature vectors in
+                           training data table
+    point_id,           -- Name of column having feature vector Ids in
+                           train data table
+    label_column_name,  -- Name of column having actual label/vlaue for
+                           corresponding feature vector in training
+                           data table
+    test_source,        -- Test data table having features as vector
+                           column. Id of features is mandatory
+    test_column_name,   -- Name of column having feature vectors in test
+                           data table
+    test_id,            -- Name of column having feature vector Ids in
+                           test data table
+    output_table,       -- Name of output table
+    k,                  -- value of k. Default will go as 1
+    output_neighbors    -- Outputs the list of k-nearest neighbors that
+                           were used in the voting/averaging.
+    fn_dist             -- The name of the function to use to calculate
+                           the distance from a data point to a centroid.
+    );
+
+-----------------------------------------------------------------------
+                            OUTPUT
+-----------------------------------------------------------------------
+The output of the KNN module is a table with the following columns:
+
+id                   The ids of test data points.
+test_column_name     The test data points.
+prediction           The output of KNN- label in case of classification,
+                     average value in case of regression.
+k_nearest_neighbours The list of k-nearest neighbors that were used in
+                     the voting/averaging.
+"""
+    else:
+        help_string = "No such option. Use {schema_madlib}.knn()"
+
+    return help_string.format(schema_madlib=schema_madlib)
+# ---------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
index d45f0f414..8408de95d 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -386,66 +386,6 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src(
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
-    arg1 VARCHAR
-) RETURNS VOID AS $$
-BEGIN
-    IF arg1 = 'help' OR arg1 = 'usage' OR arg1 = '?' THEN
-    RAISE NOTICE
-'
------------------------------------------------------------------------
-                            USAGE
------------------------------------------------------------------------
-SELECT {schema_madlib}.knn(
-    point_source,       -- Training data table having training features as vector column and labels
-    point_column_name,  -- Name of column having feature vectors in training data table
-    point_id,           -- Name of column having feature vector Ids in train data table
-    label_column_name,  -- Name of column having actual label/vlaue for corresponding feature vector in training data table
-    test_source,        -- Test data table having features as vector column. Id of features is mandatory
-    test_column_name,   -- Name of column having feature vectors in test data table
-    test_id,     -- Name of column having feature vector Ids in test data table
-    output_table,       -- Name of output table
-    k,                  -- value of k. Default will go as 1
-    output_neighbors    -- Outputs the list of k-nearest neighbors that were used in the voting/averaging.
-    fn_dist             -- The name of the function to use to calculate the distance from a data point to a centroid.
-    );
-
------------------------------------------------------------------------
-                            OUTPUT
------------------------------------------------------------------------
-The output of the KNN module is a table with the following columns:
-
-id                  The ids of test data points.
-test_column_name    The test data points.
-prediction          The output of KNN- label in case of classification, average value in case of regression.
-k_nearest_neighbours The list of k-nearest neighbors that were used in the voting/averaging.
-';
-    END IF;
-END;
-$$ LANGUAGE plpgsql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
-) RETURNS VOID AS $$
-BEGIN
-    RAISE NOTICE '
-k-Nearest Neighbors is a method for finding k closest points to a given data
-point in terms of a given metric. Its input consist of data points as features
-from testing examples. For a given k, it looks for k closest points in
-training set for each of the data points in test set. Algorithm generates one
-output per testing example. The output of KNN depends on the type of task:
-For Classification, the output is majority vote of the classes of the k
-nearest data points. The testing example gets assigned the most popular class
-among nearest neighbors. For Regression, the output is average of the values
-of k nearest neighbors of the given testing example.
-    ';
-END;
-$$ LANGUAGE plpgsql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
-
-
-
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     point_source VARCHAR,
     point_column_name VARCHAR,
@@ -540,3 +480,20 @@ BEGIN
 END;
 $$ LANGUAGE plpgsql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+-- Online help
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+    message VARCHAR
+) RETURNS VARCHAR AS $$
+    PythonFunction(knn, knn, knn_help)
+$$ LANGUAGE plpythonu IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+
+--------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn()
+RETURNS VARCHAR AS $$
+    SELECT MADLIB_SCHEMA.knn('');
+$$ LANGUAGE sql IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+--------------------------------------------------------------------------------