From 6632f72840211f87c99b32c916dc610977329d9f Mon Sep 17 00:00:00 2001
From: hpandeycodeit <hpandey@pivotal.io>
Date: Tue, 15 Aug 2017 15:30:27 -0700
Subject: [PATCH 1/7] knn code refactoring

---
 src/ports/postgres/modules/knn/knn.py_in  |  82 +++++++++++++++-
 src/ports/postgres/modules/knn/knn.sql_in | 109 +++-------------------
 2 files changed, 95 insertions(+), 96 deletions(-)

diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in
index c0d9cd7be..7fe7007c8 100644
--- a/src/ports/postgres/modules/knn/knn.py_in
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -127,5 +127,83 @@ def knn_validate_src(schema_madlib, point_source, point_column_name, label_colum
                     "Data type '{0}' is not a valid type for column '{1}' in table '{2}'.".format(colType, id_column_name, test_source))
     return k
 
-# ----------------------------------------------------------------------
-m4_changequote(<!`!>, <!'!>)
+
+
+
+
+def knn(schema_madlib, point_source, point_column_name, label_column_name,
+    test_source, test_column_name, id_column_name, output_table, operation, k):
+
+
+  
+    oldClientMinMessages = plpy.execute("SELECT setting FROM pg_settings WHERE name = 'client_min_messages'")[0]['setting'];
+
+    plpy.execute("SET client_min_messages TO warning");
+
+ 
+    k_val = knn_validate_src(schema_madlib, point_source, point_column_name, 
+                label_column_name, test_source, 
+                test_column_name, id_column_name, 
+                output_table, operation, k) 
+
+
+    plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp()".format(schema_madlib = schema_madlib));
+ 
+    x_temp_table = 'knn' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_temp' AS MD5")[0]['md5'];
+    y_temp_table = 'knn' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_temp' AS MD5")[0]['md5'];
+    label_column_name_unique = 'label' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_name' AS MD5")[0]['md5'];
+    test_id = 'id' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_name' AS MD5")[0]['md5'];
+
+    convert_boolean_to_int = '';
+    if operation == 'c':
+        convert_boolean_to_int = '::INTEGER';
+    
+
+    plpy.execute("DROP TABLE IF EXISTS pg_temp.madlib_knn_interm");
+    plpy.execute(
+    """
+    CREATE TEMP TABLE pg_temp.madlib_knn_interm AS
+    SELECT *
+    FROM
+        (
+        SELECT row_number() over (partition by {test_id}  order by dist) AS r , {x_temp_table}.*
+        FROM
+            (
+                SELECT test.{id_column_name} AS  {test_id} , {schema_madlib}.squared_dist_norm2(train.{point_column_name} ,test.{test_column_name}) AS dist, train.{label_column_name} {convert_boolean_to_int} AS {label_column_name_unique}
+                FROM  {point_source} AS train, {test_source}  AS test
+            ) {x_temp_table}
+        ){y_temp_table}
+    WHERE {y_temp_table}.r <= {k_val}""".format(**locals()));
+
+    if operation == 'c':
+        plpy.execute(
+        """
+        CREATE TABLE {output_table} AS
+        SELECT {test_id} AS id, {test_column_name} , {schema_madlib}.mode({label_column_name_unique}) AS prediction
+        FROM pg_temp.madlib_knn_interm join  {test_source}  ON  {test_id} = {id_column_name}  
+        GROUP BY {test_id}  ,  {test_column_name}""".format(**locals()))
+        
+        
+    else:
+        plpy.execute(
+        """ 
+        CREATE TABLE  {output_table} AS
+        SELECT  {test_id}   AS id, {test_column_name} , avg( {label_column_name_unique}  ) AS prediction
+        FROM
+            pg_temp.madlib_knn_interm join {test_source}  on {test_id}  ={id_column_name} 
+        GROUP BY {test_id} ,  {test_column_name} 
+        ORDER BY {test_id}""".format(**locals()))   
+   
+
+    plpy.execute("SET client_min_messages TO "+ oldClientMinMessages)
+
+    if operation == 'c':
+        returnstring = 'The classification results have been written to output table '+ output_table;
+    else:
+        returnstring = 'The regression results have been written to output table '+ output_table;
+
+    plpy.execute("DROP TABLE pg_temp.madlib_knn_interm");
+    return returnstring;
+
+
+
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
index d3c19292c..373e92418 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -358,6 +358,7 @@ $$ LANGUAGE plpgsql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 
 
+
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     point_source VARCHAR,
     point_column_name VARCHAR,
@@ -369,98 +370,18 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     operation VARCHAR,
     k INTEGER
 ) RETURNS VARCHAR AS $$
-DECLARE
-    l FLOAT;
-    id INTEGER;
-    vector DOUBLE PRECISION[];
-    cur_pid integer;
-    oldClientMinMessages VARCHAR;
-    returnstring VARCHAR;
-    x_temp_table VARCHAR;
-    y_temp_table VARCHAR;
-    k_val INTEGER;
-    label_column_name_unique VARCHAR;
-    test_id VARCHAR;
-    convert_boolean_to_int VARCHAR;
-BEGIN
-    oldClientMinMessages :=
-        (SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
-    EXECUTE 'SET client_min_messages TO warning';
-    SELECT * FROM MADLIB_SCHEMA.__knn_validate_src(point_source, point_column_name, label_column_name, test_source, test_column_name, id_column_name, output_table, operation, k) INTO k_val;
-    PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
-    x_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp';
-    y_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp';
-    label_column_name_unique := 'label'||md5('knn_'||now()::text||random()::text)||'_name';
-    test_id := 'id'||md5('knn_'||now()::text||random()::text)||'_name';
-
-    convert_boolean_to_int := '';
-    IF (operation = 'c') THEN
-        convert_boolean_to_int := '::INTEGER';
-    END IF;
-
-    EXECUTE
-        $sql$
-	DROP TABLE IF EXISTS pg_temp.madlib_knn_interm;
-	CREATE TABLE pg_temp.madlib_knn_interm AS
-	SELECT *
-    FROM
-        (
-        SELECT row_number() over (partition by $sql$ || test_id || $sql$ order by dist) AS r, $sql$ || x_temp_table || $sql$.*
-        FROM
-            (
-                SELECT test.$sql$ || id_column_name || $sql$ AS $sql$ || test_id || $sql$, MADLIB_SCHEMA.squared_dist_norm2(train.$sql$ || point_column_name || $sql$,test.$sql$ || test_column_name || $sql$) AS dist, train.$sql$ || label_column_name || $sql$ $sql$ || convert_boolean_to_int || $sql$ AS $sql$ || label_column_name_unique || $sql$
-                FROM $sql$ || textin(regclassout(point_source)) || $sql$ AS train, $sql$ || textin(regclassout(test_source)) || $sql$ AS test
-            )$sql$ || x_temp_table || $sql$
-        )$sql$ || y_temp_table || $sql$
-    WHERE $sql$ || y_temp_table || $sql$.r <= $sql$ || k_val;
-
-    IF (operation = 'c') THEN
-    	EXECUTE
-        $sql$
-    	CREATE TABLE $sql$ || output_table || $sql$ AS
-        SELECT $sql$ || test_id || $sql$ AS id, $sql$ || test_column_name || $sql$, MADLIB_SCHEMA.mode($sql$ || label_column_name_unique || $sql$) AS prediction
-        FROM pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ ON $sql$ || test_id || $sql$=$sql$ || id_column_name || $sql$
-        GROUP BY $sql$ || test_id || $sql$, $sql$ || test_column_name;
-    ELSE
-        EXECUTE
-        $sql$
-	    CREATE TABLE $sql$ || output_table || $sql$ AS
-        SELECT $sql$ || test_id || $sql$ AS id, $sql$ || test_column_name || $sql$, avg($sql$ || label_column_name_unique || $sql$) AS prediction
-        FROM
-            pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ on $sql$ || test_id || $sql$=$sql$ || id_column_name || $sql$
-        GROUP BY $sql$ || test_id || $sql$, $sql$ || test_column_name || $sql$
-        ORDER BY $sql$ || test_id || $sql$ $sql$;
-    END IF;
-
-   EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
-   IF (operation = 'c') THEN
-   	returnstring := 'The classification results have been written to output table '||output_table;
-   ELSE
-        returnstring := 'The regression results have been written to output table '||output_table;
-   END IF;
-   DROP TABLE pg_temp.madlib_knn_interm;
-   RETURN returnstring;
-END;
-$$ LANGUAGE plpgsql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
-    point_source VARCHAR,
-    point_column_name VARCHAR,
-    label_column_name VARCHAR,
-    test_source VARCHAR,
-    test_column_name VARCHAR,
-    id_column_name VARCHAR,
-    output_table VARCHAR,
-    operation VARCHAR
-) RETURNS VARCHAR AS $$
-DECLARE
-    returnstring VARCHAR;
-BEGIN
-    returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1);
-    RETURN returnstring;
-END;
-$$ LANGUAGE plpgsql VOLATILE
+    PythonFunctionBodyOnly(`knn', `knn')
+    return knn.knn(
+        schema_madlib,
+        point_source,
+        point_column_name,
+        label_column_name,
+        test_source,
+        test_column_name,
+        id_column_name,
+        output_table,
+        operation,
+        k
+    )
+$$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-

From 881a098a5d256730dc2a55bdbb2f4dba9e2a3cbc Mon Sep 17 00:00:00 2001
From: hpandeycodeit <hpandey@pivotal.io>
Date: Wed, 16 Aug 2017 11:34:41 -0700
Subject: [PATCH 2/7] knn code refactor: replaced query with unique_string

---
 src/ports/postgres/modules/knn/knn.py_in | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in
index 7fe7007c8..7f18f6b49 100644
--- a/src/ports/postgres/modules/knn/knn.py_in
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -37,6 +37,7 @@ from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import is_col_array
 from utilities.validate_args import array_col_has_no_null
 from utilities.validate_args import get_cols_and_types
+from utilities.utilities import unique_string
 
 STATE_IN_MEM = m4_ifdef(<!__HAWQ__!>, <!True!>, <!False!>)
 HAS_FUNCTION_PROPERTIES = m4_ifdef(<!__HAS_FUNCTION_PROPERTIES__!>, <!True!>, <!False!>)
@@ -149,20 +150,21 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name,
 
     plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp()".format(schema_madlib = schema_madlib));
  
-    x_temp_table = 'knn' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_temp' AS MD5")[0]['md5'];
-    y_temp_table = 'knn' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_temp' AS MD5")[0]['md5'];
-    label_column_name_unique = 'label' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_name' AS MD5")[0]['md5'];
-    test_id = 'id' + plpy.execute( " select md5('knn_'||now()::text||random()::text)||'_name' AS MD5")[0]['md5'];
+    x_temp_table = unique_string(desp='x_temp_table') 
+    y_temp_table = unique_string(desp='y_temp_table') 
+    label_column_name_unique = unique_string(desp='label_column_name_unique')  
+    test_id = unique_string(desp='test_id')  
 
     convert_boolean_to_int = '';
     if operation == 'c':
         convert_boolean_to_int = '::INTEGER';
     
+    madlib_knn_interm = unique_string(desp='madlib_knn_interm')
 
-    plpy.execute("DROP TABLE IF EXISTS pg_temp.madlib_knn_interm");
+    plpy.execute("""DROP TABLE IF EXISTS pg_temp.{madlib_knn_interm}""".format(**locals()));
     plpy.execute(
     """
-    CREATE TEMP TABLE pg_temp.madlib_knn_interm AS
+    CREATE TEMP TABLE pg_temp.{madlib_knn_interm} AS
     SELECT *
     FROM
         (
@@ -180,7 +182,7 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name,
         """
         CREATE TABLE {output_table} AS
         SELECT {test_id} AS id, {test_column_name} , {schema_madlib}.mode({label_column_name_unique}) AS prediction
-        FROM pg_temp.madlib_knn_interm join  {test_source}  ON  {test_id} = {id_column_name}  
+        FROM pg_temp.{madlib_knn_interm} join  {test_source}  ON  {test_id} = {id_column_name}  
         GROUP BY {test_id}  ,  {test_column_name}""".format(**locals()))
         
         
@@ -190,7 +192,7 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name,
         CREATE TABLE  {output_table} AS
         SELECT  {test_id}   AS id, {test_column_name} , avg( {label_column_name_unique}  ) AS prediction
         FROM
-            pg_temp.madlib_knn_interm join {test_source}  on {test_id}  ={id_column_name} 
+            pg_temp.{madlib_knn_interm} join {test_source}  on {test_id}  ={id_column_name} 
         GROUP BY {test_id} ,  {test_column_name} 
         ORDER BY {test_id}""".format(**locals()))   
    
@@ -202,7 +204,8 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name,
     else:
         returnstring = 'The regression results have been written to output table '+ output_table;
 
-    plpy.execute("DROP TABLE pg_temp.madlib_knn_interm");
+    plpy.execute("""DROP TABLE pg_temp.{madlib_knn_interm}""".format(**locals()));    
+
     return returnstring;
 
 

From 933cd3f83bd48e88c0f84df856e00df60c0f2c07 Mon Sep 17 00:00:00 2001
From: hpandeycodeit <hpandey@pivotal.io>
Date: Wed, 16 Aug 2017 14:42:35 -0700
Subject: [PATCH 3/7] formatting

---
 src/ports/postgres/modules/knn/knn.py_in | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in
index 7f18f6b49..7b474bff9 100644
--- a/src/ports/postgres/modules/knn/knn.py_in
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -40,9 +40,10 @@ from utilities.validate_args import get_cols_and_types
 from utilities.utilities import unique_string
 
 STATE_IN_MEM = m4_ifdef(<!__HAWQ__!>, <!True!>, <!False!>)
-HAS_FUNCTION_PROPERTIES = m4_ifdef(<!__HAS_FUNCTION_PROPERTIES__!>, <!True!>, <!False!>)
-UDF_ON_SEGMENT_NOT_ALLOWED = m4_ifdef(<!__UDF_ON_SEGMENT_NOT_ALLOWED__!>, <!True!>, <!False!>)
-# ----------------------------------------------------------------------
+HAS_FUNCTION_PROPERTIES = m4_ifdef(<!__HAS_FUNCTION_PROPERTIES__!>, <!True!>,
+<!False!>) UDF_ON_SEGMENT_NOT_ALLOWED =
+m4_ifdef(<!__UDF_ON_SEGMENT_NOT_ALLOWED__!>, <!True!>, <!False!>) #
+----------------------------------------------------------------------
 
 
 def knn_validate_src(schema_madlib, point_source, point_column_name, label_column_name,
@@ -135,6 +136,23 @@ def knn_validate_src(schema_madlib, point_source, point_column_name, label_colum
 def knn(schema_madlib, point_source, point_column_name, label_column_name,
     test_source, test_column_name, id_column_name, output_table, operation, k):
 
+    """
+        KNN function to find the K Nearest neighbours
+        Args:
+            @param schema_madlib       Name of the Madlib Schema
+            @param point_source        Training data table 
+            @param point_column_name   Name of the column with training data points.
+            @param label_column_name   Name of the column with labels/values of training data points.
+            @param test_source         Name of the table containing the test data points.
+            @param test_column_name    Name of the column with testing data points.
+            @param id_column_name      Name of the column having ids of data points in test data table.
+            @param output_table        Name of the table to store final results.
+            @param k                   default: 1. Number of nearest neighbors to consider
+
+
+        Returns: 
+            VARCHAR                     Name of the output table.             
+    """                                
 
   
     oldClientMinMessages = plpy.execute("SELECT setting FROM pg_settings WHERE name = 'client_min_messages'")[0]['setting'];

From 869ec2afeeb6b2bed3cf949d0fc69342d4f71665 Mon Sep 17 00:00:00 2001
From: Himanshu Pandey <hpandey@pivotal.io>
Date: Fri, 25 Aug 2017 14:55:28 -0700
Subject: [PATCH 4/7] kNN: Refactor code for ease of use JIRA: MADLIB-927

Additional author: Orhan Kislal <okislal@apache.org>

Closes #168
---
 src/ports/postgres/modules/knn/knn.py_in  | 300 +++++++++-------------
 src/ports/postgres/modules/knn/knn.sql_in |  22 +-
 2 files changed, 146 insertions(+), 176 deletions(-)

diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in
index 7b474bff9..bc2ef5c1d 100644
--- a/src/ports/postgres/modules/knn/knn.py_in
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -17,8 +17,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-m4_changequote(`<!', `!>')
-
 """
 @file knn.py_in
 
@@ -26,205 +24,157 @@ m4_changequote(`<!', `!>')
 
 @namespace knn
 
-@brief knn: Driver functions
 """
 
 import plpy
-from utilities.validate_args import table_exists
-from utilities.validate_args import table_is_empty
-from utilities.validate_args import columns_exist_in_table
-from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import input_tbl_valid, output_tbl_valid
+from utilities.validate_args import cols_in_tbl_valid
 from utilities.validate_args import is_col_array
 from utilities.validate_args import array_col_has_no_null
-from utilities.validate_args import get_cols_and_types
+from utilities.validate_args import get_expr_type
 from utilities.utilities import unique_string
+from utilities.control import MinWarning
 
-STATE_IN_MEM = m4_ifdef(<!__HAWQ__!>, <!True!>, <!False!>)
-HAS_FUNCTION_PROPERTIES = m4_ifdef(<!__HAS_FUNCTION_PROPERTIES__!>, <!True!>,
-<!False!>) UDF_ON_SEGMENT_NOT_ALLOWED =
-m4_ifdef(<!__UDF_ON_SEGMENT_NOT_ALLOWED__!>, <!True!>, <!False!>) #
-----------------------------------------------------------------------
 
+def knn_validate_src(schema_madlib, point_source, point_column_name,
+                     label_column_name, test_source, test_column_name,
+                     id_column_name, output_table, operation, k, **kwargs):
 
-def knn_validate_src(schema_madlib, point_source, point_column_name, label_column_name,
-    test_source, test_column_name, id_column_name, output_table, operation, k, **kwargs):
     if not operation or operation not in ['c', 'r']:
-        plpy.error("kNN Error: operation='{0}' is an invalid value, has to be 'r' for regression OR 'c' for classification.".format(operation))
-    if not point_source:
-        plpy.error("kNN Error: Invalid training table name.")
-    if not table_exists(point_source):
-        plpy.error("kNN Error: Training table '{0}' does not exist.".format(point_source))
-    if table_is_empty(point_source):
-        plpy.error("kNN Error: Training table '{0}' is empty.".format(point_source))
-
-    if not test_source:
-        plpy.error("kNN Error: Invalid test table name.")
-    if not table_exists(test_source):
-        plpy.error("kNN Error: Test table '{0}' does not exist.".format(test_source))
-    if table_is_empty(test_source):
-        plpy.error("kNN Error: Test table '{0}' is empty.".format(test_source))
-
-    for c in (label_column_name, point_column_name):
-        if not c:
-            plpy.error("kNN Error: Invalid column name in training table.")
-        if not columns_exist_in_table(point_source, [c]):
-            plpy.error("kNN Error: " + \
-                    "Column '{0}' does not exist in {1}.".format(c, point_source))
-
-    for c in (test_column_name, id_column_name):
-        if not c:
-            plpy.error("kNN Error: Invalid column name in test table.")
-        if not columns_exist_in_table(test_source, [c]):
-            plpy.error("kNN Error: " + \
-                    "Column '{0}' does not exist in {1}.".format(c, test_source))
+        plpy.error("kNN Error: operation='{0}' is an invalid value, has to be"
+                   " 'r' for regression OR 'c' for classification.".
+                   format(operation))
+    input_tbl_valid(point_source, 'kNN')
+    input_tbl_valid(test_source, 'kNN')
+    output_tbl_valid(output_table, 'kNN')
+    cols_in_tbl_valid(point_source, (label_column_name, point_column_name), 'kNN')
+    cols_in_tbl_valid(test_source, (test_column_name, id_column_name), 'kNN')
 
     if not is_col_array(point_source, point_column_name):
-        plpy.error("kNN Error: " + \
-                    "Feature column '{0}' in train table is not an array.".format(point_column_name))
+        plpy.error("kNN Error: Feature column '{0}' in train table is not"
+                   " an array.").format(point_column_name)
     if not is_col_array(test_source, test_column_name):
-        plpy.error("kNN Error: " + \
-                    "Feature column '{0}' in test table is not an array.".format(test_column_name))
+        plpy.error("kNN Error: Feature column '{0}' in test table is not"
+                   " an array.").format(test_column_name)
 
     if not array_col_has_no_null(point_source, point_column_name):
-        plpy.error("kNN Error: " + \
-                    "Feature column '{0}' in train table has some NULL values.".format(point_column_name))
+        plpy.error("kNN Error: Feature column '{0}' in train table has some"
+                   " NULL values.".format(point_column_name))
     if not array_col_has_no_null(test_source, test_column_name):
-        plpy.error("kNN Error: " + \
-                    "Feature column '{0}' in test table has some NULL values.".format(test_column_name))
-
-    if not output_table:
-        plpy.error("kNN Error: Invalid output table name")
-    if table_exists(output_table):
-        plpy.error("kNN Error: Table '{0}' already exists, cannot use it as output table.".format(output_table))
+        plpy.error("kNN Error: Feature column '{0}' in test table has some"
+                   " NULL values.".format(test_column_name))
 
     if k is None:
         k = 1
-    if k<=0:
-        plpy.error("kNN Error: k='{0}' is an invalid value, must be greater than 0.".format(k))
-    bound = plpy.execute("""SELECT {k} <= count(*)
-            AS bound FROM {tbl}""".format(k=k,
-            point_column_name=point_column_name, tbl=point_source))[0]['bound']
+    if k <= 0:
+        plpy.error("kNN Error: k={0} is an invalid value, must be greater"
+                   "than 0.".format(k))
+    bound = plpy.execute("SELECT {k} <= count(*) AS bound FROM {tbl}".
+                         format(k=k, tbl=point_source))[0]['bound']
     if not bound:
-        plpy.error("kNN Error: " + \
-                "k='{0}' is greater than number of rows in training table.".format(k))
-
-    colTypesList = get_cols_and_types(point_source)
-    colType = ''
-    for type in colTypesList:
-        if type[0] == label_column_name:
-            colType = type[1]
-            break
-    if colType not in ['INTEGER','integer','double precision','DOUBLE PRECISION','float','FLOAT','boolean','BOOLEAN'] :
-        plpy.error("kNN Error: " + \
-                    "Data type '{0}' is not a valid type for column '{1}' in table '{2}'.".format(colType, label_column_name, point_source))
-
-    colTypesTestList = get_cols_and_types(test_source)
-    colType = ''
-    for type in colTypesTestList:
-        if type[0] == id_column_name:
-            colType = type[1]
-            break
-    if colType not in ['INTEGER','integer'] :
-        plpy.error("kNN Error: " + \
-                    "Data type '{0}' is not a valid type for column '{1}' in table '{2}'.".format(colType, id_column_name, test_source))
+        plpy.error("kNN Error: k={0} is greater than number of rows in"
+                   " training table.".format(k))
+
+    col_type = get_expr_type(label_column_name, point_source).lower()
+    if col_type not in ['integer', 'double precision', 'float', 'boolean']:
+        plpy.error("kNN error: Data type '{0}' is not a valid type for"
+                   " column '{1}' in table '{2}'.".
+                   format(col_type, label_column_name, point_source))
+
+    col_type_test = get_expr_type(id_column_name, test_source).lower()
+    if col_type_test not in ['integer']:
+        plpy.error("kNN Error: Data type '{0}' is not a valid type for"
+                   " column '{1}' in table '{2}'.".
+                   format(col_type_test, id_column_name, test_source))
     return k
-
-
-
+# ------------------------------------------------------------------------------
 
 
 def knn(schema_madlib, point_source, point_column_name, label_column_name,
-    test_source, test_column_name, id_column_name, output_table, operation, k):
-
+        test_source, test_column_name, id_column_name, output_table,
+        operation, k):
     """
         KNN function to find the K Nearest neighbours
         Args:
-            @param schema_madlib       Name of the Madlib Schema
-            @param point_source        Training data table 
-            @param point_column_name   Name of the column with training data points.
-            @param label_column_name   Name of the column with labels/values of training data points.
-            @param test_source         Name of the table containing the test data points.
-            @param test_column_name    Name of the column with testing data points.
-            @param id_column_name      Name of the column having ids of data points in test data table.
-            @param output_table        Name of the table to store final results.
-            @param k                   default: 1. Number of nearest neighbors to consider
-
-
-        Returns: 
-            VARCHAR                     Name of the output table.             
-    """                                
-
-  
-    oldClientMinMessages = plpy.execute("SELECT setting FROM pg_settings WHERE name = 'client_min_messages'")[0]['setting'];
-
-    plpy.execute("SET client_min_messages TO warning");
-
- 
-    k_val = knn_validate_src(schema_madlib, point_source, point_column_name, 
-                label_column_name, test_source, 
-                test_column_name, id_column_name, 
-                output_table, operation, k) 
-
-
-    plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp()".format(schema_madlib = schema_madlib));
- 
-    x_temp_table = unique_string(desp='x_temp_table') 
-    y_temp_table = unique_string(desp='y_temp_table') 
-    label_column_name_unique = unique_string(desp='label_column_name_unique')  
-    test_id = unique_string(desp='test_id')  
-
-    convert_boolean_to_int = '';
-    if operation == 'c':
-        convert_boolean_to_int = '::INTEGER';
-    
-    madlib_knn_interm = unique_string(desp='madlib_knn_interm')
-
-    plpy.execute("""DROP TABLE IF EXISTS pg_temp.{madlib_knn_interm}""".format(**locals()));
-    plpy.execute(
+            @param schema_madlib        Name of the Madlib Schema
+            @param point_source         Training data table
+            @param point_column_name    Name of the column with training data
+                                        points.
+            @param label_column_name    Name of the column with labels/values
+                                        of training data points.
+            @param test_source          Name of the table containing the test
+                                        data points.
+            @param test_column_name     Name of the column with testing data
+                                        points.
+            @param id_column_name       Name of the column having ids of data
+                                        points in test data table.
+            @param output_table         Name of the table to store final
+                                        results.
+            @param operation            Flag for the operation:
+                                        'c' for classification and
+                                        'r' for regression
+            @param k                    default: 1. Number of nearest
+                                        neighbors to consider
+        Returns:
+            VARCHAR                     Name of the output table.
     """
-    CREATE TEMP TABLE pg_temp.{madlib_knn_interm} AS
-    SELECT *
-    FROM
-        (
-        SELECT row_number() over (partition by {test_id}  order by dist) AS r , {x_temp_table}.*
-        FROM
-            (
-                SELECT test.{id_column_name} AS  {test_id} , {schema_madlib}.squared_dist_norm2(train.{point_column_name} ,test.{test_column_name}) AS dist, train.{label_column_name} {convert_boolean_to_int} AS {label_column_name_unique}
-                FROM  {point_source} AS train, {test_source}  AS test
-            ) {x_temp_table}
-        ){y_temp_table}
-    WHERE {y_temp_table}.r <= {k_val}""".format(**locals()));
-
-    if operation == 'c':
-        plpy.execute(
-        """
-        CREATE TABLE {output_table} AS
-        SELECT {test_id} AS id, {test_column_name} , {schema_madlib}.mode({label_column_name_unique}) AS prediction
-        FROM pg_temp.{madlib_knn_interm} join  {test_source}  ON  {test_id} = {id_column_name}  
-        GROUP BY {test_id}  ,  {test_column_name}""".format(**locals()))
-        
-        
-    else:
+    with MinWarning('warning'):
+        k_val = knn_validate_src(schema_madlib, point_source,
+                                 point_column_name, label_column_name,
+                                 test_source, test_column_name, id_column_name,
+                                 output_table, operation, k)
+
+        x_temp_table = unique_string(desp='x_temp_table')
+        y_temp_table = unique_string(desp='y_temp_table')
+        label_col_temp = unique_string(desp='label_col_temp')
+        test_id = unique_string(desp='test_id')
+
+        is_classification = operation == 'c'
+        interim_table = unique_string(desp='interim_table')
         plpy.execute(
-        """ 
-        CREATE TABLE  {output_table} AS
-        SELECT  {test_id}   AS id, {test_column_name} , avg( {label_column_name_unique}  ) AS prediction
-        FROM
-            pg_temp.{madlib_knn_interm} join {test_source}  on {test_id}  ={id_column_name} 
-        GROUP BY {test_id} ,  {test_column_name} 
-        ORDER BY {test_id}""".format(**locals()))   
-   
-
-    plpy.execute("SET client_min_messages TO "+ oldClientMinMessages)
-
-    if operation == 'c':
-        returnstring = 'The classification results have been written to output table '+ output_table;
-    else:
-        returnstring = 'The regression results have been written to output table '+ output_table;
-
-    plpy.execute("""DROP TABLE pg_temp.{madlib_knn_interm}""".format(**locals()));    
-
-    return returnstring;
-
-
-
+            """
+            CREATE TEMP TABLE {interim_table} AS
+            SELECT * FROM
+                (
+                SELECT row_number() over
+                        (partition by {test_id} order by dist) AS r,
+                        {x_temp_table}.*
+                FROM
+                    (
+                    SELECT test.{id_column_name} AS {test_id} ,
+                        {schema_madlib}.squared_dist_norm2(
+                            train.{point_column_name},
+                            test.{test_column_name})
+                        AS dist,
+                        train.{label_column_name}{cast_to_int}
+                            AS {label_col_temp}
+                        FROM {point_source} AS train, {test_source} AS test
+                    ) {x_temp_table}
+                ) {y_temp_table}
+            WHERE {y_temp_table}.r <= {k_val}
+            """.format(cast_to_int='::INTEGER' if is_classification else '',
+                       **locals()))
+
+        if is_classification:
+            plpy.execute(
+                """
+                CREATE TABLE {output_table} AS
+                SELECT {test_id} AS id, {test_column_name},
+                       {schema_madlib}.mode({label_col_temp}) AS prediction
+                FROM {interim_table} JOIN {test_source}
+                     ON {test_id} = {id_column_name}
+                GROUP BY {test_id}, {test_column_name}
+                """.format(**locals()))
+        else:
+            plpy.execute(
+                """
+                CREATE TABLE {output_table} AS
+                SELECT {test_id} AS id, {test_column_name},
+                       AVG({label_col_temp}) AS prediction
+                FROM
+                    {interim_table} JOIN {test_source}
+                    ON {test_id} = {id_column_name}
+                GROUP BY {test_id}, {test_column_name}
+                """.format(**locals()))
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table))
+# ------------------------------------------------------------------------------
\ No newline at end of file
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
index 373e92418..865b709c5 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -308,7 +308,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
 ) RETURNS VOID AS $$
 BEGIN
     IF arg1 = 'help' OR arg1 = 'usage' OR arg1 = '?' THEN
-	RAISE NOTICE
+    RAISE NOTICE
 '
 -----------------------------------------------------------------------
                             USAGE
@@ -385,3 +385,23 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     )
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+    point_source VARCHAR,
+    point_column_name VARCHAR,
+    label_column_name VARCHAR,
+    test_source VARCHAR,
+    test_column_name VARCHAR,
+    id_column_name VARCHAR,
+    output_table VARCHAR,
+    operation VARCHAR
+) RETURNS VARCHAR AS $$
+DECLARE
+    returnstring VARCHAR;
+BEGIN
+    returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1);
+    RETURN returnstring;
+END;
+$$ LANGUAGE plpgsql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
\ No newline at end of file

From 83bbe29d7dc21081a7569c9e5360dc11d949b03b Mon Sep 17 00:00:00 2001
From: Jingyi <jmei@pivotal.io>
Date: Mon, 21 Aug 2017 17:49:53 -0700
Subject: [PATCH 5/7] Fix example py_in file

Originaly, there is a comment line at the end of the file that will
cause problem during running make. This commit just remove the last
comment line.

Closes #175
---
 examples/hello_world/iterative/simple_logistic.py_in | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/hello_world/iterative/simple_logistic.py_in b/examples/hello_world/iterative/simple_logistic.py_in
index 719bc186f..d19740c46 100644
--- a/examples/hello_world/iterative/simple_logistic.py_in
+++ b/examples/hello_world/iterative/simple_logistic.py_in
@@ -237,4 +237,3 @@ SELECT * from patients_logregr;
         help_string = "No such option. Use {schema_madlib}.logregr_simple_train('help')"
 
     return help_string.format(schema_madlib=schema_madlib)
-# ------------------------------------------------------------------------

From 4c1d3bf712d27bae8ae133a0a4087a07e6c61dda Mon Sep 17 00:00:00 2001
From: hpandeycodeit <hpandey@pivotal.io>
Date: Thu, 7 Sep 2017 13:05:51 -0700
Subject: [PATCH 6/7] KNN changes for MADLIB-1129

---
 src/ports/postgres/modules/knn/knn.py_in      | 154 ++++++++++++------
 src/ports/postgres/modules/knn/knn.sql_in     | 130 +++++++++------
 .../postgres/modules/knn/test/knn.sql_in      |  50 ++++--
 3 files changed, 226 insertions(+), 108 deletions(-)

diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in
index 27cb7353d..0cd17d4ed 100644
--- a/src/ports/postgres/modules/knn/knn.py_in
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -36,19 +36,20 @@ from utilities.utilities import unique_string
 from utilities.control import MinWarning
 
 
-def knn_validate_src(schema_madlib, point_source, point_column_name,
+def knn_validate_src(schema_madlib, point_source, point_column_name,point_id,
                      label_column_name, test_source, test_column_name,
-                     id_column_name, output_table, operation, k, **kwargs):
-
-    if not operation or operation not in ['c', 'r']:
-        plpy.error("kNN Error: operation='{0}' is an invalid value, has to be"
-                   " 'r' for regression OR 'c' for classification.".
-                   format(operation))
+                     test_id, output_table, k, output_neighbors , **kwargs):
+   # if not operation or operation not in ['c', 'r']:
+   #    plpy.error("kNN Error: operation='{0}' is an invalid value, has to be"
+   #                " 'r' for regression OR 'c' for classification.".
+   #               format(operation))
     input_tbl_valid(point_source, 'kNN')
     input_tbl_valid(test_source, 'kNN')
     output_tbl_valid(output_table, 'kNN')
-    cols_in_tbl_valid(point_source, (label_column_name, point_column_name), 'kNN')
-    cols_in_tbl_valid(test_source, (test_column_name, id_column_name), 'kNN')
+    if label_column_name is not None and label_column_name != '':
+        cols_in_tbl_valid(point_source, (label_column_name, point_column_name), 'kNN')
+    cols_in_tbl_valid(point_source, (point_column_name, point_id), 'kNN')    
+    cols_in_tbl_valid(test_source, (test_column_name, test_id), 'kNN')
 
     if not is_col_array(point_source, point_column_name):
         plpy.error("kNN Error: Feature column '{0}' in train table is not"
@@ -75,30 +76,32 @@ def knn_validate_src(schema_madlib, point_source, point_column_name,
         plpy.error("kNN Error: k={0} is greater than number of rows in"
                    " training table.".format(k))
 
-    col_type = get_expr_type(label_column_name, point_source).lower()
-    if col_type not in ['integer', 'double precision', 'float', 'boolean']:
-        plpy.error("kNN error: Data type '{0}' is not a valid type for"
+    if label_column_name is not None and label_column_name != '':
+        col_type = get_expr_type(label_column_name, point_source).lower()
+        if col_type not in ['integer', 'double precision', 'float', 'boolean']:
+            plpy.error("kNN error: Data type '{0}' is not a valid type for"
                    " column '{1}' in table '{2}'.".
                    format(col_type, label_column_name, point_source))
 
-    col_type_test = get_expr_type(id_column_name, test_source).lower()
+    col_type_test = get_expr_type(test_id, test_source).lower()
     if col_type_test not in ['integer']:
         plpy.error("kNN Error: Data type '{0}' is not a valid type for"
                    " column '{1}' in table '{2}'.".
-                   format(col_type_test, id_column_name, test_source))
+                   format(col_type_test, test_id, test_source))
     return k
 # ------------------------------------------------------------------------------
 
 
-def knn(schema_madlib, point_source, point_column_name, label_column_name,
-        test_source, test_column_name, id_column_name, output_table,
-        operation, k):
+def knn(schema_madlib, point_source, point_column_name,point_id, label_column_name,
+        test_source, test_column_name, test_id, output_table, k,output_neighbors):
     """
         KNN function to find the K Nearest neighbours
         Args:
             @param schema_madlib        Name of the Madlib Schema
             @param point_source         Training data table
             @param point_column_name    Name of the column with training data
+            @param point_id             Name of the column having ids of data 
+                                        point in train data table
                                         points.
             @param label_column_name    Name of the column with labels/values
                                         of training data points.
@@ -106,7 +109,7 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name,
                                         data points.
             @param test_column_name     Name of the column with testing data
                                         points.
-            @param id_column_name       Name of the column having ids of data
+            @param test_id              Name of the column having ids of data
                                         points in test data table.
             @param output_table         Name of the table to store final
                                         results.
@@ -115,33 +118,85 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name,
                                         'r' for regression
             @param k                    default: 1. Number of nearest
                                         neighbors to consider
+            @output_neighbours          Outputs the list of k-nearest neighbors 
+                                        that were used in the voting/averaging.
         Returns:
             VARCHAR                     Name of the output table.
     """
     with MinWarning('warning'):
         k_val = knn_validate_src(schema_madlib, point_source,
-                                 point_column_name, label_column_name,
-                                 test_source, test_column_name, id_column_name,
-                                 output_table, operation, k)
+                                 point_column_name, point_id ,label_column_name,
+                                 test_source, test_column_name, test_id,
+                                 output_table, k , output_neighbors)
 
         x_temp_table = unique_string(desp='x_temp_table')
         y_temp_table = unique_string(desp='y_temp_table')
         label_col_temp = unique_string(desp='label_col_temp')
-        test_id = unique_string(desp='test_id')
+        test_id_temp = unique_string(desp='test_id_temp')
+
+        if output_neighbors is None or '':
+            output_neighbors=False
 
-        is_classification = operation == 'c'
         interim_table = unique_string(desp='interim_table')
+
+        if label_column_name is None or label_column_name == '':
+             plpy.execute(
+                """
+                CREATE TEMP TABLE {interim_table} AS
+                SELECT * FROM
+                    (
+                    SELECT row_number() over
+                            (partition by {test_id_temp} order by dist) AS r,
+                            {x_temp_table}.*
+                    FROM
+                        (
+                        SELECT test.{test_id} AS {test_id_temp} ,
+                            train.id as train_id ,
+                            {schema_madlib}.squared_dist_norm2(
+                                train.{point_column_name},
+                                test.{test_column_name})
+                            AS dist
+                            FROM {point_source} AS train, {test_source} AS test
+                        ) {x_temp_table}
+                    ) {y_temp_table}
+                WHERE {y_temp_table}.r <= {k_val}
+                """.format(**locals()))
+             plpy.execute(
+                """
+                CREATE TABLE {output_table} AS
+                    SELECT {test_id_temp} AS id, {test_column_name} ,
+                        CASE WHEN {output_neighbors} 
+                        THEN array_agg(knn_temp.train_id) 
+                        ELSE NULL END  AS k_nearest_neighbours
+                    FROM pg_temp.{interim_table} AS knn_temp 
+                    join
+                        {test_source} AS knn_test ON  
+                        knn_temp.{test_id_temp} = knn_test.{test_id}
+                    GROUP BY {test_id_temp}  ,  {test_column_name}
+                """.format(**locals()))
+             return
+
+
+        is_classification = False
+        label_column_type = get_expr_type(label_column_name, point_source).lower()
+        if label_column_type in ['boolean','integer', 'text']:
+            is_classification = True
+            convert_boolean_to_int = '::INTEGER'
+        else: 
+            is_classification = False      
+
         plpy.execute(
             """
             CREATE TEMP TABLE {interim_table} AS
             SELECT * FROM
                 (
                 SELECT row_number() over
-                        (partition by {test_id} order by dist) AS r,
+                        (partition by {test_id_temp} order by dist) AS r,
                         {x_temp_table}.*
                 FROM
                     (
-                    SELECT test.{id_column_name} AS {test_id} ,
+                    SELECT test.{test_id} AS {test_id_temp} ,
+                        train.id as train_id ,
                         {schema_madlib}.squared_dist_norm2(
                             train.{point_column_name},
                             test.{test_column_name})
@@ -152,33 +207,34 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name,
                     ) {x_temp_table}
                 ) {y_temp_table}
             WHERE {y_temp_table}.r <= {k_val}
-            """.format(cast_to_int='::INTEGER' if is_classification else '',
+            """.format(cast_to_int='::INTEGER' if is_classification else '', 
                        **locals()))
 
+        knn_create_table = 'CREATE TABLE '+output_table+' AS '  \
+                 'SELECT '+test_id_temp+' AS id,'+test_column_name+',' 
+        knn_pred_class =  schema_madlib+'.mode(' +label_col_temp+') AS prediction'
+        knn_pred_reg   =  'avg(' +label_col_temp+') AS prediction'
+        knn_neighbours = ', array_agg(knn_temp.train_id) AS k_nearest_neighbours '
+        knn_group_by = 'FROM pg_temp.'+interim_table+' AS knn_temp join ' \
+                    +test_source+' AS knn_test ON  knn_temp.'+test_id_temp+'= knn_test.' \
+                    +test_id +' GROUP BY '+test_id_temp+', '+test_column_name
+        
+
         if is_classification:
-            plpy.execute(
-                """
-                CREATE TABLE {output_table} AS
-                SELECT {test_id} AS id, {test_column_name},
-                       {schema_madlib}.mode({label_col_temp}) AS prediction
-                FROM {interim_table} JOIN {test_source}
-                     ON {test_id} = {id_column_name}
-                GROUP BY {test_id}, {test_column_name}
-                """.format(**locals()))
+            if output_neighbors:
+                plpy.execute("""{knn_create_table}{knn_pred_class}
+                    {knn_neighbours}{knn_group_by}""".format(**locals()))
+            else:
+                plpy.execute(""" {knn_create_table}{knn_pred_class}
+                    {knn_group_by}""".format(**locals()))
         else:
-            plpy.execute(
-                """
-                CREATE TABLE {output_table} AS
-                SELECT {test_id} AS id, {test_column_name},
-                       AVG({label_col_temp}) AS prediction
-                FROM
-                    {interim_table} JOIN {test_source}
-                    ON {test_id} = {id_column_name}
-                GROUP BY {test_id}, {test_column_name}
-                """.format(**locals()))
+            if output_neighbors:
+                plpy.execute(""" {knn_create_table}{knn_pred_reg}
+                    {knn_neighbours}{knn_group_by}""".format(**locals()))
+            else: 
+                plpy.execute("""{knn_create_table}{knn_pred_reg}
+                    {knn_group_by}""".format(**locals()))
+                
+
         plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table))
-<<<<<<< HEAD
-# ------------------------------------------------------------------------------
-=======
 # ------------------------------------------------------------------------------
->>>>>>> 6f99dbe4659a5582df61e2fdcc68f7ee60d5859d
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
index 1d547bddf..dfd23748e 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -71,13 +71,14 @@ neighbors of the given test point.
 <pre class="syntax">
 knn( point_source,
      point_column_name,
+     point_id,
      label_column_name,
      test_source,
      test_column_name,
-     id_column_name,
+     test_id,
      output_table,
-     operation,
-     k
+     k,
+     output_neighbors
    )
 </pre>
 
@@ -93,8 +94,17 @@ in a column of type <tt>DOUBLE PRECISION[]</tt>.
 <dt>point_column_name</dt>
 <dd>TEXT. Name of the column with training data points.</dd>
 
+<dt>point_id</dt>
+<dd>TEXT. Name of the column in 'point_source’ containing source data ids.
+The ids are of type INTEGER with no duplicates. They do not need to be contiguous. 
+This parameter must be used if the list of nearest neighbors are to be output, i.e., 
+if the parameter 'output_neighbors' below is TRUE or if 'label_column_name' is NULL.
+
 <dt>label_column_name</dt>
-<dd>TEXT. Name of the column with labels/values of training data points.</dd>
+<dd>TEXT. Name of the column with labels/values of training data points.
+If Boolean, integer or text types will run knn classification, else if 
+double precision values will run knn regression.  
+If you set this to NULL will return neighbors only without doing classification or regression.</dd>
 
 <dt>test_source</dt>
 <dd>TEXT. Name of the table containing the test data points.
@@ -106,7 +116,7 @@ in a column of type <tt>DOUBLE PRECISION[]</tt>.
 <dt>test_column_name</dt>
 <dd>TEXT. Name of the column with testing data points.</dd>
 
-<dt>id_column_name</dt>
+<dt>test_id</dt>
 <dd>TEXT. Name of the column having ids of data points in test data table.</dd>
 
 <dt>output_table</dt>
@@ -117,7 +127,12 @@ in a column of type <tt>DOUBLE PRECISION[]</tt>.
 
 <dt>k (optional)</dt>
 <dd>INTEGER. default: 1. Number of nearest neighbors to consider.
-For classification, should be an odd number to break ties.</dd>
+For classification, should be an odd number to break ties.
+otherwise result may depend on ordering of the input data.</dd>
+
+<dt>output_neighbors (optional) </dt>
+<dd>BOOLEAN default: FALSE. Outputs the list of k-nearest 
+neighbors that were used in the voting/averaging.</dd>
 
 </dl>
 
@@ -145,15 +160,35 @@ The output of the KNN module is a table with the following columns:
 @anchor examples
 @examp
 
--#  Prepare some training data:
+-#  Prepare some training data for classification:
 <pre class="example">
 DROP TABLE IF EXISTS knn_train_data;
 CREATE TABLE knn_train_data (
                     id integer, 
                     data integer[], 
-                    label float
+                    label integer
                     );
 INSERT INTO knn_train_data VALUES
+(1, '{1,1}', 1),
+(2, '{2,2}', 1),
+(3, '{3,3}', 1),
+(4, '{4,4}', 1),
+(5, '{4,5}', 1),
+(6, '{20,50}', 0),
+(7, '{10,31}', 0),
+(8, '{81,13}', 0),
+(9, '{1,111}', 0);
+</pre>
+
+-#  Prepare some training data for regression:
+<pre class="example">
+DROP TABLE IF EXISTS knn_train_data_reg;
+CREATE TABLE knn_train_data_reg (
+                    id integer, 
+                    data integer[], 
+                    label float
+                    );
+INSERT INTO knn_train_data_reg VALUES
 (1, '{1,1}', 1.0),
 (2, '{2,2}', 1.0),
 (3, '{3,3}', 1.0),
@@ -187,26 +222,27 @@ DROP TABLE IF EXISTS madlib_knn_result_classification;
 SELECT * FROM madlib.knn( 
                 'knn_train_data',      -- Table of training data
                 'data',                -- Col name of training data
+                'id',                  -- Col Name of id in train data 
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data 
                 'madlib_knn_result_classification',  -- Output table
-                'c',                   -- Classification
                  3                     -- Number of nearest neighbours
+                 True                  -- True if you want to show Nearest-Neighbors, False otherwise
                 );
 SELECT * from madlib_knn_result_classification ORDER BY id;
 </pre>
 Result:
 <pre class="result">
- id |  data   | prediction 
-----+---------+------------
-  1 | {2,1}   |          1
-  2 | {2,6}   |          1
-  3 | {15,40} |          0
-  4 | {12,1}  |          1
-  5 | {2,90}  |          0
-  6 | {50,45} |          0
+ id |  data   | prediction | k_nearest_neighbours 
+----+---------+------------+----------------------
+  1 | {2,1}   |          1 | {1,2,3}
+  2 | {2,6}   |          1 | {5,4,3}
+  3 | {15,40} |          0 | {7,6,5}
+  4 | {12,1}  |          1 | {4,5,3}
+  5 | {2,90}  |          0 | {9,6,7}
+  6 | {50,45} |          0 | {6,7,8}
 (6 rows)
 </pre>
 
@@ -214,28 +250,29 @@ Result:
 <pre class="example">
 DROP TABLE IF EXISTS madlib_knn_result_regression;
 SELECT * FROM madlib.knn( 
-                'knn_train_data',      -- Table of training data
+                'knn_train_data_reg',  -- Table of training data
                 'data',                -- Col name of training data
+                'id',                  -- Col Name of id in train data 
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data 
                 'madlib_knn_result_regression',  -- Output table
-                'r',                   -- Regressions
-                 3                     -- Number of nearest neighbours
+                 3,                    -- Number of nearest neighbours
+                True                   -- True if you want to show Nearest-Neighbors, False otherwise
                 );
 SELECT * from madlib_knn_result_regression ORDER BY id;
 </pre>
 Result:
 <pre class="result">
- id |  data   |    prediction     
-----+---------+-------------------
-  1 | {2,1}   |                 1
-  2 | {2,6}   |                 1
-  3 | {15,40} | 0.333333333333333
-  4 | {12,1}  |                 1
-  5 | {2,90}  |                 0
-  6 | {50,45} |                 0
+ id |  data   |    prediction     | k_nearest_neighbours 
+----+---------+-------------------+----------------------
+  1 | {2,1}   |                 1 | {1,2,3}
+  2 | {2,6}   |                 1 | {5,4,3}
+  3 | {15,40} | 0.333333333333333 | {7,6,5}
+  4 | {12,1}  |                 1 | {4,5,3}
+  5 | {2,90}  |                 0 | {9,6,7}
+  6 | {50,45} |                 0 | {6,7,8}
 (6 rows)
 </pre>
 
@@ -281,7 +318,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src(
     label_column_name VARCHAR,
     test_source VARCHAR,
     test_column_name VARCHAR,
-    id_column_name VARCHAR,
+    test_id VARCHAR,
     output_table VARCHAR,
     operation VARCHAR,
     k INTEGER
@@ -294,7 +331,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src(
         label_column_name,
         test_source,
         test_column_name,
-        id_column_name,
+        test_id,
         output_table,
         operation,
         k
@@ -316,13 +353,14 @@ BEGIN
 SELECT {schema_madlib}.knn(
     point_source,       -- Training data table having training features as vector column and labels
     point_column_name,  -- Name of column having feature vectors in training data table
+    point_id,           -- Name of column having feature vector Ids in train data table
     label_column_name,  -- Name of column having actual label/vlaue for corresponding feature vector in training data table
     test_source,        -- Test data table having features as vector column. Id of features is mandatory
     test_column_name,   -- Name of column having feature vectors in test data table
-    id_column_name,     -- Name of column having feature vector Ids in test data table
+    test_id,     -- Name of column having feature vector Ids in test data table
     output_table,       -- Name of output table
-    operation,          -- c for classification task, r for regression task
-    k                   -- value of k. Default will go as 1
+    k,                  -- value of k. Default will go as 1
+    output_neighbors    -- Outputs the list of k-nearest neighbors that were used in the voting/averaging.
     );
 
 -----------------------------------------------------------------------
@@ -333,6 +371,7 @@ The output of the KNN module is a table with the following columns:
 id                  The ids of test data points.
 test_column_name    The test data points.
 prediction          The output of KNN- label in case of classification, average value in case of regression.
+k_nearest_neighbours The list of k-nearest neighbors that were used in the voting/averaging.
 ';
     END IF;
 END;
@@ -362,26 +401,28 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     point_source VARCHAR,
     point_column_name VARCHAR,
+    point_id VARCHAR,
     label_column_name VARCHAR,
     test_source VARCHAR,
     test_column_name VARCHAR,
-    id_column_name VARCHAR,
+    test_id VARCHAR,
     output_table VARCHAR,
-    operation VARCHAR,
-    k INTEGER
+    k INTEGER,
+    output_neighbors Boolean
 ) RETURNS VARCHAR AS $$
     PythonFunctionBodyOnly(`knn', `knn')
     return knn.knn(
         schema_madlib,
         point_source,
         point_column_name,
+        point_id,
         label_column_name,
         test_source,
         test_column_name,
-        id_column_name,
+        test_id,
         output_table,
-        operation,
-        k
+        k,
+        output_neighbors
     )
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
@@ -390,22 +431,19 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     point_source VARCHAR,
     point_column_name VARCHAR,
+    point_id VARCHAR,
     label_column_name VARCHAR,
     test_source VARCHAR,
     test_column_name VARCHAR,
-    id_column_name VARCHAR,
+    test_id VARCHAR,
     output_table VARCHAR,
-    operation VARCHAR
+    output_neighbors Boolean
 ) RETURNS VARCHAR AS $$
 DECLARE
     returnstring VARCHAR;
 BEGIN
-    returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1);
+    returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1,$9);
     RETURN returnstring;
 END;
 $$ LANGUAGE plpgsql VOLATILE
-<<<<<<< HEAD
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-=======
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
->>>>>>> 6f99dbe4659a5582df61e2fdcc68f7ee60d5859d
diff --git a/src/ports/postgres/modules/knn/test/knn.sql_in b/src/ports/postgres/modules/knn/test/knn.sql_in
index c7d6798c6..405068c26 100644
--- a/src/ports/postgres/modules/knn/test/knn.sql_in
+++ b/src/ports/postgres/modules/knn/test/knn.sql_in
@@ -26,12 +26,29 @@ m4_include(`SQLCommon.m4')
  * FIXME: Verify results
  * -------------------------------------------------------------------------- */
 
-drop table if exists "KNN_TRAIN_DATA";
-create table "KNN_TRAIN_DATA" (
-                id  integer,
-                "DATA"    integer[],
-                label   float);
-copy "KNN_TRAIN_DATA" (id, "DATA", label) from stdin delimiter '|';
+drop table if exists knn_train_data;
+create table knn_train_data (
+id  integer,
+data    integer[],
+label   integer);
+copy knn_train_data (id, data, label) from stdin delimiter '|';
+1|{1,1}|1
+2|{2,2}|1
+3|{3,3}|1
+4|{4,4}|1
+5|{4,5}|1
+6|{20,50}|0
+7|{10,31}|0
+8|{81,13}|0
+9|{1,111}|0
+\.
+DROP TABLE IF EXISTS knn_train_data_reg;
+CREATE TABLE knn_train_data_reg (
+                    id integer, 
+                    data integer[], 
+                    label float
+                    );
+COPY knn_train_data_reg (id, data, label) from stdin delimiter '|';
 1|{1,1}|1.0
 2|{2,2}|1.0
 3|{3,3}|1.0
@@ -42,11 +59,10 @@ copy "KNN_TRAIN_DATA" (id, "DATA", label) from stdin delimiter '|';
 8|{81,13}|0.0
 9|{1,111}|0.0
 \.
-drop table if exists knn_test_data;
 create table knn_test_data (
-                id  integer,
-                "DATA" integer[]);
-copy knn_test_data (id, "DATA") from stdin delimiter '|';
+id  integer,
+data integer[]);
+copy knn_test_data (id, data) from stdin delimiter '|';
 1|{2,1}
 2|{2,6}
 3|{15,40}
@@ -55,15 +71,23 @@ copy knn_test_data (id, "DATA") from stdin delimiter '|';
 6|{50,45}
 \.
 drop table if exists madlib_knn_result_classification;
-select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_classification','c',3);
+select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,False);
 select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=3') from madlib_knn_result_classification;
 
+drop table if exists madlib_knn_result_classification;
+select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,True);
+select assert(array_agg(k_nearest_neighbours order by id)='{ {1,2,3},{5,4,3},{7,6,5},{4,5,3},{9,6,7},{6,7,8} }', 'Wrong output in classification with k=3') from madlib_knn_result_classification;
+
 drop table if exists madlib_knn_result_regression;
-select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_regression','r',4);
+select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',4,False);
 select assert(array_agg(prediction order by id)='{1,1,0.5,1,0.25,0.25}', 'Wrong output in regression') from madlib_knn_result_regression;
 
+drop table if exists madlib_knn_result_regression;
+select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',3,True);
+select assert(array_agg(k_nearest_neighbours order by id)='{ {1,2,3},{5,4,3},{7,6,5},{4,5,3},{9,6,7},{6,7,8} }', 'Wrong output in regression') from madlib_knn_result_regression;
+
 drop table if exists madlib_knn_result_classification;
-select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_classification','c');
+select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',False);
 select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=1') from madlib_knn_result_classification;
 
 select knn();

From 8eb2b9119de695813a81b0b2d856d15bca345428 Mon Sep 17 00:00:00 2001
From: hpandeycodeit <hpandey@pivotal.io>
Date: Fri, 8 Sep 2017 14:50:49 -0700
Subject: [PATCH 7/7] Revert "KNN changes for MADLIB-1129"

This reverts commit 4c1d3bf712d27bae8ae133a0a4087a07e6c61dda.
---
 src/ports/postgres/modules/knn/knn.py_in      | 154 ++++++------------
 src/ports/postgres/modules/knn/knn.sql_in     | 130 ++++++---------
 .../postgres/modules/knn/test/knn.sql_in      |  50 ++----
 3 files changed, 108 insertions(+), 226 deletions(-)

diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in
index 0cd17d4ed..27cb7353d 100644
--- a/src/ports/postgres/modules/knn/knn.py_in
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -36,20 +36,19 @@ from utilities.utilities import unique_string
 from utilities.control import MinWarning
 
 
-def knn_validate_src(schema_madlib, point_source, point_column_name,point_id,
+def knn_validate_src(schema_madlib, point_source, point_column_name,
                      label_column_name, test_source, test_column_name,
-                     test_id, output_table, k, output_neighbors , **kwargs):
-   # if not operation or operation not in ['c', 'r']:
-   #    plpy.error("kNN Error: operation='{0}' is an invalid value, has to be"
-   #                " 'r' for regression OR 'c' for classification.".
-   #               format(operation))
+                     id_column_name, output_table, operation, k, **kwargs):
+
+    if not operation or operation not in ['c', 'r']:
+        plpy.error("kNN Error: operation='{0}' is an invalid value, has to be"
+                   " 'r' for regression OR 'c' for classification.".
+                   format(operation))
     input_tbl_valid(point_source, 'kNN')
     input_tbl_valid(test_source, 'kNN')
     output_tbl_valid(output_table, 'kNN')
-    if label_column_name is not None and label_column_name != '':
-        cols_in_tbl_valid(point_source, (label_column_name, point_column_name), 'kNN')
-    cols_in_tbl_valid(point_source, (point_column_name, point_id), 'kNN')    
-    cols_in_tbl_valid(test_source, (test_column_name, test_id), 'kNN')
+    cols_in_tbl_valid(point_source, (label_column_name, point_column_name), 'kNN')
+    cols_in_tbl_valid(test_source, (test_column_name, id_column_name), 'kNN')
 
     if not is_col_array(point_source, point_column_name):
         plpy.error("kNN Error: Feature column '{0}' in train table is not"
@@ -76,32 +75,30 @@ def knn_validate_src(schema_madlib, point_source, point_column_name,point_id,
         plpy.error("kNN Error: k={0} is greater than number of rows in"
                    " training table.".format(k))
 
-    if label_column_name is not None and label_column_name != '':
-        col_type = get_expr_type(label_column_name, point_source).lower()
-        if col_type not in ['integer', 'double precision', 'float', 'boolean']:
-            plpy.error("kNN error: Data type '{0}' is not a valid type for"
+    col_type = get_expr_type(label_column_name, point_source).lower()
+    if col_type not in ['integer', 'double precision', 'float', 'boolean']:
+        plpy.error("kNN error: Data type '{0}' is not a valid type for"
                    " column '{1}' in table '{2}'.".
                    format(col_type, label_column_name, point_source))
 
-    col_type_test = get_expr_type(test_id, test_source).lower()
+    col_type_test = get_expr_type(id_column_name, test_source).lower()
     if col_type_test not in ['integer']:
         plpy.error("kNN Error: Data type '{0}' is not a valid type for"
                    " column '{1}' in table '{2}'.".
-                   format(col_type_test, test_id, test_source))
+                   format(col_type_test, id_column_name, test_source))
     return k
 # ------------------------------------------------------------------------------
 
 
-def knn(schema_madlib, point_source, point_column_name,point_id, label_column_name,
-        test_source, test_column_name, test_id, output_table, k,output_neighbors):
+def knn(schema_madlib, point_source, point_column_name, label_column_name,
+        test_source, test_column_name, id_column_name, output_table,
+        operation, k):
     """
         KNN function to find the K Nearest neighbours
         Args:
             @param schema_madlib        Name of the Madlib Schema
             @param point_source         Training data table
             @param point_column_name    Name of the column with training data
-            @param point_id             Name of the column having ids of data 
-                                        point in train data table
                                         points.
             @param label_column_name    Name of the column with labels/values
                                         of training data points.
@@ -109,7 +106,7 @@ def knn(schema_madlib, point_source, point_column_name,point_id, label_column_na
                                         data points.
             @param test_column_name     Name of the column with testing data
                                         points.
-            @param test_id              Name of the column having ids of data
+            @param id_column_name       Name of the column having ids of data
                                         points in test data table.
             @param output_table         Name of the table to store final
                                         results.
@@ -118,85 +115,33 @@ def knn(schema_madlib, point_source, point_column_name,point_id, label_column_na
                                         'r' for regression
             @param k                    default: 1. Number of nearest
                                         neighbors to consider
-            @output_neighbours          Outputs the list of k-nearest neighbors 
-                                        that were used in the voting/averaging.
         Returns:
             VARCHAR                     Name of the output table.
     """
     with MinWarning('warning'):
         k_val = knn_validate_src(schema_madlib, point_source,
-                                 point_column_name, point_id ,label_column_name,
-                                 test_source, test_column_name, test_id,
-                                 output_table, k , output_neighbors)
+                                 point_column_name, label_column_name,
+                                 test_source, test_column_name, id_column_name,
+                                 output_table, operation, k)
 
         x_temp_table = unique_string(desp='x_temp_table')
         y_temp_table = unique_string(desp='y_temp_table')
         label_col_temp = unique_string(desp='label_col_temp')
-        test_id_temp = unique_string(desp='test_id_temp')
-
-        if output_neighbors is None or '':
-            output_neighbors=False
+        test_id = unique_string(desp='test_id')
 
+        is_classification = operation == 'c'
         interim_table = unique_string(desp='interim_table')
-
-        if label_column_name is None or label_column_name == '':
-             plpy.execute(
-                """
-                CREATE TEMP TABLE {interim_table} AS
-                SELECT * FROM
-                    (
-                    SELECT row_number() over
-                            (partition by {test_id_temp} order by dist) AS r,
-                            {x_temp_table}.*
-                    FROM
-                        (
-                        SELECT test.{test_id} AS {test_id_temp} ,
-                            train.id as train_id ,
-                            {schema_madlib}.squared_dist_norm2(
-                                train.{point_column_name},
-                                test.{test_column_name})
-                            AS dist
-                            FROM {point_source} AS train, {test_source} AS test
-                        ) {x_temp_table}
-                    ) {y_temp_table}
-                WHERE {y_temp_table}.r <= {k_val}
-                """.format(**locals()))
-             plpy.execute(
-                """
-                CREATE TABLE {output_table} AS
-                    SELECT {test_id_temp} AS id, {test_column_name} ,
-                        CASE WHEN {output_neighbors} 
-                        THEN array_agg(knn_temp.train_id) 
-                        ELSE NULL END  AS k_nearest_neighbours
-                    FROM pg_temp.{interim_table} AS knn_temp 
-                    join
-                        {test_source} AS knn_test ON  
-                        knn_temp.{test_id_temp} = knn_test.{test_id}
-                    GROUP BY {test_id_temp}  ,  {test_column_name}
-                """.format(**locals()))
-             return
-
-
-        is_classification = False
-        label_column_type = get_expr_type(label_column_name, point_source).lower()
-        if label_column_type in ['boolean','integer', 'text']:
-            is_classification = True
-            convert_boolean_to_int = '::INTEGER'
-        else: 
-            is_classification = False      
-
         plpy.execute(
             """
             CREATE TEMP TABLE {interim_table} AS
             SELECT * FROM
                 (
                 SELECT row_number() over
-                        (partition by {test_id_temp} order by dist) AS r,
+                        (partition by {test_id} order by dist) AS r,
                         {x_temp_table}.*
                 FROM
                     (
-                    SELECT test.{test_id} AS {test_id_temp} ,
-                        train.id as train_id ,
+                    SELECT test.{id_column_name} AS {test_id} ,
                         {schema_madlib}.squared_dist_norm2(
                             train.{point_column_name},
                             test.{test_column_name})
@@ -207,34 +152,33 @@ def knn(schema_madlib, point_source, point_column_name,point_id, label_column_na
                     ) {x_temp_table}
                 ) {y_temp_table}
             WHERE {y_temp_table}.r <= {k_val}
-            """.format(cast_to_int='::INTEGER' if is_classification else '', 
+            """.format(cast_to_int='::INTEGER' if is_classification else '',
                        **locals()))
 
-        knn_create_table = 'CREATE TABLE '+output_table+' AS '  \
-                 'SELECT '+test_id_temp+' AS id,'+test_column_name+',' 
-        knn_pred_class =  schema_madlib+'.mode(' +label_col_temp+') AS prediction'
-        knn_pred_reg   =  'avg(' +label_col_temp+') AS prediction'
-        knn_neighbours = ', array_agg(knn_temp.train_id) AS k_nearest_neighbours '
-        knn_group_by = 'FROM pg_temp.'+interim_table+' AS knn_temp join ' \
-                    +test_source+' AS knn_test ON  knn_temp.'+test_id_temp+'= knn_test.' \
-                    +test_id +' GROUP BY '+test_id_temp+', '+test_column_name
-        
-
         if is_classification:
-            if output_neighbors:
-                plpy.execute("""{knn_create_table}{knn_pred_class}
-                    {knn_neighbours}{knn_group_by}""".format(**locals()))
-            else:
-                plpy.execute(""" {knn_create_table}{knn_pred_class}
-                    {knn_group_by}""".format(**locals()))
+            plpy.execute(
+                """
+                CREATE TABLE {output_table} AS
+                SELECT {test_id} AS id, {test_column_name},
+                       {schema_madlib}.mode({label_col_temp}) AS prediction
+                FROM {interim_table} JOIN {test_source}
+                     ON {test_id} = {id_column_name}
+                GROUP BY {test_id}, {test_column_name}
+                """.format(**locals()))
         else:
-            if output_neighbors:
-                plpy.execute(""" {knn_create_table}{knn_pred_reg}
-                    {knn_neighbours}{knn_group_by}""".format(**locals()))
-            else: 
-                plpy.execute("""{knn_create_table}{knn_pred_reg}
-                    {knn_group_by}""".format(**locals()))
-                
-
+            plpy.execute(
+                """
+                CREATE TABLE {output_table} AS
+                SELECT {test_id} AS id, {test_column_name},
+                       AVG({label_col_temp}) AS prediction
+                FROM
+                    {interim_table} JOIN {test_source}
+                    ON {test_id} = {id_column_name}
+                GROUP BY {test_id}, {test_column_name}
+                """.format(**locals()))
         plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table))
+<<<<<<< HEAD
+# ------------------------------------------------------------------------------
+=======
 # ------------------------------------------------------------------------------
+>>>>>>> 6f99dbe4659a5582df61e2fdcc68f7ee60d5859d
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
index dfd23748e..1d547bddf 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -71,14 +71,13 @@ neighbors of the given test point.
 <pre class="syntax">
 knn( point_source,
      point_column_name,
-     point_id,
      label_column_name,
      test_source,
      test_column_name,
-     test_id,
+     id_column_name,
      output_table,
-     k,
-     output_neighbors
+     operation,
+     k
    )
 </pre>
 
@@ -94,17 +93,8 @@ in a column of type <tt>DOUBLE PRECISION[]</tt>.
 <dt>point_column_name</dt>
 <dd>TEXT. Name of the column with training data points.</dd>
 
-<dt>point_id</dt>
-<dd>TEXT. Name of the column in 'point_source’ containing source data ids.
-The ids are of type INTEGER with no duplicates. They do not need to be contiguous. 
-This parameter must be used if the list of nearest neighbors are to be output, i.e., 
-if the parameter 'output_neighbors' below is TRUE or if 'label_column_name' is NULL.
-
 <dt>label_column_name</dt>
-<dd>TEXT. Name of the column with labels/values of training data points.
-If Boolean, integer or text types will run knn classification, else if 
-double precision values will run knn regression.  
-If you set this to NULL will return neighbors only without doing classification or regression.</dd>
+<dd>TEXT. Name of the column with labels/values of training data points.</dd>
 
 <dt>test_source</dt>
 <dd>TEXT. Name of the table containing the test data points.
@@ -116,7 +106,7 @@ in a column of type <tt>DOUBLE PRECISION[]</tt>.
 <dt>test_column_name</dt>
 <dd>TEXT. Name of the column with testing data points.</dd>
 
-<dt>test_id</dt>
+<dt>id_column_name</dt>
 <dd>TEXT. Name of the column having ids of data points in test data table.</dd>
 
 <dt>output_table</dt>
@@ -127,12 +117,7 @@ in a column of type <tt>DOUBLE PRECISION[]</tt>.
 
 <dt>k (optional)</dt>
 <dd>INTEGER. default: 1. Number of nearest neighbors to consider.
-For classification, should be an odd number to break ties.
-otherwise result may depend on ordering of the input data.</dd>
-
-<dt>output_neighbors (optional) </dt>
-<dd>BOOLEAN default: FALSE. Outputs the list of k-nearest 
-neighbors that were used in the voting/averaging.</dd>
+For classification, should be an odd number to break ties.</dd>
 
 </dl>
 
@@ -160,35 +145,15 @@ The output of the KNN module is a table with the following columns:
 @anchor examples
 @examp
 
--#  Prepare some training data for classification:
+-#  Prepare some training data:
 <pre class="example">
 DROP TABLE IF EXISTS knn_train_data;
 CREATE TABLE knn_train_data (
-                    id integer, 
-                    data integer[], 
-                    label integer
-                    );
-INSERT INTO knn_train_data VALUES
-(1, '{1,1}', 1),
-(2, '{2,2}', 1),
-(3, '{3,3}', 1),
-(4, '{4,4}', 1),
-(5, '{4,5}', 1),
-(6, '{20,50}', 0),
-(7, '{10,31}', 0),
-(8, '{81,13}', 0),
-(9, '{1,111}', 0);
-</pre>
-
--#  Prepare some training data for regression:
-<pre class="example">
-DROP TABLE IF EXISTS knn_train_data_reg;
-CREATE TABLE knn_train_data_reg (
                     id integer, 
                     data integer[], 
                     label float
                     );
-INSERT INTO knn_train_data_reg VALUES
+INSERT INTO knn_train_data VALUES
 (1, '{1,1}', 1.0),
 (2, '{2,2}', 1.0),
 (3, '{3,3}', 1.0),
@@ -222,27 +187,26 @@ DROP TABLE IF EXISTS madlib_knn_result_classification;
 SELECT * FROM madlib.knn( 
                 'knn_train_data',      -- Table of training data
                 'data',                -- Col name of training data
-                'id',                  -- Col Name of id in train data 
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data 
                 'madlib_knn_result_classification',  -- Output table
+                'c',                   -- Classification
                  3                     -- Number of nearest neighbours
-                 True                  -- True if you want to show Nearest-Neighbors, False otherwise
                 );
 SELECT * from madlib_knn_result_classification ORDER BY id;
 </pre>
 Result:
 <pre class="result">
- id |  data   | prediction | k_nearest_neighbours 
-----+---------+------------+----------------------
-  1 | {2,1}   |          1 | {1,2,3}
-  2 | {2,6}   |          1 | {5,4,3}
-  3 | {15,40} |          0 | {7,6,5}
-  4 | {12,1}  |          1 | {4,5,3}
-  5 | {2,90}  |          0 | {9,6,7}
-  6 | {50,45} |          0 | {6,7,8}
+ id |  data   | prediction 
+----+---------+------------
+  1 | {2,1}   |          1
+  2 | {2,6}   |          1
+  3 | {15,40} |          0
+  4 | {12,1}  |          1
+  5 | {2,90}  |          0
+  6 | {50,45} |          0
 (6 rows)
 </pre>
 
@@ -250,29 +214,28 @@ Result:
 <pre class="example">
 DROP TABLE IF EXISTS madlib_knn_result_regression;
 SELECT * FROM madlib.knn( 
-                'knn_train_data_reg',  -- Table of training data
+                'knn_train_data',      -- Table of training data
                 'data',                -- Col name of training data
-                'id',                  -- Col Name of id in train data 
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data 
                 'madlib_knn_result_regression',  -- Output table
-                 3,                    -- Number of nearest neighbours
-                True                   -- True if you want to show Nearest-Neighbors, False otherwise
+                'r',                   -- Regressions
+                 3                     -- Number of nearest neighbours
                 );
 SELECT * from madlib_knn_result_regression ORDER BY id;
 </pre>
 Result:
 <pre class="result">
- id |  data   |    prediction     | k_nearest_neighbours 
-----+---------+-------------------+----------------------
-  1 | {2,1}   |                 1 | {1,2,3}
-  2 | {2,6}   |                 1 | {5,4,3}
-  3 | {15,40} | 0.333333333333333 | {7,6,5}
-  4 | {12,1}  |                 1 | {4,5,3}
-  5 | {2,90}  |                 0 | {9,6,7}
-  6 | {50,45} |                 0 | {6,7,8}
+ id |  data   |    prediction     
+----+---------+-------------------
+  1 | {2,1}   |                 1
+  2 | {2,6}   |                 1
+  3 | {15,40} | 0.333333333333333
+  4 | {12,1}  |                 1
+  5 | {2,90}  |                 0
+  6 | {50,45} |                 0
 (6 rows)
 </pre>
 
@@ -318,7 +281,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src(
     label_column_name VARCHAR,
     test_source VARCHAR,
     test_column_name VARCHAR,
-    test_id VARCHAR,
+    id_column_name VARCHAR,
     output_table VARCHAR,
     operation VARCHAR,
     k INTEGER
@@ -331,7 +294,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src(
         label_column_name,
         test_source,
         test_column_name,
-        test_id,
+        id_column_name,
         output_table,
         operation,
         k
@@ -353,14 +316,13 @@ BEGIN
 SELECT {schema_madlib}.knn(
     point_source,       -- Training data table having training features as vector column and labels
     point_column_name,  -- Name of column having feature vectors in training data table
-    point_id,           -- Name of column having feature vector Ids in train data table
     label_column_name,  -- Name of column having actual label/vlaue for corresponding feature vector in training data table
     test_source,        -- Test data table having features as vector column. Id of features is mandatory
     test_column_name,   -- Name of column having feature vectors in test data table
-    test_id,     -- Name of column having feature vector Ids in test data table
+    id_column_name,     -- Name of column having feature vector Ids in test data table
     output_table,       -- Name of output table
-    k,                  -- value of k. Default will go as 1
-    output_neighbors    -- Outputs the list of k-nearest neighbors that were used in the voting/averaging.
+    operation,          -- c for classification task, r for regression task
+    k                   -- value of k. Default will go as 1
     );
 
 -----------------------------------------------------------------------
@@ -371,7 +333,6 @@ The output of the KNN module is a table with the following columns:
 id                  The ids of test data points.
 test_column_name    The test data points.
 prediction          The output of KNN- label in case of classification, average value in case of regression.
-k_nearest_neighbours The list of k-nearest neighbors that were used in the voting/averaging.
 ';
     END IF;
 END;
@@ -401,28 +362,26 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     point_source VARCHAR,
     point_column_name VARCHAR,
-    point_id VARCHAR,
     label_column_name VARCHAR,
     test_source VARCHAR,
     test_column_name VARCHAR,
-    test_id VARCHAR,
+    id_column_name VARCHAR,
     output_table VARCHAR,
-    k INTEGER,
-    output_neighbors Boolean
+    operation VARCHAR,
+    k INTEGER
 ) RETURNS VARCHAR AS $$
     PythonFunctionBodyOnly(`knn', `knn')
     return knn.knn(
         schema_madlib,
         point_source,
         point_column_name,
-        point_id,
         label_column_name,
         test_source,
         test_column_name,
-        test_id,
+        id_column_name,
         output_table,
-        k,
-        output_neighbors
+        operation,
+        k
     )
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
@@ -431,19 +390,22 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     point_source VARCHAR,
     point_column_name VARCHAR,
-    point_id VARCHAR,
     label_column_name VARCHAR,
     test_source VARCHAR,
     test_column_name VARCHAR,
-    test_id VARCHAR,
+    id_column_name VARCHAR,
     output_table VARCHAR,
-    output_neighbors Boolean
+    operation VARCHAR
 ) RETURNS VARCHAR AS $$
 DECLARE
     returnstring VARCHAR;
 BEGIN
-    returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1,$9);
+    returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1);
     RETURN returnstring;
 END;
 $$ LANGUAGE plpgsql VOLATILE
+<<<<<<< HEAD
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+=======
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+>>>>>>> 6f99dbe4659a5582df61e2fdcc68f7ee60d5859d
diff --git a/src/ports/postgres/modules/knn/test/knn.sql_in b/src/ports/postgres/modules/knn/test/knn.sql_in
index 405068c26..c7d6798c6 100644
--- a/src/ports/postgres/modules/knn/test/knn.sql_in
+++ b/src/ports/postgres/modules/knn/test/knn.sql_in
@@ -26,29 +26,12 @@ m4_include(`SQLCommon.m4')
  * FIXME: Verify results
  * -------------------------------------------------------------------------- */
 
-drop table if exists knn_train_data;
-create table knn_train_data (
-id  integer,
-data    integer[],
-label   integer);
-copy knn_train_data (id, data, label) from stdin delimiter '|';
-1|{1,1}|1
-2|{2,2}|1
-3|{3,3}|1
-4|{4,4}|1
-5|{4,5}|1
-6|{20,50}|0
-7|{10,31}|0
-8|{81,13}|0
-9|{1,111}|0
-\.
-DROP TABLE IF EXISTS knn_train_data_reg;
-CREATE TABLE knn_train_data_reg (
-                    id integer, 
-                    data integer[], 
-                    label float
-                    );
-COPY knn_train_data_reg (id, data, label) from stdin delimiter '|';
+drop table if exists "KNN_TRAIN_DATA";
+create table "KNN_TRAIN_DATA" (
+                id  integer,
+                "DATA"    integer[],
+                label   float);
+copy "KNN_TRAIN_DATA" (id, "DATA", label) from stdin delimiter '|';
 1|{1,1}|1.0
 2|{2,2}|1.0
 3|{3,3}|1.0
@@ -59,10 +42,11 @@ COPY knn_train_data_reg (id, data, label) from stdin delimiter '|';
 8|{81,13}|0.0
 9|{1,111}|0.0
 \.
+drop table if exists knn_test_data;
 create table knn_test_data (
-id  integer,
-data integer[]);
-copy knn_test_data (id, data) from stdin delimiter '|';
+                id  integer,
+                "DATA" integer[]);
+copy knn_test_data (id, "DATA") from stdin delimiter '|';
 1|{2,1}
 2|{2,6}
 3|{15,40}
@@ -71,23 +55,15 @@ copy knn_test_data (id, data) from stdin delimiter '|';
 6|{50,45}
 \.
 drop table if exists madlib_knn_result_classification;
-select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,False);
+select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_classification','c',3);
 select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=3') from madlib_knn_result_classification;
 
-drop table if exists madlib_knn_result_classification;
-select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,True);
-select assert(array_agg(k_nearest_neighbours order by id)='{ {1,2,3},{5,4,3},{7,6,5},{4,5,3},{9,6,7},{6,7,8} }', 'Wrong output in classification with k=3') from madlib_knn_result_classification;
-
 drop table if exists madlib_knn_result_regression;
-select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',4,False);
+select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_regression','r',4);
 select assert(array_agg(prediction order by id)='{1,1,0.5,1,0.25,0.25}', 'Wrong output in regression') from madlib_knn_result_regression;
 
-drop table if exists madlib_knn_result_regression;
-select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',3,True);
-select assert(array_agg(k_nearest_neighbours order by id)='{ {1,2,3},{5,4,3},{7,6,5},{4,5,3},{9,6,7},{6,7,8} }', 'Wrong output in regression') from madlib_knn_result_regression;
-
 drop table if exists madlib_knn_result_classification;
-select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',False);
+select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_classification','c');
 select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=1') from madlib_knn_result_classification;
 
 select knn();