From 4ff1bf17fc678b88f6f29eb344d0e4cd523a2a7a Mon Sep 17 00:00:00 2001 From: hpandeycodeit Date: Fri, 8 Sep 2017 15:10:37 -0700 Subject: [PATCH 1/4] Changes for KNN_1129 --- src/ports/postgres/modules/knn/knn.py_in | 4 ++++ src/ports/postgres/modules/knn/knn.sql_in | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in index 4d5d62769..27cb7353d 100644 --- a/src/ports/postgres/modules/knn/knn.py_in +++ b/src/ports/postgres/modules/knn/knn.py_in @@ -177,4 +177,8 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name, GROUP BY {test_id}, {test_column_name} """.format(**locals())) plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table)) +<<<<<<< HEAD # ------------------------------------------------------------------------------ +======= +# ------------------------------------------------------------------------------ +>>>>>>> 6f99dbe4659a5582df61e2fdcc68f7ee60d5859d diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in index ca5be8821..1d547bddf 100644 --- a/src/ports/postgres/modules/knn/knn.sql_in +++ b/src/ports/postgres/modules/knn/knn.sql_in @@ -308,7 +308,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( ) RETURNS VOID AS $$ BEGIN IF arg1 = 'help' OR arg1 = 'usage' OR arg1 = '?' THEN - RAISE NOTICE + RAISE NOTICE ' ----------------------------------------------------------------------- USAGE @@ -404,4 +404,8 @@ BEGIN RETURN returnstring; END; $$ LANGUAGE plpgsql VOLATILE +<<<<<<< HEAD m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +======= +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +>>>>>>> 6f99dbe4659a5582df61e2fdcc68f7ee60d5859d From 0c6940a96a6a6b6b33db3c98c0d6dc55c8e55265 Mon Sep 17 00:00:00 2001 From: hpandeycodeit Date: Fri, 8 Sep 2017 15:34:11 -0700 Subject: [PATCH 2/4] KNN Changes for Jira: 1129 --- src/ports/postgres/modules/knn/knn.py_in | 152 ++++++++++++------ src/ports/postgres/modules/knn/knn.sql_in | 130 +++++++++------ .../postgres/modules/knn/test/knn.sql_in | 50 ++++-- 3 files changed, 225 insertions(+), 107 deletions(-) diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in index 27cb7353d..92e1de2c2 100644 --- a/src/ports/postgres/modules/knn/knn.py_in +++ b/src/ports/postgres/modules/knn/knn.py_in @@ -36,19 +36,20 @@ from utilities.utilities import unique_string from utilities.control import MinWarning -def knn_validate_src(schema_madlib, point_source, point_column_name, +def knn_validate_src(schema_madlib, point_source, point_column_name,point_id, label_column_name, test_source, test_column_name, - id_column_name, output_table, operation, k, **kwargs): - - if not operation or operation not in ['c', 'r']: - plpy.error("kNN Error: operation='{0}' is an invalid value, has to be" - " 'r' for regression OR 'c' for classification.". - format(operation)) + test_id, output_table, k, output_neighbors , **kwargs): + # if not operation or operation not in ['c', 'r']: + # plpy.error("kNN Error: operation='{0}' is an invalid value, has to be" + # " 'r' for regression OR 'c' for classification.". + # format(operation)) input_tbl_valid(point_source, 'kNN') input_tbl_valid(test_source, 'kNN') output_tbl_valid(output_table, 'kNN') - cols_in_tbl_valid(point_source, (label_column_name, point_column_name), 'kNN') - cols_in_tbl_valid(test_source, (test_column_name, id_column_name), 'kNN') + if label_column_name is not None and label_column_name != '': + cols_in_tbl_valid(point_source, (label_column_name, point_column_name), 'kNN') + cols_in_tbl_valid(point_source, (point_column_name, point_id), 'kNN') + cols_in_tbl_valid(test_source, (test_column_name, test_id), 'kNN') if not is_col_array(point_source, point_column_name): plpy.error("kNN Error: Feature column '{0}' in train table is not" @@ -75,30 +76,32 @@ def knn_validate_src(schema_madlib, point_source, point_column_name, plpy.error("kNN Error: k={0} is greater than number of rows in" " training table.".format(k)) - col_type = get_expr_type(label_column_name, point_source).lower() - if col_type not in ['integer', 'double precision', 'float', 'boolean']: - plpy.error("kNN error: Data type '{0}' is not a valid type for" + if label_column_name is not None and label_column_name != '': + col_type = get_expr_type(label_column_name, point_source).lower() + if col_type not in ['integer', 'double precision', 'float', 'boolean']: + plpy.error("kNN error: Data type '{0}' is not a valid type for" " column '{1}' in table '{2}'.". format(col_type, label_column_name, point_source)) - col_type_test = get_expr_type(id_column_name, test_source).lower() + col_type_test = get_expr_type(test_id, test_source).lower() if col_type_test not in ['integer']: plpy.error("kNN Error: Data type '{0}' is not a valid type for" " column '{1}' in table '{2}'.". - format(col_type_test, id_column_name, test_source)) + format(col_type_test, test_id, test_source)) return k # ------------------------------------------------------------------------------ -def knn(schema_madlib, point_source, point_column_name, label_column_name, - test_source, test_column_name, id_column_name, output_table, - operation, k): +def knn(schema_madlib, point_source, point_column_name,point_id, label_column_name, + test_source, test_column_name, test_id, output_table, k,output_neighbors): """ KNN function to find the K Nearest neighbours Args: @param schema_madlib Name of the Madlib Schema @param point_source Training data table @param point_column_name Name of the column with training data + @param point_id Name of the column having ids of data + point in train data table points. @param label_column_name Name of the column with labels/values of training data points. @@ -106,7 +109,7 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name, data points. @param test_column_name Name of the column with testing data points. - @param id_column_name Name of the column having ids of data + @param test_id Name of the column having ids of data points in test data table. @param output_table Name of the table to store final results. @@ -115,33 +118,85 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name, 'r' for regression @param k default: 1. Number of nearest neighbors to consider + @output_neighbours Outputs the list of k-nearest neighbors + that were used in the voting/averaging. Returns: VARCHAR Name of the output table. """ with MinWarning('warning'): k_val = knn_validate_src(schema_madlib, point_source, - point_column_name, label_column_name, - test_source, test_column_name, id_column_name, - output_table, operation, k) + point_column_name, point_id ,label_column_name, + test_source, test_column_name, test_id, + output_table, k , output_neighbors) x_temp_table = unique_string(desp='x_temp_table') y_temp_table = unique_string(desp='y_temp_table') label_col_temp = unique_string(desp='label_col_temp') - test_id = unique_string(desp='test_id') + test_id_temp = unique_string(desp='test_id_temp') + + if output_neighbors is None or '': + output_neighbors=False - is_classification = operation == 'c' interim_table = unique_string(desp='interim_table') + + if label_column_name is None or label_column_name == '': + plpy.execute( + """ + CREATE TEMP TABLE {interim_table} AS + SELECT * FROM + ( + SELECT row_number() over + (partition by {test_id_temp} order by dist) AS r, + {x_temp_table}.* + FROM + ( + SELECT test.{test_id} AS {test_id_temp} , + train.id as train_id , + {schema_madlib}.squared_dist_norm2( + train.{point_column_name}, + test.{test_column_name}) + AS dist + FROM {point_source} AS train, {test_source} AS test + ) {x_temp_table} + ) {y_temp_table} + WHERE {y_temp_table}.r <= {k_val} + """.format(**locals())) + plpy.execute( + """ + CREATE TABLE {output_table} AS + SELECT {test_id_temp} AS id, {test_column_name} , + CASE WHEN {output_neighbors} + THEN array_agg(knn_temp.train_id) + ELSE NULL END AS k_nearest_neighbours + FROM pg_temp.{interim_table} AS knn_temp + join + {test_source} AS knn_test ON + knn_temp.{test_id_temp} = knn_test.{test_id} + GROUP BY {test_id_temp} , {test_column_name} + """.format(**locals())) + return + + + is_classification = False + label_column_type = get_expr_type(label_column_name, point_source).lower() + if label_column_type in ['boolean','integer', 'text']: + is_classification = True + convert_boolean_to_int = '::INTEGER' + else: + is_classification = False + plpy.execute( """ CREATE TEMP TABLE {interim_table} AS SELECT * FROM ( SELECT row_number() over - (partition by {test_id} order by dist) AS r, + (partition by {test_id_temp} order by dist) AS r, {x_temp_table}.* FROM ( - SELECT test.{id_column_name} AS {test_id} , + SELECT test.{test_id} AS {test_id_temp} , + train.id as train_id , {schema_madlib}.squared_dist_norm2( train.{point_column_name}, test.{test_column_name}) @@ -155,30 +210,31 @@ def knn(schema_madlib, point_source, point_column_name, label_column_name, """.format(cast_to_int='::INTEGER' if is_classification else '', **locals())) + knn_create_table = 'CREATE TABLE '+output_table+' AS ' \ + 'SELECT '+test_id_temp+' AS id,'+test_column_name+',' + knn_pred_class = schema_madlib+'.mode(' +label_col_temp+') AS prediction' + knn_pred_reg = 'avg(' +label_col_temp+') AS prediction' + knn_neighbours = ', array_agg(knn_temp.train_id) AS k_nearest_neighbours ' + knn_group_by = 'FROM pg_temp.'+interim_table+' AS knn_temp join ' \ + +test_source+' AS knn_test ON knn_temp.'+test_id_temp+'= knn_test.' \ + +test_id +' GROUP BY '+test_id_temp+', '+test_column_name + + if is_classification: - plpy.execute( - """ - CREATE TABLE {output_table} AS - SELECT {test_id} AS id, {test_column_name}, - {schema_madlib}.mode({label_col_temp}) AS prediction - FROM {interim_table} JOIN {test_source} - ON {test_id} = {id_column_name} - GROUP BY {test_id}, {test_column_name} - """.format(**locals())) + if output_neighbors: + plpy.execute("""{knn_create_table}{knn_pred_class} + {knn_neighbours}{knn_group_by}""".format(**locals())) + else: + plpy.execute(""" {knn_create_table}{knn_pred_class} + {knn_group_by}""".format(**locals())) else: - plpy.execute( - """ - CREATE TABLE {output_table} AS - SELECT {test_id} AS id, {test_column_name}, - AVG({label_col_temp}) AS prediction - FROM - {interim_table} JOIN {test_source} - ON {test_id} = {id_column_name} - GROUP BY {test_id}, {test_column_name} - """.format(**locals())) + if output_neighbors: + plpy.execute(""" {knn_create_table}{knn_pred_reg} + {knn_neighbours}{knn_group_by}""".format(**locals())) + else: + plpy.execute("""{knn_create_table}{knn_pred_reg} + {knn_group_by}""".format(**locals())) + + plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table)) -<<<<<<< HEAD -# ------------------------------------------------------------------------------ -======= # ------------------------------------------------------------------------------ ->>>>>>> 6f99dbe4659a5582df61e2fdcc68f7ee60d5859d diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in index 1d547bddf..dfd23748e 100644 --- a/src/ports/postgres/modules/knn/knn.sql_in +++ b/src/ports/postgres/modules/knn/knn.sql_in @@ -71,13 +71,14 @@ neighbors of the given test point.
 knn( point_source,
      point_column_name,
+     point_id,
      label_column_name,
      test_source,
      test_column_name,
-     id_column_name,
+     test_id,
      output_table,
-     operation,
-     k
+     k,
+     output_neighbors
    )
 
@@ -93,8 +94,17 @@ in a column of type DOUBLE PRECISION[].
point_column_name
TEXT. Name of the column with training data points.
+
point_id
+
TEXT. Name of the column in 'point_source’ containing source data ids. +The ids are of type INTEGER with no duplicates. They do not need to be contiguous. +This parameter must be used if the list of nearest neighbors are to be output, i.e., +if the parameter 'output_neighbors' below is TRUE or if 'label_column_name' is NULL. +
label_column_name
-
TEXT. Name of the column with labels/values of training data points.
+
TEXT. Name of the column with labels/values of training data points. +If Boolean, integer or text types will run knn classification, else if +double precision values will run knn regression. +If you set this to NULL will return neighbors only without doing classification or regression.
test_source
TEXT. Name of the table containing the test data points. @@ -106,7 +116,7 @@ in a column of type DOUBLE PRECISION[].
test_column_name
TEXT. Name of the column with testing data points.
-
id_column_name
+
test_id
TEXT. Name of the column having ids of data points in test data table.
output_table
@@ -117,7 +127,12 @@ in a column of type DOUBLE PRECISION[].
k (optional)
INTEGER. default: 1. Number of nearest neighbors to consider. -For classification, should be an odd number to break ties.
+For classification, should be an odd number to break ties. +otherwise result may depend on ordering of the input data. + +
output_neighbors (optional)
+
BOOLEAN default: FALSE. Outputs the list of k-nearest +neighbors that were used in the voting/averaging.
@@ -145,15 +160,35 @@ The output of the KNN module is a table with the following columns: @anchor examples @examp --# Prepare some training data: +-# Prepare some training data for classification:
 DROP TABLE IF EXISTS knn_train_data;
 CREATE TABLE knn_train_data (
                     id integer, 
                     data integer[], 
-                    label float
+                    label integer
                     );
 INSERT INTO knn_train_data VALUES
+(1, '{1,1}', 1),
+(2, '{2,2}', 1),
+(3, '{3,3}', 1),
+(4, '{4,4}', 1),
+(5, '{4,5}', 1),
+(6, '{20,50}', 0),
+(7, '{10,31}', 0),
+(8, '{81,13}', 0),
+(9, '{1,111}', 0);
+
+ +-# Prepare some training data for regression: +
+DROP TABLE IF EXISTS knn_train_data_reg;
+CREATE TABLE knn_train_data_reg (
+                    id integer, 
+                    data integer[], 
+                    label float
+                    );
+INSERT INTO knn_train_data_reg VALUES
 (1, '{1,1}', 1.0),
 (2, '{2,2}', 1.0),
 (3, '{3,3}', 1.0),
@@ -187,26 +222,27 @@ DROP TABLE IF EXISTS madlib_knn_result_classification;
 SELECT * FROM madlib.knn( 
                 'knn_train_data',      -- Table of training data
                 'data',                -- Col name of training data
+                'id',                  -- Col Name of id in train data 
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data 
                 'madlib_knn_result_classification',  -- Output table
-                'c',                   -- Classification
                  3                     -- Number of nearest neighbours
+                 True                  -- True if you want to show Nearest-Neighbors, False otherwise
                 );
 SELECT * from madlib_knn_result_classification ORDER BY id;
 
Result:
- id |  data   | prediction 
-----+---------+------------
-  1 | {2,1}   |          1
-  2 | {2,6}   |          1
-  3 | {15,40} |          0
-  4 | {12,1}  |          1
-  5 | {2,90}  |          0
-  6 | {50,45} |          0
+ id |  data   | prediction | k_nearest_neighbours 
+----+---------+------------+----------------------
+  1 | {2,1}   |          1 | {1,2,3}
+  2 | {2,6}   |          1 | {5,4,3}
+  3 | {15,40} |          0 | {7,6,5}
+  4 | {12,1}  |          1 | {4,5,3}
+  5 | {2,90}  |          0 | {9,6,7}
+  6 | {50,45} |          0 | {6,7,8}
 (6 rows)
 
@@ -214,28 +250,29 @@ Result:
 DROP TABLE IF EXISTS madlib_knn_result_regression;
 SELECT * FROM madlib.knn( 
-                'knn_train_data',      -- Table of training data
+                'knn_train_data_reg',  -- Table of training data
                 'data',                -- Col name of training data
+                'id',                  -- Col Name of id in train data 
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data 
                 'madlib_knn_result_regression',  -- Output table
-                'r',                   -- Regressions
-                 3                     -- Number of nearest neighbours
+                 3,                    -- Number of nearest neighbours
+                True                   -- True if you want to show Nearest-Neighbors, False otherwise
                 );
 SELECT * from madlib_knn_result_regression ORDER BY id;
 
Result:
- id |  data   |    prediction     
-----+---------+-------------------
-  1 | {2,1}   |                 1
-  2 | {2,6}   |                 1
-  3 | {15,40} | 0.333333333333333
-  4 | {12,1}  |                 1
-  5 | {2,90}  |                 0
-  6 | {50,45} |                 0
+ id |  data   |    prediction     | k_nearest_neighbours 
+----+---------+-------------------+----------------------
+  1 | {2,1}   |                 1 | {1,2,3}
+  2 | {2,6}   |                 1 | {5,4,3}
+  3 | {15,40} | 0.333333333333333 | {7,6,5}
+  4 | {12,1}  |                 1 | {4,5,3}
+  5 | {2,90}  |                 0 | {9,6,7}
+  6 | {50,45} |                 0 | {6,7,8}
 (6 rows)
 
@@ -281,7 +318,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src( label_column_name VARCHAR, test_source VARCHAR, test_column_name VARCHAR, - id_column_name VARCHAR, + test_id VARCHAR, output_table VARCHAR, operation VARCHAR, k INTEGER @@ -294,7 +331,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src( label_column_name, test_source, test_column_name, - id_column_name, + test_id, output_table, operation, k @@ -316,13 +353,14 @@ BEGIN SELECT {schema_madlib}.knn( point_source, -- Training data table having training features as vector column and labels point_column_name, -- Name of column having feature vectors in training data table + point_id, -- Name of column having feature vector Ids in train data table label_column_name, -- Name of column having actual label/vlaue for corresponding feature vector in training data table test_source, -- Test data table having features as vector column. Id of features is mandatory test_column_name, -- Name of column having feature vectors in test data table - id_column_name, -- Name of column having feature vector Ids in test data table + test_id, -- Name of column having feature vector Ids in test data table output_table, -- Name of output table - operation, -- c for classification task, r for regression task - k -- value of k. Default will go as 1 + k, -- value of k. Default will go as 1 + output_neighbors -- Outputs the list of k-nearest neighbors that were used in the voting/averaging. ); ----------------------------------------------------------------------- @@ -333,6 +371,7 @@ The output of the KNN module is a table with the following columns: id The ids of test data points. test_column_name The test data points. prediction The output of KNN- label in case of classification, average value in case of regression. +k_nearest_neighbours The list of k-nearest neighbors that were used in the voting/averaging. '; END IF; END; @@ -362,26 +401,28 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( point_source VARCHAR, point_column_name VARCHAR, + point_id VARCHAR, label_column_name VARCHAR, test_source VARCHAR, test_column_name VARCHAR, - id_column_name VARCHAR, + test_id VARCHAR, output_table VARCHAR, - operation VARCHAR, - k INTEGER + k INTEGER, + output_neighbors Boolean ) RETURNS VARCHAR AS $$ PythonFunctionBodyOnly(`knn', `knn') return knn.knn( schema_madlib, point_source, point_column_name, + point_id, label_column_name, test_source, test_column_name, - id_column_name, + test_id, output_table, - operation, - k + k, + output_neighbors ) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -390,22 +431,19 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( point_source VARCHAR, point_column_name VARCHAR, + point_id VARCHAR, label_column_name VARCHAR, test_source VARCHAR, test_column_name VARCHAR, - id_column_name VARCHAR, + test_id VARCHAR, output_table VARCHAR, - operation VARCHAR + output_neighbors Boolean ) RETURNS VARCHAR AS $$ DECLARE returnstring VARCHAR; BEGIN - returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1); + returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1,$9); RETURN returnstring; END; $$ LANGUAGE plpgsql VOLATILE -<<<<<<< HEAD -m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); -======= m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); ->>>>>>> 6f99dbe4659a5582df61e2fdcc68f7ee60d5859d diff --git a/src/ports/postgres/modules/knn/test/knn.sql_in b/src/ports/postgres/modules/knn/test/knn.sql_in index c7d6798c6..405068c26 100644 --- a/src/ports/postgres/modules/knn/test/knn.sql_in +++ b/src/ports/postgres/modules/knn/test/knn.sql_in @@ -26,12 +26,29 @@ m4_include(`SQLCommon.m4') * FIXME: Verify results * -------------------------------------------------------------------------- */ -drop table if exists "KNN_TRAIN_DATA"; -create table "KNN_TRAIN_DATA" ( - id integer, - "DATA" integer[], - label float); -copy "KNN_TRAIN_DATA" (id, "DATA", label) from stdin delimiter '|'; +drop table if exists knn_train_data; +create table knn_train_data ( +id integer, +data integer[], +label integer); +copy knn_train_data (id, data, label) from stdin delimiter '|'; +1|{1,1}|1 +2|{2,2}|1 +3|{3,3}|1 +4|{4,4}|1 +5|{4,5}|1 +6|{20,50}|0 +7|{10,31}|0 +8|{81,13}|0 +9|{1,111}|0 +\. +DROP TABLE IF EXISTS knn_train_data_reg; +CREATE TABLE knn_train_data_reg ( + id integer, + data integer[], + label float + ); +COPY knn_train_data_reg (id, data, label) from stdin delimiter '|'; 1|{1,1}|1.0 2|{2,2}|1.0 3|{3,3}|1.0 @@ -42,11 +59,10 @@ copy "KNN_TRAIN_DATA" (id, "DATA", label) from stdin delimiter '|'; 8|{81,13}|0.0 9|{1,111}|0.0 \. -drop table if exists knn_test_data; create table knn_test_data ( - id integer, - "DATA" integer[]); -copy knn_test_data (id, "DATA") from stdin delimiter '|'; +id integer, +data integer[]); +copy knn_test_data (id, data) from stdin delimiter '|'; 1|{2,1} 2|{2,6} 3|{15,40} @@ -55,15 +71,23 @@ copy knn_test_data (id, "DATA") from stdin delimiter '|'; 6|{50,45} \. drop table if exists madlib_knn_result_classification; -select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_classification','c',3); +select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,False); select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=3') from madlib_knn_result_classification; +drop table if exists madlib_knn_result_classification; +select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,True); +select assert(array_agg(k_nearest_neighbours order by id)='{ {1,2,3},{5,4,3},{7,6,5},{4,5,3},{9,6,7},{6,7,8} }', 'Wrong output in classification with k=3') from madlib_knn_result_classification; + drop table if exists madlib_knn_result_regression; -select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_regression','r',4); +select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',4,False); select assert(array_agg(prediction order by id)='{1,1,0.5,1,0.25,0.25}', 'Wrong output in regression') from madlib_knn_result_regression; +drop table if exists madlib_knn_result_regression; +select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',3,True); +select assert(array_agg(k_nearest_neighbours order by id)='{ {1,2,3},{5,4,3},{7,6,5},{4,5,3},{9,6,7},{6,7,8} }', 'Wrong output in regression') from madlib_knn_result_regression; + drop table if exists madlib_knn_result_classification; -select knn('"KNN_TRAIN_DATA"','"DATA"','label','knn_test_data','"DATA"','id','madlib_knn_result_classification','c'); +select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',False); select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=1') from madlib_knn_result_classification; select knn(); From cf723c1d0bfde717dc01df7acde0c361736f44f4 Mon Sep 17 00:00:00 2001 From: hpandeycodeit Date: Fri, 15 Sep 2017 14:46:17 -0700 Subject: [PATCH 3/4] changed assert statement for couple of test cases and removed commented code in knn.py_in --- src/ports/postgres/modules/knn/knn.py_in | 77 +++++++++---------- .../postgres/modules/knn/test/knn.sql_in | 4 +- 2 files changed, 40 insertions(+), 41 deletions(-) diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in index 92e1de2c2..477c18d82 100644 --- a/src/ports/postgres/modules/knn/knn.py_in +++ b/src/ports/postgres/modules/knn/knn.py_in @@ -36,19 +36,19 @@ from utilities.utilities import unique_string from utilities.control import MinWarning -def knn_validate_src(schema_madlib, point_source, point_column_name,point_id, +def knn_validate_src(schema_madlib, point_source, point_column_name, point_id, label_column_name, test_source, test_column_name, - test_id, output_table, k, output_neighbors , **kwargs): - # if not operation or operation not in ['c', 'r']: - # plpy.error("kNN Error: operation='{0}' is an invalid value, has to be" - # " 'r' for regression OR 'c' for classification.". - # format(operation)) + test_id, output_table, k, output_neighbors, **kwargs): input_tbl_valid(point_source, 'kNN') input_tbl_valid(test_source, 'kNN') output_tbl_valid(output_table, 'kNN') if label_column_name is not None and label_column_name != '': - cols_in_tbl_valid(point_source, (label_column_name, point_column_name), 'kNN') - cols_in_tbl_valid(point_source, (point_column_name, point_id), 'kNN') + cols_in_tbl_valid( + point_source, + (label_column_name, + point_column_name), + 'kNN') + cols_in_tbl_valid(point_source, (point_column_name, point_id), 'kNN') cols_in_tbl_valid(test_source, (test_column_name, test_id), 'kNN') if not is_col_array(point_source, point_column_name): @@ -80,8 +80,8 @@ def knn_validate_src(schema_madlib, point_source, point_column_name,point_id, col_type = get_expr_type(label_column_name, point_source).lower() if col_type not in ['integer', 'double precision', 'float', 'boolean']: plpy.error("kNN error: Data type '{0}' is not a valid type for" - " column '{1}' in table '{2}'.". - format(col_type, label_column_name, point_source)) + " column '{1}' in table '{2}'.". + format(col_type, label_column_name, point_source)) col_type_test = get_expr_type(test_id, test_source).lower() if col_type_test not in ['integer']: @@ -92,15 +92,15 @@ def knn_validate_src(schema_madlib, point_source, point_column_name,point_id, # ------------------------------------------------------------------------------ -def knn(schema_madlib, point_source, point_column_name,point_id, label_column_name, - test_source, test_column_name, test_id, output_table, k,output_neighbors): +def knn(schema_madlib, point_source, point_column_name, point_id, label_column_name, + test_source, test_column_name, test_id, output_table, k, output_neighbors): """ KNN function to find the K Nearest neighbours Args: @param schema_madlib Name of the Madlib Schema @param point_source Training data table @param point_column_name Name of the column with training data - @param point_id Name of the column having ids of data + @param point_id Name of the column having ids of data point in train data table points. @param label_column_name Name of the column with labels/values @@ -118,16 +118,16 @@ def knn(schema_madlib, point_source, point_column_name,point_id, label_column_na 'r' for regression @param k default: 1. Number of nearest neighbors to consider - @output_neighbours Outputs the list of k-nearest neighbors + @output_neighbours Outputs the list of k-nearest neighbors that were used in the voting/averaging. Returns: VARCHAR Name of the output table. """ with MinWarning('warning'): k_val = knn_validate_src(schema_madlib, point_source, - point_column_name, point_id ,label_column_name, + point_column_name, point_id, label_column_name, test_source, test_column_name, test_id, - output_table, k , output_neighbors) + output_table, k, output_neighbors) x_temp_table = unique_string(desp='x_temp_table') y_temp_table = unique_string(desp='y_temp_table') @@ -135,12 +135,12 @@ def knn(schema_madlib, point_source, point_column_name,point_id, label_column_na test_id_temp = unique_string(desp='test_id_temp') if output_neighbors is None or '': - output_neighbors=False + output_neighbors = False interim_table = unique_string(desp='interim_table') if label_column_name is None or label_column_name == '': - plpy.execute( + plpy.execute( """ CREATE TEMP TABLE {interim_table} AS SELECT * FROM @@ -161,29 +161,29 @@ def knn(schema_madlib, point_source, point_column_name,point_id, label_column_na ) {y_temp_table} WHERE {y_temp_table}.r <= {k_val} """.format(**locals())) - plpy.execute( + plpy.execute( """ CREATE TABLE {output_table} AS SELECT {test_id_temp} AS id, {test_column_name} , - CASE WHEN {output_neighbors} - THEN array_agg(knn_temp.train_id) + CASE WHEN {output_neighbors} + THEN array_agg(knn_temp.train_id) ELSE NULL END AS k_nearest_neighbours - FROM pg_temp.{interim_table} AS knn_temp + FROM pg_temp.{interim_table} AS knn_temp join - {test_source} AS knn_test ON + {test_source} AS knn_test ON knn_temp.{test_id_temp} = knn_test.{test_id} GROUP BY {test_id_temp} , {test_column_name} """.format(**locals())) - return - + return is_classification = False - label_column_type = get_expr_type(label_column_name, point_source).lower() - if label_column_type in ['boolean','integer', 'text']: + label_column_type = get_expr_type( + label_column_name, point_source).lower() + if label_column_type in ['boolean', 'integer', 'text']: is_classification = True convert_boolean_to_int = '::INTEGER' - else: - is_classification = False + else: + is_classification = False plpy.execute( """ @@ -210,15 +210,15 @@ def knn(schema_madlib, point_source, point_column_name,point_id, label_column_na """.format(cast_to_int='::INTEGER' if is_classification else '', **locals())) - knn_create_table = 'CREATE TABLE '+output_table+' AS ' \ - 'SELECT '+test_id_temp+' AS id,'+test_column_name+',' - knn_pred_class = schema_madlib+'.mode(' +label_col_temp+') AS prediction' - knn_pred_reg = 'avg(' +label_col_temp+') AS prediction' + knn_create_table = 'CREATE TABLE ' + output_table + ' AS ' \ + 'SELECT ' + test_id_temp + ' AS id,' + test_column_name + ',' + knn_pred_class = schema_madlib + \ + '.mode(' + label_col_temp + ') AS prediction' + knn_pred_reg = 'avg(' + label_col_temp + ') AS prediction' knn_neighbours = ', array_agg(knn_temp.train_id) AS k_nearest_neighbours ' - knn_group_by = 'FROM pg_temp.'+interim_table+' AS knn_temp join ' \ - +test_source+' AS knn_test ON knn_temp.'+test_id_temp+'= knn_test.' \ - +test_id +' GROUP BY '+test_id_temp+', '+test_column_name - + knn_group_by = 'FROM pg_temp.' + interim_table + ' AS knn_temp join ' \ + + test_source + ' AS knn_test ON knn_temp.' + test_id_temp + '= knn_test.' \ + + test_id + ' GROUP BY ' + test_id_temp + ', ' + test_column_name if is_classification: if output_neighbors: @@ -231,10 +231,9 @@ def knn(schema_madlib, point_source, point_column_name,point_id, label_column_na if output_neighbors: plpy.execute(""" {knn_create_table}{knn_pred_reg} {knn_neighbours}{knn_group_by}""".format(**locals())) - else: + else: plpy.execute("""{knn_create_table}{knn_pred_reg} {knn_group_by}""".format(**locals())) - plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table)) # ------------------------------------------------------------------------------ diff --git a/src/ports/postgres/modules/knn/test/knn.sql_in b/src/ports/postgres/modules/knn/test/knn.sql_in index 405068c26..9f2bad83f 100644 --- a/src/ports/postgres/modules/knn/test/knn.sql_in +++ b/src/ports/postgres/modules/knn/test/knn.sql_in @@ -76,7 +76,7 @@ select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output i drop table if exists madlib_knn_result_classification; select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,True); -select assert(array_agg(k_nearest_neighbours order by id)='{ {1,2,3},{5,4,3},{7,6,5},{4,5,3},{9,6,7},{6,7,8} }', 'Wrong output in classification with k=3') from madlib_knn_result_classification; +select assert((k_nearest_neighbours )='{1,2,3}', 'Wrong output in classification with k=3') from madlib_knn_result_classification where id = 1; drop table if exists madlib_knn_result_regression; select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',4,False); @@ -84,7 +84,7 @@ select assert(array_agg(prediction order by id)='{1,1,0.5,1,0.25,0.25}', 'Wrong drop table if exists madlib_knn_result_regression; select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',3,True); -select assert(array_agg(k_nearest_neighbours order by id)='{ {1,2,3},{5,4,3},{7,6,5},{4,5,3},{9,6,7},{6,7,8} }', 'Wrong output in regression') from madlib_knn_result_regression; +select assert((k_nearest_neighbours )='{1,2,3}', 'Wrong output in regression') from madlib_knn_result_regression where id =1; drop table if exists madlib_knn_result_classification; select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',False); From 61194860741c4d6260163b91e549885556b7677e Mon Sep 17 00:00:00 2001 From: hpandeycodeit Date: Thu, 12 Oct 2017 12:48:03 -0700 Subject: [PATCH 4/4] modified test case so that order of the k_nearest_neighbours remains same --- src/ports/postgres/modules/knn/test/knn.sql_in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ports/postgres/modules/knn/test/knn.sql_in b/src/ports/postgres/modules/knn/test/knn.sql_in index 9f2bad83f..fa3875173 100644 --- a/src/ports/postgres/modules/knn/test/knn.sql_in +++ b/src/ports/postgres/modules/knn/test/knn.sql_in @@ -76,7 +76,7 @@ select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output i drop table if exists madlib_knn_result_classification; select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3,True); -select assert((k_nearest_neighbours )='{1,2,3}', 'Wrong output in classification with k=3') from madlib_knn_result_classification where id = 1; +select assert(array_agg(x)= '{1,2,3}','Wrong output in classification with k=3') from (select unnest(k_nearest_neighbours) as x from madlib_knn_result_classification where id = 1 order by x asc) y; drop table if exists madlib_knn_result_regression; select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',4,False); @@ -84,7 +84,7 @@ select assert(array_agg(prediction order by id)='{1,1,0.5,1,0.25,0.25}', 'Wrong drop table if exists madlib_knn_result_regression; select knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',3,True); -select assert((k_nearest_neighbours )='{1,2,3}', 'Wrong output in regression') from madlib_knn_result_regression where id =1; +select assert(array_agg(x)= '{1,2,3}' , 'Wrong output in regression with k=3') from (select unnest(k_nearest_neighbours) as x from madlib_knn_result_regression where id = 1 order by x asc) y; drop table if exists madlib_knn_result_classification; select knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',False);