[SYSTEMDS-3213] New builtin for cluster-based quantization

LDE SoSe'24 project Closes #2030.
apache · Jun 4, 2024 · 9ff9e16 · 9ff9e16
1 parent 889db1f
commit 9ff9e16
Show file tree

Hide file tree

Showing 4 changed files with 292 additions and 0 deletions.
diff --git a/scripts/builtin/quantizeByCluster.dml b/scripts/builtin/quantizeByCluster.dml
@@ -0,0 +1,75 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Builtin function that implements product quantization
+#
+# INPUT:
+# ---------------------------------------------------------------------------------------
+# X                             The input matrix to perform product quantization on
+# M                             Number of subspaces
+# k                             Number of vectors in the subcodebooks
+# runs                          Number of runs (with different initial centroids)
+# max_iter                      Maximum number of iterations per run
+# eps                           Tolerance (epsilon) for WCSS change ratio
+# avg_sample_size_per_centroid  Average number of records per centroid in data samples
+# separate                      Cluster subspaces separately. If value is set to true,
+#                               kmeans is run M times, once for each subspace. Otherwise
+#                               kmeans is run only once.
+# seed                          The seed used for initial sampling. If set to -1 random
+#                               seeds are selected.
+# ---------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ------------------------------------------------------------------------------------------
+# codebook  The matrix containing the centroids. If clustered separately, the ith
+#           subcodebook is the ith chunk of size k. The codebook matrix has the dimensions
+#           [k*M x ncol(X)/M].
+# codes     The mapping of vectors to centroids. Each vector of the input matrix X is mapped
+#           onto a vector of codes. The entries in the codes matrix are the indices of
+#           the vectors in the codebook. The codes matrix has the dimensions [nrow(X) x M].
+# ------------------------------------------------------------------------------------------
+
+m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, Integer runs = 10,
+    Integer max_iter = 1000, Double eps = 1e-6, Integer avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Integer seed = -1)
+  return(Matrix[Double] codebook, Matrix[Double] codes)
+{
+  subvector_size = ncol(X) / M
+  #Kmeans is run just once for all subspaces together. Subvectors are mapped to vectors of the codebook of size k*M.
+  #The ith entry of a code vector has a value in [1, k*M].
+  if(!separate) {
+    A = matrix(X, rows= nrow(X) * M, cols=subvector_size)
+    [codebook, B] = kmeans(A, k * M, runs, max_iter, eps, FALSE, avg_sample_size_per_centroid, seed)
+    codes = matrix(B, rows = nrow(B) / M, cols = ncol(B) * M)
+  }
+  #Kmeans is run for every subspace separately. Subvectors are mapped to a subset of k vectors of the codebook.
+  #The ith entry of a code vector has a value in ((i-1)*k, i*k].
+  else {
+    l = k
+    codebook = matrix(1, rows=l*M, cols=subvector_size)
+    codes = matrix(1, rows=nrow(X), cols=M)
+    parfor(i in 1:M, check=0) {
+       [tmp_cbook, tmp_c] = kmeans(X[,(i-1)*subvector_size+1:i*subvector_size], l, runs, max_iter, eps, FALSE, avg_sample_size_per_centroid, seed)
+       codebook[(i-1)*l+1:i*l,] = tmp_cbook
+       offset = matrix((i-1)*l, rows=nrow(codes), cols=1)
+       codes[,i] = tmp_c + offset
+    }
+  }
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -271,6 +271,7 @@ public enum Builtins {
 	PSNR("psnr", true),
 	QR("qr", false, ReturnType.MULTI_RETURN),
 	QUANTILE("quantile", false),
+	QUANTIZEBYCLUSTER("quantizeByCluster", true),
 	RANDOM_FOREST("randomForest", true),
 	RANDOM_FOREST_PREDICT("randomForestPredict", true),
 	RANGE("range", false),

diff --git a/...test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java b/...test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part2;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.runtime.meta.MatrixCharacteristics;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+
+@RunWith(Parameterized.class)
+public class BuiltinQuantizeByClusterTest extends AutomatedTestBase {
+
+	@Parameter
+	public String test_case;
+	@Parameter(1)
+	public int rows;
+	@Parameter(2)
+	public int cols;
+	@Parameter(3)
+	public int clusters;
+	@Parameter(4)
+	public int subspaces;
+	@Parameter(5)
+	public int k;
+	@Parameter(6)
+	public int vectors_per_cluster;
+	@Parameter(7)
+	public boolean quantize_separately;
+
+	private final static String TEST_NAME = "quantizeByCluster";
+	private final static String TEST_DIR = "functions/builtin/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + BuiltinQuantizeByClusterTest.class.getSimpleName() + "/";
+	private final static double eps = 1e-10;
+	private final static int runs = 3;
+	private final static int max_iter = 1000;
+//	private final static double cluster_offset = 0.1;
+
+	@Parameterized.Parameters(name = "{0}: rows={1}, cols={2}, c={3}, subspaces={4}, k={5}, v_per_c={6}, sep={7}")
+	public static Collection<Object[]> data() {
+		return Arrays.asList(new Object[][]{
+			{"sub_cluster", 1024, 64, 12, 8, 12, 40, true},  {"sub_cluster", 1024, 64, 12, 4, 12, 40, true},  {"sub_cluster", 1024, 64, 12, 2, 12, 40, true},
+			{"sub_cluster", 1024, 64, 12, 8, 12, 40, false}, {"sub_cluster", 1024, 64, 12, 4, 12, 40, false}, {"sub_cluster", 1024, 64, 12, 2, 12, 40, false},
+			{"cluster", 1024, 64, 12, 8, 12, 40, true},  {"cluster", 1024, 64, 12, 4, 12, 40, true},  {"cluster", 1024, 64, 12, 2, 12, 40, true},
+			{"cluster", 1024, 64, 20, 8, 12, 40, false}, {"cluster", 1024, 64, 12, 4, 12, 40, false}, {"cluster", 1024, 64, 12, 2, 12, 40, false},
+			{"uniform", 1024, 64, 12, 8, 12, 40, true},  {"uniform", 1024, 64, 12, 4, 12, 40, true},  {"uniform", 1024, 64, 12, 2, 12, 40, true},
+			{"uniform", 1024, 64, 12, 8, 12, 40, false}, {"uniform", 1024, 64, 12, 4, 12, 40, false}, {"uniform", 1024, 64, 12, 2, 12, 40, false},
+			{"normal",  1024, 64, 12, 8, 12, 40, true},  {"normal",  1024, 64, 12, 4, 12, 40, true},  {"normal",  1024, 64, 12, 2, 12, 40, true},
+			{"normal",  1024, 64, 12, 8, 12, 40, false}, {"normal",  1024, 64, 12, 4, 12, 40, false}, {"normal",  1024, 64, 12, 2, 12, 40, false},
+		});
+	}
+
+	@Override
+	public void setUp() {
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME));
+	}
+
+	@Test
+	public void basicTest() {
+		runQuantizeByClusterTest();
+	}
+
+	/*The tests use kmeans clustering as a baseline and check whether the distortion is within
+	a certain threshold.*/
+	private void runQuantizeByClusterTest() {
+		loadTestConfiguration(getTestConfiguration(TEST_NAME));
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[]{"-nvargs", "codes=" + output("codes"), "codebook=" + output("codebook"),
+				"pq_distortion=" + output("pq_distortion"), "k_distortion=" + output("k_distortion"),
+				"clusters=" + clusters, "test_case=" + test_case, "rows=" + rows,
+				"cols=" + cols, "subspaces=" + subspaces, "k=" + k, "runs=" + runs, "max_iter=" + max_iter,
+				"eps=" + eps, "vectors_per_cluster=" + vectors_per_cluster, "sep=" + quantize_separately};
+
+		runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+
+		// check if output dimensions are correct
+		MatrixCharacteristics meta_codes = readDMLMetaDataFile("codes");
+		MatrixCharacteristics meta_codebook = readDMLMetaDataFile("codebook");
+		Assert.assertTrue("Matrix dimensions should be equal to expected dimensions",
+			meta_codes.getRows() == (long) clusters * vectors_per_cluster 
+			&& meta_codes.getCols() == subspaces);
+		Assert.assertEquals("Centroid dimensions should be equal to expected dimensions",
+			cols / subspaces, meta_codebook.getCols());
+
+		double pq_distortion = readDMLMatrixFromOutputDir("pq_distortion").get(new MatrixValue.CellIndex(1, 1));
+		double k_distortion = readDMLMatrixFromOutputDir("k_distortion").get(new MatrixValue.CellIndex(1, 1));
+
+		//check if distortion is within a threshold
+		if (!test_case.equals("cluster")) {
+			Assert.assertTrue(pq_distortion < 1.2 * k_distortion + 0.1);
+		} else {
+			Assert.assertTrue(pq_distortion < 2 * k_distortion + 0.1);
+		}
+	}
+}
diff --git a/src/test/scripts/functions/builtin/quantizeByCluster.dml b/src/test/scripts/functions/builtin/quantizeByCluster.dml
@@ -0,0 +1,96 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#duplicate rows of matix n times
+duplicate_rows = function (Matrix [Double] V, Integer N)
+  return(Matrix [Double] V)
+{
+  tmp = V
+  for(i in seq(1, N-1, 1)) {
+    tmp = rbind(tmp, V)
+  }
+  V = tmp
+}
+#construct vectors from codes
+construct_vectors = function (Matrix [Double] codes, Matrix [Double] codebook)
+  return(Matrix [Double] vectors)
+{
+  vectors = matrix(0, rows=nrow(codes), cols=ncol(codes)*ncol(codebook))
+  parfor (i in 1:nrow(codes), check=0) {
+    parfor (j in 1:ncol(codes), check=0) {
+      vectors[i, 1 + (j-1)* ncol(codebook): j * ncol(codebook)] = codebook[as.scalar(codes[i, j])]
+    }
+  }
+}
+
+max = 1
+min = -max
+offset = max / 10
+
+subvector_size = $cols / $subspaces
+rows = $clusters * $vectors_per_cluster
+
+# Generate points by concatenating sub_points around sub_clusters
+if($test_case == "sub_cluster") {
+  offset_matrix = rand(rows=rows, cols=$cols, min=-offset, max=offset, pdf="uniform", seed=2)
+  cluster_centers = rand(rows = $clusters, cols = subvector_size, min=min, max=max, seed=2)
+  vectors = matrix(cluster_centers, nrow(cluster_centers), ncol(cluster_centers))
+  for(i in 1:$subspaces-1)  {
+    cluster_centers = rand(rows = $clusters, cols = subvector_size, min=min, max=max, seed=2)
+    vectors = cbind(vectors, cluster_centers)
+  }
+  #ensure correct number of vectors
+  vectors = duplicate_rows(vectors, $vectors_per_cluster)
+  vectors = vectors + offset_matrix
+}
+# Generate points around clusters
+else if ($test_case == "cluster") {
+  cluster_centers = rand(rows = $clusters, cols = $cols, min=min, max=max, pdf="uniform", seed=2)
+  vectors = matrix(cluster_centers, nrow(cluster_centers), ncol(cluster_centers))
+  #ensure correct number of vectors
+  vectors = duplicate_rows(vectors, $vectors_per_cluster)
+  offset_matrix = rand(rows=rows, cols=$cols, min=-offset, max=offset, pdf="uniform", seed=2)
+  vectors = vectors + offset_matrix
+}
+# Generate random points
+else {
+  vectors = rand(rows = rows, cols = $cols, min=min, max=max, pdf=$test_case, seed=2)
+}
+
+[codebook, codes] = quantizeByCluster(vectors, $subspaces, $k, $runs, $max_iter, $eps, $vectors_per_cluster, $sep, 2)
+[k_codebook, k_codes] = kmeans(vectors, $k * $subspaces, $runs, $max_iter, $eps, FALSE, $vectors_per_cluster, 2)
+
+#construct vectors from codes
+pq_result = construct_vectors(codes, codebook)
+k_result = construct_vectors(k_codes, k_codebook)
+
+#calculate distortion
+pq_distortion = colSums(rowSums((vectors - pq_result)^2)) / rows
+k_distortion = colSums(rowSums((vectors - k_result)^2)) / rows
+
+print("Product quantization distortion: " + toString(pq_distortion))
+print("Kmeans distortion: " + toString(k_distortion))
+
+write(codes, $codes)
+write(codebook, $codebook)
+write(pq_distortion, $pq_distortion)
+write(k_distortion, $k_distortion)
+