Skip to content

Commit

Permalink
[SYSTEMDS-3213] New builtin for cluster-based quantization
Browse files Browse the repository at this point in the history
LDE SoSe'24 project
Closes #2030.
  • Loading branch information
canabdulla authored and mboehm7 committed Jun 4, 2024
1 parent 889db1f commit 9ff9e16
Show file tree
Hide file tree
Showing 4 changed files with 292 additions and 0 deletions.
75 changes: 75 additions & 0 deletions scripts/builtin/quantizeByCluster.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

# Builtin function that implements product quantization
#
# INPUT:
# ---------------------------------------------------------------------------------------
# X The input matrix to perform product quantization on
# M Number of subspaces
# k Number of vectors in the subcodebooks
# runs Number of runs (with different initial centroids)
# max_iter Maximum number of iterations per run
# eps Tolerance (epsilon) for WCSS change ratio
# avg_sample_size_per_centroid Average number of records per centroid in data samples
# separate Cluster subspaces separately. If value is set to true,
# kmeans is run M times, once for each subspace. Otherwise
# kmeans is run only once.
# seed The seed used for initial sampling. If set to -1 random
# seeds are selected.
# ---------------------------------------------------------------------------------------
#
# OUTPUT:
# ------------------------------------------------------------------------------------------
# codebook The matrix containing the centroids. If clustered separately, the ith
# subcodebook is the ith chunk of size k. The codebook matrix has the dimensions
# [k*M x ncol(X)/M].
# codes The mapping of vectors to centroids. Each vector of the input matrix X is mapped
# onto a vector of codes. The entries in the codes matrix are the indices of
# the vectors in the codebook. The codes matrix has the dimensions [nrow(X) x M].
# ------------------------------------------------------------------------------------------

m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, Integer runs = 10,
Integer max_iter = 1000, Double eps = 1e-6, Integer avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Integer seed = -1)
return(Matrix[Double] codebook, Matrix[Double] codes)
{
subvector_size = ncol(X) / M
#Kmeans is run just once for all subspaces together. Subvectors are mapped to vectors of the codebook of size k*M.
#The ith entry of a code vector has a value in [1, k*M].
if(!separate) {
A = matrix(X, rows= nrow(X) * M, cols=subvector_size)
[codebook, B] = kmeans(A, k * M, runs, max_iter, eps, FALSE, avg_sample_size_per_centroid, seed)
codes = matrix(B, rows = nrow(B) / M, cols = ncol(B) * M)
}
#Kmeans is run for every subspace separately. Subvectors are mapped to a subset of k vectors of the codebook.
#The ith entry of a code vector has a value in ((i-1)*k, i*k].
else {
l = k
codebook = matrix(1, rows=l*M, cols=subvector_size)
codes = matrix(1, rows=nrow(X), cols=M)
parfor(i in 1:M, check=0) {
[tmp_cbook, tmp_c] = kmeans(X[,(i-1)*subvector_size+1:i*subvector_size], l, runs, max_iter, eps, FALSE, avg_sample_size_per_centroid, seed)
codebook[(i-1)*l+1:i*l,] = tmp_cbook
offset = matrix((i-1)*l, rows=nrow(codes), cols=1)
codes[,i] = tmp_c + offset
}
}
}
1 change: 1 addition & 0 deletions src/main/java/org/apache/sysds/common/Builtins.java
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ public enum Builtins {
PSNR("psnr", true),
QR("qr", false, ReturnType.MULTI_RETURN),
QUANTILE("quantile", false),
QUANTIZEBYCLUSTER("quantizeByCluster", true),
RANDOM_FOREST("randomForest", true),
RANDOM_FOREST_PREDICT("randomForestPredict", true),
RANGE("range", false),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.sysds.test.functions.builtin.part2;

import java.util.Arrays;
import java.util.Collection;

import org.apache.sysds.runtime.matrix.data.MatrixValue;
import org.apache.sysds.runtime.meta.MatrixCharacteristics;
import org.apache.sysds.test.AutomatedTestBase;
import org.apache.sysds.test.TestConfiguration;
import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameter;

@RunWith(Parameterized.class)
public class BuiltinQuantizeByClusterTest extends AutomatedTestBase {

@Parameter
public String test_case;
@Parameter(1)
public int rows;
@Parameter(2)
public int cols;
@Parameter(3)
public int clusters;
@Parameter(4)
public int subspaces;
@Parameter(5)
public int k;
@Parameter(6)
public int vectors_per_cluster;
@Parameter(7)
public boolean quantize_separately;

private final static String TEST_NAME = "quantizeByCluster";
private final static String TEST_DIR = "functions/builtin/";
private final static String TEST_CLASS_DIR = TEST_DIR + BuiltinQuantizeByClusterTest.class.getSimpleName() + "/";
private final static double eps = 1e-10;
private final static int runs = 3;
private final static int max_iter = 1000;
// private final static double cluster_offset = 0.1;

@Parameterized.Parameters(name = "{0}: rows={1}, cols={2}, c={3}, subspaces={4}, k={5}, v_per_c={6}, sep={7}")
public static Collection<Object[]> data() {
return Arrays.asList(new Object[][]{
{"sub_cluster", 1024, 64, 12, 8, 12, 40, true}, {"sub_cluster", 1024, 64, 12, 4, 12, 40, true}, {"sub_cluster", 1024, 64, 12, 2, 12, 40, true},
{"sub_cluster", 1024, 64, 12, 8, 12, 40, false}, {"sub_cluster", 1024, 64, 12, 4, 12, 40, false}, {"sub_cluster", 1024, 64, 12, 2, 12, 40, false},
{"cluster", 1024, 64, 12, 8, 12, 40, true}, {"cluster", 1024, 64, 12, 4, 12, 40, true}, {"cluster", 1024, 64, 12, 2, 12, 40, true},
{"cluster", 1024, 64, 20, 8, 12, 40, false}, {"cluster", 1024, 64, 12, 4, 12, 40, false}, {"cluster", 1024, 64, 12, 2, 12, 40, false},
{"uniform", 1024, 64, 12, 8, 12, 40, true}, {"uniform", 1024, 64, 12, 4, 12, 40, true}, {"uniform", 1024, 64, 12, 2, 12, 40, true},
{"uniform", 1024, 64, 12, 8, 12, 40, false}, {"uniform", 1024, 64, 12, 4, 12, 40, false}, {"uniform", 1024, 64, 12, 2, 12, 40, false},
{"normal", 1024, 64, 12, 8, 12, 40, true}, {"normal", 1024, 64, 12, 4, 12, 40, true}, {"normal", 1024, 64, 12, 2, 12, 40, true},
{"normal", 1024, 64, 12, 8, 12, 40, false}, {"normal", 1024, 64, 12, 4, 12, 40, false}, {"normal", 1024, 64, 12, 2, 12, 40, false},
});
}

@Override
public void setUp() {
addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME));
}

@Test
public void basicTest() {
runQuantizeByClusterTest();
}

/*The tests use kmeans clustering as a baseline and check whether the distortion is within
a certain threshold.*/
private void runQuantizeByClusterTest() {
loadTestConfiguration(getTestConfiguration(TEST_NAME));
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME + ".dml";
programArgs = new String[]{"-nvargs", "codes=" + output("codes"), "codebook=" + output("codebook"),
"pq_distortion=" + output("pq_distortion"), "k_distortion=" + output("k_distortion"),
"clusters=" + clusters, "test_case=" + test_case, "rows=" + rows,
"cols=" + cols, "subspaces=" + subspaces, "k=" + k, "runs=" + runs, "max_iter=" + max_iter,
"eps=" + eps, "vectors_per_cluster=" + vectors_per_cluster, "sep=" + quantize_separately};

runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);

// check if output dimensions are correct
MatrixCharacteristics meta_codes = readDMLMetaDataFile("codes");
MatrixCharacteristics meta_codebook = readDMLMetaDataFile("codebook");
Assert.assertTrue("Matrix dimensions should be equal to expected dimensions",
meta_codes.getRows() == (long) clusters * vectors_per_cluster
&& meta_codes.getCols() == subspaces);
Assert.assertEquals("Centroid dimensions should be equal to expected dimensions",
cols / subspaces, meta_codebook.getCols());

double pq_distortion = readDMLMatrixFromOutputDir("pq_distortion").get(new MatrixValue.CellIndex(1, 1));
double k_distortion = readDMLMatrixFromOutputDir("k_distortion").get(new MatrixValue.CellIndex(1, 1));

//check if distortion is within a threshold
if (!test_case.equals("cluster")) {
Assert.assertTrue(pq_distortion < 1.2 * k_distortion + 0.1);
} else {
Assert.assertTrue(pq_distortion < 2 * k_distortion + 0.1);
}
}
}
96 changes: 96 additions & 0 deletions src/test/scripts/functions/builtin/quantizeByCluster.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

#duplicate rows of matix n times
duplicate_rows = function (Matrix [Double] V, Integer N)
return(Matrix [Double] V)
{
tmp = V
for(i in seq(1, N-1, 1)) {
tmp = rbind(tmp, V)
}
V = tmp
}
#construct vectors from codes
construct_vectors = function (Matrix [Double] codes, Matrix [Double] codebook)
return(Matrix [Double] vectors)
{
vectors = matrix(0, rows=nrow(codes), cols=ncol(codes)*ncol(codebook))
parfor (i in 1:nrow(codes), check=0) {
parfor (j in 1:ncol(codes), check=0) {
vectors[i, 1 + (j-1)* ncol(codebook): j * ncol(codebook)] = codebook[as.scalar(codes[i, j])]
}
}
}

max = 1
min = -max
offset = max / 10

subvector_size = $cols / $subspaces
rows = $clusters * $vectors_per_cluster

# Generate points by concatenating sub_points around sub_clusters
if($test_case == "sub_cluster") {
offset_matrix = rand(rows=rows, cols=$cols, min=-offset, max=offset, pdf="uniform", seed=2)
cluster_centers = rand(rows = $clusters, cols = subvector_size, min=min, max=max, seed=2)
vectors = matrix(cluster_centers, nrow(cluster_centers), ncol(cluster_centers))
for(i in 1:$subspaces-1) {
cluster_centers = rand(rows = $clusters, cols = subvector_size, min=min, max=max, seed=2)
vectors = cbind(vectors, cluster_centers)
}
#ensure correct number of vectors
vectors = duplicate_rows(vectors, $vectors_per_cluster)
vectors = vectors + offset_matrix
}
# Generate points around clusters
else if ($test_case == "cluster") {
cluster_centers = rand(rows = $clusters, cols = $cols, min=min, max=max, pdf="uniform", seed=2)
vectors = matrix(cluster_centers, nrow(cluster_centers), ncol(cluster_centers))
#ensure correct number of vectors
vectors = duplicate_rows(vectors, $vectors_per_cluster)
offset_matrix = rand(rows=rows, cols=$cols, min=-offset, max=offset, pdf="uniform", seed=2)
vectors = vectors + offset_matrix
}
# Generate random points
else {
vectors = rand(rows = rows, cols = $cols, min=min, max=max, pdf=$test_case, seed=2)
}

[codebook, codes] = quantizeByCluster(vectors, $subspaces, $k, $runs, $max_iter, $eps, $vectors_per_cluster, $sep, 2)
[k_codebook, k_codes] = kmeans(vectors, $k * $subspaces, $runs, $max_iter, $eps, FALSE, $vectors_per_cluster, 2)

#construct vectors from codes
pq_result = construct_vectors(codes, codebook)
k_result = construct_vectors(k_codes, k_codebook)

#calculate distortion
pq_distortion = colSums(rowSums((vectors - pq_result)^2)) / rows
k_distortion = colSums(rowSums((vectors - k_result)^2)) / rows

print("Product quantization distortion: " + toString(pq_distortion))
print("Kmeans distortion: " + toString(k_distortion))

write(codes, $codes)
write(codebook, $codebook)
write(pq_distortion, $pq_distortion)
write(k_distortion, $k_distortion)

0 comments on commit 9ff9e16

Please sign in to comment.