Skip to content

Commit

Permalink
[SYSTEMDS-3632] Sparse Reuse of RowVectors
Browse files Browse the repository at this point in the history
I had an issue when reusing sparseRowVectors, where the maximum number
of non zeros in a Sparse Row is set to the number of columns for the
parent SparseMCSR Block. But when reusing the rows in allocations of
append with other sparse blocks some edge cases increased the number
of non zero values above the number of columns in the original matrix.

to fix this i made a few changes inside the Sparse CSR blocks, and while
at it made an mini optimization of the posFIndexGTE(int r, int c), method
to reduce the number of if statements improving overall performance.

Closes #1924
  • Loading branch information
Baunsgaard committed Oct 17, 2023
1 parent 65e31c7 commit 00e68df
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 47 deletions.
Expand Up @@ -34,7 +34,7 @@ public class DenseBlockFP64 extends DenseBlockDRB

public DenseBlockFP64(int[] dims) {
super(dims);
reset(_rlen, _odims, 0);
resetNoFill(_rlen, _odims);
}

@Override
Expand Down
15 changes: 12 additions & 3 deletions src/main/java/org/apache/sysds/runtime/data/SparseBlockCSR.java
Expand Up @@ -835,9 +835,18 @@ private int internPosFIndexLTE(int r, int c) {
}

@Override
public int posFIndexGTE(int r, int c) {
int index = internPosFIndexGTE(r, c);
return (index>=0) ? index-pos(r) : index;
public final int posFIndexGTE(int r, int c) {
final int pos = pos(r);
final int len = size(r);
final int end = pos + len;

// search for existing col index
int index = Arrays.binarySearch(_indexes, pos, end, c);
if(index < 0)
// search gt col index (see binary search)
index = Math.abs(index + 1);

return (index < end) ? index - pos : -1;
}

private int internPosFIndexGTE(int r, int c) {
Expand Down
35 changes: 27 additions & 8 deletions src/main/java/org/apache/sysds/runtime/data/SparseBlockMCSR.java
Expand Up @@ -49,7 +49,7 @@ public SparseBlockMCSR(SparseBlock sblock)
_rows = new SparseRow[orows.length];
for( int i=0; i<_rows.length; i++ )
if( orows[i] != null )
_rows[i] = new SparseRowVector(orows[i]);
_rows[i] = orows[i].copy(true);
}
//general case SparseBlock
else {
Expand All @@ -58,10 +58,17 @@ public SparseBlockMCSR(SparseBlock sblock)
if( !sblock.isEmpty(i) ) {
int apos = sblock.pos(i);
int alen = sblock.size(i);
_rows[i] = new SparseRowVector(alen);
((SparseRowVector)_rows[i]).setSize(alen);
System.arraycopy(sblock.indexes(i), apos, _rows[i].indexes(), 0, alen);
System.arraycopy(sblock.values(i), apos, _rows[i].values(), 0, alen);
if(alen == 0){
// do nothing
}
else if(alen == 1)
_rows[i] = new SparseRowScalar(sblock.indexes(i)[apos], sblock.values(i)[apos]);
else{
_rows[i] = new SparseRowVector(alen);
((SparseRowVector)_rows[i]).setSize(alen);
System.arraycopy(sblock.indexes(i), apos, _rows[i].indexes(), 0, alen);
System.arraycopy(sblock.values(i), apos, _rows[i].values(), 0, alen);
}
}
}
}
Expand Down Expand Up @@ -183,7 +190,7 @@ public boolean isContiguous() {
}

@Override
public boolean isAllocated(int r) {
public final boolean isAllocated(int r) {
return _rows[r] != null;
}

Expand Down Expand Up @@ -283,8 +290,8 @@ public long size(int rl, int ru, int cl, int cu) {
}

@Override
public boolean isEmpty(int r) {
return (!isAllocated(r) || _rows[r].isEmpty());
public final boolean isEmpty(int r) {
return !isAllocated(r) || _rows[r].isEmpty();
}

@Override
Expand Down Expand Up @@ -426,6 +433,18 @@ public int posFIndexGT(int r, int c) {
_rows[r] = new SparseRowVector(_rows[r]);
return ((SparseRowVector)_rows[r]).searchIndexesFirstGT(c);
}

public void setNnzEstimatePerRow(int nnzPerCol, int nCol){
for(SparseRow s : _rows){
if(s instanceof SparseRowVector){
SparseRowVector sv = (SparseRowVector)s;
sv.setEstimatedNzs(nnzPerCol);
}
else if(s == null){
s = new SparseRowVector(nnzPerCol, nCol);
}
}
}

@Override
public String toString() {
Expand Down
8 changes: 8 additions & 0 deletions src/main/java/org/apache/sysds/runtime/data/SparseRow.java
Expand Up @@ -131,6 +131,14 @@ public abstract class SparseRow implements Serializable
* @param eps epsilon value
*/
public abstract void compact(double eps);

/**
* Make a copy of this row.
*
* @param deep if the copy should be deep
* @return A copy
*/
public abstract SparseRow copy(boolean deep);

@Override
public String toString() {
Expand Down
Expand Up @@ -41,8 +41,8 @@ public int size() {
}

@Override
public boolean isEmpty() {
return (index < 0);
public final boolean isEmpty() {
return index < 0;
}

@Override
Expand Down Expand Up @@ -115,4 +115,9 @@ public int getIndex(){
public double getValue(){
return value;
}

@Override
public SparseRow copy(boolean deep){
return new SparseRowScalar(index, value);
}
}
96 changes: 63 additions & 33 deletions src/main/java/org/apache/sysds/runtime/data/SparseRowVector.java
Expand Up @@ -24,18 +24,30 @@
import org.apache.sysds.runtime.util.SortUtils;
import org.apache.sysds.runtime.util.UtilFunctions;

public final class SparseRowVector extends SparseRow{
/**
* A sparse row vector that is able to grow dynamically as values are appended to it.
*/
public final class SparseRowVector extends SparseRow {
private static final long serialVersionUID = 2971077474424464992L;

//initial capacity of any created sparse row
//WARNING: be aware that this affects the core memory estimates (incl. implicit assumptions)!
/**
* <p>Initial capacity of any created sparse row</p>
* WARNING: be aware that this affects the core memory estimates (incl. implicit assumptions)!
*/
public static final int initialCapacity = 4;

/**
* An estimate of the number of non zero values in this row.
* The estimate is used to set a threshold on how much the array should grow at certain
* lengths to not double the size at all times.
*/
private int estimatedNzs = initialCapacity;
private int maxNzs = Integer.MAX_VALUE;
private int size = 0;
private double[] values = null;
private int[] indexes = null;
/** The current size of the row vector */
private int size;
/** The values contained in the vector, can be allocated larger than needed */
private double[] values;
/** The column indexes of the values contained, can be allocated larger than needed */
private int[] indexes;

public SparseRowVector() {
this(initialCapacity);
Expand All @@ -45,6 +57,7 @@ public SparseRowVector(int capacity) {
estimatedNzs = capacity;
values = new double[capacity];
indexes = new int[capacity];
size = 0;
}

public SparseRowVector(int nnz, double[] v, int vlen) {
Expand All @@ -59,6 +72,12 @@ public SparseRowVector(int nnz, double[] v, int vlen) {
size = nnz;
}

public SparseRowVector(double[] v, int[] i){
values = v;
indexes = i;
size = v.length;
}

/**
* Sparse row vector constructor that take a dense array, and allocate sparsely by ignoring zero values
* @param v The dense row
Expand All @@ -83,11 +102,10 @@ public SparseRowVector(double[] v){
public SparseRowVector(int estnnz, int maxnnz) {
if( estnnz > initialCapacity )
estimatedNzs = estnnz;
maxNzs = maxnnz;
int capacity = ((estnnz<initialCapacity && estnnz>0) ?
estnnz : initialCapacity);
int capacity = initialCapacity;
values = new double[capacity];
indexes = new int[capacity];
size = 0;
}

public SparseRowVector(SparseRow that) {
Expand All @@ -109,8 +127,8 @@ public void setSize(int newsize) {
}

@Override
public boolean isEmpty() {
return (size == 0);
public final boolean isEmpty() {
return size == 0;
}

@Override
Expand Down Expand Up @@ -157,10 +175,14 @@ public void copy(SparseRow that)
@Override
public void reset(int estnns, int maxnns) {
estimatedNzs = estnns;
maxNzs = maxnns;
// maxNzs = maxnns;
size = 0;
}

public void setEstimatedNzs(int estnnz){
estimatedNzs = estnnz;
}

private void recap(int newCap) {
if( newCap<=values.length )
return;
Expand All @@ -179,11 +201,13 @@ private void recap(int newCap) {
*/
private int newCapacity() {
final double currLen = values.length;
final boolean lessThanEstimate = currLen < estimatedNzs;
final double factor = lessThanEstimate ?
SparseBlock.RESIZE_FACTOR1 : SparseBlock.RESIZE_FACTOR2;
//scale length exponentially based on estimated number of non-zeros
final int nextLen = (int)Math.ceil(currLen * ((currLen < estimatedNzs) ?
SparseBlock.RESIZE_FACTOR1 : SparseBlock.RESIZE_FACTOR2));
final int nextLen = (int)Math.ceil(currLen * factor);
//cap at max number of non-zeros with robustness of initial zero
return Math.max(2, Math.min(maxNzs, nextLen));
return Math.max(2, nextLen);
}

@Override
Expand Down Expand Up @@ -391,26 +415,27 @@ public void setIndexRange(int cl, int cu, double[] v, int[] vix, int vpos, int v
}
}

private void resizeAndInsert(int index, int col, double v) {
//allocate new arrays
int newCap = newCapacity();
double[] oldvalues = values;
int[] oldindexes = indexes;
private final void resizeAndInsert(int index, int col, double v) {
final int newCap = newCapacity();
resizeVals(newCap, index, v);
resizeIndex(newCap, index, col);
size++;
}

private final void resizeVals(int newCap, int index, double v){
double[] old = values;
values = new double[newCap];
System.arraycopy(old, 0, values, 0, index);
values[index] = v;
System.arraycopy(old, index, values, index+1, size-index);
}

private final void resizeIndex(int newCap, int index, int col){
int[] old = indexes;
indexes = new int[newCap];

//copy lhs values to new array
System.arraycopy(oldvalues, 0, values, 0, index);
System.arraycopy(oldindexes, 0, indexes, 0, index);

//insert new value
System.arraycopy(old, 0, indexes, 0, index);
indexes[index] = col;
values[index] = v;

//copy rhs values to new array
System.arraycopy(oldvalues, index, values, index+1, size-index);
System.arraycopy(oldindexes, index, indexes, index+1, size-index);
size++;
System.arraycopy(old, index, indexes, index+1, size-index);
}

private void shiftRightAndInsert(int index, int col, double v) {
Expand Down Expand Up @@ -467,4 +492,9 @@ public void compact(double eps) {
}
size = nnz; //adjust row size
}

@Override
public SparseRow copy(boolean deep){
return new SparseRowVector(this);
}
}
@@ -0,0 +1,85 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.sysds.test.component.matrix;

import static org.junit.Assert.assertEquals;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
import org.apache.sysds.runtime.data.SparseBlockCSR;
import org.junit.Test;

public class SparseCSRTest {
protected static final Log LOG = LogFactory.getLog(CompressedMatrixBlock.class.getName());

@Test
public void testGTE() {
int[] rs = new int[] {0, 9};
int[] colInd = new int[] {10, 20, 30, 40, 50, 60, 80, 90, 100};
double[] val = new double[] {1, 1, 1, 1, 1, 1, 1, 1, 1};
SparseBlockCSR b = new SparseBlockCSR(rs, colInd, val, val.length);

assertEquals(0, b.posFIndexGTE(0, 0));
assertEquals(0, b.posFIndexGTE(0, 10));
assertEquals(1, b.posFIndexGTE(0, 11));
assertEquals(7, b.posFIndexGTE(0, 90));
assertEquals(8, b.posFIndexGTE(0, 91));
assertEquals(-1, b.posFIndexGTE(0, 101));
assertEquals(-1, b.posFIndexGTE(0, 10100));

}

@Test
public void testGTE2Rows() {
int[] rs = new int[] {0, 0, 9};
int[] colInd = new int[] {10, 20, 30, 40, 50, 60, 80, 90, 100};
double[] val = new double[] {1, 1, 1, 1, 1, 1, 1, 1, 1};
SparseBlockCSR b = new SparseBlockCSR(rs, colInd, val, val.length);
LOG.error(b);

assertEquals(0, b.posFIndexGTE(1, 0));
assertEquals(0, b.posFIndexGTE(1, 10));
assertEquals(1, b.posFIndexGTE(1, 11));
assertEquals(7, b.posFIndexGTE(1, 90));
assertEquals(8, b.posFIndexGTE(1, 91));
assertEquals(-1, b.posFIndexGTE(1, 101));
assertEquals(-1, b.posFIndexGTE(1, 10100));

}

@Test
public void testGTE2RowsNN() {
int[] rs = new int[] {0, 1, 10};
int[] colInd = new int[] {100, 10, 20, 30, 40, 50, 60, 80, 90, 100};
double[] val = new double[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
SparseBlockCSR b = new SparseBlockCSR(rs, colInd, val, val.length);
LOG.error(b);

assertEquals(0, b.posFIndexGTE(1, 0));
assertEquals(0, b.posFIndexGTE(1, 10));
assertEquals(1, b.posFIndexGTE(1, 11));
assertEquals(7, b.posFIndexGTE(1, 90));
assertEquals(8, b.posFIndexGTE(1, 91));
assertEquals(-1, b.posFIndexGTE(1, 101));
assertEquals(-1, b.posFIndexGTE(1, 10100));

}
}

0 comments on commit 00e68df

Please sign in to comment.