From 419cec9f2838be1eb34b4e2f7096c86e5c3bafa2 Mon Sep 17 00:00:00 2001
From: baunsgaard <baunsgaard@tugraz.at>
Date: Sun, 21 Nov 2021 18:16:08 +0100
Subject: [PATCH] [SYSTEMDS-3226] PFOR column group

start of ColGroupPFOR

progress on PFOR now with many of the aggregates

fix docs

1960 test to go.

1790 tests left

1284 left ... but added 644 failures

compression updates

7 errors, 8 failures left

7 erros left

Fixed all component tests

move morphing part out

SDC 10x faster left

getting close

40 errors left
---
 .../compress/CompressedMatrixBlock.java       |  52 +-
 .../CompressedMatrixBlockFactory.java         |   4 -
 .../runtime/compress/colgroup/AColGroup.java  |  71 +-
 .../colgroup/AColGroupCompressed.java         |  37 +-
 .../compress/colgroup/AColGroupValue.java     |  73 +--
 .../colgroup/AMorphingMMColGroup.java         | 101 +++
 .../runtime/compress/colgroup/APreAgg.java    |  50 +-
 .../compress/colgroup/ColGroupConst.java      |  92 ++-
 .../compress/colgroup/ColGroupDDC.java        |  46 +-
 .../compress/colgroup/ColGroupEmpty.java      |  44 +-
 .../compress/colgroup/ColGroupFactory.java    |  42 +-
 .../runtime/compress/colgroup/ColGroupIO.java |   2 +
 .../compress/colgroup/ColGroupOLE.java        | 182 +++---
 .../compress/colgroup/ColGroupPFOR.java       | 386 +++++++++++
 .../compress/colgroup/ColGroupRLE.java        | 209 ++++--
 .../compress/colgroup/ColGroupSDC.java        | 318 ++++-----
 .../compress/colgroup/ColGroupSDCSingle.java  | 287 +++++----
 .../colgroup/ColGroupSDCSingleZeros.java      | 388 +++++++----
 .../compress/colgroup/ColGroupSDCZeros.java   | 487 ++++++++------
 .../compress/colgroup/ColGroupUtils.java      |  65 ++
 .../colgroup/dictionary/ADictionary.java      | 210 +++++-
 .../colgroup/dictionary/Dictionary.java       | 308 +++++++--
 .../dictionary/DictionaryFactory.java         |   6 +
 .../dictionary/MatrixBlockDictionary.java     | 604 +++++++++++++++++-
 .../colgroup/dictionary/QDictionary.java      | 144 ++++-
 .../insertionsort/MaterializeSort.java        |   2 +-
 .../compress/colgroup/mapping/AMapToData.java |  73 ++-
 .../compress/colgroup/mapping/MapToBit.java   |  33 +-
 .../compress/colgroup/mapping/MapToByte.java  |  41 +-
 .../compress/colgroup/mapping/MapToChar.java  |  55 +-
 .../colgroup/mapping/MapToFactory.java        |   2 +-
 .../compress/colgroup/mapping/MapToInt.java   |  33 +-
 .../compress/colgroup/offset/AIterator.java   |  52 +-
 .../compress/colgroup/offset/AOffset.java     | 262 +++++++-
 .../compress/colgroup/offset/OffsetByte.java  | 342 +++++++++-
 .../compress/colgroup/offset/OffsetChar.java  | 112 +++-
 .../colgroup/offset/OffsetFactory.java        |  32 +-
 .../runtime/compress/lib/CLALibAppend.java    |   9 +-
 .../compress/lib/CLALibBinaryCellOp.java      |  40 +-
 .../runtime/compress/lib/CLALibCompAgg.java   |   2 +-
 .../compress/lib/CLALibDecompress.java        | 194 ++++--
 .../compress/lib/CLALibLeftMultBy.java        |  95 +--
 .../compress/lib/CLALibRightMultBy.java       |   4 +-
 .../runtime/compress/lib/CLALibSlice.java     | 105 +++
 .../runtime/compress/lib/CLALibUtils.java     |  58 +-
 .../java/org/apache/sysds/test/TestUtils.java |   2 +-
 .../compress/CompressedMatrixTest.java        |  11 +
 .../compress/CompressedTestBase.java          |  41 +-
 .../compress/ExtendedMatrixTests.java         |   7 +-
 .../component/compress/TestConstants.java     |   6 +-
 .../mapping/MappingPreAggregateTests.java     |  84 ++-
 .../compress/mapping/MappingTests.java        |  40 +-
 .../compress/offset/OffsetNegativeTests.java  |  92 ---
 .../compress/offset/OffsetSingleTests.java    |  24 +-
 .../offset/OffsetTestPreAggregate.java        | 460 +++++++++++++
 .../compress/offset/OffsetTests.java          | 231 ++++++-
 .../offset/OffsetTestsDefaultConstructor.java | 116 ++++
 57 files changed, 5194 insertions(+), 1674 deletions(-)
 create mode 100644 src/main/java/org/apache/sysds/runtime/compress/colgroup/AMorphingMMColGroup.java
 create mode 100644 src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPFOR.java
 create mode 100644 src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUtils.java
 create mode 100644 src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java
 delete mode 100644 src/test/java/org/apache/sysds/test/component/compress/offset/OffsetNegativeTests.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregate.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestsDefaultConstructor.java

diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index c8bdd0a45cb..548e5e931b4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -55,6 +55,7 @@
 import org.apache.sysds.runtime.compress.lib.CLALibReExpand;
 import org.apache.sysds.runtime.compress.lib.CLALibRightMultBy;
 import org.apache.sysds.runtime.compress.lib.CLALibScalar;
+import org.apache.sysds.runtime.compress.lib.CLALibSlice;
 import org.apache.sysds.runtime.compress.lib.CLALibSquash;
 import org.apache.sysds.runtime.compress.lib.CLALibUnary;
 import org.apache.sysds.runtime.controlprogram.caching.CacheBlock;
@@ -691,61 +692,14 @@ public void setOverlapping(boolean overlapping) {
 	@Override
 	public MatrixBlock slice(int rl, int ru, int cl, int cu, boolean deep, CacheBlock ret) {
 		validateSliceArgument(rl, ru, cl, cu);
-		MatrixBlock tmp;
-		if(rl == ru && cl == cu) {
-			// get a single index, and return in a matrixBlock
-			tmp = new MatrixBlock(1, 1, 0);
-			tmp.appendValue(0, 0, getValue(rl, cl));
-			return tmp;
-		}
-		else if(rl == 0 && ru == getNumRows() - 1) {
-			tmp = sliceColumns(cl, cu);
-			tmp.recomputeNonZeros();
-			return tmp;
-		}
-		else if(cl == 0 && cu == getNumColumns() - 1) {
-			// Row Slice. Potential optimization if the slice contains enough rows.
-			// +1 since the implementation arguments for slice is inclusive values for ru
-			// and cu. It is not inclusive in decompression, and construction of MatrixBlock.
-			tmp = new MatrixBlock(ru + 1 - rl, getNumColumns(), false).allocateDenseBlock();
-			for(AColGroup g : getColGroups())
-				g.decompressToBlock(tmp, rl, ru + 1, -rl, 0);
-			tmp.recomputeNonZeros();
-			tmp.examSparsity();
-			return tmp;
-		}
-		else {
-			// In the case where an internal matrix is sliced out, then first slice out the
-			// columns to an compressed intermediate.
-			tmp = sliceColumns(cl, cu);
-			// Then call slice recursively, to do the row slice.
-			// Since we do not copy the index structure but simply maintain a pointer to the
-			// original this is fine.
-			tmp = tmp.slice(rl, ru, 0, tmp.getNumColumns() - 1, ret);
-			return tmp;
-		}
-	}
-
-	private CompressedMatrixBlock sliceColumns(int cl, int cu) {
-		CompressedMatrixBlock ret = new CompressedMatrixBlock(this.getNumRows(), cu + 1 - cl);
-		List<AColGroup> newColGroups = new ArrayList<>();
-		for(AColGroup grp : getColGroups()) {
-			AColGroup slice = grp.sliceColumns(cl, cu + 1);
-			if(slice != null)
-				newColGroups.add(slice);
-		}
-		ret.allocateColGroupList(newColGroups);
-		ret.recomputeNonZeros();
-		ret.overlappingColGroups = this.isOverlapping();
-		return ret;
+		return CLALibSlice.slice(this, rl, ru, cl, cu, deep);
 	}
 
 	@Override
 	public void slice(ArrayList<IndexedMatrixValue> outlist, IndexRange range, int rowCut, int colCut, int blen,
 		int boundaryRlen, int boundaryClen) {
-		printDecompressWarning(
+		MatrixBlock tmp = getUncompressed(
 			"slice for distribution to spark. (Could be implemented such that it does not decompress)");
-		MatrixBlock tmp = getUncompressed();
 		tmp.slice(outlist, range, rowCut, colCut, blen, boundaryRlen, boundaryClen);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
index 42ea6a711e9..97f6f0975d1 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
@@ -250,10 +250,6 @@ else if(mb.isEmpty()) {
 		if(res == null)
 			return abortCompression();
 
-		if(compSettings.isInSparkInstruction) {
-			// clear soft reference to uncompressed block in case of spark.
-			res.clearSoftReferenceToDecompressed();
-		}
 		return new ImmutablePair<>(res, _stats);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
index 27a29cb945d..d46611aa96a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
@@ -48,7 +48,7 @@ public abstract class AColGroup implements Serializable {
 
 	/** Public super types of compression ColGroups supported */
 	public enum CompressionType {
-		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC
+		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, PFOR,
 	}
 
 	/**
@@ -57,7 +57,7 @@ public enum CompressionType {
 	 * Protected such that outside the ColGroup package it should be unknown which specific subtype is used.
 	 */
 	protected enum ColGroupType {
-		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCSingle, SDCSingleZeros, SDCZeros;
+		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCSingle, SDCSingleZeros, SDCZeros, PFOR;
 	}
 
 	/** The ColGroup Indexes contained in the ColGroup */
@@ -132,14 +132,27 @@ public long estimateInMemorySize() {
 	}
 
 	/**
-	 * Decompress the contents of the column group into the target matrix,.
+	 * Decompress a range of rows into a sparse block
 	 * 
-	 * @param target A matrix block where the columns covered by this column group have not yet been filled in.
-	 * @param rl     Row to start decompression from
-	 * @param ru     Row to end decompression at (not inclusive)
+	 * Note that this is using append, so the sparse column indexes need to be sorted afterwards.
+	 * 
+	 * @param sb Sparse Target block
+	 * @param rl Row to start at
+	 * @param ru Row to end at
+	 */
+	public final void decompressToSparseBlock(SparseBlock sb, int rl, int ru) {
+		decompressToSparseBlock(sb, rl, ru, 0, 0);
+	}
+
+	/**
+	 * Decompress a range of rows into a dense block
+	 * 
+	 * @param db Sparse Target block
+	 * @param rl Row to start at
+	 * @param ru Row to end at
 	 */
-	public final void decompressToBlock(MatrixBlock target, int rl, int ru) {
-		decompressToBlock(target, rl, ru, 0, 0);
+	public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) {
+		decompressToDenseBlock(db, rl, ru, 0, 0);
 	}
 
 	/**
@@ -326,33 +339,29 @@ public double get(int r, int c) {
 	protected abstract ColGroupType getColGroupType();
 
 	/**
-	 * Decompress the contents of the column group without counting non zeros
+	 * Decompress into the DenseBlock. (no NNZ handling)
 	 * 
-	 * The offsets helps us decompress into specific target areas of the output matrix.
-	 * 
-	 * If OffR and OffC is 0, then decompression output starts at row offset equal to rl,
+	 * @param db   Target DenseBlock
+	 * @param rl   Row to start decompression from
+	 * @param ru   Row to end decompression at
+	 * @param offR Row offset into the target to decompress
+	 * @param offC Column offset into the target to decompress
+	 */
+	public abstract void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC);
+
+	/**
+	 * Decompress into the SparseBlock. (no NNZ handling)
 	 * 
-	 * If for instance a MiniBatch of rows 10 to 15, then target would be 5 rows high and arguments would look like:
-	 *
-	 * cg.decompressToBlock(target, 10, 15, -10, 0)
+	 * Note this method is allowing to calls to append since it is assumed that the sparse column indexes are sorted
+	 * afterwards
 	 * 
-	 * @param target a matrix block where the columns covered by this column group have not yet been filled in.
-	 * @param rl     Row to start decompression at.
-	 * @param ru     Row to end decompression at (not inclusive).
-	 * @param offR   RowOffset into target to assign from.
-	 * @param offC   ColumnOffset into the target matrix to assign from.
+	 * @param sb   Target SparseBlock
+	 * @param rl   Row to start decompression from
+	 * @param ru   Row to end decompression at
+	 * @param offR Row offset into the target to decompress
+	 * @param offC Column offset into the target to decompress
 	 */
-	public final void decompressToBlock(MatrixBlock target, int rl, int ru, int offR, int offC){
-		if(target.isInSparseFormat())
-			decompressToSparseBlock(target.getSparseBlock(), rl, ru, offR, offC);
-		else
-			decompressToDenseBlock(target.getDenseBlock(), rl, ru, offR, offC);
-	}
-
-
-	protected abstract void decompressToDenseBlock(DenseBlock db, int rl, int ru,int offR, int offC);
-
-	protected abstract void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC);
+	public abstract void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC);
 
 	/**
 	 * Right matrix multiplication with this column group.
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupCompressed.java
index 106a2df0677..90cd5c94e9a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupCompressed.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupCompressed.java
@@ -55,11 +55,15 @@ protected AColGroupCompressed(int[] colIndices) {
 
 	protected abstract void computeColMxx(double[] c, Builtin builtin);
 
-	protected abstract void computeSum(double[] c, int nRows, boolean square);
+	protected abstract void computeSum(double[] c, int nRows);
 
-	protected abstract void computeRowSums(double[] c, boolean square, int rl, int ru);
+	protected abstract void computeRowSums(double[] c, int rl, int ru);
 
-	protected abstract void computeColSums(double[] c, int nRows, boolean square);
+	protected abstract void computeSumSq(double[] c, int nRows);
+
+	protected abstract void computeRowSumsSq(double[] c, int rl, int ru);
+
+	protected abstract void computeColSumsSq(double[] c, int nRows);
 
 	protected abstract void computeRowMxx(double[] c, Builtin builtin, int rl, int ru);
 
@@ -79,22 +83,27 @@ public double getMax() {
 		return computeMxx(Double.NEGATIVE_INFINITY, Builtin.getBuiltinFnObject(BuiltinCode.MAX));
 	}
 
-	@Override
-	public void computeColSums(double[] c, int nRows) {
-		computeColSums(c, nRows, false);
-	}
-
 	@Override
 	public final void unaryAggregateOperations(AggregateUnaryOperator op, double[] c, int nRows, int rl, int ru) {
 		final ValueFunction fn = op.aggOp.increOp.fn;
 		if(fn instanceof Plus || fn instanceof KahanPlus || fn instanceof KahanPlusSq) {
 			boolean square = fn instanceof KahanPlusSq;
-			if(op.indexFn instanceof ReduceAll)
-				computeSum(c, nRows, square);
-			else if(op.indexFn instanceof ReduceCol)
-				computeRowSums(c, square, rl, ru);
-			else if(op.indexFn instanceof ReduceRow)
-				computeColSums(c, nRows, square);
+			if(square){
+				if(op.indexFn instanceof ReduceAll)
+					computeSumSq(c, nRows);
+				else if(op.indexFn instanceof ReduceCol)
+					computeRowSumsSq(c, rl, ru);
+				else if(op.indexFn instanceof ReduceRow)
+					computeColSumsSq(c, nRows);
+			}
+			else{
+				if(op.indexFn instanceof ReduceAll)
+					computeSum(c, nRows);
+				else if(op.indexFn instanceof ReduceCol)
+					computeRowSums(c, rl, ru);
+				else if(op.indexFn instanceof ReduceRow)
+					computeColSums(c, nRows);
+			}
 		}
 		else if(fn instanceof Multiply) {
 			if(op.indexFn instanceof ReduceAll)
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java
index 067fa6f20f9..34abf61b05d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java
@@ -38,7 +38,6 @@
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
-import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
 /**
  * Base class for column groups encoded with value dictionary. This include column groups such as DDC OLE and RLE.
@@ -171,7 +170,7 @@ protected abstract void decompressToSparseBlockDenseDictionary(SparseBlock ret,
 		double[] values);
 
 	@Override
-	public final int getNumValues() {
+	public int getNumValues() {
 		return _dict.getNumberOfValues(_colIndexes.length);
 	}
 
@@ -286,15 +285,14 @@ private double[] rightMMPreAggSparse(int numVals, SparseBlock b, int[] aggregate
 	}
 
 	@Override
-	protected final double computeMxx(double c, Builtin builtin) {
+	protected double computeMxx(double c, Builtin builtin) {
 		if(_zeros)
 			c = builtin.execute(c, 0);
 		return _dict.aggregate(c, builtin);
-
 	}
 
 	@Override
-	protected final void computeColMxx(double[] c, Builtin builtin) {
+	protected void computeColMxx(double[] c, Builtin builtin) {
 		if(_zeros)
 			for(int x = 0; x < _colIndexes.length; x++)
 				c[_colIndexes[x]] = builtin.execute(c[_colIndexes[x]], 0);
@@ -302,40 +300,6 @@ protected final void computeColMxx(double[] c, Builtin builtin) {
 		_dict.aggregateCols(c, builtin, _colIndexes);
 	}
 
-	/**
-	 * Method for use by subclasses. Applies a scalar operation to the value metadata stored in the dictionary.
-	 * 
-	 * @param op scalar operation to perform
-	 * @return transformed copy of value metadata for this column group
-	 */
-	protected final ADictionary applyScalarOp(ScalarOperator op) {
-		return _dict.clone().inplaceScalarOp(op);
-	}
-
-	/**
-	 * Method for use by subclasses. Applies a scalar operation to the value metadata stored in the dictionary. This
-	 * specific method is used in cases where an new entry is to be added in the dictionary.
-	 * 
-	 * Method should only be called if the newVal is not 0! Also the newVal should already have the operator applied.
-	 * 
-	 * @param op      The Operator to apply to the underlying data.
-	 * @param newVal  The new Value to append to the underlying data.
-	 * @param numCols The number of columns in the ColGroup, to specify how many copies of the newVal should be appended.
-	 * @return The new Dictionary containing the values.
-	 */
-	protected final ADictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) {
-		return _dict.applyScalarOp(op, newVal, numCols);
-	}
-
-	protected static double[] allocDVector(int len, boolean reset) {
-		return new double[len];
-	}
-
-	protected static int[] allocIVector(int len, boolean reset) {
-		LOG.error("deprecated allocIVector");
-		return new int[len + 1];
-	}
-
 	@Override
 	public void readFields(DataInput in) throws IOException {
 		super.readFields(in);
@@ -362,16 +326,23 @@ public long getExactSizeOnDisk() {
 	public abstract int[] getCounts(int[] out);
 
 	@Override
-	protected final void computeSum(double[] c, int nRows, boolean square) {
-		if(square)
-			c[0] += _dict.sumsq(getCounts(), _colIndexes.length);
-		else
-			c[0] += _dict.sum(getCounts(), _colIndexes.length);
+	protected void computeSum(double[] c, int nRows) {
+		c[0] += _dict.sum(getCounts(), _colIndexes.length);
+	}
+
+	@Override
+	public void computeColSums(double[] c, int nRows) {
+		_dict.colSum(c, getCounts(), _colIndexes);
+	}
+
+	@Override
+	protected void computeSumSq(double[] c, int nRows) {
+		c[0] += _dict.sumSq(getCounts(), _colIndexes.length);
 	}
 
 	@Override
-	protected final void computeColSums(double[] c, int nRows, boolean square) {
-		_dict.colSum(c, getCounts(), _colIndexes, square);
+	protected void computeColSumsSq(double[] c, int nRows) {
+		_dict.colSumSq(c, getCounts(), _colIndexes);
 	}
 
 	@Override
@@ -425,7 +396,7 @@ public AColGroupValue copy() {
 	}
 
 	@Override
-	protected final AColGroup sliceSingleColumn(int idx) {
+	protected AColGroup sliceSingleColumn(int idx) {
 		final AColGroupValue ret = (AColGroupValue) copy();
 		ret._colIndexes = new int[] {0};
 		if(_colIndexes.length == 1)
@@ -437,7 +408,7 @@ protected final AColGroup sliceSingleColumn(int idx) {
 	}
 
 	@Override
-	protected final AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
+	protected AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
 		final AColGroupValue ret = (AColGroupValue) copy();
 		ret._dict = ret._dict.sliceOutColumnRange(idStart, idEnd, _colIndexes.length);
 		ret._colIndexes = outputCols;
@@ -445,20 +416,20 @@ protected final AColGroup sliceMultiColumns(int idStart, int idEnd, int[] output
 	}
 
 	@Override
-	protected final void tsmm(double[] result, int numColumns, int nRows) {
+	protected void tsmm(double[] result, int numColumns, int nRows) {
 		final int[] counts = getCounts();
 		tsmm(result, numColumns, counts, _dict, _colIndexes);
 	}
 
 	@Override
-	public final boolean containsValue(double pattern) {
+	public boolean containsValue(double pattern) {
 		if(pattern == 0 && _zeros)
 			return true;
 		return _dict.containsValue(pattern);
 	}
 
 	@Override
-	public final long getNumberNonZeros(int nRows) {
+	public long getNumberNonZeros(int nRows) {
 		int[] counts = getCounts();
 		return _dict.getNumberNonZeros(counts, _colIndexes.length);
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AMorphingMMColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AMorphingMMColGroup.java
new file mode 100644
index 00000000000..26c055de9d7
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AMorphingMMColGroup.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+/**
+ * Abstract class for column group types that do not perform matrix Multiplication, and decompression for performance
+ * reasons but instead transforms into another type of column group type to perform that operation.
+ */
+public abstract class AMorphingMMColGroup extends AColGroupValue {
+
+	/**
+	 * Constructor for serialization
+	 * 
+	 * @param numRows Number of rows contained
+	 */
+	protected AMorphingMMColGroup(int numRows) {
+		super(numRows);
+	}
+
+	/**
+	 * A Abstract class for column groups that contain ADictionary for values.
+	 * 
+	 * @param colIndices   The Column indexes
+	 * @param numRows      The number of rows contained in this group
+	 * @param dict         The dictionary to contain the distinct tuples
+	 * @param cachedCounts The cached counts of the distinct tuples (can be null since it should be possible to
+	 *                     reconstruct the counts on demand)
+	 */
+	protected AMorphingMMColGroup(int[] colIndices, int numRows, ADictionary dict, int[] cachedCounts) {
+		super(colIndices, numRows, dict, cachedCounts);
+	}
+
+	@Override
+	protected final void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		SparseBlock sb) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	protected final void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		double[] values) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	protected final void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
+		SparseBlock sb) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	protected final void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
+		double[] values) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	public final void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	public final void leftMultByAColGroup(AColGroup lhs, MatrixBlock result) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	public final void tsmmAColGroup(AColGroup other, MatrixBlock result) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	protected final void tsmm(double[] result, int numColumns, int nRows) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	public abstract AColGroup extractCommon(double[] constV);
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/APreAgg.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/APreAgg.java
index 2a15a2110bb..9d1b1e3712a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/APreAgg.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/APreAgg.java
@@ -297,7 +297,7 @@ private boolean shouldPreAggregateLeft(APreAgg lhs) {
 
 	private static MatrixBlock allocatePreAggregate(MatrixBlock m, int numVals, int rl, int ru) {
 		final int lhsRows = ru - rl;
-		final double[] vals = allocDVector(lhsRows * numVals, true);
+		final double[] vals = new double[lhsRows * numVals];
 		final DenseBlock retB = new DenseBlockFP64(new int[] {lhsRows, numVals}, vals);
 		return new MatrixBlock(lhsRows, numVals, retB);
 	}
@@ -318,16 +318,12 @@ private static void tsmmDictionaryWithScaling(final ADictionary dict, final int[
 			if(mb.isEmpty())
 				return;
 			else if(mb.isInSparseFormat())
-				throw new NotImplementedException();
-			else {
-				final double[] values = mb.getDenseBlockValues();
-				MMDictsDenseDenseWithScaling(values, values, rows, cols, counts, ret);
-			}
-		}
-		else {
-			final double[] values = dict.getValues();
-			MMDictsDenseDenseWithScaling(values, values, rows, cols, counts, ret);
+				TSMMDictsSparseWithScaling(mb.getSparseBlock(), rows, cols, counts, ret);
+			else
+				TSMMDictsDenseWithScaling(mb.getDenseBlockValues(), rows, cols, counts, ret);
 		}
+		else
+			TSMMDictsDenseWithScaling(dict.getValues(), rows, cols, counts, ret);
 	}
 
 	/**
@@ -416,9 +412,9 @@ private static void MMDictsDenseDense(double[] left, double[] right, int[] rowsL
 		}
 	}
 
-	private static void MMDictsDenseDenseWithScaling(double[] left, double[] right, int[] rowsLeft, int[] colsRight,
-		int[] scaling, MatrixBlock result) {
-		final int commonDim = Math.min(left.length / rowsLeft.length, right.length / colsRight.length);
+	private static void TSMMDictsDenseWithScaling(double[] dv, int[] rowsLeft, int[] colsRight, int[] scaling,
+		MatrixBlock result) {
+		final int commonDim = Math.min(dv.length / rowsLeft.length, dv.length / colsRight.length);
 		final int resCols = result.getNumColumns();
 		final double[] resV = result.getDenseBlockValues();
 		for(int k = 0; k < commonDim; k++) {
@@ -427,10 +423,34 @@ private static void MMDictsDenseDenseWithScaling(double[] left, double[] right,
 			final int scale = scaling[k];
 			for(int i = 0; i < rowsLeft.length; i++) {
 				final int offOut = rowsLeft[i] * resCols;
-				final double vl = left[offL + i] * scale;
+				final double vl = dv[offL + i] * scale;
 				if(vl != 0)
 					for(int j = 0; j < colsRight.length; j++)
-						resV[offOut + colsRight[j]] += vl * right[offR + j];
+						resV[offOut + colsRight[j]] += vl * dv[offR + j];
+			}
+		}
+	}
+
+	private static void TSMMDictsSparseWithScaling(SparseBlock sb, int[] rowsLeft, int[] colsRight, int[] scaling,
+		MatrixBlock result) {
+
+		final int commonDim = sb.numRows();
+		final int resCols = result.getNumColumns();
+		final double[] resV = result.getDenseBlockValues();
+
+		for(int k = 0; k < commonDim; k++) {
+			if(sb.isEmpty(k))
+				continue;
+			final int apos = sb.pos(k);
+			final int alen = sb.size(k) + apos;
+			final int[] aix = sb.indexes(k);
+			final double[] avals = sb.values(k);
+			final int scale = scaling[k];
+			for(int i = apos; i < alen; i++) {
+				final double v = avals[i] * scale;
+				final int offOut = rowsLeft[aix[i]] * resCols;
+				for(int j = 0; j < alen; j++)
+					resV[offOut + colsRight[aix[j]]] += v * avals[j];
 			}
 		}
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
index 86335b983fa..afe43da66a8 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
@@ -54,21 +54,29 @@ protected ColGroupConst() {
 	 * @param colIndices The Colum indexes for the column group.
 	 * @param dict       The dictionary containing one tuple for the entire compression.
 	 */
-	protected ColGroupConst(int[] colIndices, ADictionary dict) {
+	private ColGroupConst(int[] colIndices, ADictionary dict) {
 		super(colIndices);
 		this._dict = dict;
 	}
 
-	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		double vals = _dict.sumAllRowsToDouble(square, _colIndexes.length)[0];
-		for(int rix = rl; rix < ru; rix++)
-			c[rix] += vals;
+	/**
+	 * Create constructor for a ColGroup Const this constructor ensures that if the dictionary input is empty an Empty
+	 * column group is constructed.
+	 * 
+	 * @param colIndices The column indexes in the column group
+	 * @param dict       The dictionary to use
+	 * @return A Colgroup either const or empty.
+	 */
+	protected static AColGroup create(int[] colIndices, ADictionary dict) {
+		if(dict == null)
+			return new ColGroupEmpty(colIndices);
+		else
+			return new ColGroupConst(colIndices, dict);
 	}
 
 	@Override
 	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		double value = _dict.aggregateTuples(builtin, _colIndexes.length)[0];
+		double value = _dict.aggregateRows(builtin, _colIndexes.length)[0];
 		for(int i = rl; i < ru; i++)
 			c[i] = builtin.execute(c[i], value);
 	}
@@ -108,19 +116,17 @@ public double getIdx(int r, int colIdx) {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		return new ColGroupConst(_colIndexes, _dict.clone().inplaceScalarOp(op));
+		return create(_colIndexes, _dict.applyScalarOp(op));
 	}
 
 	@Override
 	public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
-		ADictionary ret = _dict.binOpLeft(op, v, _colIndexes);
-		return new ColGroupConst(_colIndexes, ret);
+		return create(_colIndexes, _dict.binOpLeft(op, v, _colIndexes));
 	}
 
 	@Override
 	public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) {
-		ADictionary ret = _dict.binOpRight(op, v, _colIndexes);
-		return new ColGroupConst(_colIndexes, ret);
+		return create(_colIndexes, _dict.binOpRight(op, v, _colIndexes));
 	}
 
 	/**
@@ -131,13 +137,12 @@ public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSa
 	 */
 	public void addToCommon(double[] constV) {
 		final double[] values = _dict.getValues();
-		if(values != null && constV != null)
-			for(int i = 0; i < _colIndexes.length; i++)
-				constV[_colIndexes[i]] += values[i];
+		for(int i = 0; i < _colIndexes.length; i++)
+			constV[_colIndexes[i]] += values[i];
 	}
 
 	public double[] getValues() {
-		return _dict != null ? _dict.getValues() : null;
+		return _dict.getValues();
 	}
 
 	@Override
@@ -151,17 +156,38 @@ protected void computeColMxx(double[] c, Builtin builtin) {
 	}
 
 	@Override
-	protected void computeSum(double[] c, int nRows, boolean square) {
-		if(_dict != null)
-			if(square)
-				c[0] += _dict.sumsq(new int[] {nRows}, _colIndexes.length);
-			else
-				c[0] += _dict.sum(new int[] {nRows}, _colIndexes.length);
+	protected void computeSum(double[] c, int nRows) {
+		c[0] += _dict.sum(new int[] {nRows}, _colIndexes.length);
+	}
+
+	@Override
+	public void computeColSums(double[] c, int nRows) {
+		_dict.colSum(c, new int[] {nRows}, _colIndexes);
+	}
+
+	@Override
+	protected void computeSumSq(double[] c, int nRows) {
+
+		c[0] += _dict.sumSq(new int[] {nRows}, _colIndexes.length);
+	}
+
+	@Override
+	protected void computeColSumsSq(double[] c, int nRows) {
+		_dict.colSumSq(c, new int[] {nRows}, _colIndexes);
 	}
 
 	@Override
-	protected void computeColSums(double[] c, int nRows, boolean square) {
-		_dict.colSum(c, new int[] {nRows}, _colIndexes, square);
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		double vals = _dict.sumAllRowsToDouble(_colIndexes.length)[0];
+		for(int rix = rl; rix < ru; rix++)
+			c[rix] += vals;
+	}
+
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		double vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length)[0];
+		for(int rix = rl; rix < ru; rix++)
+			c[rix] += vals;
 	}
 
 	@Override
@@ -183,11 +209,13 @@ public AColGroup rightMultByMatrix(MatrixBlock right) {
 		final int cr = right.getNumColumns();
 		if(_colIndexes.length == rr) {
 			MatrixBlock left = forceValuesToMatrixBlock();
+			if(left.isEmpty())
+				return null;
 			MatrixBlock ret = new MatrixBlock(1, cr, false);
 			LibMatrixMult.matrixMult(left, right, ret);
-			ADictionary d = new MatrixBlockDictionary(ret);
 			if(ret.isEmpty())
 				return null;
+			ADictionary d = new MatrixBlockDictionary(ret);
 			return ColGroupFactory.genColGroupConst(cr, d);
 		}
 		else {
@@ -202,7 +230,7 @@ public void tsmm(double[] result, int numColumns, int nRows) {
 
 	@Override
 	public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
-		throw new NotImplementedException();
+		throw new DMLCompressionException("Should not be called");
 	}
 
 	@Override
@@ -223,19 +251,19 @@ protected AColGroup sliceSingleColumn(int idx) {
 			return new ColGroupEmpty(colIndexes);
 		else {
 			ADictionary retD = new Dictionary(new double[] {_dict.getValue(idx)});
-			return new ColGroupConst(colIndexes, retD);
+			return create(colIndexes, retD);
 		}
 	}
 
 	@Override
 	protected AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
 		ADictionary retD = _dict.sliceOutColumnRange(idStart, idEnd, _colIndexes.length);
-		return new ColGroupConst(outputCols, retD);
+		return create(outputCols, retD);
 	}
 
 	@Override
 	public AColGroup copy() {
-		return new ColGroupConst(_colIndexes, _dict.clone());
+		return create(_colIndexes, _dict.clone());
 	}
 
 	@Override
@@ -251,7 +279,7 @@ public long getNumberNonZeros(int nRows) {
 	@Override
 	public AColGroup replace(double pattern, double replace) {
 		ADictionary replaced = _dict.replace(pattern, replace, _colIndexes.length);
-		return new ColGroupConst(_colIndexes, replaced);
+		return create(_colIndexes, replaced);
 	}
 
 	@Override
@@ -269,9 +297,7 @@ public void write(DataOutput out) throws IOException {
 	@Override
 	public long getExactSizeOnDisk() {
 		long ret = super.getExactSizeOnDisk();
-		if(_dict != null)
-			ret += _dict.getExactSizeOnDisk();
-
+		ret += _dict.getExactSizeOnDisk();
 		return ret;
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
index b6d42312b98..82faecde164 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
@@ -67,26 +67,13 @@ public CompressionType getCompType() {
 	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		SparseBlock sb) {
 		throw new NotImplementedException();
-		// for(int i = rl; i < ru; i++, offT++) {
-		// final int rowIndex = _data.getIndex(i);
-		// if(sb.isEmpty(rowIndex))
-		// continue;
-		// final double[] c = db.values(offT);
-		// final int off = db.pos(offT);
-		// final int apos = sb.pos(rowIndex);
-		// final int alen = sb.size(rowIndex) + apos;
-		// final double[] avals = sb.values(rowIndex);
-		// final int[] aix = sb.indexes(rowIndex);
-		// for(int j = apos; j < alen; j++)
-		// c[off + _colIndexes[aix[j]]] += avals[j];
-		// }
 	}
 
 	@Override
 	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		double[] values) {
 		final int nCol = _colIndexes.length;
-		for(int i = rl,offT = rl + offR; i < ru; i++, offT++) {
+		for(int i = rl, offT = rl + offR; i < ru; i++, offT++) {
 			final double[] c = db.values(offT);
 			final int off = db.pos(offT) + offC;
 			final int rowIndex = _data.getIndex(i) * nCol;
@@ -118,8 +105,15 @@ public double getIdx(int r, int colIdx) {
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		for(int rix = rl; rix < ru; rix++)
+			c[rix] += vals[_data.getIndex(rix)];
+	}
+
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		double[] vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length);
 		for(int rix = rl; rix < ru; rix++)
 			c[rix] += vals[_data.getIndex(rix)];
 	}
@@ -127,7 +121,7 @@ protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
 	@Override
 	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
 		final int nCol = getNumCols();
-		double[] preAggregatedRows = _dict.aggregateTuples(builtin, nCol);
+		double[] preAggregatedRows = _dict.aggregateRows(builtin, nCol);
 		for(int i = rl; i < ru; i++)
 			c[i] = builtin.execute(c[i], preAggregatedRows[_data.getIndex(i)]);
 	}
@@ -151,7 +145,7 @@ public void preAggregate(final MatrixBlock m, final MatrixBlock preAgg, final in
 
 	@Override
 	public void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru, int cl, int cu) {
-		_data.preAggregateDense(m, preAgg, rl, ru, cl, cu);
+		_data.preAggregateDense(m, preAgg.getDenseBlockValues(), rl, ru, cl, cu);
 	}
 
 	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
@@ -181,11 +175,14 @@ public void preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
 	public void preAggregateThatSDCZerosStructure(ColGroupSDCZeros that, Dictionary ret) {
 		final AIterator itThat = that._indexes.getIterator();
 		final int nCol = that._colIndexes.length;
-
-		while(itThat.hasNext()) {
+		final int finalOff = that._indexes.getOffsetToLast();
+		while(true) {
 			final int to = _data.getIndex(itThat.value());
-			final int fr = that._data.getIndex(itThat.getDataIndexAndIncrement());
+			final int fr = that._data.getIndex(itThat.getDataIndex());
 			that._dict.addToEntry(ret, fr, to, nCol);
+			if(itThat.value() == finalOff)
+				break;
+			itThat.next();
 		}
 	}
 
@@ -193,9 +190,12 @@ public void preAggregateThatSDCZerosStructure(ColGroupSDCZeros that, Dictionary
 	public void preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary ret) {
 		final AIterator itThat = that._indexes.getIterator();
 		final int nCol = that._colIndexes.length;
-		while(itThat.hasNext()) {
+		final int finalOff = that._indexes.getOffsetToLast();
+		while(true) {
 			final int to = _data.getIndex(itThat.value());
 			that._dict.addToEntry(ret, 0, to, nCol);
+			if(itThat.value() == finalOff)
+				break;
 			itThat.next();
 		}
 	}
@@ -219,7 +219,7 @@ public long estimateInMemorySize() {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		return new ColGroupDDC(_colIndexes, _numRows, applyScalarOp(op), _data, getCachedCounts());
+		return new ColGroupDDC(_colIndexes, _numRows, _dict.applyScalarOp(op), _data, getCachedCounts());
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
index ec20674c43c..a75f046eb84 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
@@ -19,6 +19,8 @@
 
 package org.apache.sysds.runtime.compress.colgroup;
 
+import java.util.Arrays;
+
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
@@ -69,7 +71,7 @@ public void decompressToDenseBlock(DenseBlock target, int rl, int ru, int offR,
 	}
 
 	@Override
-	public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC){
+	public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) {
 		// do nothing.
 	}
 
@@ -80,10 +82,12 @@ public double getIdx(int r, int colIdx) {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		double val0 = op.executeScalar(0);
-		if(val0 == 0)
+		final double v = op.executeScalar(0);
+		if(v == 0)
 			return this;
-		return new ColGroupConst(_colIndexes, new Dictionary(new double[_colIndexes.length]).inplaceScalarOp(op));
+		double[] retV = new double[_colIndexes.length];
+		Arrays.fill(retV, v);
+		return ColGroupConst.create(_colIndexes, new Dictionary(retV));
 	}
 
 	@Override
@@ -99,7 +103,7 @@ public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSaf
 
 		if(allZero)
 			return this;
-		return new ColGroupConst(_colIndexes, new Dictionary(retVals));
+		return ColGroupConst.create(_colIndexes, new Dictionary(retVals));
 	}
 
 	@Override
@@ -111,10 +115,10 @@ public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSa
 		final int lenV = _colIndexes.length;
 		boolean allZero = true;
 		for(int i = 0; i < lenV; i++)
-			allZero = 0 == (retVals[i] = fn.execute(0, v[_colIndexes[i]])) && allZero ;
+			allZero = 0 == (retVals[i] = fn.execute(0, v[_colIndexes[i]])) && allZero;
 		if(allZero)
 			return this;
-		return new ColGroupConst(_colIndexes, new Dictionary(retVals));
+		return ColGroupConst.create(_colIndexes, new Dictionary(retVals));
 	}
 
 	@Override
@@ -185,11 +189,6 @@ public final double getMax() {
 		return 0;
 	}
 
-	@Override
-	public void computeColSums(double[] c, int nRows) {
-		// do nothing
-	}
-
 	@Override
 	protected double computeMxx(double c, Builtin builtin) {
 		return builtin.execute(c, 0);
@@ -202,17 +201,32 @@ protected void computeColMxx(double[] c, Builtin builtin) {
 	}
 
 	@Override
-	protected void computeSum(double[] c, int nRows, boolean square) {
+	protected void computeSum(double[] c, int nRows) {
+		// do nothing
+	}
+
+	@Override
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		// do nothing
+	}
+
+	@Override
+	public void computeColSums(double[] c, int nRows) {
+		// do nothing
+	}
+
+	@Override
+	protected void computeSumSq(double[] c, int nRows) {
 		// do nothing
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
 		// do nothing
 	}
 
 	@Override
-	protected void computeColSums(double[] c, int nRows, boolean square) {
+	protected void computeColSumsSq(double[] c, int nRows) {
 		// do nothing
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
index fc0edf67fae..72779342445 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
@@ -147,7 +147,7 @@ public static AColGroup genColGroupConst(int[] cols, double[] values) {
 		if(cols.length != values.length)
 			throw new DMLCompressionException("Invalid size of values compared to columns");
 		ADictionary dict = new Dictionary(values);
-		return new ColGroupConst(cols, dict);
+		return ColGroupConst.create(cols, dict);
 	}
 
 	/**
@@ -162,7 +162,7 @@ public static AColGroup genColGroupConst(int numCols, ADictionary dict) {
 			throw new DMLCompressionException(
 				"Invalid construction of const column group with different number of columns in arguments");
 		final int[] colIndices = Util.genColsIndices(numCols);
-		return new ColGroupConst(colIndices, dict);
+		return ColGroupConst.create(colIndices, dict);
 	}
 
 	private static List<AColGroup> genEmpty(MatrixBlock in, CompressionSettings compSettings) {
@@ -194,7 +194,7 @@ private static List<AColGroup> compressColGroupsParallel(MatrixBlock in, Compres
 				if(!tg.isEmpty())
 					tasks.add(new CompressTask(in, tg, compSettings, Math.max(1, k / 2)));
 
-			List<AColGroup> ret = new ArrayList<>(csi.getNumberColGroups());
+			List<AColGroup> ret = new ArrayList<>();
 			for(Future<Collection<AColGroup>> t : pool.invokeAll(tasks))
 				ret.addAll(t.get());
 			pool.shutdown();
@@ -234,11 +234,17 @@ protected CompressTask(MatrixBlock in, List<CompressedSizeInfoColGroup> groups,
 
 		@Override
 		public Collection<AColGroup> call() {
-			ArrayList<AColGroup> res = new ArrayList<>();
-			Tmp tmpMap = new Tmp();
-			for(CompressedSizeInfoColGroup g : _groups)
-				res.addAll(compressColGroup(_in, _compSettings, tmpMap, g, _k));
-			return res;
+			try{
+				ArrayList<AColGroup> res = new ArrayList<>();
+				Tmp tmpMap = new Tmp();
+				for(CompressedSizeInfoColGroup g : _groups)
+					res.addAll(compressColGroup(_in, _compSettings, tmpMap, g, _k));
+				return res;
+			}
+			catch(Exception e){
+				e.printStackTrace();
+				throw e;
+			}
 		}
 	}
 
@@ -347,7 +353,7 @@ private static AColGroup compress(int[] colIndexes, int rlen, ABitmap ubm, Compr
 
 		final IntArrayList[] of = ubm.getOffsetList();
 		if(of.length == 1 && of[0].size() == rlen) // If this always constant
-			return new ColGroupConst(colIndexes, DictionaryFactory.create(ubm));
+			return ColGroupConst.create(colIndexes, DictionaryFactory.create(ubm));
 
 		switch(compType) {
 			case DDC:
@@ -490,7 +496,7 @@ private static AColGroup compressSDC(int[] colIndexes, int rlen, ABitmap ubm, Co
 		ADictionary dict = DictionaryFactory.create(ubm, tupleSparsity);
 		if(ubm.getNumValues() == 1) {
 			if(numZeros >= largestOffset) {
-				final AOffset off = OffsetFactory.create(ubm.getOffsetList()[0].extractValues(true));
+				final AOffset off = OffsetFactory.createOffset(ubm.getOffsetList()[0].extractValues(true));
 				return new ColGroupSDCSingleZeros(colIndexes, rlen, dict, off, null);
 			}
 			else {
@@ -510,7 +516,7 @@ private static AColGroup setupMultiValueZeroColGroup(int[] colIndexes, int rlen,
 		CompressionSettings cs) {
 		IntArrayList[] offsets = ubm.getOffsetList();
 		AInsertionSorter s = InsertionSorterFactory.create(rlen, offsets, cs.sdcSortType);
-		AOffset indexes = OffsetFactory.create(s.getIndexes());
+		AOffset indexes = OffsetFactory.createOffset(s.getIndexes());
 		AMapToData data = s.getData();
 		int[] counts = new int[offsets.length + 1];
 		int sum = 0;
@@ -519,18 +525,16 @@ private static AColGroup setupMultiValueZeroColGroup(int[] colIndexes, int rlen,
 			sum += counts[i];
 		}
 		counts[offsets.length] = rlen - sum;
-		AColGroupValue ret = new ColGroupSDCZeros(colIndexes, rlen, dict, indexes, data, counts);
-		return ret;
+		return ColGroupSDCZeros.create(colIndexes, rlen, dict, indexes, data, counts);
 	}
 
 	private static AColGroup setupMultiValueColGroup(int[] colIndexes, int numZeros, int rlen, ABitmap ubm,
 		int largestIndex, ADictionary dict, CompressionSettings cs) {
 		IntArrayList[] offsets = ubm.getOffsetList();
 		AInsertionSorter s = InsertionSorterFactory.createNegative(rlen, offsets, largestIndex, cs.sdcSortType);
-		AOffset indexes = OffsetFactory.create(s.getIndexes());
+		AOffset indexes = OffsetFactory.createOffset(s.getIndexes());
 		AMapToData _data = s.getData();
-		AColGroupValue ret = new ColGroupSDC(colIndexes, rlen, dict, indexes, _data, null);
-		return ret;
+		return ColGroupSDC.create(colIndexes, rlen, dict, indexes, _data, null);
 	}
 
 	private static AColGroup setupSingleValueSDCColGroup(int[] colIndexes, int rlen, ABitmap ubm, ADictionary dict) {
@@ -548,7 +552,7 @@ private static AColGroup setupSingleValueSDCColGroup(int[] colIndexes, int rlen,
 
 		while(v < rlen)
 			indexes[p++] = v++;
-		AOffset off = OffsetFactory.create(indexes);
+		AOffset off = OffsetFactory.createOffset(indexes);
 
 		return new ColGroupSDCSingle(colIndexes, rlen, dict, off, null);
 	}
@@ -635,14 +639,14 @@ private static AColGroup compressSDCFromSparseTransposedBlock(MatrixBlock mb, in
 			}
 
 			counts[entries.size()] = rlen - sum;
-			final AOffset offsets = OffsetFactory.create(sb.indexes(sbRow), apos, alen);
+			final AOffset offsets = OffsetFactory.createOffset(sb.indexes(sbRow), apos, alen);
 			if(entries.size() <= 1)
 				return new ColGroupSDCSingleZeros(cols, rlen, new Dictionary(dict), offsets, counts);
 			else {
 				final AMapToData mapToData = MapToFactory.create((alen - apos), entries.size());
 				for(int j = apos; j < alen; j++)
 					mapToData.set(j - apos, map.get(vals[j]));
-				return new ColGroupSDCZeros(cols, rlen, new Dictionary(dict), offsets, mapToData, counts);
+				return ColGroupSDCZeros.create(cols, rlen, new Dictionary(dict), offsets, mapToData, counts);
 			}
 		}
 		else {
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
index f8edbcb1975..184ca1a69c2 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
@@ -118,6 +118,8 @@ private static AColGroup constructColGroup(ColGroupType ctype, int nRows){
 				return new ColGroupSDCSingleZeros(nRows);
 			case SDCZeros:
 				return new ColGroupSDCZeros(nRows);
+			case PFOR:
+				return new ColGroupPFOR(nRows);
 			default:
 				throw new DMLRuntimeException("Unsupported ColGroup Type used:  " + ctype);
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
index a303d98910c..64dd626bc17 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
@@ -23,7 +23,6 @@
 
 import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
@@ -66,7 +65,8 @@ public ColGroupType getColGroupType() {
 	}
 
 	@Override
-	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC, double[] values) {
+	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		double[] values) {
 		throw new NotImplementedException();
 		// final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		// final int numCols = getNumCols();
@@ -79,33 +79,34 @@ protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int
 		// double[] c = target.getDenseBlockValues();
 		// // cache conscious append via horizontal scans
 		// for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz) {
-		// 	for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
-		// 		int boff = _ptr[k];
-		// 		int blen = len(k);
-		// 		int bix = apos[k];
-
-		// 		if(bix >= blen)
-		// 			continue;
-		// 		int pos = boff + bix;
-		// 		int len = _data[pos];
-		// 		int i = 1;
-		// 		int row = bi + _data[pos + 1];
-		// 		while(i <= len && row < rl)
-		// 			row = bi + _data[pos + i++];
-
-		// 		for(; i <= len && row < ru; i++) {
-		// 			row = bi + _data[pos + i];
-		// 			int rc = (row - offOut) * targetCols;
-		// 			for(int j = 0; j < numCols; j++)
-		// 				c[rc + _colIndexes[j]] += values[off + j];
-		// 		}
-		// 		apos[k] += len + 1;
-		// 	}
+		// for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// int bix = apos[k];
+
+		// if(bix >= blen)
+		// continue;
+		// int pos = boff + bix;
+		// int len = _data[pos];
+		// int i = 1;
+		// int row = bi + _data[pos + 1];
+		// while(i <= len && row < rl)
+		// row = bi + _data[pos + i++];
+
+		// for(; i <= len && row < ru; i++) {
+		// row = bi + _data[pos + i];
+		// int rc = (row - offOut) * targetCols;
+		// for(int j = 0; j < numCols; j++)
+		// c[rc + _colIndexes[j]] += values[off + j];
+		// }
+		// apos[k] += len + 1;
+		// }
 		// }
 	}
 
 	@Override
-	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC, SparseBlock values) {
+	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		SparseBlock values) {
 		throw new NotImplementedException();
 	}
 
@@ -148,7 +149,7 @@ public AColGroup scalarOperation(ScalarOperator op) {
 		// fast path: sparse-safe operations
 		// Note that bitmaps don't change and are shallow-copied
 		if(op.sparseSafe || val0 == 0 || !_zeros) {
-			return new ColGroupOLE(_colIndexes, _numRows, _zeros, applyScalarOp(op), _data, _ptr, getCachedCounts());
+			return new ColGroupOLE(_colIndexes, _numRows, _zeros, _dict.applyScalarOp(op), _data, _ptr, getCachedCounts());
 		}
 		// slow path: sparse-unsafe operations (potentially create new bitmap)
 		// note: for efficiency, we currently don't drop values that become 0
@@ -156,10 +157,10 @@ public AColGroup scalarOperation(ScalarOperator op) {
 		int[] loff = computeOffsets(lind);
 
 		if(loff.length == 0) { // empty offset list: go back to fast path
-			return new ColGroupOLE(_colIndexes, _numRows, false, applyScalarOp(op), _data, _ptr, getCachedCounts());
+			return new ColGroupOLE(_colIndexes, _numRows, false, _dict.applyScalarOp(op), _data, _ptr, getCachedCounts());
 		}
 
-		ADictionary rvalues = applyScalarOp(op, val0, getNumCols());
+		ADictionary rvalues = _dict.applyScalarOp(op, val0, getNumCols());
 		char[] lbitmap = genOffsetBitmap(loff, loff.length);
 		char[] rbitmaps = Arrays.copyOf(_data, _data.length + lbitmap.length);
 		System.arraycopy(lbitmap, 0, rbitmaps, _data.length, lbitmap.length);
@@ -216,69 +217,74 @@ public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSa
 	// }
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		throw new NotImplementedException();
+		// final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+		// final int numVals = getNumValues();
 
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		final int numVals = getNumValues();
+		// if(numVals > 1 && _numRows > blksz) {
+		// final int blksz2 = CompressionSettings.BITMAP_BLOCK_SZ;
 
-		if(numVals > 1 && _numRows > blksz) {
-			final int blksz2 = CompressionSettings.BITMAP_BLOCK_SZ;
-
-			// step 1: prepare position and value arrays
-			int[] apos = skipScan(numVals, rl);
-			double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-
-			// step 2: cache conscious row sums via horizontal scans
-			for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz2) {
-				int bimax = Math.min(bi + blksz2, ru);
-
-				// horizontal segment scan, incl pos maintenance
-				for(int k = 0; k < numVals; k++) {
-					int boff = _ptr[k];
-					int blen = len(k);
-					double val = aval[k];
-					int bix = apos[k];
-
-					for(int ii = bi; ii < bimax && bix < blen; ii += blksz) {
-						// prepare length, start, and end pos
-						int len = _data[boff + bix];
-
-						// compute partial results
-						for(int i = 1; i <= len; i++) {
-							int rix = ii + _data[boff + bix + i];
-							if(rix >= _numRows)
-								throw new DMLCompressionException("Invalid row " + rix);
-							c[rix] += val;
-						}
-						bix += len + 1;
-					}
+		// // step 1: prepare position and value arrays
+		// int[] apos = skipScan(numVals, rl);
+		// double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
+
+		// // step 2: cache conscious row sums via horizontal scans
+		// for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz2) {
+		// int bimax = Math.min(bi + blksz2, ru);
+
+		// // horizontal segment scan, incl pos maintenance
+		// for(int k = 0; k < numVals; k++) {
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// double val = aval[k];
+		// int bix = apos[k];
+
+		// for(int ii = bi; ii < bimax && bix < blen; ii += blksz) {
+		// // prepare length, start, and end pos
+		// int len = _data[boff + bix];
+
+		// // compute partial results
+		// for(int i = 1; i <= len; i++) {
+		// int rix = ii + _data[boff + bix + i];
+		// if(rix >= _numRows)
+		// throw new DMLCompressionException("Invalid row " + rix);
+		// c[rix] += val;
+		// }
+		// bix += len + 1;
+		// }
 
-					apos[k] = bix;
-				}
-			}
-		}
-		else {
-			// iterate over all values and their bitmaps
-			for(int k = 0; k < numVals; k++) {
-				// prepare value-to-add for entire value bitmap
-				int boff = _ptr[k];
-				int blen = len(k);
-				double val = _dict.sumRow(k, square, _colIndexes.length);
+		// apos[k] = bix;
+		// }
+		// }
+		// }
+		// else {
+		// // iterate over all values and their bitmaps
+		// for(int k = 0; k < numVals; k++) {
+		// // prepare value-to-add for entire value bitmap
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// double val = _dict.sumRow(k, square, _colIndexes.length);
+
+		// // iterate over bitmap blocks and add values
+		// if(val != 0) {
+		// int slen;
+		// int bix = skipScanVal(k, rl);
+		// for(int off = ((rl + 1) / blksz) * blksz; bix < blen && off < ru; bix += slen + 1, off += blksz) {
+		// slen = _data[boff + bix];
+		// for(int i = 1; i <= slen; i++) {
+		// int rix = off + _data[boff + bix + i];
+		// c[rix] += val;
+		// }
+		// }
+		// }
+		// }
+		// }
+	}
 
-				// iterate over bitmap blocks and add values
-				if(val != 0) {
-					int slen;
-					int bix = skipScanVal(k, rl);
-					for(int off = ((rl + 1) / blksz) * blksz; bix < blen && off < ru; bix += slen + 1, off += blksz) {
-						slen = _data[boff + bix];
-						for(int i = 1; i <= slen; i++) {
-							int rix = off + _data[boff + bix + i];
-							c[rix] += val;
-						}
-					}
-				}
-			}
-		}
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		throw new NotImplementedException();
 	}
 
 	@Override
@@ -413,7 +419,7 @@ else if(_data[boff + bix + blckIx] > offset)
 	private int[] skipScan(int numVals, int rl) {
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		rl = (rl / blksz) * blksz;
-		int[] ret = allocIVector(numVals, rl == 0);
+		int[] ret = new int[numVals];
 
 		if(rl > 0) { // rl aligned with blksz
 			for(int k = 0; k < numVals; k++) {
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPFOR.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPFOR.java
new file mode 100644
index 00000000000..e858addbc27
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPFOR.java
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Divide;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Multiply;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+
+/**
+ * ColGroup for Patched Frame Of Reference.
+ * 
+ * This column group fits perfectly into the collection of compression groups
+ * 
+ * It can be constructed when a SDCZeros group get a non zero default value. Then a natural extension is to transform
+ * the group into a PFOR group, since the default value is then treated as an offset, and the dictionary can be copied
+ * with no modifications.
+ * 
+ */
+public class ColGroupPFOR extends AMorphingMMColGroup {
+
+	private static final long serialVersionUID = 3883228464052204203L;
+
+	/** Sparse row indexes for the data that is nonZero */
+	protected AOffset _indexes;
+
+	/** Pointers to row indexes in the dictionary. */
+	protected transient AMapToData _data;
+
+	/** Reference values in this column group */
+	protected double[] _reference;
+
+	/**
+	 * Constructor for serialization
+	 * 
+	 * @param numRows Number of rows contained
+	 */
+	protected ColGroupPFOR(int numRows) {
+		super(numRows);
+	}
+
+	private ColGroupPFOR(int[] colIndices, int numRows, ADictionary dict, AOffset indexes, AMapToData data,
+		int[] cachedCounts, double[] reference) {
+		super(colIndices, numRows, dict, cachedCounts);
+		_data = data;
+		_indexes = indexes;
+		_zeros = allZero(reference);
+		_reference = reference;
+	}
+
+	protected static AColGroup create(int[] colIndices, int numRows, ADictionary dict, AOffset indexes, AMapToData data,
+		int[] cachedCounts, double[] reference) {
+		if(dict == null) {
+			// either ColGroupEmpty or const
+			boolean allZero = true;
+			for(double d : reference)
+				if(d != 0) {
+					allZero = false;
+					break;
+				}
+
+			if(allZero)
+				return new ColGroupEmpty(colIndices);
+			else
+				return ColGroupFactory.genColGroupConst(colIndices, reference);
+		}
+		return new ColGroupPFOR(colIndices, numRows, dict, indexes, data, cachedCounts, reference);
+	}
+
+	private final static boolean allZero(double[] in) {
+		for(double v : in)
+			if(v != 0)
+				return false;
+		return true;
+	}
+
+	@Override
+	public CompressionType getCompType() {
+		return CompressionType.PFOR;
+	}
+
+	@Override
+	public ColGroupType getColGroupType() {
+		return ColGroupType.PFOR;
+	}
+
+	@Override
+	public int[] getCounts(int[] counts) {
+		return _data.getCounts(counts, _numRows);
+	}
+
+	@Override
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		// Add reference value sum.
+		final double refSum = refSum();
+		for(int rix = rl; rix < ru; rix++)
+			c[rix] += refSum;
+
+		final double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		ColGroupSDCZeros.computeRowSums(c, rl, ru, vals, _data, _indexes, _numRows);
+	}
+
+	private final double refSum() {
+		double ret = 0;
+		for(double d : _reference)
+			ret += d;
+		return ret;
+	}
+
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDoubleSq(_reference);
+		ColGroupSDC.computeRowSumsSq(c, rl, ru, vals, _data, _indexes, _numRows);
+	}
+
+	@Override
+	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
+		final double[] vals = _dict.aggregateRows(builtin, _reference);
+		ColGroupSDC.computeRowMxx(c, builtin, rl, ru, vals, _data, _indexes, _numRows, vals[vals.length - 1]);
+	}
+
+	@Override
+	public double getIdx(int r, int colIdx) {
+		final AIterator it = _indexes.getIterator(r);
+		final int nCol = _colIndexes.length;
+		if(it.value() == r) {
+			final int rowOff = _data.getIndex(it.getDataIndex()) * nCol;
+			return _dict.getValue(rowOff + colIdx) + _reference[colIdx];
+		}
+		else
+			return _reference[colIdx];
+	}
+
+	@Override
+	public AColGroup scalarOperation(ScalarOperator op) {
+		final double[] newRef = new double[_reference.length];
+		for(int i = 0; i < _reference.length; i++)
+			newRef[i] = op.executeScalar(_reference[i]);
+		if(op.fn instanceof Plus || op.fn instanceof Minus) {
+			return create(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), newRef);
+		}
+		else if(op.fn instanceof Multiply || op.fn instanceof Divide) {
+			final ADictionary newDict = _dict.applyScalarOp(op);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+		else {
+			final ADictionary newDict = _dict.applyScalarOp(op, _reference, newRef);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+	}
+
+	@Override
+	public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
+		final double[] newRef = new double[_reference.length];
+		for(int i = 0; i < _reference.length; i++)
+			newRef[i] = op.fn.execute(v[_colIndexes[i]], _reference[i]);
+
+		if(op.fn instanceof Plus || op.fn instanceof Minus)
+			return create(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), newRef);
+		else if(op.fn instanceof Multiply || op.fn instanceof Divide) {
+			final ADictionary newDict = _dict.binOpLeft(op, v, _colIndexes);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+		else {
+			final ADictionary newDict = _dict.binOpLeft(op, v, _colIndexes, _reference, newRef);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+	}
+
+	@Override
+	public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) {
+		final double[] newRef = new double[_reference.length];
+		for(int i = 0; i < _reference.length; i++)
+			newRef[i] = op.fn.execute(_reference[i], v[_colIndexes[i]]);
+		if(op.fn instanceof Plus || op.fn instanceof Minus)
+			return new ColGroupPFOR(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), newRef);
+		else if(op.fn instanceof Multiply || op.fn instanceof Divide) {
+			final ADictionary newDict = _dict.binOpRight(op, v, _colIndexes);
+			return new ColGroupPFOR(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+		else {
+			final ADictionary newDict = _dict.binOpRight(op, v, _colIndexes, _reference, newRef);
+			return new ColGroupPFOR(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+	}
+
+	@Override
+	public void write(DataOutput out) throws IOException {
+		super.write(out);
+		_indexes.write(out);
+		_data.write(out);
+		for(double d : _reference)
+			out.writeDouble(d);
+	}
+
+	@Override
+	public void readFields(DataInput in) throws IOException {
+		super.readFields(in);
+		_indexes = OffsetFactory.readIn(in);
+		_data = MapToFactory.readIn(in);
+		_reference = new double[_colIndexes.length];
+		for(int i = 0; i < _colIndexes.length; i++)
+			_reference[i] = in.readDouble();
+	}
+
+	@Override
+	public long getExactSizeOnDisk() {
+		long ret = super.getExactSizeOnDisk();
+		ret += _data.getExactSizeOnDisk();
+		ret += _indexes.getExactSizeOnDisk();
+		ret += 8 * _colIndexes.length; // reference values.
+		return ret;
+	}
+
+	@Override
+	public AColGroup replace(double pattern, double replace) {
+		boolean patternInReference = false;
+		for(double d : _reference)
+			if(pattern == d) {
+				patternInReference = true;
+				break;
+			}
+
+		if(patternInReference) {
+			throw new NotImplementedException("Not Implemented replace where a value in reference should be replaced");
+			// _dict.replace(pattern, replace, _reference, _newReplace);
+		}
+		else {
+			final ADictionary newDict = _dict.replace(pattern, replace, _reference);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), _reference);
+		}
+
+	}
+
+	@Override
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append(super.toString());
+		sb.append(String.format("\n%15s ", "Indexes: "));
+		sb.append(_indexes.toString());
+		sb.append(String.format("\n%15s ", "Data: "));
+		sb.append(_data);
+		sb.append(String.format("\n%15s ", "Reference:"));
+		sb.append(Arrays.toString(_reference));
+		return sb.toString();
+	}
+
+	@Override
+	protected double computeMxx(double c, Builtin builtin) {
+		return _dict.aggregate(c, builtin, _reference);
+	}
+
+	@Override
+	protected void computeColMxx(double[] c, Builtin builtin) {
+		_dict.aggregateCols(c, builtin, _colIndexes, _reference);
+	}
+
+	@Override
+	protected void computeSum(double[] c, int nRows) {
+		super.computeSum(c, nRows);
+		final double refSum = refSum();
+		c[0] += refSum * nRows;
+	}
+
+	@Override
+	public void computeColSums(double[] c, int nRows) {
+		super.computeColSums(c, nRows);
+		for(int i = 0; i < _colIndexes.length; i++)
+			c[_colIndexes[i]] += _reference[i] * nRows;
+	}
+
+	@Override
+	protected void computeSumSq(double[] c, int nRows) {
+		c[0] += _dict.sumSq(getCounts(), _reference);
+	}
+
+	@Override
+	protected void computeColSumsSq(double[] c, int nRows) {
+		_dict.colSumSq(c, getCounts(), _colIndexes, _reference);
+	}
+
+	@Override
+	protected void computeProduct(double[] c, int nRows) {
+		throw new NotImplementedException("Not Implemented PFOR");
+	}
+
+	@Override
+	protected void computeRowProduct(double[] c, int rl, int ru) {
+		throw new NotImplementedException("Not Implemented PFOR");
+	}
+
+	@Override
+	protected void computeColProduct(double[] c, int nRows) {
+		throw new NotImplementedException("Not Implemented PFOR");
+	}
+
+	@Override
+	protected AColGroup sliceSingleColumn(int idx) {
+		ColGroupPFOR ret = (ColGroupPFOR) super.sliceSingleColumn(idx);
+		// select values from double array.
+		ret._reference = new double[1];
+		ret._reference[0] = _reference[idx];
+		return ret;
+	}
+
+	@Override
+	protected AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
+		ColGroupPFOR ret = (ColGroupPFOR) super.sliceMultiColumns(idStart, idEnd, outputCols);
+		final int len = idEnd - idStart;
+		ret._reference = new double[len];
+		for(int i = 0, ii = idStart; i < len; i++, ii++)
+			ret._reference[i] = _reference[ii];
+
+		return ret;
+	}
+
+	@Override
+	public boolean containsValue(double pattern) {
+		if(pattern == 0 && _zeros)
+			return true;
+		else if(Double.isNaN(pattern) || Double.isInfinite(pattern))
+			return containsInfOrNan(pattern) || _dict.containsValue(pattern);
+		else
+			return _dict.containsValue(pattern, _reference);
+	}
+
+	private boolean containsInfOrNan(double pattern) {
+		if(Double.isNaN(pattern)) {
+			for(double d : _reference)
+				if(Double.isNaN(d))
+					return true;
+			return false;
+		}
+		else {
+			for(double d : _reference)
+				if(Double.isInfinite(d))
+					return true;
+			return false;
+		}
+	}
+
+	@Override
+	public long getNumberNonZeros(int nRows) {
+		int[] counts = getCounts();
+		return (long) _dict.getNumberNonZeros(counts, _reference, nRows);
+	}
+
+	@Override
+	public AColGroup extractCommon(double[] constV) {
+		for(int i = 0; i < _colIndexes.length; i++)
+			constV[_colIndexes[i]] += _reference[i];
+		return ColGroupSDCZeros.create(_colIndexes, _numRows, _dict, _indexes, _data, getCounts());
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
index 3d69b9662aa..3ee843468ec 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
@@ -24,7 +24,6 @@
 import java.util.List;
 
 import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
@@ -146,7 +145,7 @@ public AColGroup scalarOperation(ScalarOperator op) {
 		// fast path: sparse-safe operations
 		// Note that bitmaps don't change and are shallow-copied
 		if(op.sparseSafe || val0 == 0 || !_zeros) {
-			return new ColGroupRLE(_colIndexes, _numRows, _zeros, applyScalarOp(op), _data, _ptr, getCachedCounts());
+			return new ColGroupRLE(_colIndexes, _numRows, _zeros, _dict.applyScalarOp(op), _data, _ptr, getCachedCounts());
 		}
 
 		// slow path: sparse-unsafe operations (potentially create new bitmap)
@@ -154,10 +153,10 @@ public AColGroup scalarOperation(ScalarOperator op) {
 		boolean[] lind = computeZeroIndicatorVector();
 		int[] loff = computeOffsets(lind);
 		if(loff.length == 0) { // empty offset list: go back to fast path
-			return new ColGroupRLE(_colIndexes, _numRows, false, applyScalarOp(op), _data, _ptr, getCachedCounts());
+			return new ColGroupRLE(_colIndexes, _numRows, false, _dict.applyScalarOp(op), _data, _ptr, getCachedCounts());
 		}
 
-		ADictionary rvalues = applyScalarOp(op, val0, getNumCols());
+		ADictionary rvalues = _dict.applyScalarOp(op, val0, getNumCols());
 		char[] lbitmap = genRLEBitmap(loff, loff.length);
 
 		char[] rbitmaps = Arrays.copyOf(_data, _data.length + lbitmap.length);
@@ -217,73 +216,143 @@ public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSa
 	// }
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		throw new NotImplementedException();
+		// final int numVals = getNumValues();
 
-		final int numVals = getNumValues();
+		// if(numVals > 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
+		// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+
+		// 	// step 1: prepare position and value arrays
+
+		// 	// current pos / values per RLE list
+		// 	int[] astart = new int[numVals];
+		// 	int[] apos = skipScan(numVals, rl, astart);
+		// 	double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
+
+		// 	// step 2: cache conscious matrix-vector via horizontal scans
+		// 	for(int bi = rl; bi < ru; bi += blksz) {
+		// 		int bimax = Math.min(bi + blksz, ru);
+
+		// 		// horizontal segment scan, incl pos maintenance
+		// 		for(int k = 0; k < numVals; k++) {
+		// 			int boff = _ptr[k];
+		// 			int blen = len(k);
+		// 			double val = aval[k];
+		// 			int bix = apos[k];
+		// 			int start = astart[k];
+
+		// 			// compute partial results, not aligned
+		// 			while(bix < blen) {
+		// 				int lstart = _data[boff + bix];
+		// 				int llen = _data[boff + bix + 1];
+		// 				int from = Math.max(bi, start + lstart);
+		// 				int to = Math.min(start + lstart + llen, bimax);
+		// 				for(int rix = from; rix < to; rix++)
+		// 					c[rix] += val;
+
+		// 				if(start + lstart + llen >= bimax)
+		// 					break;
+		// 				start += lstart + llen;
+		// 				bix += 2;
+		// 			}
 
-		if(numVals > 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
-			final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-
-			// step 1: prepare position and value arrays
-
-			// current pos / values per RLE list
-			int[] astart = new int[numVals];
-			int[] apos = skipScan(numVals, rl, astart);
-			double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-
-			// step 2: cache conscious matrix-vector via horizontal scans
-			for(int bi = rl; bi < ru; bi += blksz) {
-				int bimax = Math.min(bi + blksz, ru);
-
-				// horizontal segment scan, incl pos maintenance
-				for(int k = 0; k < numVals; k++) {
-					int boff = _ptr[k];
-					int blen = len(k);
-					double val = aval[k];
-					int bix = apos[k];
-					int start = astart[k];
-
-					// compute partial results, not aligned
-					while(bix < blen) {
-						int lstart = _data[boff + bix];
-						int llen = _data[boff + bix + 1];
-						int from = Math.max(bi, start + lstart);
-						int to = Math.min(start + lstart + llen, bimax);
-						for(int rix = from; rix < to; rix++)
-							c[rix] += val;
-
-						if(start + lstart + llen >= bimax)
-							break;
-						start += lstart + llen;
-						bix += 2;
-					}
-
-					apos[k] = bix;
-					astart[k] = start;
-				}
-			}
-		}
-		else {
-			for(int k = 0; k < numVals; k++) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				double val = _dict.sumRow(k, square, _colIndexes.length);
-
-				if(val != 0.0) {
-					Pair<Integer, Integer> tmp = skipScanVal(k, rl);
-					int bix = tmp.getKey();
-					int curRunStartOff = tmp.getValue();
-					int curRunEnd = tmp.getValue();
-					for(; bix < blen && curRunEnd < ru; bix += 2) {
-						curRunStartOff = curRunEnd + _data[boff + bix];
-						curRunEnd = curRunStartOff + _data[boff + bix + 1];
-						for(int rix = curRunStartOff; rix < curRunEnd && rix < ru; rix++)
-							c[rix] += val;
-
-					}
-				}
-			}
-		}
+		// 			apos[k] = bix;
+		// 			astart[k] = start;
+		// 		}
+		// 	}
+		// }
+		// else {
+		// 	for(int k = 0; k < numVals; k++) {
+		// 		int boff = _ptr[k];
+		// 		int blen = len(k);
+		// 		double val = _dict.sumRow(k, square, _colIndexes.length);
+
+		// 		if(val != 0.0) {
+		// 			Pair<Integer, Integer> tmp = skipScanVal(k, rl);
+		// 			int bix = tmp.getKey();
+		// 			int curRunStartOff = tmp.getValue();
+		// 			int curRunEnd = tmp.getValue();
+		// 			for(; bix < blen && curRunEnd < ru; bix += 2) {
+		// 				curRunStartOff = curRunEnd + _data[boff + bix];
+		// 				curRunEnd = curRunStartOff + _data[boff + bix + 1];
+		// 				for(int rix = curRunStartOff; rix < curRunEnd && rix < ru; rix++)
+		// 					c[rix] += val;
+
+		// 			}
+		// 		}
+		// 	}
+		// }
+	}
+
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		throw new NotImplementedException();
+		// final int numVals = getNumValues();
+
+		// if(numVals > 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
+		// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+
+		// 	// step 1: prepare position and value arrays
+
+		// 	// current pos / values per RLE list
+		// 	int[] astart = new int[numVals];
+		// 	int[] apos = skipScan(numVals, rl, astart);
+		// 	double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
+
+		// 	// step 2: cache conscious matrix-vector via horizontal scans
+		// 	for(int bi = rl; bi < ru; bi += blksz) {
+		// 		int bimax = Math.min(bi + blksz, ru);
+
+		// 		// horizontal segment scan, incl pos maintenance
+		// 		for(int k = 0; k < numVals; k++) {
+		// 			int boff = _ptr[k];
+		// 			int blen = len(k);
+		// 			double val = aval[k];
+		// 			int bix = apos[k];
+		// 			int start = astart[k];
+
+		// 			// compute partial results, not aligned
+		// 			while(bix < blen) {
+		// 				int lstart = _data[boff + bix];
+		// 				int llen = _data[boff + bix + 1];
+		// 				int from = Math.max(bi, start + lstart);
+		// 				int to = Math.min(start + lstart + llen, bimax);
+		// 				for(int rix = from; rix < to; rix++)
+		// 					c[rix] += val;
+
+		// 				if(start + lstart + llen >= bimax)
+		// 					break;
+		// 				start += lstart + llen;
+		// 				bix += 2;
+		// 			}
+
+		// 			apos[k] = bix;
+		// 			astart[k] = start;
+		// 		}
+		// 	}
+		// }
+		// else {
+		// 	for(int k = 0; k < numVals; k++) {
+		// 		int boff = _ptr[k];
+		// 		int blen = len(k);
+		// 		double val = _dict.sumRow(k, square, _colIndexes.length);
+
+		// 		if(val != 0.0) {
+		// 			Pair<Integer, Integer> tmp = skipScanVal(k, rl);
+		// 			int bix = tmp.getKey();
+		// 			int curRunStartOff = tmp.getValue();
+		// 			int curRunEnd = tmp.getValue();
+		// 			for(; bix < blen && curRunEnd < ru; bix += 2) {
+		// 				curRunStartOff = curRunEnd + _data[boff + bix];
+		// 				curRunEnd = curRunStartOff + _data[boff + bix + 1];
+		// 				for(int rix = curRunStartOff; rix < curRunEnd && rix < ru; rix++)
+		// 					c[rix] += val;
+
+		// 			}
+		// 		}
+		// 	}
+		// }
 	}
 
 	@Override
@@ -395,7 +464,7 @@ public double getIdx(int r, int colIdx) {
 	 * @return array of positions for all values
 	 */
 	private int[] skipScan(int numVals, int rl, int[] astart) {
-		int[] apos = allocIVector(numVals, rl == 0);
+		int[] apos = new int[numVals];
 
 		if(rl > 0) { // rl aligned with blksz
 			for(int k = 0; k < numVals; k++) {
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
index fc011e082a1..9aef4313406 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
@@ -23,18 +23,13 @@
 import java.io.DataOutput;
 import java.io.IOException;
 
-import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
 import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
-import org.apache.sysds.runtime.data.DenseBlock;
-import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
@@ -46,7 +41,7 @@
  * This column group is handy in cases where sparse unsafe operations is executed on very sparse columns. Then the zeros
  * would be materialized in the group without any overhead.
  */
-public class ColGroupSDC extends AColGroupValue {
+public class ColGroupSDC extends AMorphingMMColGroup {
 	private static final long serialVersionUID = 769993538831949086L;
 	/**
 	 * Sparse row indexes for the data
@@ -66,7 +61,7 @@ protected ColGroupSDC(int numRows) {
 		super(numRows);
 	}
 
-	protected ColGroupSDC(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
+	private ColGroupSDC(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
 		int[] cachedCounts) {
 		super(colIndices, numRows, dict, cachedCounts);
 		_indexes = offsets;
@@ -74,6 +69,14 @@ protected ColGroupSDC(int[] colIndices, int numRows, ADictionary dict, AOffset o
 		_zeros = false;
 	}
 
+	protected static AColGroup create(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
+		int[] cachedCounts) {
+		if(dict == null)
+			return new ColGroupEmpty(colIndices);
+		else
+			return new ColGroupSDC(colIndices, numRows, dict, offsets, data, cachedCounts);
+	}
+
 	@Override
 	public CompressionType getCompType() {
 		return CompressionType.SDC;
@@ -85,183 +88,153 @@ public ColGroupType getColGroupType() {
 	}
 
 	@Override
-	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		double[] values) {
+	public double getIdx(int r, int colIdx) {
+		final AIterator it = _indexes.getIterator(r);
 		final int nCol = _colIndexes.length;
-		final int offsetToDefault = values.length - nCol;
-		final AIterator it = _indexes.getIterator(rl);
-
-		int offT = rl + offR;
-		int i = rl;
-		for(; i < ru && it.hasNext(); i++, offT++) {
-			final double[] c = db.values(offT);
-			final int off = db.pos(offT) + offC;
-			if(it.value() == i) {
-				int offset = _data.getIndex(it.getDataIndexAndIncrement()) * nCol;
-				for(int j = 0; j < nCol; j++)
-					c[off + _colIndexes[j]] += values[offset + j];
-			}
-			else
-				for(int j = 0; j < nCol; j++)
-					c[off + _colIndexes[j]] += values[offsetToDefault + j];
-		}
-
-		for(; i < ru; i++, offT++) {
-			final double[] c = db.values(offT);
-			final int off = db.pos(offT) + offC;
-			for(int j = 0; j < nCol; j++)
-				c[off + _colIndexes[j]] += values[offsetToDefault + j];
-		}
-
-		_indexes.cacheIterator(it, ru);
-	}
-
-	@Override
-	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		SparseBlock sb) {
-		throw new NotImplementedException();
-		// final int offsetToDefault = sb.numRows() - 1;
-		// final int defApos = sb.pos(offsetToDefault);
-		// final int defAlen = sb.size(offsetToDefault) + defApos;
-		// final double[] defAvals = sb.values(offsetToDefault);
-		// final int[] defAix = sb.indexes(offsetToDefault);
-		// final DenseBlock db = target.getDenseBlock();
-
-		// int i = rl;
-		// AIterator it = _indexes.getIterator(rl);
-		// for(; i < ru && it.hasNext(); i++, offT++) {
-		// final double[] c = db.values(offT);
-		// final int off = db.pos(offT);
-		// if(it.value() == i) {
-		// int dictIndex = _data.getIndex(it.getDataIndexAndIncrement());
-		// if(sb.isEmpty(dictIndex))
-		// continue;
-		// final int apos = sb.pos(dictIndex);
-		// final int alen = sb.size(dictIndex) + apos;
-		// final double[] avals = sb.values(dictIndex);
-		// final int[] aix = sb.indexes(dictIndex);
-		// for(int j = apos; j < alen; j++)
-		// c[off + _colIndexes[aix[j]]] += avals[j];
-		// }
-		// else
-		// for(int j = defApos; j < defAlen; j++)
-		// c[off + _colIndexes[defAix[j]]] += defAvals[j];
-		// }
-
-		// for(; i < ru; i++, offT++) {
-		// final double[] c = db.values(offT);
-		// final int off = db.pos(offT);
-		// for(int j = defApos; j < defAlen; j++)
-		// c[off + _colIndexes[defAix[j]]] += defAvals[j];
-		// }
-
-		// _indexes.cacheIterator(it, ru);
+		final int rowOff = it.value() == r ? _data.getIndex(it.getDataIndex()) * nCol : getNumValues() * nCol - nCol;
+		return _dict.getValue(rowOff + colIdx);
 	}
 
 	@Override
-	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
-		SparseBlock sb) {
-		throw new NotImplementedException();
-	}
+	protected void computeRowSums(double[] c, int rl, int ru) {
 
-	@Override
-	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
-		double[] values) {
-		final int nCol = _colIndexes.length;
-		final int offsetToDefault = values.length - nCol;
 		final AIterator it = _indexes.getIterator(rl);
-
-		int offT = rl + offR;
-		int i = rl;
-		for(; i < ru && it.hasNext(); i++, offT++) {
-			// final double[] c = db.values(offT);
-			// final int off = db.pos(offT) + offC;
-			if(it.value() == i) {
-				int offset = _data.getIndex(it.getDataIndexAndIncrement()) * nCol;
-				for(int j = 0; j < nCol; j++)
-					ret.append(offT, _colIndexes[j] + offC, values[offset + j]);
-				// c[off + _colIndexes[j]] += values[offset + j];
+		final int numVals = getNumValues();
+		int r = rl;
+		final double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		final double def = vals[numVals - 1];
+		if(it != null && it.value() > ru)
+			_indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= _indexes.getOffsetToLast()) {
+			final int maxId = _data.size() - 1;
+			while(true) {
+				if(it.value() == r) {
+					c[r] += vals[_data.getIndex(it.getDataIndex())];
+					if(it.getDataIndex() < maxId)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] += def;
+				r++;
 			}
-			else
-				for(int j = 0; j < nCol; j++)
-					ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
-			// c[off + _colIndexes[j]] += values[offsetToDefault + j];
 		}
-
-		for(; i < ru; i++, offT++) {
-			// final double[] c = db.values(offT);
-			// final int off = db.pos(offT) + offC;
-			for(int j = 0; j < nCol; j++)
-				ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
-			// c[off + _colIndexes[j]] += values[offsetToDefault + j];
+		else if(it != null) {
+			while(it.isNotOver(ru)) {
+				if(it.value() == r)
+					c[r] += vals[_data.getIndex(it.getDataIndexAndIncrement())];
+				else
+					c[r] += def;
+				r++;
+			}
+			_indexes.cacheIterator(it, ru);
 		}
 
-		_indexes.cacheIterator(it, ru);
+		while(r < ru) {
+			c[r] += def;
+			r++;
+		}
 	}
 
 	@Override
-	public double getIdx(int r, int colIdx) {
-		final AIterator it = _indexes.getIterator(r);
-		final int nCol = _colIndexes.length;
-		final int rowOff = it.value() == r ? getIndex(it.getDataIndex()) * nCol : getNumValues() * nCol - nCol;
-		return _dict.getValue(rowOff + colIdx);
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length);
+		computeRowSumsSq(c, rl, ru, vals, _data, _indexes, _numRows);
 	}
 
-	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		final int numVals = getNumValues();
-		// // pre-aggregate nnz per value tuple
-		double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-
-		int rix = rl;
-		AIterator it = _indexes.getIterator(rl);
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] += vals[numVals - 1];
-			else {
-				c[rix] += vals[_data.getIndex(it.getDataIndexAndIncrement())];
+	protected static final void computeRowSumsSq(double[] c, int rl, int ru, double[] vals, AMapToData data,
+		AOffset indexes, int nRows) {
+		int r = rl;
+		final AIterator it = indexes.getIterator(rl);
+		final double def = vals[vals.length - 1];
+		if(it != null && it.value() > ru)
+			indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= indexes.getOffsetToLast()) {
+			final int maxId = data.size() - 1;
+			while(true) {
+				if(it.value() == r) {
+					c[r] += vals[data.getIndex(it.getDataIndex())];
+					if(it.getDataIndex() < maxId)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] += def;
+				r++;
 			}
 		}
-		for(; rix < ru; rix++) {
-			c[rix] += vals[numVals - 1];
+		else if(it != null) {
+			while(r < ru) {
+				if(it.value() == r)
+					c[r] += vals[data.getIndex(it.getDataIndexAndIncrement())];
+				else
+					c[r] += def;
+				r++;
+			}
+			indexes.cacheIterator(it, ru);
 		}
 
+		while(r < ru) {
+			c[r] += def;
+			r++;
+		}
 	}
 
 	@Override
 	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		final int numVals = getNumValues();
-		final double[] vals = _dict.aggregateTuples(builtin, _colIndexes.length);
-		final AIterator it = _indexes.getIterator(rl);
-		int rix = rl;
+		final double[] vals = _dict.aggregateRows(builtin, _colIndexes.length);
+		computeRowMxx(c, builtin, rl, ru, vals, _data, _indexes, _numRows, vals[vals.length - 1]);
+	}
 
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] = builtin.execute(c[rix], vals[numVals - 1]);
-			else
-				c[rix] = builtin.execute(c[rix], vals[_data.getIndex(it.getDataIndexAndIncrement())]);
+	protected static final void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, double[] vals,
+		AMapToData data, AOffset indexes, int nRows, double def) {
+		int r = rl;
+		final AIterator it = indexes.getIterator(rl);
+		if(it != null && it.value() > ru)
+			indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= indexes.getOffsetToLast()) {
+			final int maxId = data.size() - 1;
+			while(true) {
+				if(it.value() == r) {
+					c[r] = builtin.execute(c[r], vals[data.getIndex(it.getDataIndex())]);
+					if(it.getDataIndex() < maxId)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] = builtin.execute(c[r], def);
+				r++;
+			}
+		}
+		else if(it != null) {
+			while(r < ru) {
+				if(it.value() == r)
+					c[r] = builtin.execute(c[r], vals[data.getIndex(it.getDataIndexAndIncrement())]);
+				else
+					c[r] = builtin.execute(c[r], def);
+				r++;
+			}
+			indexes.cacheIterator(it, ru);
 		}
 
-		// cover remaining rows with default value
-		for(; rix < ru; rix++)
-			c[rix] = builtin.execute(c[rix], vals[numVals - 1]);
+		while(r < ru) {
+			c[r] = builtin.execute(c[r], def);
+			r++;
+		}
 	}
 
 	@Override
 	public int[] getCounts(int[] counts) {
-		final int nonDefaultLength = _data.size();
-		// final AIterator it = _indexes.getIterator();
-		final int defaults = _numRows - nonDefaultLength;
-		for(int i = 0; i < nonDefaultLength; i++)
-			counts[_data.getIndex(i)]++;
-
-		counts[counts.length - 1] += defaults;
-
-		return counts;
-	}
-
-	public int getIndex(int r) {
-		return _data.getIndex(r);
+		return _data.getCounts(counts, _numRows);
 	}
 
 	@Override
@@ -274,19 +247,19 @@ public long estimateInMemorySize() {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		return new ColGroupSDC(_colIndexes, _numRows, applyScalarOp(op), _indexes, _data, getCachedCounts());
+		return create(_colIndexes, _numRows, _dict.applyScalarOp(op), _indexes, _data, getCachedCounts());
 	}
 
 	@Override
 	public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
 		ADictionary ret = _dict.binOpLeft(op, v, _colIndexes);
-		return new ColGroupSDC(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+		return create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
 	}
 
 	@Override
 	public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) {
 		ADictionary ret = _dict.binOpRight(op, v, _colIndexes);
-		return new ColGroupSDC(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+		return create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
 	}
 
 	@Override
@@ -311,40 +284,17 @@ public long getExactSizeOnDisk() {
 		return ret;
 	}
 
-	public ColGroupSDCZeros extractCommon(double[] constV) {
+	@Override
+	public AColGroup extractCommon(double[] constV) {
 		double[] commonV = _dict.getTuple(getNumValues() - 1, _colIndexes.length);
 		if(commonV == null) // The common tuple was all zero. Therefore this column group should never have been SDC.
-			return new ColGroupSDCZeros(_colIndexes, _numRows, _dict, _indexes, _data, getCounts());
+			return ColGroupSDCZeros.create(_colIndexes, _numRows, _dict, _indexes, _data, getCounts());
 
 		for(int i = 0; i < _colIndexes.length; i++)
 			constV[_colIndexes[i]] += commonV[i];
 
 		ADictionary subtractedDict = _dict.subtractTuple(commonV);
-		return new ColGroupSDCZeros(_colIndexes, _numRows, subtractedDict, _indexes, _data, getCounts());
-	}
-
-	@Override
-	public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
-	@Override
-	public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
-	@Override
-	public void tsmmAColGroup(AColGroup other, MatrixBlock result) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
+		return ColGroupSDCZeros.create(_colIndexes, _numRows, subtractedDict, _indexes, _data, getCounts());
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
index cb123eca99c..c3f19c5ddad 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
@@ -23,16 +23,11 @@
 import java.io.DataOutput;
 import java.io.IOException;
 
-import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
 import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
-import org.apache.sysds.runtime.data.DenseBlock;
-import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
@@ -44,7 +39,7 @@
  * This column group is handy in cases where sparse unsafe operations is executed on very sparse columns. Then the zeros
  * would be materialized in the group without any overhead.
  */
-public class ColGroupSDCSingle extends AColGroupValue {
+public class ColGroupSDCSingle extends AMorphingMMColGroup {
 	private static final long serialVersionUID = 3883228464052204200L;
 	/**
 	 * Sparse row indexes for the data
@@ -76,126 +71,185 @@ public ColGroupType getColGroupType() {
 		return ColGroupType.SDCSingle;
 	}
 
-	@Override
-	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		double[] values) {
-		final int nCol = _colIndexes.length;
-		final int offsetToDefault = values.length - nCol;
-		final AIterator it = _indexes.getIterator(rl);
-
-		int offT = rl + offR;
-		int i = rl;
-		for(; i < ru && it.hasNext(); i++, offT++) {
-			final double[] c = db.values(offT);
-			final int off = db.pos(offT) + offC;
-			if(it.value() == i) {
-				for(int j = 0; j < nCol; j++)
-					c[off + _colIndexes[j]] += values[j];
-				it.next();
-			}
-			else
-				for(int j = 0; j < nCol; j++)
-					c[off + _colIndexes[j]] += values[offsetToDefault + j];
-		}
-
-		for(; i < ru; i++, offT++) {
-			final double[] c = db.values(offT);
-			final int off = db.pos(offT) + offC;
-			for(int j = 0; j < nCol; j++)
-				c[off + _colIndexes[j]] += values[offsetToDefault + j];
-		}
+	// @Override
+	// protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+	// double[] values) {
+	// final int nCol = _colIndexes.length;
+	// final int offsetToDefault = values.length - nCol;
+	// final AIterator it = _indexes.getIterator(rl);
+
+	// int offT = rl + offR;
+	// int i = rl;
+	// for(; i < ru && it.hasNext(); i++, offT++) {
+	// final double[] c = db.values(offT);
+	// final int off = db.pos(offT) + offC;
+	// if(it.value() == i) {
+	// for(int j = 0; j < nCol; j++)
+	// c[off + _colIndexes[j]] += values[j];
+	// it.next();
+	// }
+	// else
+	// for(int j = 0; j < nCol; j++)
+	// c[off + _colIndexes[j]] += values[offsetToDefault + j];
+	// }
+
+	// for(; i < ru; i++, offT++) {
+	// final double[] c = db.values(offT);
+	// final int off = db.pos(offT) + offC;
+	// for(int j = 0; j < nCol; j++)
+	// c[off + _colIndexes[j]] += values[offsetToDefault + j];
+	// }
+
+	// _indexes.cacheIterator(it, ru);
+	// }
+
+	// @Override
+	// protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+	// SparseBlock values) {
+	// throw new NotImplementedException();
+	// }
+
+	// @Override
+	// protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
+	// SparseBlock sb) {
+	// throw new NotImplementedException();
+	// }
+
+	// @Override
+	// protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
+	// double[] values) {
+	// final int nCol = _colIndexes.length;
+	// final int offsetToDefault = values.length - nCol;
+	// final AIterator it = _indexes.getIterator(rl);
+
+	// int offT = rl + offR;
+	// int i = rl;
+	// for(; i < ru && it.hasNext(); i++, offT++) {
+	// if(it.value() == i) {
+	// for(int j = 0; j < nCol; j++)
+	// ret.append(offT, _colIndexes[j] + offC, values[j]);
+	// it.next();
+	// }
+	// else
+	// for(int j = 0; j < nCol; j++)
+	// ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
+	// }
+
+	// for(; i < ru; i++, offT++)
+	// for(int j = 0; j < nCol; j++)
+	// ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
+
+	// _indexes.cacheIterator(it, ru);
+	// }
 
-		_indexes.cacheIterator(it, ru);
+	@Override
+	public double getIdx(int r, int colIdx) {
+		AIterator it = _indexes.getIterator(r);
+		if(it.value() == r)
+			return _dict.getValue(colIdx);
+		else
+			return _dict.getValue(_colIndexes.length + colIdx);
 	}
 
 	@Override
-	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		SparseBlock values) {
-		throw new NotImplementedException();
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		computeRowSums(c, rl, ru, vals);
 	}
 
 	@Override
-	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
-		SparseBlock sb) {
-		throw new NotImplementedException();
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length);
+		computeRowSums(c, rl, ru, vals);
 	}
 
-	@Override
-	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
-		double[] values) {
-		final int nCol = _colIndexes.length;
-		final int offsetToDefault = values.length - nCol;
+	protected void computeRowSums(double[] c, int rl, int ru, double[] vals) {
+		int r = rl;
 		final AIterator it = _indexes.getIterator(rl);
-
-		int offT = rl + offR;
-		int i = rl;
-		for(; i < ru && it.hasNext(); i++, offT++) {
-			if(it.value() == i) {
-				for(int j = 0; j < nCol; j++)
-					ret.append(offT, _colIndexes[j] + offC, values[j]);
-				it.next();
+		final double def = vals[1];
+		final double norm = vals[0];
+		if(it != null && it.value() > ru)
+			_indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= _indexes.getOffsetToLast()) {
+			final int maxOff = _indexes.getOffsetToLast();
+			while(true) {
+				if(it.value() == r) {
+					c[r] += norm;
+					if(it.value() < maxOff)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] += def;
+				r++;
 			}
-			else
-				for(int j = 0; j < nCol; j++)
-					ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
+		}
+		else if(it != null) {
+			while(r < ru) {
+				if(it.value() == r)
+					c[r] += norm;
+				else
+					c[r] += def;
+				r++;
+			}
+			_indexes.cacheIterator(it, ru);
 		}
 
-		for(; i < ru; i++, offT++)
-			for(int j = 0; j < nCol; j++)
-				ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
-
-		_indexes.cacheIterator(it, ru);
+		while(r < ru) {
+			c[r] += def;
+			r++;
+		}
 	}
 
 	@Override
-	public double getIdx(int r, int colIdx) {
-		AIterator it = _indexes.getIterator(r);
-		if(it.value() == r)
-			return _dict.getValue(colIdx);
-		else
-			return _dict.getValue(_colIndexes.length + colIdx);
+	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
+		final double[] vals = _dict.aggregateRows(builtin, _colIndexes.length);
+		computeRowMxx(c, builtin, rl, ru, _indexes, _numRows, vals[1], vals[0]);
 	}
 
-	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-
-		// // pre-aggregate nnz per value tuple
-		final double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-		final AIterator it = _indexes.getIterator();
-
-		int rix = rl;
-		it.skipTo(rl);
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] += vals[1];
-			else {
-				c[rix] += vals[0];
-				it.next();
+	protected static final void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, AOffset indexes, int nRows,
+		double def, double norm) {
+		int r = rl;
+		final AIterator it = indexes.getIterator(rl);
+		if(it != null && it.value() > ru)
+			indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= indexes.getOffsetToLast()) {
+			final int maxOff = indexes.getOffsetToLast();
+			while(true) {
+				if(it.value() == r) {
+					c[r] = builtin.execute(c[r], norm);
+					if(it.value() < maxOff)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] = builtin.execute(c[r], def);
+				r++;
 			}
 		}
-		for(; rix < ru; rix++) {
-			c[rix] += vals[1];
-		}
-	}
-
-	@Override
-	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		final double[] vals = _dict.aggregateTuples(builtin, _colIndexes.length);
-		final AIterator it = _indexes.getIterator(rl);
-		int rix = rl;
-
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] = builtin.execute(c[rix], vals[1]);
-			else {
-				c[rix] = builtin.execute(c[rix], vals[0]);
-				it.next();
+		else if(it != null) {
+			while(r < ru) {
+				if(it.value() == r) {
+					c[r] = builtin.execute(c[r], norm);
+					it.next();
+				}
+				else
+					c[r] = builtin.execute(c[r], def);
+				r++;
 			}
+			indexes.cacheIterator(it, ru);
 		}
 
-		// cover remaining rows with default value
-		for(; rix < ru; rix++)
-			c[rix] = builtin.execute(c[rix], vals[1]);
+		while(r < ru) {
+			c[r] = builtin.execute(c[r], def);
+			r++;
+		}
 	}
 
 	@Override
@@ -214,7 +268,7 @@ public long estimateInMemorySize() {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		return new ColGroupSDCSingle(_colIndexes, _numRows, applyScalarOp(op), _indexes, getCachedCounts());
+		return new ColGroupSDCSingle(_colIndexes, _numRows, _dict.applyScalarOp(op), _indexes, getCachedCounts());
 	}
 
 	@Override
@@ -248,6 +302,7 @@ public long getExactSizeOnDisk() {
 		return ret;
 	}
 
+	@Override
 	public ColGroupSDCSingleZeros extractCommon(double[] constV) {
 		double[] commonV = _dict.getTuple(getNumValues() - 1, _colIndexes.length);
 
@@ -261,30 +316,6 @@ public ColGroupSDCSingleZeros extractCommon(double[] constV) {
 		return new ColGroupSDCSingleZeros(_colIndexes, _numRows, subtractedDict, _indexes, getCachedCounts());
 	}
 
-	@Override
-	public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
-	@Override
-	public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
-	@Override
-	public void tsmmAColGroup(AColGroup other, MatrixBlock result) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
index d8edd0d3c7c..534856735b4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
@@ -46,9 +46,7 @@
  */
 public class ColGroupSDCSingleZeros extends APreAgg {
 	private static final long serialVersionUID = 8033235615964315078L;
-	/**
-	 * Sparse row indexes for the data
-	 */
+	/** Sparse row indexes for the data */
 	protected transient AOffset _indexes;
 
 	/**
@@ -80,64 +78,129 @@ public ColGroupType getColGroupType() {
 	@Override
 	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		double[] values) {
-		final int nCol = _colIndexes.length;
+
 		final AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int row = offR + it.value();
-			final double[] c = db.values(row);
-			final int off = db.pos(row) + offC;
-			for(int j = 0; j < nCol; j++)
-				c[off + _colIndexes[j]] += values[j];
-
-			it.next();
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int maxOff = _indexes.getOffsetToLast();
+			final int nCol = _colIndexes.length;
+			while(true) {
+				final int row = offR + it.value();
+				final double[] c = db.values(row);
+				final int off = db.pos(row);
+				for(int j = 0; j < nCol; j++)
+					c[off + _colIndexes[j] + offC] += values[j];
+				if(it.value() < maxOff)
+					it.next();
+				else
+					break;
+			}
 		}
-		_indexes.cacheIterator(it, ru);
+		else {
+			final int nCol = _colIndexes.length;
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				final double[] c = db.values(row);
+				final int off = db.pos(row);
+				for(int j = 0; j < nCol; j++)
+					c[off + _colIndexes[j] + offC] += values[j];
+
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
+		}
+
 	}
 
 	@Override
 	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		SparseBlock values) {
-		throw new NotImplementedException();
-		// final int offTCorr = offT - rl;
-		// final DenseBlock db = target.getDenseBlock();
-		// final int apos = values.pos(0);
-		// final int alen = values.size(0) + apos;
-		// final int[] aix = values.indexes(0);
-		// final double[] avals = values.values(0);
-
-		// AIterator it = _indexes.getIterator(rl);
-		// while(it.hasNext() && it.value() < ru) {
-		// final int idx = offTCorr + it.value();
-		// final double[] c = db.values(idx);
-		// final int off = db.pos(idx);
-
-		// for(int j = apos; j < alen; j++)
-		// c[off + _colIndexes[aix[j]]] += avals[j];
-
-		// it.next();
-		// }
-
-		// _indexes.cacheIterator(it, ru);
+		SparseBlock sb) {
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			throw new NotImplementedException();
+		}
+		else {
+			final int apos = sb.pos(0);
+			final int alen = sb.size(0) + apos;
+			final int[] aix = sb.indexes(0);
+			final double[] avals = sb.values(0);
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				final double[] c = db.values(row);
+				final int off = db.pos(row);
+				for(int j = apos; j < alen; j++)
+					c[off + _colIndexes[aix[j]] + offC] += avals[j];
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
+		}
 	}
 
 	@Override
 	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
 		SparseBlock sb) {
-		throw new NotImplementedException();
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			throw new NotImplementedException();
+		}
+		else {
+			final int apos = sb.pos(0);
+			final int alen = sb.size(0) + apos;
+			final int[] aix = sb.indexes(0);
+			final double[] avals = sb.values(0);
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				for(int j = apos; j < alen; j++)
+					ret.append(row, _colIndexes[aix[j]] + offC, avals[j]);
+
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
+		}
 	}
 
 	@Override
 	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
 		double[] values) {
-		final int nCol = _colIndexes.length;
 		final AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int row = offR + it.value();
-			for(int j = 0; j < nCol; j++)
-				ret.append(row, _colIndexes[j] + offC, values[j]);
-			it.next();
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int nCol = _colIndexes.length;
+			final int lastOff = _indexes.getOffsetToLast();
+			while(true) {
+				final int row = offR + it.value();
+				for(int j = 0; j < nCol; j++)
+					ret.append(row, _colIndexes[j] + offC, values[j]);
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+			final int nCol = _colIndexes.length;
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				for(int j = 0; j < nCol; j++)
+					ret.append(row, _colIndexes[j] + offC, values[j]);
+
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
 	}
 
 	@Override
@@ -150,34 +213,45 @@ public double getIdx(int r, int colIdx) {
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		final double vals = _dict.sumAllRowsToDouble(square, _colIndexes.length)[0];
-		final AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			c[it.value()] += vals;
-			it.next();
-		}
-
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		final double def = _dict.sumAllRowsToDouble(_colIndexes.length)[0];
+		computeRowSum(c, rl, ru, def);
 	}
 
 	@Override
-	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		final double vals = _dict.aggregateTuples(builtin, _colIndexes.length)[0];
-		final AIterator it = _indexes.getIterator(rl);
-		int rix = rl;
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double def = _dict.sumAllRowsToDoubleSq(_colIndexes.length)[0];
+		computeRowSum(c, rl, ru, def);
+	}
 
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] = builtin.execute(c[rix], 0);
-			else {
-				c[rix] = builtin.execute(c[rix], vals);
+	protected void computeRowSum(double[] c, int rl, int ru, double def) {
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() > ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int maxOff = _indexes.getOffsetToLast();
+			while(true) {
+				c[it.value()] += def;
+				if(it.value() == maxOff)
+					break;
+				it.next();
+			}
+		}
+		else {
+			while(it.isNotOver(ru)) {
+				c[it.value()] += def;
 				it.next();
 			}
+			_indexes.cacheIterator(it, ru);
 		}
+	}
 
-		// cover remaining rows
-		for(; rix < ru; rix++)
-			c[rix] = builtin.execute(c[rix], 0);
+	@Override
+	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
+		final double[] vals = _dict.aggregateRows(builtin, _colIndexes.length);
+		ColGroupSDCSingle.computeRowMxx(c, builtin, rl, ru, _indexes, _numRows, 0, vals[0]);
 	}
 
 	@Override
@@ -197,66 +271,88 @@ public void preAggregate(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
 
 	@Override
 	public void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru, int cl, int cu) {
-		final double[] mV = m.getDenseBlockValues();
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final int numVals = getNumValues();
-		final int blockSize = 2000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, cu);
-			final AIterator itStart = _indexes.getIterator(block);
-			AIterator it;
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
-				final int offLeft = rowLeft * _numRows;
-				it = itStart.clone();
-				while(it.value() < blockEnd && it.hasNext()) {
-					final int i = it.value();
-					preAV[offOut] += mV[offLeft + i];
-					it.next();
-				}
+
+		final AIterator it = _indexes.getIterator(cl);
+		if(it == null)
+			return;
+		else if(it.value() > cu)
+			_indexes.cacheIterator(it, cu);
+		else if(rl == ru - 1) {
+			final int maxOff = _indexes.getOffsetToLast();
+			final double[] mV = m.getDenseBlockValues();
+			final double[] preAV = preAgg.getDenseBlockValues();
+			final int offLeft = rl * _numRows;
+			while(true) {
+				final int i = it.value();
+				preAV[0] += mV[offLeft + i];
+				if(i == maxOff)
+					break;
+				it.next();
 			}
 		}
+		else
+			throw new NotImplementedException();
+
 	}
 
 	private void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int numVals = getNumValues();
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
-			final AIterator it = _indexes.getIterator();
-			final int offLeft = rowLeft * _numRows;
-			while(it.hasNext()) {
-				final int i = it.value();
-				preAV[offOut] += mV[offLeft + i];
+		final AIterator it = _indexes.getIterator();
+		if(rl == ru - 1) {
+			double ret = 0;
+			final DenseBlock db = m.getDenseBlock();
+			final double[] mV = db.values(rl);
+			final int off = db.pos(rl);
+			final int offsetToLast = _indexes.getOffsetToLast();
+			while(true) {
+				ret += mV[off + it.value()];
+				if(it.value() == offsetToLast)
+					break;
 				it.next();
 			}
+
+			preAgg.setValue(0, 0, ret);
 		}
+		else
+			throw new NotImplementedException();
 	}
 
 	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final int numVals = getNumValues();
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
-			if(sb.isEmpty(rowLeft))
-				continue;
-			final AIterator it = _indexes.getIterator();
-			final int apos = sb.pos(rowLeft);
-			final int alen = sb.size(rowLeft) + apos;
-			final int[] aix = sb.indexes(rowLeft);
-			final double[] avals = sb.values(rowLeft);
+		final AIterator it = _indexes.getIterator();
+		if(rl == ru - 1) {
+			final int apos = sb.pos(rl);
+			final int alen = sb.size(rl) + apos;
+			final int[] aix = sb.indexes(rl);
+			final double[] avals = sb.values(rl);
+			final int offsetToLast = _indexes.getOffsetToLast();
+
+			double ret = 0;
 			int j = apos;
-			while(it.hasNext() && j < alen) {
-				final int index = aix[j];
-				final int v = it.value();
-				if(index < v)
-					j++;
-				else if(index == v) {
-					preAV[offOut] += avals[j++];
+
+			while(true) {
+				final int idx = aix[j];
+
+				if(idx == it.value()) {
+					ret += avals[j++];
+					if(j >= alen || it.value() >= offsetToLast)
+						break;
 					it.next();
 				}
-				else
+				else if(idx < it.value()) {
+					j++;
+					if(j >= alen)
+						break;
+				}
+				else {
+					if(it.value() >= offsetToLast)
+						break;
 					it.next();
+				}
 			}
+
+			preAgg.setValue(0, 0, ret);
 		}
+		else
+			throw new NotImplementedException();
 	}
 
 	@Override
@@ -271,9 +367,9 @@ public AColGroup scalarOperation(ScalarOperator op) {
 		double val0 = op.executeScalar(0);
 		boolean isSparseSafeOp = op.sparseSafe || val0 == 0;
 		if(isSparseSafeOp)
-			return new ColGroupSDCSingleZeros(_colIndexes, _numRows, applyScalarOp(op), _indexes, getCachedCounts());
+			return new ColGroupSDCSingleZeros(_colIndexes, _numRows, _dict.applyScalarOp(op), _indexes, getCachedCounts());
 		else {
-			ADictionary aDictionary = applyScalarOp(op, val0, getNumCols());// swapEntries();
+			ADictionary aDictionary = _dict.applyScalarOp(op, val0, getNumCols());// swapEntries();
 			// ADictionary aDictionary = applyScalarOp(op, val0, getNumCols());
 			return new ColGroupSDCSingle(_colIndexes, _numRows, aDictionary, _indexes, null);
 		}
@@ -336,10 +432,15 @@ public boolean sameIndexStructure(AColGroupCompressed that) {
 	public void preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
 		final AIterator itThis = _indexes.getIterator();
 		final int nCol = that._colIndexes.length;
-		while(itThis.hasNext()) {
+		final int finalOffThis = _indexes.getOffsetToLast();
+
+		while(true) {
 			final int fr = that._data.getIndex(itThis.value());
 			that._dict.addToEntry(ret, fr, 0, nCol);
-			itThis.next();
+			if(itThis.value() >= finalOffThis)
+				break;
+			else
+				itThis.next();
 		}
 	}
 
@@ -348,26 +449,69 @@ public void preAggregateThatSDCZerosStructure(ColGroupSDCZeros that, Dictionary
 		final AIterator itThat = that._indexes.getIterator();
 		final AIterator itThis = _indexes.getIterator();
 		final int nCol = that._colIndexes.length;
-
-		while(itThat.hasNext() && itThis.hasNext()) {
-			final int v = itThat.value();
-			if(v == itThis.skipTo(v))
-				that._dict.addToEntry(ret, that.getIndex(itThat.getDataIndex()), 0, nCol);
-
-			itThat.next();
+		final int finalOffThis = _indexes.getOffsetToLast();
+		final int finalOffThat = that._indexes.getOffsetToLast();
+
+		while(true) {
+			if(itThat.value() == itThis.value()) {
+				that._dict.addToEntry(ret, that._data.getIndex(itThat.getDataIndex()), 0, nCol);
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+			else if(itThat.value() < itThis.value()) {
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+			}
+			else {
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
 		}
 	}
 
 	@Override
 	public void preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary ret) {
+		final int nCol = that._colIndexes.length;
 		final AIterator itThat = that._indexes.getIterator();
 		final AIterator itThis = _indexes.getIterator();
-		final int nCol = that._colIndexes.length;
-		while(itThat.hasNext()) {
-			final int v = itThat.value();
-			if(v == itThis.skipTo(v))
+		final int finalOffThis = _indexes.getOffsetToLast();
+		final int finalOffThat = that._indexes.getOffsetToLast();
+
+		while(true) {
+			if(itThat.value() == itThis.value()) {
 				that._dict.addToEntry(ret, 0, 0, nCol);
-			itThat.next();
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+			else if(itThat.value() < itThis.value()) {
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+			}
+			else {
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+
 		}
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
index a7632dd70ad..8fa9887b2f5 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
@@ -23,13 +23,9 @@
 import java.io.DataOutput;
 import java.io.IOException;
 
-import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
-import org.apache.sysds.runtime.compress.colgroup.mapping.MapToByte;
-import org.apache.sysds.runtime.compress.colgroup.mapping.MapToChar;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
@@ -37,6 +33,7 @@
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
@@ -72,19 +69,20 @@ protected ColGroupSDCZeros(int numRows) {
 		super(numRows);
 	}
 
-	protected ColGroupSDCZeros(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data) {
-		super(colIndices, numRows, dict, null);
+	private ColGroupSDCZeros(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
+		int[] cachedCounts) {
+		super(colIndices, numRows, dict, cachedCounts);
 		_indexes = offsets;
 		_data = data;
 		_zeros = true;
 	}
 
-	protected ColGroupSDCZeros(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
+	protected static AColGroup create(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
 		int[] cachedCounts) {
-		super(colIndices, numRows, dict, cachedCounts);
-		_indexes = offsets;
-		_data = data;
-		_zeros = true;
+		if(dict == null)
+			return new ColGroupEmpty(colIndices);
+		else
+			return new ColGroupSDCZeros(colIndices, numRows, dict, offsets, data, cachedCounts);
 	}
 
 	@Override
@@ -100,129 +98,256 @@ public ColGroupType getColGroupType() {
 	@Override
 	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		double[] values) {
-		final int nCol = _colIndexes.length;
 
-		AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int idx = offR + it.value();
-			final double[] c = db.values(idx);
-			final int off = db.pos(idx) + offC;
-			final int offDict = getIndex(it.getDataIndexAndIncrement()) * nCol;
-			for(int j = 0; j < nCol; j++)
-				c[off + _colIndexes[j]] += values[offDict + j];
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int lastOff = _indexes.getOffsetToLast();
+			final int nCol = _colIndexes.length;
+			while(true) {
+				final int idx = offR + it.value();
+				final double[] c = db.values(idx);
+				final int off = db.pos(idx) + offC;
+				final int offDict = _data.getIndex(it.getDataIndex()) * nCol;
+				for(int j = 0; j < nCol; j++)
+					c[off + _colIndexes[j]] += values[offDict + j];
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+
+			final int nCol = _colIndexes.length;
+			while(it.isNotOver(ru)) {
+				final int idx = offR + it.value();
+				final double[] c = db.values(idx);
+				final int off = db.pos(idx) + offC;
+				final int offDict = _data.getIndex(it.getDataIndex()) * nCol;
+				for(int j = 0; j < nCol; j++)
+					c[off + _colIndexes[j]] += values[offDict + j];
 
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
+
 	}
 
 	@Override
 	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		SparseBlock sb) {
-		AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int idx = offR + it.value();
-			final int dictIndex = getIndex(it.getDataIndexAndIncrement());
-			if(sb.isEmpty(dictIndex))
-				continue;
-
-			final double[] c = db.values(idx);
-			final int off = db.pos(idx) + offC;
-			final int apos = sb.pos(dictIndex);
-			final int alen = sb.size(dictIndex) + apos;
-			final double[] avals = sb.values(dictIndex);
-			final int[] aix = sb.indexes(dictIndex);
-			for(int j = apos; j < alen; j++)
-				c[off + _colIndexes[aix[j]]] += avals[j];
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int lastOff = _indexes.getOffsetToLast();
+			while(true) {
+				final int idx = offR + it.value();
+				final double[] c = db.values(idx);
+				final int dx = it.getDataIndex();
+				final int dictIndex = _data.getIndex(dx);
+				if(sb.isEmpty(dictIndex)) {
+					if(it.value() == lastOff)
+						return;
+					it.next();
+					continue;
+				}
+
+				final int off = db.pos(idx) + offC;
+				final int apos = sb.pos(dictIndex);
+				final int alen = sb.size(dictIndex) + apos;
+				final double[] avals = sb.values(dictIndex);
+				final int[] aix = sb.indexes(dictIndex);
+				for(int j = apos; j < alen; j++)
+					c[off + _colIndexes[aix[j]]] += avals[j];
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+			while(it.isNotOver(ru)) {
+				final int idx = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int dictIndex = _data.getIndex(dx);
+				if(sb.isEmpty(dictIndex)) {
+					it.next();
+					continue;
+				}
+
+				final double[] c = db.values(idx);
+				final int off = db.pos(idx) + offC;
+				final int apos = sb.pos(dictIndex);
+				final int alen = sb.size(dictIndex) + apos;
+				final double[] avals = sb.values(dictIndex);
+				final int[] aix = sb.indexes(dictIndex);
+				for(int j = apos; j < alen; j++)
+					c[off + _colIndexes[aix[j]]] += avals[j];
+
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
 	}
 
 	@Override
 	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
 		SparseBlock sb) {
-		AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int row = offR + it.value();
-			final int dictIndex = getIndex(it.getDataIndexAndIncrement());
-			if(sb.isEmpty(dictIndex))
-				continue;
-
-			final int apos = sb.pos(dictIndex);
-			final int alen = sb.size(dictIndex) + apos;
-			final double[] avals = sb.values(dictIndex);
-			final int[] aix = sb.indexes(dictIndex);
-			for(int j = apos; j < alen; j++)
-				ret.append(row, _colIndexes[aix[j]] + offC, avals[j] );
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int lastOff = _indexes.getOffsetToLast();
+			while(true) {
+				final int row = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int dictIndex = _data.getIndex(dx);
+				if(sb.isEmpty(dictIndex)) {
+					if(it.value() == lastOff)
+						return;
+					it.next();
+					continue;
+				}
+
+				final int apos = sb.pos(dictIndex);
+				final int alen = sb.size(dictIndex) + apos;
+				final double[] avals = sb.values(dictIndex);
+				final int[] aix = sb.indexes(dictIndex);
+				for(int j = apos; j < alen; j++)
+					ret.append(row, _colIndexes[aix[j]] + offC, avals[j]);
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int dictIndex = _data.getIndex(dx);
+				if(sb.isEmpty(dictIndex)) {
+					it.next();
+					continue;
+				}
+
+				final int apos = sb.pos(dictIndex);
+				final int alen = sb.size(dictIndex) + apos;
+				final double[] avals = sb.values(dictIndex);
+				final int[] aix = sb.indexes(dictIndex);
+				for(int j = apos; j < alen; j++)
+					ret.append(row, _colIndexes[aix[j]] + offC, avals[j]);
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
 	}
 
 	@Override
 	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
 		double[] values) {
-		final int nCol = _colIndexes.length;
+		// LOG.error(ret);
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int lastOff = _indexes.getOffsetToLast();
+			final int nCol = _colIndexes.length;
+			while(true) {
+				final int row = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int offDict = _data.getIndex(dx) * nCol;
+				for(int j = 0; j < nCol; j++)
+					ret.append(row, _colIndexes[j] + offC, values[offDict + j]);
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+
+			final int nCol = _colIndexes.length;
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int offDict = _data.getIndex(dx) * nCol;
+				for(int j = 0; j < nCol; j++)
+					ret.append(row, _colIndexes[j] + offC, values[offDict + j]);
 
-		AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int row = offR + it.value();
-			final int offDict = getIndex(it.getDataIndexAndIncrement()) * nCol;
-			for(int j = 0; j < nCol; j++)
-				ret.append(row, _colIndexes[j] + offC, values[offDict + j]);
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
+
 	}
 
 	@Override
 	public double getIdx(int r, int colIdx) {
 		final AIterator it = _indexes.getIterator(r);
+		if(it == null || it.value() != r)
+			return 0;
 		final int nCol = _colIndexes.length;
-		if(it.value() == r)
-			return _dict.getValue(getIndex(it.getDataIndex()) * nCol + colIdx);
-		else
-			return 0.0;
+		return _dict.getValue(_data.getIndex(it.getDataIndex()) * nCol + colIdx);
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		final double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-		final AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru)
-			c[it.value()] += vals[getIndex(it.getDataIndexAndIncrement())];
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		computeRowSums(c, rl, ru, vals);
 	}
 
 	@Override
-	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		final double[] vals = _dict.aggregateTuples(builtin, _colIndexes.length);
-		final AIterator it = _indexes.getIterator(rl);
-		int rix = rl;
-
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] = builtin.execute(c[rix], 0);
-			else
-				c[rix] = builtin.execute(c[rix], vals[_data.getIndex(it.getDataIndexAndIncrement())]);
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length);
+		computeRowSums(c, rl, ru, vals);
+	}
+
+	protected void computeRowSums(double[] c, int rl, int ru, double[] vals) {
+		computeRowSums(c, rl, ru, vals, _data, _indexes, _numRows);
+	}
+
+	protected static final void computeRowSums(double[] c, int rl, int ru, double[] vals, AMapToData data,
+		AOffset indexes, int nRows) {
+		final AIterator it = indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() > ru)
+			indexes.cacheIterator(it, ru);
+		else if(ru >= indexes.getOffsetToLast()) {
+			final int maxId = data.size() - 1;
+			c[it.value()] += vals[data.getIndex(it.getDataIndex())];
+			while(it.getDataIndex() < maxId) {
+				it.next();
+				c[it.value()] += vals[data.getIndex(it.getDataIndex())];
+			}
+		}
+		else {
+			while(it.isNotOver(ru)) {
+				c[it.value()] += vals[data.getIndex(it.getDataIndex())];
+				it.next();
+			}
+			indexes.cacheIterator(it, ru);
 		}
-
-		// cover remaining rows with default value
-		for(; rix < ru; rix++)
-			c[rix] = builtin.execute(c[rix], 0);
 	}
 
 	@Override
-	public int[] getCounts(int[] counts) {
-		final int nonDefaultLength = _data.size();
-		// final AIterator it = _indexes.getIterator();
-		final int zeros = _numRows - nonDefaultLength;
-		for(int i = 0; i < nonDefaultLength; i++)
-			counts[_data.getIndex(i)]++;
-
-		counts[counts.length - 1] += zeros;
-
-		return counts;
+	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
+		final double[] vals = _dict.aggregateRows(builtin, _colIndexes.length);
+		ColGroupSDC.computeRowMxx(c, builtin, rl, ru, vals, _data, _indexes, _numRows, 0);
 	}
 
-	public int getIndex(int r) {
-		return _data.getIndex(r);
+	@Override
+	public int[] getCounts(int[] counts) {
+		return _data.getCounts(counts, _numRows);
 	}
 
 	@Override
@@ -235,82 +360,11 @@ public void preAggregate(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
 
 	@Override
 	public void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru, int cl, int cu) {
-
-		final int numVals = getNumValues();
-		if(cl != 0 && cu != preAgg.getNumColumns())
-			throw new NotImplementedException("Not implemented preAggregate of sub number of columns");
-		if(_data instanceof MapToByte)
-			preAggregateDenseByte(m, preAgg, ((MapToByte) _data).getBytes(), rl, ru, cl, cu, _numRows, numVals, _indexes);
-		else if(_data instanceof MapToChar)
-			preAggregateDenseChar(m, preAgg, ((MapToChar) _data).getChars(), rl, ru, cl, cu, _numRows, numVals, _indexes);
-		else
-			throw new DMLCompressionException("Unsupported map type:" + _data);
-
-	}
-
-	private static void preAggregateDenseByte(final MatrixBlock m, final MatrixBlock preAgg, final byte[] d,
-		final int rl, final int ru, final int cl, final int cu, final int nRow, final int nVal, AOffset indexes) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		// multi row iterator.
-		final AIterator itStart = indexes.getIterator(cl);
-		AIterator it = null;
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-			final int offLeft = rowLeft * nRow;
-			it = itStart.clone();
-			while(it.value() < cu && it.hasNext()) {
-				int i = it.value();
-				int index = d[it.getDataIndexAndIncrement()] & 0xFF;
-				preAV[offOut + index] += mV[offLeft + i];
-			}
-		}
-		if(it != null && cu < m.getNumColumns())
-			indexes.cacheIterator(it, cu);
-	}
-
-	private static void preAggregateDenseChar(final MatrixBlock m, final MatrixBlock preAgg, final char[] d,
-		final int rl, final int ru, final int cl, final int cu, final int nRow, final int nVal, AOffset indexes) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		// multi row iterator.
-		final AIterator itStart = indexes.getIterator(cl);
-		AIterator it = null;
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-			final int offLeft = rowLeft * nRow;
-			it = itStart.clone();
-			while(it.value() < cu && it.hasNext()) {
-				int i = it.value();
-				int index = d[it.getDataIndexAndIncrement()];
-				preAV[offOut + index] += mV[offLeft + i];
-			}
-		}
-		if(it != null && cu < m.getNumColumns())
-			indexes.cacheIterator(it, cu);
+		_data.preAggregateDense(m, preAgg.getDenseBlockValues(), rl, ru, cl, cu, _indexes);
 	}
 
 	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final int numVals = getNumValues();
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
-			if(sb.isEmpty(rowLeft))
-				continue;
-			final AIterator it = _indexes.getIterator();
-			final int apos = sb.pos(rowLeft);
-			final int alen = sb.size(rowLeft) + apos;
-			final int[] aix = sb.indexes(rowLeft);
-			final double[] avals = sb.values(rowLeft);
-			int j = apos;
-			while(it.hasNext() && j < alen) {
-				final int index = aix[j];
-				final int val = it.value();
-				if(index < val)
-					j++;
-				else if(index == val)
-					preAV[offOut + _data.getIndex(it.getDataIndexAndIncrement())] += avals[j++];
-				else
-					it.next();
-			}
-		}
+		_data.preAggregateSparse(sb, preAgg.getDenseBlockValues(), rl, ru, _indexes);
 	}
 
 	@Override
@@ -326,10 +380,10 @@ public AColGroup scalarOperation(ScalarOperator op) {
 		double val0 = op.executeScalar(0);
 		boolean isSparseSafeOp = op.sparseSafe || val0 == 0;
 		if(isSparseSafeOp)
-			return new ColGroupSDCZeros(_colIndexes, _numRows, applyScalarOp(op), _indexes, _data, getCachedCounts());
+			return create(_colIndexes, _numRows, _dict.applyScalarOp(op), _indexes, _data, getCachedCounts());
 		else {
-			ADictionary rValues = applyScalarOp(op, val0, getNumCols());
-			return new ColGroupSDC(_colIndexes, _numRows, rValues, _indexes, _data, getCachedCounts());
+			ADictionary rValues = _dict.applyScalarOp(op, val0, getNumCols());
+			return ColGroupSDC.create(_colIndexes, _numRows, rValues, _indexes, _data, getCachedCounts());
 		}
 	}
 
@@ -337,11 +391,15 @@ public AColGroup scalarOperation(ScalarOperator op) {
 	public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
 		if(isRowSafe) {
 			ADictionary ret = _dict.binOpLeft(op, v, _colIndexes);
-			return new ColGroupSDCZeros(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+			return create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+		}
+		else if(op.fn instanceof Plus) {
+			double[] def = ColGroupUtils.binaryDefRowLeft(op, v, _colIndexes);
+			return ColGroupPFOR.create(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), def);
 		}
 		else {
 			ADictionary ret = _dict.applyBinaryRowOpLeftAppendNewEntry(op, v, _colIndexes);
-			return new ColGroupSDC(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+			return ColGroupSDC.create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
 		}
 	}
 
@@ -349,11 +407,15 @@ public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSaf
 	public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) {
 		if(isRowSafe) {
 			ADictionary ret = _dict.binOpRight(op, v, _colIndexes);
-			return new ColGroupSDCZeros(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+			return create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+		}
+		else if(op.fn instanceof Plus) {
+			double[] def = ColGroupUtils.binaryDefRowRight(op, v, _colIndexes);
+			return ColGroupPFOR.create(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), def);
 		}
 		else {
 			ADictionary ret = _dict.applyBinaryRowOpRightAppendNewEntry(op, v, _colIndexes);
-			return new ColGroupSDC(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+			return ColGroupSDC.create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
 		}
 	}
 
@@ -394,10 +456,15 @@ public void preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
 		final AIterator itThis = _indexes.getIterator();
 		final int nCol = that._colIndexes.length;
 
-		while(itThis.hasNext()) {
+		final int finalOffThis = _indexes.getOffsetToLast();
+		while(true) {
 			final int fr = that._data.getIndex(itThis.value());
-			final int to = getIndex(itThis.getDataIndexAndIncrement());
+			final int to = _data.getIndex(itThis.getDataIndex());
 			that._dict.addToEntry(ret, fr, to, nCol);
+			if(itThis.value() >= finalOffThis)
+				break;
+			else
+				itThis.next();
 		}
 	}
 
@@ -405,17 +472,37 @@ public void preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
 	public void preAggregateThatSDCZerosStructure(ColGroupSDCZeros that, Dictionary ret) {
 		final AIterator itThat = that._indexes.getIterator();
 		final AIterator itThis = _indexes.getIterator();
+
+		final int finalOffThis = _indexes.getOffsetToLast();
+		final int finalOffThat = that._indexes.getOffsetToLast();
+
 		final int nCol = that._colIndexes.length;
-		while(itThat.hasNext() && itThis.hasNext()) {
+		while(true) {
 			if(itThat.value() == itThis.value()) {
-				final int fr = that.getIndex(itThat.getDataIndexAndIncrement());
-				final int to = getIndex(itThis.getDataIndexAndIncrement());
+				final int fr = that._data.getIndex(itThat.getDataIndex());
+				final int to = _data.getIndex(itThis.getDataIndex());
 				that._dict.addToEntry(ret, fr, to, nCol);
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+			else if(itThat.value() < itThis.value()) {
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+			}
+			else {
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
 			}
-			else if(itThat.value() < itThis.value())
-				itThat.next();
-			else
-				itThis.next();
 		}
 	}
 
@@ -425,16 +512,34 @@ public void preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that,
 		final AIterator itThis = _indexes.getIterator();
 		final int nCol = that._colIndexes.length;
 
-		while(itThat.hasNext() && itThis.hasNext()) {
+		final int finalOffThis = _indexes.getOffsetToLast();
+		final int finalOffThat = that._indexes.getOffsetToLast();
+
+		while(true) {
 			if(itThat.value() == itThis.value()) {
-				final int to = getIndex(itThis.getDataIndexAndIncrement());
+				final int to = _data.getIndex(itThis.getDataIndex());
 				that._dict.addToEntry(ret, 0, to, nCol);
-				itThat.next();
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+			else if(itThat.value() < itThis.value()) {
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+			}
+			else {
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
 			}
-			else if(itThat.value() < itThis.value())
-				itThat.next();
-			else
-				itThis.next();
 		}
 	}
 
@@ -448,7 +553,7 @@ public AColGroup replace(double pattern, double replace) {
 
 	private AColGroup replaceZero(double replace) {
 		ADictionary replaced = _dict.replaceZeroAndExtend(replace, _colIndexes.length);
-		return new ColGroupSDC(_colIndexes, _numRows, replaced, _indexes, _data, getCachedCounts());
+		return ColGroupSDC.create(_colIndexes, _numRows, replaced, _indexes, _data, getCachedCounts());
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUtils.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUtils.java
new file mode 100644
index 00000000000..f33d2dee293
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUtils.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.sysds.runtime.functionobjects.ValueFunction;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+
+public class ColGroupUtils {
+
+	/**
+	 * Calculate the result of performing the binary operation on an empty row to the left
+	 * 
+	 * v op empty
+	 * 
+	 * @param op         The operator
+	 * @param v          The values to use on the left side of the operator
+	 * @param colIndexes The column indexes to extract
+	 * @return The result as a double array.
+	 */
+	protected final static double[] binaryDefRowLeft(BinaryOperator op, double[] v, int[] colIndexes) {
+		final ValueFunction fn = op.fn;
+		final int len = colIndexes.length;
+		final double[] ret = new double[len];
+		for(int i = 0; i < len; i++)
+			ret[i] = fn.execute(v[colIndexes[i]], 0);
+		return ret;
+	}
+
+	/**
+	 * Calculate the result of performing the binary operation on an empty row to the right
+	 * 
+	 * empty op v
+	 * 
+	 * @param op         The operator
+	 * @param v          The values to use on the left side of the operator
+	 * @param colIndexes The column indexes to extract
+	 * @return The result as a double array.
+	 */
+	protected final static double[] binaryDefRowRight(BinaryOperator op, double[] v, int[] colIndexes) {
+		final ValueFunction fn = op.fn;
+		final int len = colIndexes.length;
+		final double[] ret = new double[len];
+		for(int i = 0; i < len; i++)
+			ret[i] = fn.execute(0, v[colIndexes[i]]);
+		return ret;
+	}
+
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
index 79be408c17f..7ee7ed38d8a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
@@ -70,6 +70,16 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract double aggregate(double init, Builtin fn);
 
+	/**
+	 * Aggregate all the contained values, with a reference offset.
+	 * 
+	 * @param init      The initial value, in cases such as Max value this could be -infinity.
+	 * @param fn        The function to apply to the values
+	 * @param reference The reference offset to each value in the dictionary
+	 * @return The aggregated value as a double.
+	 */
+	public abstract double aggregate(double init, Builtin fn, double[] reference);
+
 	/**
 	 * Aggregate all entries in the rows.
 	 * 
@@ -77,7 +87,57 @@ public abstract class ADictionary implements Serializable {
 	 * @param nCol The number of columns contained in the dictionary.
 	 * @return Aggregates for this dictionary tuples.
 	 */
-	public abstract double[] aggregateTuples(Builtin fn, int nCol);
+	public abstract double[] aggregateRows(Builtin fn, int nCol);
+
+	/**
+	 * Aggregate all entries in the rows with an offset value reference added.
+	 * 
+	 * @param fn        The aggregate function
+	 * @param reference The reference offset to each value in the dictionary
+	 * @return Aggregates for this dictionary tuples.
+	 */
+	public abstract double[] aggregateRows(Builtin fn, double[] reference);
+
+	/**
+	 * Aggregates the columns into the target double array provided.
+	 * 
+	 * @param c          The target double array, this contains the full number of columns, therefore the colIndexes for
+	 *                   this specific dictionary is needed.
+	 * @param fn         The function to apply to individual columns
+	 * @param colIndexes The mapping to the target columns from the individual columns
+	 */
+	public abstract void aggregateCols(double[] c, Builtin fn, int[] colIndexes);
+
+	/**
+	 * Aggregates the columns into the target double array provided.
+	 * 
+	 * @param c          The target double array, this contains the full number of columns, therefore the colIndexes for
+	 *                   this specific dictionary is needed.
+	 * @param fn         The function to apply to individual columns
+	 * @param reference  The reference offset values to add to each cell.
+	 * @param colIndexes The mapping to the target columns from the individual columns
+	 */
+	public abstract void aggregateCols(double[] c, Builtin fn, int[] colIndexes, double[] reference);
+
+	/**
+	 * Allocate a new dictionary and applies the scalar operation on each cell of the to then return the new.
+	 * 
+	 * @param op The operator.
+	 * @return The new dictionary to return.
+	 */
+	public abstract ADictionary applyScalarOp(ScalarOperator op);
+
+	/**
+	 * Allocate a new dictionary and apply the scalar operation on each cell to then return a new dictionary.
+	 * 
+	 * outValues[j] = op(this.values[j] + reference[i]) - newReference[i]
+	 * 
+	 * @param op           The operator to apply to each cell.
+	 * @param reference    The reference value to add before the operator.
+	 * @param newReference The reference value to subtract after the operator.
+	 * @return A New Dictionary.
+	 */
+	public abstract ADictionary applyScalarOp(ScalarOperator op, double[] reference, double[] newReference);
 
 	/**
 	 * Applies the scalar operation on the dictionary. Note that this operation modifies the underlying data, and
@@ -109,6 +169,23 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes);
 
+	/**
+	 * Apply the binary operator such that each value is offset by the reference before application. Then put the result
+	 * into the new dictionary, but offset it by the new reference.
+	 * 
+	 * outValues[j] = op(v[colIndexes[i]], this.values[j] + reference[i]) - newReference[i]
+	 * 
+	 * 
+	 * @param op           The operation to apply on the dictionary values.
+	 * @param v            The values to use on the left side of the operator.
+	 * @param colIndexes   The column indexes to use.
+	 * @param reference    The reference value to add before operator.
+	 * @param newReference The reference value to subtract after operator.
+	 * @return A new dictionary.
+	 */
+	public abstract ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference);
+
 	/**
 	 * Apply binary row operation on the right side.
 	 * 
@@ -119,6 +196,22 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes);
 
+	/**
+	 * Apply the binary operator such that each value is offset by the reference before application. Then put the result
+	 * into the new dictionary, but offset it by the new reference.
+	 * 
+	 * outValues[j] = op(this.values[j] + reference[i], v[colIndexes[i]]) - newReference[i]
+	 * 
+	 * @param op           The operation to apply on the dictionary values.
+	 * @param v            The values to use on the right side of the operator.
+	 * @param colIndexes   The column indexes to use.
+	 * @param reference    The reference value to add before operator.
+	 * @param newReference The reference value to subtract after operator.
+	 * @return A new dictionary.
+	 */
+	public abstract ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference);
+
 	/**
 	 * Apply binary row operation on the left side and allocate a new dictionary.
 	 * 
@@ -131,7 +224,6 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract ADictionary applyBinaryRowOpLeftAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes);
 
-
 	/**
 	 * Apply binary row operation on this dictionary on the right side.
 	 * 
@@ -155,16 +247,6 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract ADictionary cloneAndExtend(int len);
 
-	/**
-	 * Aggregates the columns into the target double array provided.
-	 * 
-	 * @param c          The target double array, this contains the full number of columns, therefore the colIndexes for
-	 *                   this specific dictionary is needed.
-	 * @param fn         The function to apply to individual columns
-	 * @param colIndexes The mapping to the target columns from the individual columns
-	 */
-	public abstract void aggregateCols(double[] c, Builtin fn, int[] colIndexes);
-
 	/**
 	 * Write the dictionary to a DataOutput.
 	 * 
@@ -200,21 +282,57 @@ public abstract class ADictionary implements Serializable {
 	 * 
 	 * Note if the number of columns is one the actual dictionaries values are simply returned.
 	 * 
-	 * @param square    If each entry should be squared.
+	 * 
+	 * @param nrColumns The number of columns in the ColGroup to know how to get the values from the dictionary.
+	 * @return a double array containing the row sums from this dictionary.
+	 */
+	public abstract double[] sumAllRowsToDouble(int nrColumns);
+
+	/**
+	 * Method used as a pre-aggregate of each tuple in the dictionary, to single double values.
+	 * 
+	 * Note if the number of columns is one the actual dictionaries values are simply returned.
+	 * 
 	 * @param nrColumns The number of columns in the ColGroup to know how to get the values from the dictionary.
 	 * @return a double array containing the row sums from this dictionary.
 	 */
-	public abstract double[] sumAllRowsToDouble(boolean square, int nrColumns);
+	public abstract double[] sumAllRowsToDoubleSq(int nrColumns);
+
+	/**
+	 * Method used as a pre-aggregate of each tuple in the dictionary, to single double values.
+	 * 
+	 * @param reference The reference values to add to each cell.
+	 * @return a double array containing the row sums from this dictionary.
+	 */
+	public abstract double[] sumAllRowsToDoubleSq(double[] reference);
 
 	/**
 	 * Sum the values at a specific row.
 	 * 
 	 * @param k         The row index to sum
-	 * @param square    If each entry should be squared.
 	 * @param nrColumns The number of columns
 	 * @return The sum of the row.
 	 */
-	public abstract double sumRow(int k, boolean square, int nrColumns);
+	public abstract double sumRow(int k, int nrColumns);
+
+	/**
+	 * Sum the values at a specific row.
+	 * 
+	 * @param k         The row index to sum
+	 * @param nrColumns The number of columns
+	 * @return The sum of the row.
+	 */
+	public abstract double sumRowSq(int k, int nrColumns);
+
+	/**
+	 * Sum the values at a specific row, with a reference array to scale the values.
+	 * 
+	 * @param k         The row index to sum
+	 * @param nrColumns The number of columns
+	 * @param reference The reference vector to add to each cell processed.
+	 * @return The sum of the row.
+	 */
+	public abstract double sumRowSq(int k, int nrColumns, double[] reference);
 
 	/**
 	 * get the column sum of this dictionary only.
@@ -232,9 +350,29 @@ public abstract class ADictionary implements Serializable {
 	 * @param counts     The counts of the individual tuples.
 	 * @param colIndexes The columns indexes of the parent column group, this indicate where to put the column sum into
 	 *                   the c output.
-	 * @param square     Specify if the values should be squared
 	 */
-	public abstract void colSum(double[] c, int[] counts, int[] colIndexes, boolean square);
+	public abstract void colSum(double[] c, int[] counts, int[] colIndexes);
+
+	/**
+	 * Get the column sum of the values contained in the dictionary
+	 * 
+	 * @param c          The output array allocated to contain all column groups output.
+	 * @param counts     The counts of the individual tuples.
+	 * @param colIndexes The columns indexes of the parent column group, this indicate where to put the column sum into
+	 *                   the c output.
+	 */
+	public abstract void colSumSq(double[] c, int[] counts, int[] colIndexes);
+
+	/**
+	 * Get the column sum of the values contained in the dictionary with an offset reference value added to each cell.
+	 * 
+	 * @param c          The output array allocated to contain all column groups output.
+	 * @param counts     The counts of the individual tuples.
+	 * @param colIndexes The columns indexes of the parent column group, this indicate where to put the column sum into
+	 *                   the c output.
+	 * @param reference  The reference values to add to each cell.
+	 */
+	public abstract void colSumSq(double[] c, int[] counts, int[] colIndexes, double[] reference);
 
 	/**
 	 * Get the sum of the values contained in the dictionary
@@ -252,7 +390,16 @@ public abstract class ADictionary implements Serializable {
 	 * @param nCol   The number of columns contained
 	 * @return The square sum scaled by the counts provided.
 	 */
-	public abstract double sumsq(int[] counts, int nCol);
+	public abstract double sumSq(int[] counts, int nCol);
+
+	/**
+	 * Get the square sum of the values contained in the dictionary with a reference offset on each value.
+	 * 
+	 * @param counts    The counts of the individual tuples
+	 * @param reference The reference value
+	 * @return The square sum scaled by the counts and reference.
+	 */
+	public abstract double sumSq(int[] counts, double[] reference);
 
 	/**
 	 * Get a string representation of the dictionary, that considers the layout of the data.
@@ -298,6 +445,15 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract boolean containsValue(double pattern);
 
+	/**
+	 * Detect if the dictionary contains a specific value with reference offset.
+	 * 
+	 * @param pattern   The pattern/ value to search for
+	 * @param reference The reference double array.
+	 * @return true if the value is contained else false.
+	 */
+	public abstract boolean containsValue(double pattern, double[] reference);
+
 	/**
 	 * Calculate the number of non zeros in the dictionary. The number of non zeros should be scaled with the counts
 	 * given. This gives the exact number of non zero values in the parent column group.
@@ -308,6 +464,20 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract long getNumberNonZeros(int[] counts, int nCol);
 
+	/**
+	 * Calculate the number of non zeros in the dictionary.
+	 * 
+	 * Each value in the dictionary should be added to the reference value.
+	 * 
+	 * The number of non zeros should be scaled with the given counts.
+	 * 
+	 * @param counts    The Counts of each dict entry.
+	 * @param reference The reference vector.
+	 * @param nRows     The number of rows in the input.
+	 * @return The NonZero Count.
+	 */
+	public abstract long getNumberNonZeros(int[] counts, double[] reference, int nRows);
+
 	/**
 	 * Copies and adds the dictionary entry from this dictionary to the d dictionary
 	 * 
@@ -380,6 +550,8 @@ public abstract ADictionary preaggValuesFromDense(final int numVals, final int[]
 	 */
 	public abstract ADictionary replace(double pattern, double replace, int nCol);
 
+	public abstract ADictionary replace(double pattern, double replace, double[] reference);
+
 	public abstract ADictionary replaceZeroAndExtend(double replace, int nCol);
 
 	public abstract double product(int[] counts, int nCol);
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
index 3707de70fd0..8f9a91b287e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
@@ -80,7 +80,19 @@ public double aggregate(double init, Builtin fn) {
 	}
 
 	@Override
-	public double[] aggregateTuples(Builtin fn, final int nCol) {
+	public double aggregate(double init, Builtin fn, double[] reference) {
+		final int nCol = reference.length;
+		double ret = init;
+		for(int i = 0; i < _values.length; i++)
+			ret = fn.execute(ret, _values[i] + reference[i % nCol]);
+
+		for(int i = 0; i < nCol; i++)
+			ret = fn.execute(ret, reference[i]);
+		return ret;
+	}
+
+	@Override
+	public double[] aggregateRows(Builtin fn, int nCol) {
 		if(nCol == 1)
 			return _values;
 		final int nRows = _values.length / nCol;
@@ -94,9 +106,48 @@ public double[] aggregateTuples(Builtin fn, final int nCol) {
 		return res;
 	}
 
+	@Override
+	public double[] aggregateRows(Builtin fn, double[] reference) {
+		final int nCol = reference.length;
+		final int nRows = _values.length / nCol;
+		double[] res = new double[nRows + 1];
+		int off = 0;
+		for(int i = 0; i < nRows; i++) {
+			res[i] = _values[off++] + reference[0];
+			for(int j = 1; j < nCol; j++)
+				res[i] = fn.execute(res[i], _values[off++] + reference[j]);
+		}
+		res[nRows] = reference[0];
+		for(int i = 0; i < nCol; i++)
+			res[nRows] = fn.execute(res[nRows], reference[i]);
+		return res;
+	}
+
+	@Override
+	public Dictionary applyScalarOp(ScalarOperator op) {
+		final double[] retV = new double[_values.length];
+		for(int i = 0; i < _values.length; i++)
+			retV[i] = op.executeScalar(_values[i]);
+		return new Dictionary(retV);
+	}
+
+	@Override
+	public Dictionary applyScalarOp(ScalarOperator op, double[] reference, double[] newReference) {
+		final double[] retV = new double[_values.length];
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int i = 0; i < nRow; i++) {
+			for(int j = 0; j < nCol; j++) {
+				retV[off] = op.executeScalar(_values[off] + reference[j]) - newReference[j];
+				off++;
+			}
+		}
+		return new Dictionary(retV);
+	}
+
 	@Override
 	public Dictionary inplaceScalarOp(ScalarOperator op) {
-		// in-place modification of the dictionary
 		int len = size();
 		for(int i = 0; i < len; i++)
 			_values[i] = op.executeScalar(_values[i]);
@@ -125,6 +176,23 @@ public Dictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes) {
 		return new Dictionary(retVals);
 	}
 
+	@Override
+	public Dictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		final ValueFunction fn = op.fn;
+		final double[] retV = new double[_values.length];
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int i = 0; i < nRow; i++) {
+			for(int j = 0; j < nCol; j++) {
+				retV[off] = fn.execute(_values[off] + reference[j], v[colIndexes[j]]) - newReference[j];
+				off++;
+			}
+		}
+		return new Dictionary(retV);
+	}
+
 	@Override
 	public final Dictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes) {
 		final ValueFunction fn = op.fn;
@@ -136,9 +204,26 @@ public final Dictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexe
 		return new Dictionary(retVals);
 	}
 
+	@Override
+	public Dictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		final ValueFunction fn = op.fn;
+		final double[] retV = new double[_values.length];
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int i = 0; i < nRow; i++) {
+			for(int j = 0; j < nCol; j++) {
+				retV[off] = fn.execute(v[colIndexes[j]], _values[off] + reference[j]) - newReference[j];
+				off++;
+			}
+		}
+		return new Dictionary(retV);
+	}
+
 	@Override
 	public Dictionary applyBinaryRowOpRightAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
-		ValueFunction fn = op.fn;
+		final ValueFunction fn = op.fn;
 		final int len = size();
 		final int lenV = colIndexes.length;
 		final double[] values = new double[len + lenV];
@@ -152,7 +237,7 @@ public Dictionary applyBinaryRowOpRightAppendNewEntry(BinaryOperator op, double[
 
 	@Override
 	public final Dictionary applyBinaryRowOpLeftAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
-		ValueFunction fn = op.fn;
+		final ValueFunction fn = op.fn;
 		final int len = size();
 		final int lenV = colIndexes.length;
 		final double[] values = new double[len + lenV];
@@ -207,34 +292,67 @@ public int getNumberOfValues(int nCol) {
 	}
 
 	@Override
-	public double[] sumAllRowsToDouble(boolean square, int nrColumns) {
-		if(nrColumns == 1 && !square)
+	public double[] sumAllRowsToDouble(int nrColumns) {
+		if(nrColumns == 1)
 			return getValues(); // shallow copy of values
 
 		// pre-aggregate value tuple
 		final int numVals = getNumberOfValues(nrColumns);
 		double[] ret = new double[numVals];
-		for(int k = 0; k < numVals; k++) {
-			ret[k] = sumRow(k, square, nrColumns);
-		}
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRow(k, nrColumns);
+
+		return ret;
+	}
+
+	@Override
+	public double[] sumAllRowsToDoubleSq(int nrColumns) {
+		// pre-aggregate value tuple
+		final int numVals = getNumberOfValues(nrColumns);
+		double[] ret = new double[numVals];
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRowSq(k, nrColumns);
 
 		return ret;
 	}
 
 	@Override
-	public double sumRow(int k, boolean square, int nrColumns) {
+	public double[] sumAllRowsToDoubleSq(double[] reference) {
+		final int nCol = reference.length;
+		final int numVals = getNumberOfValues(nCol);
+		double[] ret = new double[numVals + 1];
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRowSq(k, nCol, reference);
+		for(int i = 0; i < nCol; i++)
+			ret[numVals] += reference[i] * reference[i];
+		return ret;
+	}
 
-		int valOff = k * nrColumns;
+	@Override
+	public double sumRow(int k, int nrColumns) {
+		final int valOff = k * nrColumns;
 		double res = 0.0;
-		if(!square) {
-			for(int i = 0; i < nrColumns; i++) {
-				res += _values[valOff + i];
-			}
-		}
-		else {
-			// kSquare
-			for(int i = 0; i < nrColumns; i++)
-				res += _values[valOff + i] * _values[valOff + i];
+		for(int i = 0; i < nrColumns; i++)
+			res += _values[valOff + i];
+		return res;
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns) {
+		final int valOff = k * nrColumns;
+		double res = 0.0;
+		for(int i = 0; i < nrColumns; i++)
+			res += _values[valOff + i] * _values[valOff + i];
+		return res;
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns, double[] reference) {
+		final int valOff = k * nrColumns;
+		double res = 0.0;
+		for(int i = 0; i < nrColumns; i++) {
+			final double v = _values[valOff + i] + reference[i];
+			res += v * v;
 		}
 		return res;
 	}
@@ -252,44 +370,89 @@ public double[] colSum(int[] counts, int nCol) {
 	}
 
 	@Override
-	public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
-		for(int k = 0; k < _values.length / colIndexes.length; k++) {
+	public void colSum(double[] c, int[] counts, int[] colIndexes) {
+		final int nCol = colIndexes.length;
+		for(int k = 0; k < _values.length / nCol; k++) {
 			final int cntk = counts[k];
-			for(int j = 0; j < colIndexes.length; j++) {
-				double v = _values[k * colIndexes.length + j];
-				if(square)
-					c[colIndexes[j]] += v * v * cntk;
-				else
-					c[colIndexes[j]] += v * cntk;
+			final int off = k * nCol;
+			for(int j = 0; j < nCol; j++)
+				c[colIndexes[j]] += _values[off + j] * cntk;
+		}
+	}
+
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes) {
+		final int nCol = colIndexes.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int k = 0; k < nRow; k++) {
+			final int cntk = counts[k];
+			for(int j = 0; j < nCol; j++) {
+				final double v = _values[off++];
+				c[colIndexes[j]] += v * v * cntk;
 			}
 		}
+	}
 
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes, double[] reference) {
+		final int nCol = colIndexes.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int k = 0; k < nRow; k++) {
+			final int cntk = counts[k];
+			for(int j = 0; j < nCol; j++) {
+				final double v = _values[off++] + reference[j];
+				c[colIndexes[j]] += v * v * cntk;
+			}
+		}
+		for(int i = 0; i < nCol; i++)
+			c[colIndexes[i]] += reference[i] * reference[i] * counts[nRow];
 	}
 
 	@Override
-	public double sum(int[] counts, int ncol) {
+	public double sum(int[] counts, int nCol) {
 		double out = 0;
 		int valOff = 0;
-		for(int k = 0; k < _values.length / ncol; k++) {
+		for(int k = 0; k < _values.length / nCol; k++) {
 			int countK = counts[k];
-			for(int j = 0; j < ncol; j++) {
-				out += getValue(valOff++) * countK;
+			for(int j = 0; j < nCol; j++) {
+				out += _values[valOff++] * countK;
 			}
 		}
 		return out;
 	}
 
 	@Override
-	public double sumsq(int[] counts, int ncol) {
+	public double sumSq(int[] counts, int nCol) {
 		double out = 0;
 		int valOff = 0;
-		for(int k = 0; k < _values.length / ncol; k++) {
-			int countK = counts[k];
-			for(int j = 0; j < ncol; j++) {
-				double val = getValue(valOff++);
+		for(int k = 0; k < _values.length / nCol; k++) {
+			final int countK = counts[k];
+			for(int j = 0; j < nCol; j++) {
+				final double val = _values[valOff++];
+				out += val * val * countK;
+			}
+		}
+		return out;
+	}
+
+	@Override
+	public double sumSq(int[] counts, double[] reference) {
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		double out = 0;
+		int valOff = 0;
+		for(int k = 0; k < nRow; k++) {
+			final int countK = counts[k];
+			for(int j = 0; j < nCol; j++) {
+				final double val = _values[valOff++] + reference[j];
 				out += val * val * countK;
 			}
 		}
+		for(int i = 0; i < nCol; i++)
+			out += reference[i] * reference[i] * counts[nRow];
+
 		return out;
 	}
 
@@ -383,6 +546,15 @@ public boolean containsValue(double pattern) {
 		return false;
 	}
 
+	@Override
+	public boolean containsValue(double pattern, double[] reference) {
+		final int nCol = reference.length;
+		for(int i = 0; i < _values.length; i++)
+			if(_values[i] + reference[i % nCol] == pattern)
+				return true;
+		return false;
+	}
+
 	@Override
 	public long getNumberNonZeros(int[] counts, int nCol) {
 		long nnz = 0;
@@ -399,6 +571,27 @@ public long getNumberNonZeros(int[] counts, int nCol) {
 		return nnz;
 	}
 
+	@Override
+	public long getNumberNonZeros(int[] counts, double[] reference, int nRows) {
+		long nnz = 0;
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		for(int i = 0; i < nRow; i++) {
+			long rowCount = 0;
+			final int off = i * nCol;
+			for(int j = off, jj = 0; j < off + nCol; j++, jj++) {
+				if(_values[j] + reference[jj] != 0)
+					rowCount++;
+			}
+			nnz += rowCount * counts[i];
+		}
+		for(int i = 0; i < nCol; i++)
+			if(reference[i] != 0)
+				nnz += counts[nRow];
+
+		return nnz;
+	}
+
 	@Override
 	public void addToEntry(Dictionary d, int fr, int to, int nCol) {
 		final int sf = nCol * fr; // start from
@@ -446,12 +639,22 @@ public MatrixBlockDictionary getMBDict(int nCol) {
 
 	@Override
 	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes) {
-		int ncol = colIndexes.length;
-		int vlen = size() / ncol;
-		for(int k = 0; k < vlen; k++)
-			for(int j = 0, valOff = k * ncol; j < ncol; j++)
-				c[colIndexes[j]] = fn.execute(c[colIndexes[j]], getValue(valOff + j));
+		final int nCol = colIndexes.length;
+		final int rlen = _values.length / nCol;
+		for(int k = 0; k < rlen; k++)
+			for(int j = 0, valOff = k * nCol; j < nCol; j++)
+				c[colIndexes[j]] = fn.execute(c[colIndexes[j]], _values[valOff + j]);
+	}
 
+	@Override
+	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes, double[] reference) {
+		final int nCol = reference.length;
+		final int rlen = _values.length / nCol;
+		for(int k = 0; k < rlen; k++)
+			for(int j = 0, valOff = k * nCol; j < nCol; j++)
+				c[colIndexes[j]] = fn.execute(c[colIndexes[j]], _values[valOff + j] + reference[j]);
+		for(int i = 0; i < nCol; i++)
+			c[colIndexes[i]] = fn.execute(c[colIndexes[i]], reference[i]);
 	}
 
 	@Override
@@ -488,10 +691,23 @@ public ADictionary replace(double pattern, double replace, int nCol) {
 		double[] retV = new double[_values.length];
 		for(int i = 0; i < _values.length; i++) {
 			final double v = _values[i];
-			if(v == pattern)
-				retV[i] = replace;
-			else
-				retV[i] = v;
+			retV[i] = v == pattern ? replace : v;
+		}
+		return new Dictionary(retV);
+	}
+
+	@Override
+	public ADictionary replace(double pattern, double replace, double[] reference) {
+		final double[] retV = new double[_values.length];
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int i = 0; i < nRow; i++) {
+			for(int j = 0; j < nCol; j++) {
+				final double v = _values[off];
+				retV[off++] = v + reference[j] == pattern ? replace - reference[j] : v;
+
+			}
 		}
 		return new Dictionary(retV);
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
index 1db433c5c29..982c3c903c6 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
@@ -171,6 +171,8 @@ public static ADictionary moveFrequentToLastDictionaryEntry(ADictionary dict, AB
 			else if(mb.isInSparseFormat()) {
 				MatrixBlockDictionary mbdn = moveToLastDictionaryEntrySparse(mb.getSparseBlock(), largestIndex, zeros, nCol,
 					largestIndexSize);
+				if(mbdn == null)
+					return null;
 				MatrixBlock mbn = mbdn.getMatrixBlock();
 				mbn.setNonZeros(mb.getNonZeros());
 				if(mbn.getNonZeros() == 0)
@@ -196,6 +198,8 @@ private static MatrixBlockDictionary moveToLastDictionaryEntrySparse(SparseBlock
 			for(int i = indexToMove + 1; i < sb.numRows(); i++)
 				sb.set(i - 1, sb.get(i), false);
 			sb.set(sb.numRows() - 1, swap, false);
+			if(ret.isEmpty())
+				return null;
 			return new MatrixBlockDictionary(ret);
 		}
 
@@ -214,6 +218,8 @@ private static MatrixBlockDictionary moveToLastDictionaryEntrySparse(SparseBlock
 			for(int i = indexToMove + 1; i < sb.numRows(); i++)
 				retB.set(i - 1, sb.get(i), false);
 		}
+		if(ret.isEmpty())
+			return null;
 		return new MatrixBlockDictionary(ret);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
index b3fa6f7e09f..b9fc6868ea6 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
@@ -25,6 +25,7 @@
 import java.util.Arrays;
 
 import org.apache.commons.lang.NotImplementedException;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.utils.Util;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.DenseBlockFP64;
@@ -45,10 +46,14 @@ public class MatrixBlockDictionary extends ADictionary {
 
 	public MatrixBlockDictionary(double[] values, int nCol) {
 		_data = Util.matrixBlockFromDenseArray(values, nCol);
+		if(_data.isEmpty())
+			throw new DMLCompressionException("Invalid construction of empty dictionary");
 	}
 
 	public MatrixBlockDictionary(MatrixBlock data) {
 		_data = data;
+		if(_data.isEmpty())
+			throw new DMLCompressionException("Invalid construction of empty dictionary");
 	}
 
 	public MatrixBlock getMatrixBlock() {
@@ -93,7 +98,45 @@ else if(fn.getBuiltinCode() == BuiltinCode.MIN)
 	}
 
 	@Override
-	public double[] aggregateTuples(Builtin fn, int nCol) {
+	public double aggregate(double init, Builtin fn, double[] reference) {
+		final int nCol = reference.length;
+		final int nRows = _data.getNumRows();
+		double ret = init;
+
+		for(int i = 0; i < nCol; i++)
+			ret = fn.execute(ret, reference[i]);
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRows; i++) {
+				if(sb.isEmpty(i))
+					continue;
+				final int apos = sb.pos(i);
+				final int alen = sb.size(i) + apos;
+				final int[] aix = sb.indexes(i);
+				final double[] avals = sb.values(i);
+				for(int k = apos; k < alen; k++) {
+					final double v = avals[k] + reference[aix[k]];
+					ret = fn.execute(ret, v);
+				}
+			}
+		}
+		else if(!_data.isEmpty()) {
+			final double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < nRows; k++) {
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					ret = fn.execute(ret, v);
+				}
+			}
+		}
+
+		return ret;
+	}
+
+	@Override
+	public double[] aggregateRows(Builtin fn, int nCol) {
 		double[] ret = new double[_data.getNumRows()];
 		if(_data.isEmpty())
 			return ret;
@@ -129,6 +172,53 @@ else if(nCol == 1)
 		return ret;
 	}
 
+	@Override
+	public double[] aggregateRows(Builtin fn, double[] reference) {
+		final int nCol = reference.length;
+		final int nRows = _data.getNumRows();
+		final double[] ret = new double[nRows + 1];
+
+		ret[nRows] = reference[0];
+		for(int i = 1; i < nCol; i++)
+			ret[nRows] = fn.execute(ret[nRows], reference[i]);
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRows; i++) {
+				if(sb.isEmpty(i))
+					ret[i] = ret[nRows];
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 1;
+					ret[i] = (aix[k] == 0) ? avals[k++] + reference[0] : reference[0];
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						ret[i] = fn.execute(ret[i], v);
+					}
+					for(; j < _data.getNumColumns(); j++)
+						ret[i] = fn.execute(ret[i], reference[j]);
+				}
+			}
+		}
+		else if(!_data.isEmpty()) {
+			final double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < nRows; k++) {
+				ret[k] = values[off++] + reference[0];
+				for(int j = 1; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					ret[k] = fn.execute(ret[k], v);
+				}
+			}
+		}
+
+		return ret;
+	}
+
 	@Override
 	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes) {
 		if(_data.isEmpty()) {
@@ -172,9 +262,102 @@ else if(_data.isInSparseFormat()) {
 	}
 
 	@Override
-	public ADictionary inplaceScalarOp(ScalarOperator op) {
+	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes, double[] reference) {
+		final int nCol = _data.getNumColumns();
+		final int nRow = _data.getNumRows();
+
+		for(int j = 0; j < colIndexes.length; j++) {
+			final int idx = colIndexes[j];
+			c[idx] = fn.execute(c[idx], reference[j]);
+		}
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRow; i++) {
+				if(sb.isEmpty(i))
+					continue;
+				final int apos = sb.pos(i);
+				final int alen = sb.size(i) + apos;
+				final double[] avals = sb.values(i);
+				final int[] aix = sb.indexes(i);
+				// This is a cool trick but it only works with min / max.
+				for(int k = apos; k < alen; k++) {
+					final int idx = colIndexes[aix[k]];
+					c[idx] = fn.execute(c[idx], avals[k] + reference[aix[k]]);
+				}
+			}
+		}
+		else if(!_data.isEmpty()) {
+			final double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < nRow; k++) {
+				for(int j = 0; j < nCol; j++) {
+					final int idx = colIndexes[j];
+					c[idx] = fn.execute(c[idx], values[off++] + reference[j]);
+				}
+			}
+		}
+	}
+
+	@Override
+	public ADictionary applyScalarOp(ScalarOperator op) {
 		MatrixBlock res = _data.scalarOperations(op, new MatrixBlock());
-		return new MatrixBlockDictionary(res);
+		if(res.isEmpty())
+			return null;
+		else
+			return new MatrixBlockDictionary(res);
+	}
+
+	@Override
+	public ADictionary applyScalarOp(ScalarOperator op, double[] reference, double[] newReference) {
+		final int nCol = _data.getNumColumns();
+		final int nRow = _data.getNumRows();
+		final MatrixBlock ret = new MatrixBlock(nRow, nCol, false);
+		ret.allocateDenseBlock();
+		final double[] retV = ret.getDenseBlockValues();
+		int off = 0;
+		if(_data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRow; i++) {
+				if(sb.isEmpty(i))
+					for(int j = 0; j < nCol; j++)
+						retV[off++] = op.executeScalar(reference[j]) - newReference[j];
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int j = 0;
+					for(int k = apos; j < nCol && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						retV[off++] = op.executeScalar(v) - newReference[j];
+					}
+					for(; j < nCol; j++)
+						retV[off++] = op.executeScalar(reference[j]) - newReference[j];
+				}
+			}
+		}
+		else {
+			final double[] values = _data.getDenseBlockValues();
+			for(int i = 0; i < nRow; i++) {
+				for(int j = 0; j < nCol; j++) {
+					retV[off] = op.executeScalar(values[off] + reference[j]) - newReference[j];
+					off++;
+				}
+			}
+		}
+
+		ret.recomputeNonZeros();
+		ret.examSparsity();
+		if(ret.isEmpty())
+			return null;
+		else
+			return new MatrixBlockDictionary(ret);
+
+	}
+
+	@Override
+	public ADictionary inplaceScalarOp(ScalarOperator op) {
+		throw new NotImplementedException();
 	}
 
 	@Override
@@ -182,15 +365,16 @@ public ADictionary applyScalarOp(ScalarOperator op, double newVal, int numCols)
 		MatrixBlock res = _data.scalarOperations(op, new MatrixBlock());
 		final int lastRow = res.getNumRows();
 		MatrixBlock res2 = new MatrixBlock(lastRow + 1, res.getNumColumns(), true);
-		if(res.isEmpty()) {
+		if(res.isEmpty())
 			for(int i = 0; i < numCols; i++)
 				res2.appendValue(lastRow, i, newVal);
-			return new MatrixBlockDictionary(res2);
-		}
-		else {
+		else
 			res.append(new MatrixBlock(1, numCols, newVal), res2, false);
+
+		if(res2.isEmpty())
+			return null;
+		else
 			return new MatrixBlockDictionary(res2);
-		}
 	}
 
 	@Override
@@ -199,6 +383,12 @@ public ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes) {
 		return new MatrixBlockDictionary(rowVector.binaryOperations(op, _data, null));
 	}
 
+	@Override
+	public Dictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public ADictionary applyBinaryRowOpLeftAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
 		MatrixBlock rowVector = Util.extractValues(v, colIndexes);
@@ -212,6 +402,12 @@ public ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes) {
 		return new MatrixBlockDictionary(_data.binaryOperations(op, rowVector, null));
 	}
 
+	@Override
+	public Dictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public ADictionary applyBinaryRowOpRightAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
 		MatrixBlock rowVector = Util.extractValues(v, colIndexes);
@@ -242,7 +438,7 @@ public int getNumberOfValues(int ncol) {
 	}
 
 	@Override
-	public double[] sumAllRowsToDouble(boolean square, int nrColumns) {
+	public double[] sumAllRowsToDouble(int nrColumns) {
 		double[] ret = new double[_data.getNumRows()];
 
 		if(_data.isEmpty())
@@ -255,7 +451,7 @@ else if(_data.isInSparseFormat()) {
 					final int alen = sb.size(i) + apos;
 					final double[] avals = sb.values(i);
 					for(int j = apos; j < alen; j++) {
-						ret[i] += (square) ? avals[j] * avals[j] : avals[j];
+						ret[i] += avals[j];
 					}
 				}
 			}
@@ -266,7 +462,7 @@ else if(_data.isInSparseFormat()) {
 			for(int k = 0; k < _data.getNumRows(); k++) {
 				for(int j = 0; j < _data.getNumColumns(); j++) {
 					final double v = values[off++];
-					ret[k] += (square) ? v * v : v;
+					ret[k] += v;
 				}
 			}
 		}
@@ -274,7 +470,95 @@ else if(_data.isInSparseFormat()) {
 	}
 
 	@Override
-	public double sumRow(int k, boolean square, int nrColumns) {
+	public double[] sumAllRowsToDoubleSq(int nrColumns) {
+		final double[] ret = new double[_data.getNumRows()];
+
+		if(_data.isEmpty())
+			return ret;
+		else if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(!sb.isEmpty(i)) {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final double[] avals = sb.values(i);
+					for(int j = apos; j < alen; j++) {
+						ret[i] += avals[j] * avals[j];
+					}
+				}
+			}
+		}
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < _data.getNumRows(); k++) {
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++];
+					ret[k] += v * v;
+				}
+			}
+		}
+		return ret;
+	}
+
+	@Override
+	public double[] sumAllRowsToDoubleSq(double[] reference) {
+		final int nCol = reference.length;
+		final int numVals = _data.getNumRows();
+		final double[] ret = new double[numVals + 1];
+
+		final int finalIndex = numVals;
+		for(int i = 0; i < nCol; i++)
+			ret[finalIndex] += reference[i] * reference[i];
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < numVals; i++) {
+				if(sb.isEmpty(i))
+					ret[i] = ret[finalIndex];
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 0;
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						ret[i] += v * v;
+					}
+					for(; j < _data.getNumColumns(); j++)
+						ret[i] += reference[j] * reference[j];
+				}
+
+			}
+		}
+		else if(!_data.isEmpty()) {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < numVals; k++) {
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					ret[k] += v * v;
+				}
+			}
+		}
+
+		return ret;
+	}
+
+	@Override
+	public double sumRow(int k, int nrColumns) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns, double[] reference) {
 		throw new NotImplementedException();
 	}
 
@@ -314,7 +598,40 @@ public double[] colSum(int[] counts, int nCol) {
 	}
 
 	@Override
-	public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
+	public void colSum(double[] c, int[] counts, int[] colIndexes) {
+		if(_data.isEmpty())
+			return;
+		if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(!sb.isEmpty(i)) {
+					// double tmpSum = 0;
+					final int count = counts[i];
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					for(int j = apos; j < alen; j++) {
+						c[colIndexes[aix[j]]] += count * avals[j];
+					}
+				}
+			}
+		}
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < _data.getNumRows(); k++) {
+				final int countK = counts[k];
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++];
+					c[colIndexes[j]] += v * countK;
+				}
+			}
+		}
+	}
+
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes) {
 		if(_data.isEmpty())
 			return;
 		if(_data.isInSparseFormat()) {
@@ -328,7 +645,7 @@ public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
 					final int[] aix = sb.indexes(i);
 					final double[] avals = sb.values(i);
 					for(int j = apos; j < alen; j++) {
-						c[colIndexes[aix[j]]] += square ? count * avals[j] * avals[j] : count * avals[j];
+						c[colIndexes[aix[j]]] += count * avals[j] * avals[j];
 					}
 				}
 			}
@@ -340,7 +657,50 @@ public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
 				final int countK = counts[k];
 				for(int j = 0; j < _data.getNumColumns(); j++) {
 					final double v = values[off++];
-					c[colIndexes[j]] += square ? v * v * countK : v * countK;
+					c[colIndexes[j]] += v * v * countK;
+				}
+			}
+		}
+	}
+
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes, double[] reference) {
+		final int nCol = reference.length;
+		final int nRow = _data.getNumRows();
+		for(int i = 0; i < nCol; i++)
+			c[colIndexes[i]] += reference[i] * reference[i] * counts[nRow];
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRow; i++) {
+				final int countK = counts[i];
+				if(sb.isEmpty(i))
+					for(int j = 0; j < nCol; j++)
+						c[colIndexes[j]] += reference[j] * reference[j] * countK;
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 0;
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						c[colIndexes[j]] += v * v * countK;
+					}
+					for(; j < _data.getNumColumns(); j++)
+						c[colIndexes[j]] += reference[j] * reference[j] * countK;
+				}
+			}
+		}
+		else if(!_data.isEmpty()) {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < nRow; k++) {
+				final int countK = counts[k];
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					c[colIndexes[j]] += v * v * countK;
 				}
 			}
 		}
@@ -380,7 +740,7 @@ public double sum(int[] counts, int ncol) {
 	}
 
 	@Override
-	public double sumsq(int[] counts, int ncol) {
+	public double sumSq(int[] counts, int ncol) {
 		double tmpSum = 0;
 		if(_data.isEmpty())
 			return tmpSum;
@@ -412,6 +772,54 @@ public double sumsq(int[] counts, int ncol) {
 		return tmpSum;
 	}
 
+	@Override
+	public double sumSq(int[] counts, double[] reference) {
+		final int nCol = reference.length;
+		final int numVals = _data.getNumRows();
+		double ret = 0;
+		for(int i = 0; i < nCol; i++)
+			ret += reference[i] * reference[i];
+		final double ref = ret;
+		ret *= counts[numVals];
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < numVals; i++) {
+				final int countK = counts[i];
+				if(sb.isEmpty(i))
+					ret += ref * countK;
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 0;
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						ret += v * v * countK;
+					}
+					for(; j < _data.getNumColumns(); j++)
+						ret += reference[j] * reference[j] * countK;
+				}
+
+			}
+		}
+		else if(!_data.isEmpty()) {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < numVals; k++) {
+				final int countK = counts[k];
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					ret += v * v * countK;
+				}
+			}
+		}
+
+		return ret;
+	}
+
 	@Override
 	public String getString(int colIndexes) {
 		return _data.toString();
@@ -438,6 +846,53 @@ public boolean containsValue(double pattern) {
 		return _data.containsValue(pattern);
 	}
 
+	@Override
+	public boolean containsValue(double pattern, double[] reference) {
+
+		if(_data.isEmpty()) {
+			for(double d : reference)
+				if(pattern == d)
+					return true;
+			return false;
+		}
+		else if(_data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(sb.isEmpty(i))
+					continue;
+				final int apos = sb.pos(i);
+				final int alen = sb.size(i) + apos;
+				final int[] aix = sb.indexes(i);
+				final double[] avals = sb.values(i);
+				int k = apos;
+				int j = 0;
+				for(; j < _data.getNumColumns() && k < alen; j++) {
+					if(aix[k] == j) {
+						if(reference[j] + avals[k++] == pattern)
+							return true;
+					}
+					else {
+						if(reference[j] == pattern)
+							return true;
+					}
+				}
+				for(; j < _data.getNumColumns(); j++)
+					if(reference[j] == pattern)
+						return true;
+
+			}
+		}
+		else {
+			final double[] values = _data.getDenseBlockValues();
+			final int nCol = reference.length;
+			for(int i = 0; i < values.length; i++)
+				if(values[i] + reference[i % nCol] == pattern)
+					return true;
+
+		}
+		return false;
+	}
+
 	@Override
 	public long getNumberNonZeros(int[] counts, int nCol) {
 		if(_data.isEmpty())
@@ -449,7 +904,6 @@ public long getNumberNonZeros(int[] counts, int nCol) {
 			for(int i = 0; i < _data.getNumRows(); i++)
 				if(!sb.isEmpty(i))
 					nnz += sb.size(i) * counts[i];
-
 		}
 		else {
 			double[] values = _data.getDenseBlockValues();
@@ -467,6 +921,64 @@ public long getNumberNonZeros(int[] counts, int nCol) {
 		return nnz;
 	}
 
+	@Override
+	public long getNumberNonZeros(int[] counts, double[] reference, int nRows) {
+		long nnz = 0;
+		for(double d : reference)
+			if(d != 0)
+				nnz++;
+		if(_data.isEmpty()) {
+			// sum counts
+			return nnz * nRows;
+		}
+		else if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			long emptyRowNNZ = nnz;
+			nnz *= counts[counts.length - 1]; // multiply count with the common value count in reference.
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(sb.isEmpty(i))
+					nnz += emptyRowNNZ * counts[i];
+				else {
+					int countThis = 0;
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 0;
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						if(aix[k] == j) {
+							if(reference[j] + avals[k++] != 0)
+								countThis++;
+						}
+						else {
+							if(reference[j] != 0)
+								countThis++;
+						}
+					}
+					for(; j < _data.getNumColumns(); j++)
+						if(reference[j] != 0)
+							countThis++;
+
+					nnz += countThis * counts[i];
+				}
+			}
+		}
+		else {
+			nnz *= counts[counts.length - 1]; // multiply count with the common value count in reference.
+			final double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				int countThisTuple = 0;
+				for(int j = 0; j < _data.getNumColumns(); j++)
+					if(values[off++] + reference[j] != 0)
+						countThisTuple++;
+				nnz += countThisTuple * counts[i];
+			}
+		}
+		return nnz;
+	}
+
 	@Override
 	public void addToEntry(Dictionary d, int fr, int to, int nCol) {
 		double[] v = d.getValues();
@@ -529,6 +1041,8 @@ public ADictionary subtractTuple(double[] tuple) {
 		MatrixBlock rowVector = new MatrixBlock(1, tuple.length, b);
 		MatrixBlock res = new MatrixBlock(_data.getNumColumns(), _data.getNumRows(), _data.isInSparseFormat());
 		_data.binaryOperations(new BinaryOperator(Minus.getMinusFnObject()), rowVector, res);
+		if(res.isEmpty())
+			return null;
 		return new MatrixBlockDictionary(res);
 	}
 
@@ -645,7 +1159,7 @@ else if(_data.isInSparseFormat()) {
 
 		DenseBlock dictV = new DenseBlockFP64(new int[] {numVals, aggregateColumns.length}, ret);
 		MatrixBlock dictM = new MatrixBlock(numVals, aggregateColumns.length, dictV);
-		dictM.getNonZeros();
+		dictM.recomputeNonZeros();
 		dictM.examSparsity();
 		return new MatrixBlockDictionary(dictM);
 
@@ -653,16 +1167,66 @@ else if(_data.isInSparseFormat()) {
 
 	@Override
 	public ADictionary replace(double pattern, double replace, int nCol) {
-		MatrixBlock ret = _data.replaceOperations(new MatrixBlock(), pattern, replace);
+		final MatrixBlock ret = _data.replaceOperations(new MatrixBlock(), pattern, replace);
+		if(ret.isEmpty())
+			return null;
 		return new MatrixBlockDictionary(ret);
 	}
 
+	@Override
+	public ADictionary replace(double pattern, double replace, double[] reference) {
+		final int nRow = _data.getNumRows();
+		final int nCol = _data.getNumColumns();
+		final MatrixBlock ret = new MatrixBlock(nRow, nCol, false);
+		ret.allocateDenseBlock();
+		final double[] retV = ret.getDenseBlockValues();
+		int off = 0;
+		if(_data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRow; i ++){
+				if(sb.isEmpty(i))
+					for(int j = 0; j < nCol; j++)
+						retV[off++] = pattern == reference[j] ? replace - reference[j] : 0;
+				else{
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int j = 0;
+					for(int k = apos; j < nCol && k < alen; j++){
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						retV[off++] = pattern == v ? replace - reference[j] : v - reference[j];
+					}
+					for(; j < nCol; j++)
+						retV[off++] = pattern == reference[j] ? replace - reference[j] : 0;
+				}
+			}
+		}
+		else {
+			final double[] values = _data.getDenseBlockValues();
+			for(int i = 0; i < nRow; i++) {
+				for(int j = 0; j < nCol; j++) {
+					final double v = values[off];
+					retV[off++] = pattern == v + reference[j] ? replace - reference[j] : v;
+				}
+			}
+		}
+
+		ret.recomputeNonZeros();
+		ret.examSparsity();
+		if(ret.isEmpty())
+			return null;
+		else
+			return new MatrixBlockDictionary(ret);
+
+	}
+
 	@Override
 	public ADictionary replaceZeroAndExtend(double replace, int nCol) {
 		final int nRows = _data.getNumRows();
 		final int nCols = _data.getNumColumns();
 		final long nonZerosOut = (nRows + 1) * nCols;
-		final MatrixBlock ret = new MatrixBlock(_data.getNumRows() + 1, _data.getNumColumns(), false);
+		final MatrixBlock ret = new MatrixBlock(nRows + 1, nCols, false);
 		ret.allocateBlock();
 		ret.setNonZeros(nonZerosOut);
 		final double[] retValues = ret.getDenseBlockValues();
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
index bfab5275c79..879892a3745 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
@@ -101,7 +101,12 @@ public double aggregate(double init, Builtin fn) {
 	}
 
 	@Override
-	public double[] aggregateTuples(Builtin fn, final int nCol) {
+	public double aggregate(double init, Builtin fn, double[] reference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public double[] aggregateRows(Builtin fn, final int nCol) {
 		if(nCol == 1)
 			return getValues();
 		final int nRows = _values.length / nCol;
@@ -115,6 +120,11 @@ public double[] aggregateTuples(Builtin fn, final int nCol) {
 		return res;
 	}
 
+	@Override
+	public double[] aggregateRows(Builtin fn, double[] reference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public QDictionary inplaceScalarOp(ScalarOperator op) {
 		if(_values == null)
@@ -154,6 +164,11 @@ else if(op.fn instanceof Plus) {
 		return this;
 	}
 
+	@Override
+	public QDictionary applyScalarOp(ScalarOperator op) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public QDictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) {
 		double[] temp = getValues();
@@ -219,39 +234,60 @@ public int getNumberOfValues(int nCol) {
 	}
 
 	@Override
-	public double[] sumAllRowsToDouble(boolean square, int nrColumns) {
-		if(nrColumns == 1 && !square)
+	public double[] sumAllRowsToDouble(int nrColumns) {
+		if(nrColumns == 1)
 			return getValues(); // shallow copy of values
 
 		final int numVals = getNumberOfValues(nrColumns);
 		double[] ret = new double[numVals];
-		for(int k = 0; k < numVals; k++) {
-			ret[k] = sumRow(k, square, nrColumns);
-		}
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRow(k, nrColumns);
 
 		return ret;
 	}
 
 	@Override
-	public double sumRow(int k, boolean square, int nrColumns) {
+	public double[] sumAllRowsToDoubleSq(int nrColumns) {
+		final int numVals = getNumberOfValues(nrColumns);
+		double[] ret = new double[numVals];
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRowSq(k, nrColumns);
+		return ret;
+	}
+
+	@Override
+	public double[] sumAllRowsToDoubleSq(double[] reference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public double sumRow(int k, int nrColumns) {
 		if(_values == null)
 			return 0;
 		int valOff = k * nrColumns;
 
-		if(!square) {
-			int res = 0;
-			for(int i = 0; i < nrColumns; i++) {
-				res += _values[valOff + i];
-			}
-			return res * _scale;
-		}
-		else {
-			// kSquare
-			double res = 0.0;
-			for(int i = 0; i < nrColumns; i++)
-				res += (int) (_values[valOff + i] * _values[valOff + i]) * _scale * _scale;
-			return res;
+		int res = 0;
+		for(int i = 0; i < nrColumns; i++) {
+			res += _values[valOff + i];
 		}
+		return res * _scale;
+
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns) {
+		if(_values == null)
+			return 0;
+		int valOff = k * nrColumns;
+		double res = 0.0;
+		for(int i = 0; i < nrColumns; i++)
+			res += (int) (_values[valOff + i] * _values[valOff + i]) * _scale * _scale;
+		return res;
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns, double[] reference) {
+		throw new NotImplementedException();
 	}
 
 	@Override
@@ -260,17 +296,32 @@ public double[] colSum(int[] counts, int nCol) {
 	}
 
 	@Override
-	public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
+	public void colSum(double[] c, int[] counts, int[] colIndexes) {
 		throw new NotImplementedException("Not Implemented");
 	}
 
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes) {
+		throw new NotImplementedException("Not Implemented");
+	}
+
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes, double[] reference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public double sum(int[] counts, int ncol) {
 		throw new NotImplementedException("Not Implemented");
 	}
 
 	@Override
-	public double sumsq(int[] counts, int ncol) {
+	public double sumSq(int[] counts, int ncol) {
+		throw new NotImplementedException("Not Implemented");
+	}
+
+	@Override
+	public double sumSq(int[] counts, double[] reference) {
 		throw new NotImplementedException("Not Implemented");
 	}
 
@@ -341,6 +392,11 @@ public boolean containsValue(double pattern) {
 		throw new NotImplementedException("Not contains value on Q Dictionary");
 	}
 
+	@Override
+	public boolean containsValue(double pattern, double[] reference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public long getNumberNonZeros(int[] counts, int nCol) {
 		long nnz = 0;
@@ -357,6 +413,11 @@ public long getNumberNonZeros(int[] counts, int nCol) {
 		return nnz;
 	}
 
+	@Override
+	public long getNumberNonZeros(int[] counts, double[] reference, int nRows) {
+		throw new NotImplementedException("not implemented yet");
+	}
+
 	@Override
 	public void addToEntry(Dictionary d, int fr, int to, int nCol) {
 		throw new NotImplementedException("Not implemented yet");
@@ -387,6 +448,11 @@ public void aggregateCols(double[] c, Builtin fn, int[] colIndexes) {
 		throw new NotImplementedException();
 	}
 
+	@Override
+	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes, double[] reference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public ADictionary scaleTuples(int[] scaling, int nCol) {
 		throw new NotImplementedException();
@@ -403,6 +469,11 @@ public ADictionary replace(double pattern, double replace, int nCol) {
 		throw new NotImplementedException();
 	}
 
+	@Override
+	public ADictionary replace(double pattern, double replace, double[] reference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public ADictionary replaceZeroAndExtend(double replace, int nCol) {
 		throw new NotImplementedException();
@@ -420,25 +491,38 @@ public void colProduct(double[] res, int[] counts, int[] colIndexes) {
 
 	@Override
 	public ADictionary applyBinaryRowOpLeftAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
-		// TODO Auto-generated method stub
-		return null;
+		throw new NotImplementedException();
 	}
 
 	@Override
 	public ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes) {
-		// TODO Auto-generated method stub
-		return null;
+		throw new NotImplementedException();
 	}
 
 	@Override
 	public ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes) {
-		// TODO Auto-generated method stub
-		return null;
+		throw new NotImplementedException();
 	}
 
 	@Override
 	public ADictionary applyBinaryRowOpRightAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
-		// TODO Auto-generated method stub
-		return null;
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public ADictionary applyScalarOp(ScalarOperator op, double[] reference, double[] newReference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		throw new NotImplementedException();
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/MaterializeSort.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/MaterializeSort.java
index 2d9c5b84308..341268b763b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/MaterializeSort.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/MaterializeSort.java
@@ -24,7 +24,7 @@
 import org.apache.sysds.runtime.compress.utils.IntArrayList;
 
 public class MaterializeSort extends AInsertionSorter {
-	public static int CACHE_BLOCK = 1000;
+	public static int CACHE_BLOCK = 50000;
 
 	/** a dense mapToData, that have a value for each row in the input. */
 	private final AMapToData md;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
index d3310fee72b..953ea49d858 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
@@ -25,12 +25,13 @@
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
 public abstract class AMapToData implements Serializable {
 
-	private static final long serialVersionUID = 100512759972844714L;
-
 	protected static final Log LOG = LogFactory.getLog(AMapToData.class.getName());
 
 	/** Number of unique values inside this map. */
@@ -44,7 +45,10 @@ public abstract class AMapToData implements Serializable {
 	 * @param nUnique number of unique values.
 	 */
 	protected AMapToData(int nUnique) {
-		this.nUnique = nUnique;
+		if(nUnique + 1 < 0)
+			this.nUnique = Integer.MAX_VALUE;
+		else
+			this.nUnique = nUnique + 1;
 	}
 
 	/**
@@ -145,14 +149,63 @@ protected final void setUnique(int nUnique) {
 	/**
 	 * Pre aggregate a dense matrix m into pre, subject to only including a row segment and column segment.
 	 * 
-	 * @param m   The dense matrix values to preaggregate
-	 * @param pre The preAggregate to populate with the summed values of m
-	 * @param rl  The row start in m
-	 * @param ru  The row end in m
-	 * @param cl  The column start in m
-	 * @param cu  The column end in m
+	 * @param m     The dense matrix values to preaggregate
+	 * @param preAV The preAggregate double array populate with the summed values of m
+	 * @param rl    The row start in m
+	 * @param ru    The row end in m
+	 * @param cl    The column start in m
+	 * @param cu    The column end in m
+	 */
+	public final void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu) {
+		final DenseBlock db = m.getDenseBlock();
+		if(rl == ru - 1) {
+			final double[] mV = db.values(rl);
+			final int off = db.pos(rl);
+			preAggregateDenseToRow(mV, off, preAV, cl, cu);
+		}
+		else {
+			preAggregateDenseRows(m, preAV, rl, ru, cl, cu);
+		}
+	}
+
+	protected abstract void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu);
+
+	protected void preAggregateDenseRows(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu) {
+		LOG.warn("Inefficient implementation pre aggregate of multi row in use");
+		final int nRow = m.getNumColumns();
+		final int nVal = getUnique() -1;
+		final double[] mV = m.getDenseBlockValues();
+		final int blockSize = 4000;
+		for(int block = cl; block < cu; block += blockSize) {
+			final int blockEnd = Math.min(block + blockSize, nRow);
+			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
+				final int offLeft = rowLeft * nRow;
+				for(int rc = block; rc < blockEnd; rc++) {
+					final int idx = getIndex(rc);
+					preAV[offOut + idx] += mV[offLeft + rc];
+				}
+			}
+		}
+	}
+
+	public abstract void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu,
+		AOffset indexes);
+
+	public abstract void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes);
+
+	/**
+	 * Get the number of counts of each unique value contained in this map.
+	 * 
+	 * @param counts The object to return.
+	 * @param nRows  The number of rows in the calling column group.
 	 */
-	public abstract void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu);
+	public int[] getCounts(int[] counts, int nRows) {
+		final int nonDefaultLength = size();
+		for(int i = 0; i < nonDefaultLength; i++)
+			counts[getIndex(i)]++;
+		counts[counts.length - 1] += nRows - nonDefaultLength;
+		return counts;
+	}
 
 	/**
 	 * Copy the values in this map into another mapping object.
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
index 678ee65619e..af81dc338a9 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
@@ -24,7 +24,10 @@
 import java.io.IOException;
 import java.util.BitSet;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -118,24 +121,26 @@ public static MapToBit readFields(DataInput in) throws IOException {
 	}
 
 	@Override
-	public void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu) {
-		final int nRow = m.getNumColumns();
-		final int nVal = pre.getNumColumns();
-		final double[] preAV = pre.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int blockSize = 4000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, nRow);
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-				final int offLeft = rowLeft * nRow;
-				for(int rc = block; rc < blockEnd; rc++)
-					preAV[_data.get(rc) ? offOut + 1 : offOut] += mV[offLeft + rc];
-			}
-		}
+	protected void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu) {
+		off += cl;
+		for(int rc = cl; rc < cu; rc++, off++)
+			preAV[_data.get(rc) ? 1 : 0] += mV[off];
+	}
+	
+	@Override
+	public void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu,
+		AOffset indexes) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes){
+		throw new NotImplementedException();
 	}
 
 	@Override
 	public int getUpperBoundValue() {
 		return 1;
 	}
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
index 5bd1e645b47..537c45836f6 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
@@ -25,6 +25,8 @@
 import java.util.Arrays;
 
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -98,9 +100,9 @@ public static MapToByte readFields(DataInput in) throws IOException {
 		return new MapToByte(unique, data);
 	}
 
-	public byte[] getBytes() {
-		return _data;
-	}
+	// public byte[] getBytes() {
+	// return _data;
+	// }
 
 	@Override
 	public void replace(int v, int r) {
@@ -125,24 +127,27 @@ public void copy(AMapToData d) {
 	}
 
 	@Override
-	public void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu) {
-		final int nRow = m.getNumColumns();
-		final int nVal = pre.getNumColumns();
-		final double[] preAV = pre.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int blockSize = 4000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, nRow);
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-				final int offLeft = rowLeft * nRow;
-				for(int rc = block; rc < blockEnd; rc++) {
-					final int idx = _data[rc] & 0xFF;
-					preAV[offOut + idx] += mV[offLeft + rc];
-				}
-			}
+	protected void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu) {
+		if(getUnique() < 127) {
+			for(int rc = cl; rc < cu; rc++)
+				preAV[_data[rc]] += mV[off + rc];
+		}
+		else {
+			for(int rc = cl; rc < cu; rc++)
+				preAV[_data[rc] & 0xFF] += mV[off + rc];
 		}
 	}
 
+	@Override
+	public final void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, AOffset indexes) {
+		indexes.preAggregateDenseMap(m, preAV, rl, ru, cl, cu, getUnique(), _data);
+	}
+
+	@Override
+	public void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes){
+		indexes.preAggregateSparseMap(sb, preAV, rl, ru, getUnique(), _data);
+	}
+
 	@Override
 	public int getUpperBoundValue() {
 		return 255;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
index d1fc0125a2a..249bc6ba50c 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
@@ -25,6 +25,8 @@
 import java.util.Arrays;
 
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -107,29 +109,50 @@ public static MapToChar readFields(DataInput in) throws IOException {
 		return new MapToChar(unique, data);
 	}
 
-	public char[] getChars() {
+	protected char[] getChars() {
 		return _data;
 	}
 
+	private void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, int cu, int off) {
+		int h = (cu - cl) % 8;
+		off += cl;
+		for(int rc = cl; rc < cl + h; rc++, off++)
+			preAV[_data[rc]] += mV[off];
+		for(int rc = cl + h; rc < cu; rc += 8, off += 8) {
+			int id1 = _data[rc], id2 = _data[rc + 1], id3 = _data[rc + 2], id4 = _data[rc + 3], id5 = _data[rc + 4],
+				id6 = _data[rc + 5], id7 = _data[rc + 6], id8 = _data[rc + 7];
+			preAV[id1] += mV[off];
+			preAV[id2] += mV[off + 1];
+			preAV[id3] += mV[off + 2];
+			preAV[id4] += mV[off + 3];
+			preAV[id5] += mV[off + 4];
+			preAV[id6] += mV[off + 5];
+			preAV[id7] += mV[off + 6];
+			preAV[id8] += mV[off + 7];
+		}
+	}
+
 	@Override
-	public void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu) {
-		final int nRow = m.getNumColumns();
-		final int nVal = pre.getNumColumns();
-		final double[] preAV = pre.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int blockSize = 4000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, nRow);
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-				final int offLeft = rowLeft * nRow;
-				for(int rc = block; rc < blockEnd; rc++) {
-					final int idx = _data[rc];
-					preAV[offOut + idx] += mV[offLeft + rc];
-				}
-			}
+	protected void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu) {
+		if(cu - cl > 1000)
+			preAggregateDenseToRowBy8(mV, preAV, cl, cu, off);
+		else {
+			off += cl;
+			for(int rc = cl; rc < cu; rc++, off++)
+				preAV[_data[rc]] += mV[off];
 		}
 	}
 
+	@Override
+	public void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, AOffset indexes) {
+		indexes.preAggregateDenseMap(m, preAV, rl, ru, cl, cu, getUnique(), _data);
+	}
+
+	@Override
+	public void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes) {
+		indexes.preAggregateSparseMap(sb, preAV, rl, ru, getUnique(), _data);
+	}
+
 	@Override
 	public int getUpperBoundValue() {
 		return Character.MAX_VALUE;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
index 8a706880e96..de8d95f6a3d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
@@ -79,7 +79,7 @@ public static AMapToData resize(AMapToData d, int numTuples) {
 		AMapToData ret;
 		if(d instanceof MapToBit)
 			return d;
-		else if(numTuples <= 1)
+		else if(numTuples <= 2)
 			ret = new MapToBit(numTuples, size);
 		else if(d instanceof MapToByte)
 			return d;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
index b991ccb7e0f..6a518573a54 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
@@ -24,7 +24,10 @@
 import java.io.IOException;
 import java.util.Arrays;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -106,22 +109,20 @@ public static MapToInt readFields(DataInput in) throws IOException {
 	}
 
 	@Override
-	public void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu) {
-		final int nRow = m.getNumColumns();
-		final int nVal = pre.getNumColumns();
-		final double[] preAV = pre.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int blockSize = 4000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, nRow);
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-				final int offLeft = rowLeft * nRow;
-				for(int rc = block; rc < blockEnd; rc++) {
-					final int idx = _data[rc];
-					preAV[offOut + idx] += mV[offLeft + rc];
-				}
-			}
-		}
+	protected void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu) {
+		off += cl;
+		for(int rc = cl; rc < cu; rc++, off++)
+			preAV[_data[rc]] += mV[off];
+	}
+
+	@Override
+	public void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, AOffset indexes) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes) {
+		throw new NotImplementedException();
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AIterator.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AIterator.java
index 17a502629d5..1c7e81e2057 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AIterator.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AIterator.java
@@ -49,13 +49,6 @@ protected AIterator(int index, int dataIndex, int offset) {
 	 */
 	public abstract void next();
 
-	/**
-	 * Get a boolean specifying if the iterator is done
-	 * 
-	 * @return A boolean that is true if there are more values contained in the Iterator.
-	 */
-	public abstract boolean hasNext();
-
 	/**
 	 * Get the current index value, note this correspond to a row index in the original matrix.
 	 * 
@@ -66,25 +59,38 @@ public int value() {
 	}
 
 	/**
-	 * Get the current index value and increment the pointers
+	 * find out if the current offset is not exceeding the index.
 	 * 
-	 * @return The current value pointed at.
+	 * @param ub The offset to not exceed
+	 * @return boolean if it is exceeded.
 	 */
-	public int valueAndIncrement() {
-		int x = offset;
-		next();
-		return x;
+	public boolean isNotOver(int ub) {
+		return offset < ub;
 	}
 
 	/**
 	 * Get the current data index associated with the index returned from value.
 	 * 
-	 * @return The data Index.
+	 * This index points to a position int the mapToData object, that then inturn can be used to lookup the dictionary
+	 * entry in ADictionary.
+	 * 
+	 * @return The Data Index.
 	 */
 	public int getDataIndex() {
 		return dataIndex;
 	}
 
+	/**
+	 * Get the current offsets index, that points to the underlying offsets list.
+	 * 
+	 * This is available for debugging purposes, not to be used for the calling classes.
+	 * 
+	 * @return The Offsets Index.
+	 */
+	public int getOffsetsIndex() {
+		return index;
+	}
+
 	/**
 	 * Get the current data index and increment the pointers using the next operator.
 	 * 
@@ -99,17 +105,23 @@ public int getDataIndexAndIncrement() {
 	/**
 	 * Skip values until index is achieved.
 	 * 
-	 * @param index The index to skip to.
+	 * @param idx The index to skip to.
 	 * @return the index that follows or are equal to the skip to index.
 	 */
-	public int skipTo(int index) {
-		while(hasNext() && offset < index)
-			next();
-		return offset;
-	}
+	public abstract int skipTo(int idx);
 
 	/**
 	 * Copy the iterator with the current values.
 	 */
 	public abstract AIterator clone();
+
+	/**
+	 * Unsafe version of equals, note that it should only compare iterators stemming from the same Offset Object.
+	 * 
+	 * @param o The Iterator to compare
+	 * @return The result
+	 */
+	public boolean equals(AIterator o) {
+		return o.index == this.index;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java
index 27816009a25..2f51e7f7442 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java
@@ -21,12 +21,13 @@
 import java.io.DataOutput;
 import java.io.IOException;
 import java.io.Serializable;
-import java.lang.ref.SoftReference;
-import java.util.HashMap;
-import java.util.Map;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
 /**
  * Offset list encoder interface.
@@ -39,9 +40,14 @@
  */
 public abstract class AOffset implements Serializable {
 
-	private static final long serialVersionUID = -4143271285905723425L;
 	protected static final Log LOG = LogFactory.getLog(AOffset.class.getName());
-	protected SoftReference<Map<Integer, AIterator>> skipIterators;
+
+	private ThreadLocal<OffsetCache> cacheRow = new ThreadLocal<OffsetCache>() {
+		@Override
+		protected OffsetCache initialValue() {
+			return null;
+		}
+	};
 
 	/**
 	 * Get an iterator of the offsets.
@@ -57,16 +63,23 @@ public abstract class AOffset implements Serializable {
 	 * @return AIterator that iterate through index and dictionary offset values.
 	 */
 	public AIterator getIterator(int row) {
-		if(skipIterators != null) {
-			Map<Integer, AIterator> sk = skipIterators.get();
-			AIterator it = sk.getOrDefault(row, null);
-			if(it != null)
-				return it.clone();
-		}
-		AIterator it = getIterator();
+		if(row <= getOffsetToFirst())
+			return getIterator();
+		else if(row >= getOffsetToLast())
+			return null;
+
+		// try the cache first.
+		OffsetCache c = cacheRow.get();
+		if(c != null && c.row == row)
+			return c.it.clone();
+
+		// Use the cached iterator if it is closer to the queried row.
+		AIterator it = c != null && c.row < row ? c.it.clone() : getIterator();
 		it.skipTo(row);
+		// cache this new iterator.
 		cacheIterator(it.clone(), row);
 		return it;
+
 	}
 
 	/**
@@ -76,14 +89,18 @@ public AIterator getIterator(int row) {
 	 * @param row The row index to cache the iterator as.
 	 */
 	public void cacheIterator(AIterator it, int row) {
-		if(skipIterators != null) {
-			Map<Integer, AIterator> sk = skipIterators.get();
-			sk.put(row, it);
+		if(it == null)
+			return;
+		OffsetCache c = cacheRow.get();
+		if(c == null) {
+			c = new OffsetCache();
+			c.it = it;
+			c.row = row;
+			cacheRow.set(c);
 		}
 		else {
-			Map<Integer, AIterator> nsk = new HashMap<>();
-			nsk.put(row, it.clone());
-			skipIterators = new SoftReference<>(nsk);
+			c.it = it;
+			c.row = row;
 		}
 	}
 
@@ -98,6 +115,20 @@ public void cacheIterator(AIterator it, int row) {
 	 */
 	public abstract void write(DataOutput out) throws IOException;
 
+	/**
+	 * Get the offset to the first index
+	 * 
+	 * @return The first index offset
+	 */
+	public abstract int getOffsetToFirst();
+
+	/**
+	 * Get the offset to the last value
+	 * 
+	 * @return The last values offset
+	 */
+	public abstract int getOffsetToLast();
+
 	/**
 	 * Get the in memory size of the Offset object
 	 * 
@@ -119,17 +150,200 @@ public void cacheIterator(AIterator it, int row) {
 	 */
 	public abstract int getSize();
 
+	/**
+	 * Get the length of the underlying offsets lists.
+	 * 
+	 * @return The number of offsets.
+	 */
+	public abstract int getOffsetsLength();
+
+	public final void preAggregateDenseMap(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		char[] data) {
+		// multi row iterator.
+		final AIterator it = getIterator(cl);
+		if(it == null)
+			return;
+		else if(it.offset > cu)
+			cacheIterator(it, cu); // cache this iterator.
+		else if(rl == ru - 1) {
+			final DenseBlock db = m.getDenseBlock();
+			final double[] mV = db.values(rl);
+			final int off = db.pos(rl);
+			preAggregateDenseMapRow(mV, off, preAV, cu, nVal, data, it);
+		}
+		else {
+			final DenseBlock db = m.getDenseBlock();
+			preAggregateDenseMapRows(db, preAV, rl, ru, cl, cu, nVal, data);
+		}
+	}
+
+	public final void preAggregateDenseMap(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		byte[] data) {
+		// multi row iterator.
+		final AIterator it = getIterator(cl);
+		if(it == null)
+			return;
+		else if(it.offset > cu)
+			cacheIterator(it, cu); // cache this iterator.
+		else if(rl == ru - 1) {
+			final DenseBlock db = m.getDenseBlock();
+			final double[] mV = db.values(rl);
+			final int off = db.pos(rl);
+			preAggregateDenseMapRow(mV, off, preAV, cu, nVal, data, it);
+		}
+		else {
+			final DenseBlock db = m.getDenseBlock();
+			preAggregateDenseMapRows(db, preAV, rl, ru, cl, cu, nVal, data);
+		}
+	}
+
+	protected abstract void preAggregateDenseMapRow(double[] mV, int off, double[] preAV, int cu, int nVal, byte[] data,
+		AIterator it);
+
+	protected abstract void preAggregateDenseMapRow(double[] mV, int off, double[] preAV, int cu, int nVal, char[] data,
+		AIterator it);
+
+	protected void preAggregateDenseMapRows(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		char[] data) {
+			
+		LOG.warn("Inefficient implementation of Preaggregate DenseMap multi row.");
+		throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+	}
+
+	protected void preAggregateDenseMapRows(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		byte[] data) {
+		LOG.warn("Inefficient implementation of Preaggregate DenseMap multi row.");
+		throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+	}
+
+	public final void preAggregateSparseMap(SparseBlock sb, double[] preAV, int rl, int ru, int nVal, char[] data) {
+		final AIterator it = getIterator();
+		if(rl == ru - 1)
+			preAggregateSparseMapRow(sb, preAV, rl, nVal, data, it);
+		else
+			throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+	}
+
+	public final void preAggregateSparseMap(SparseBlock sb, double[] preAV, int rl, int ru, int nVal, byte[] data) {
+		final AIterator it = getIterator();
+		if(rl == ru - 1)
+			preAggregateSparseMapRow(sb, preAV, rl, nVal, data, it);
+		else
+			throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+	}
+
+	public final void preAggregateSparseMap(SparseBlock sb, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		char[] data) {
+		// multi row iterator.
+		final AIterator it = getIterator(cl);
+		if(it == null)
+			return;
+		else if(it.offset > cu)
+			cacheIterator(it, cu); // cache this iterator.
+		else if(rl == ru - 1)
+			preAggregateSparseMapRow(sb, preAV, rl, nVal, data, it);
+		else
+			throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+
+	}
+
+	public final void preAggregateSparseMap(SparseBlock sb, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		byte[] data) {
+		// multi row iterator.
+		final AIterator it = getIterator(cl);
+		if(it == null)
+			return;
+		else if(it.offset > cu)
+			cacheIterator(it, cu); // cache this iterator.
+		else if(rl == ru - 1)
+			preAggregateSparseMapRow(sb, preAV, rl, nVal, data, it);
+		else
+			throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+
+	}
+
+	protected void preAggregateSparseMapRow(SparseBlock sb, double[] preAV, int r, int nVal, byte[] data, AIterator it) {
+		final int apos = sb.pos(r);
+		final int alen = sb.size(r) + apos;
+		final int[] aix = sb.indexes(r);
+		final double[] avals = sb.values(r);
+
+		final int maxId = data.length - 1;
+
+		int j = apos;
+		while(true) {
+			final int idx = aix[j];
+			if(idx == it.offset) {
+				preAV[data[it.dataIndex] & 0xFF] += avals[j++];
+				if(j >= alen || it.dataIndex >= maxId)
+					break;
+				it.next();
+			}
+			else if(idx < it.offset) {
+				j++;
+				if(j >= alen)
+					break;
+			}
+			else {
+				if(it.dataIndex >= maxId)
+					break;
+				it.next();
+			}
+		}
+	}
+
+	protected void preAggregateSparseMapRow(SparseBlock sb, double[] preAV, int r, int nVal, char[] data, AIterator it) {
+		final int apos = sb.pos(r);
+		final int alen = sb.size(r) + apos;
+		final int[] aix = sb.indexes(r);
+		final double[] avals = sb.values(r);
+
+		final int maxId = data.length - 1;
+
+		int j = apos;
+		while(true) {
+			final int idx = aix[j];
+			if(idx == it.offset) {
+				preAV[data[it.dataIndex]] += avals[j++];
+				if(j >= alen || it.dataIndex >= maxId)
+					break;
+				it.next();
+			}
+			else if(idx < it.offset) {
+				j++;
+				if(j >= alen)
+					break;
+			}
+			else {
+				if(it.dataIndex >= maxId)
+					break;
+				it.next();
+			}
+		}
+	}
+
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
-		AIterator i = getIterator();
 		sb.append(this.getClass().getSimpleName());
-		sb.append(" [");
-		sb.append(i.valueAndIncrement());
-
-		while(i.hasNext())
-			sb.append(", " + i.valueAndIncrement());
+		final AIterator it = getIterator();
+		final int last = getOffsetToLast();
+		sb.append("[");
+		while(it.offset < last) {
+			sb.append(it.offset);
+			sb.append(", ");
+			it.next();
+		}
+		sb.append(it.offset);
 		sb.append("]");
 		return sb.toString();
 	}
+
+	protected static class OffsetCache {
+		protected AIterator it = null;
+		protected int row = -1;
+
+		protected OffsetCache() {
+		}
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetByte.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetByte.java
index 29133cbd758..ebb29df1900 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetByte.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetByte.java
@@ -21,18 +21,18 @@
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.util.Arrays;
 
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.utils.MemoryEstimates;
 
 public class OffsetByte extends AOffset {
 
 	private static final long serialVersionUID = -4716104973912491790L;
+	private static final int maxV = 255;
 
-	private final static int maxV = 255;
 	private final byte[] offsets;
 	private final int offsetToFirst;
+	private final int offsetToLast;
+	private final boolean noOverHalf;
 
 	public OffsetByte(int[] indexes) {
 		this(indexes, 0, indexes.length);
@@ -41,21 +41,22 @@ public OffsetByte(int[] indexes) {
 	public OffsetByte(int[] indexes, int apos, int alen) {
 		int endSize = 0;
 		offsetToFirst = indexes[apos];
+		offsetToLast = indexes[alen - 1];
 		int ov = offsetToFirst;
+		// find the size of the array
 		for(int i = apos + 1; i < alen; i++) {
 			final int nv = indexes[i];
-			endSize += 1 + (nv - ov) / maxV;
+			endSize += 1 + (nv - ov - 1) / maxV;
 			ov = nv;
 		}
 		offsets = new byte[endSize];
 		ov = offsetToFirst;
 		int p = 0;
 
+		// populate the array
 		for(int i = apos + 1; i < alen; i++) {
 			final int nv = indexes[i];
 			final int offsetSize = nv - ov;
-			if(offsetSize == 0)
-				throw new DMLCompressionException("Invalid difference between cells :\n" + Arrays.toString(indexes));
 			final int div = offsetSize / maxV;
 			final int mod = offsetSize % maxV;
 			if(mod == 0) {
@@ -69,11 +70,30 @@ public OffsetByte(int[] indexes, int apos, int alen) {
 
 			ov = nv;
 		}
+		boolean noOverHalf = true;
+		for(byte b : offsets)
+			if(b < 0) {
+				noOverHalf = false;
+				break;
+			}
+		this.noOverHalf = noOverHalf;
 	}
 
-	private OffsetByte(byte[] offsets, int offsetToFirst) {
+	protected OffsetByte(byte[] offsets, int offsetToFirst, int offsetToLast) {
 		this.offsets = offsets;
 		this.offsetToFirst = offsetToFirst;
+		this.offsetToLast = offsetToLast;
+		this.noOverHalf = getNoOverHalf();
+	}
+
+	private boolean getNoOverHalf() {
+		boolean noOverHalf = true;
+		for(byte b : offsets)
+			if(b < 0) {
+				noOverHalf = false;
+				break;
+			}
+		return noOverHalf;
 	}
 
 	@Override
@@ -92,7 +112,9 @@ public void write(DataOutput out) throws IOException {
 
 	@Override
 	public long getInMemorySize() {
-		return getInMemorySize(offsets.length);
+		long size = 16 + 4 + 4 + 8; // object header plus ints plus reference
+		size += MemoryEstimates.byteArrayCost(offsets.length);
+		return size;
 	}
 
 	@Override
@@ -103,29 +125,288 @@ public long getExactSizeOnDisk() {
 	@Override
 	public int getSize() {
 		int size = 1;
-		for(byte b : offsets) {
+		for(byte b : offsets)
 			if(b != 0)
 				size++;
-		}
+
 		return size;
 	}
 
-	public static long getInMemorySize(int length) {
-		long size = 16 + 4 + 8; // object header plus int plus reference
-		size += MemoryEstimates.byteArrayCost(length);
+	@Override
+	public int getOffsetToFirst() {
+		return offsetToFirst;
+	}
+
+	@Override
+	public int getOffsetToLast() {
+		return offsetToLast;
+	}
+
+	@Override
+	public int getOffsetsLength() {
+		return offsets.length;
+	}
+
+	public static long estimateInMemorySize(int nOffs, int nRows) {
+		long size = 16 + 4 + 4 + 8; // object header plus int plus reference
+		size += MemoryEstimates.byteArrayCost(Math.max(nOffs, nRows / maxV));
 		return size;
 	}
 
 	public static OffsetByte readFields(DataInput in) throws IOException {
-		int offsetToFirst = in.readInt();
-		int offsetsLength = in.readInt();
-		byte[] offsets = new byte[offsetsLength];
+		final int offsetToFirst = in.readInt();
+		final int offsetsLength = in.readInt();
+
+		final byte[] offsets = new byte[offsetsLength];
+		int offsetToLast = offsetToFirst;
 		for(int i = 0; i < offsetsLength; i++) {
 			offsets[i] = in.readByte();
+			offsetToLast += offsets[i] & 0xFF;
+		}
+		return new OffsetByte(offsets, offsetToFirst, offsetToLast);
+	}
+
+	@Override
+	protected final void preAggregateDenseMapRow(double[] mV, int off, double[] preAV, int cu, int nVal, byte[] data,
+		AIterator it) {
+		IterateByteOffset itb = (IterateByteOffset) it;
+		final boolean noZero = offsets.length == data.length - 1;
+		if(cu < offsetToLast + 1) {
+			if(noOverHalf && noZero && nVal < 127)
+				preAggregateDenseByteMapRowBelowEndAndNoZeroNoOverHalfAlsoData(mV, off, preAV, cu, data, itb);
+			else if(noOverHalf && noZero)
+				preAggregateDenseByteMapRowBelowEndAndNoZeroNoOverHalf(mV, off, preAV, cu, data, itb);
+			else if(noZero)
+				preAggregateDenseByteMapRowBelowEndAndNoZero(mV, off, preAV, cu, data, itb);
+			else
+				preAggregateDenseByteMapRowBelowEnd(mV, off, preAV, cu, data, itb);
+			cacheIterator(itb, cu);
+		}
+		else if(noZero)
+			preAggregateDenseByteMapRowNoZero(mV, off, preAV, data, itb);
+		else
+			preAggregateDenseByteMapRow(mV, off, preAV, data, itb);
+
+	}
+
+	private final void preAggregateDenseByteMapRow(double[] mV, int off, double[] preAV, byte[] data,
+		IterateByteOffset it) {
+		final int maxId = data.length - 1;
+
+		int offset = it.offset + off;
+		int index = it.index;
+		int dataIndex = it.dataIndex;
+
+		preAV[data[dataIndex] & 0xFF] += mV[offset];
+		while(dataIndex < maxId) {
+			byte v = offsets[index];
+			while(v == 0) {
+				offset += maxV;
+				index++;
+				v = offsets[index];
+			}
+			offset += v & 0xFF;
+			index++;
+			dataIndex++;
+			preAV[data[dataIndex] & 0xFF] += mV[offset];
+		}
+	}
+
+	private final void preAggregateDenseByteMapRowNoZero(double[] mV, int off, double[] preAV, byte[] data,
+		IterateByteOffset it) {
+
+		int offset = it.offset + off;
+		int index = it.index;
+
+		while(index < offsets.length) {
+			preAV[data[index] & 0xFF] += mV[offset];
+			offset += offsets[index++] & 0xFF;
+		}
+		// process straggler index.
+		preAV[data[index] & 0xFF] += mV[offset];
+	}
+
+	private void preAggregateDenseByteMapRowBelowEnd(double[] mV, int off, double[] preAV, int cu, byte[] data,
+		IterateByteOffset it) {
+
+		cu += off;
+		it.offset += off;
+		while(it.offset < cu) {
+			preAV[data[it.dataIndex] & 0xFF] += mV[it.offset];
+			byte v = offsets[it.index];
+			while(v == 0) {
+				it.offset += maxV;
+				it.index++;
+				v = offsets[it.index];
+			}
+			it.offset += v & 0xFF;
+			it.index++;
+			it.dataIndex++;
+		}
+		it.offset -= off;
+	}
+
+	private void preAggregateDenseByteMapRowBelowEndAndNoZero(double[] mV, int off, double[] preAV, int cu, byte[] data,
+		IterateByteOffset it) {
+
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index] & 0xFF] += mV[offset];
+			offset += offsets[index++] & 0xFF;
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+	private final void preAggregateDenseByteMapRowBelowEndAndNoZeroNoOverHalf(double[] mV, int off, double[] preAV,
+		int cu, byte[] data, IterateByteOffset it) {
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index] & 0xFF] += mV[offset];
+			offset += offsets[index++];
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+	private final void preAggregateDenseByteMapRowBelowEndAndNoZeroNoOverHalfAlsoData(double[] mV, int off,
+		double[] preAV, int cu, byte[] data, IterateByteOffset it) {
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index]] += mV[offset];
+			offset += offsets[index++];
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+	@Override
+	protected final void preAggregateDenseMapRow(double[] mV, int off, double[] preAV, int cu, int nVal, char[] data,
+		AIterator it) {
+		IterateByteOffset itb = (IterateByteOffset) it;
+		final boolean noZero = offsets.length == data.length - 1;
+		if(cu < offsetToLast + 1) {
+			if(noOverHalf && noZero)
+				preAggregateDenseCharMapRowBelowEndAndNoZeroNoOverHalf(mV, off, preAV, cu, data, itb);
+			else if(noZero)
+				preAggregateDenseCharMapRowBelowEndAndNoZero(mV, off, preAV, cu, data, itb);
+			else
+				preAggregateDenseCharMapRowBelowEnd(mV, off, preAV, cu, data, itb);
+			cacheIterator(itb, cu);
+		}
+		else if(noZero)
+			preAggregateDenseCharMapRowNoZero(mV, off, preAV, data, itb);
+		else
+			preAggregateDenseCharMapRow(mV, off, preAV, data, itb);
+	}
+
+	private void preAggregateDenseCharMapRow(double[] mV, int off, double[] preAV, char[] data, IterateByteOffset it) {
+		final int maxId = data.length - 1;
+		int offset = it.offset + off;
+		int index = it.index;
+		int dataIndex = it.dataIndex;
+
+		preAV[data[dataIndex]] += mV[offset];
+		while(dataIndex < maxId) {
+			byte v = offsets[index];
+			while(v == 0) {
+				offset += maxV;
+				index++;
+				v = offsets[index];
+			}
+			offset += v & 0xff;
+			index++;
+			dataIndex++;
+			preAV[data[dataIndex]] += mV[offset];
 		}
-		return new OffsetByte(offsets, offsetToFirst);
 	}
 
+	private void preAggregateDenseCharMapRowNoZero(double[] mV, int off, double[] preAV, char[] data,
+		IterateByteOffset it) {
+
+		int offset = it.offset + off;
+		int index = it.index;
+		while(index < offsets.length) {
+			preAV[data[index]] += mV[offset];
+			offset += offsets[index++] & 0xFF;
+		}
+		preAV[data[index]] += mV[offset];
+	}
+
+	private void preAggregateDenseCharMapRowBelowEnd(double[] mV, int off, double[] preAV, int cu, char[] data,
+		IterateByteOffset it) {
+
+		cu += off;
+		it.offset += off;
+		while(it.offset < cu) {
+			preAV[data[it.dataIndex]] += mV[it.offset];
+			byte v = offsets[it.index];
+			while(v == 0) {
+				it.offset += maxV;
+				it.index++;
+				v = offsets[it.index];
+			}
+			it.offset += v & 0xFF;
+			it.index++;
+			it.dataIndex++;
+		}
+		it.offset -= off;
+	}
+
+	private void preAggregateDenseCharMapRowBelowEndAndNoZero(double[] mV, int off, double[] preAV, int cu, char[] data,
+		IterateByteOffset it) {
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index]] += mV[offset];
+			offset += offsets[index++] & 0xFF;
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+	private final void preAggregateDenseCharMapRowBelowEndAndNoZeroNoOverHalf(double[] mV, int off, double[] preAV,
+		int cu, char[] data, IterateByteOffset it) {
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index]] += mV[offset];
+			offset += offsets[index++];
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+
+
 	private class IterateByteOffset extends AIterator {
 
 		private IterateByteOffset() {
@@ -138,26 +419,22 @@ private IterateByteOffset(int index, int dataIndex, int offset) {
 
 		@Override
 		public void next() {
-			if(index >= offsets.length) {
-				index++;
-				dataIndex++;
-				return;
-			}
-
-			final byte v = offsets[index++];
-			if(v == 0) {
+			byte v = offsets[index];
+			while(v == 0) {
 				offset += maxV;
-				next();
-			}
-			else {
-				dataIndex++;
-				offset += v & 0xFF;
+				index++;
+				v = offsets[index];
 			}
+			offset += v & 0xFF;
+			index++;
+			dataIndex++;
 		}
 
 		@Override
-		public boolean hasNext() {
-			return index <= offsets.length;
+		public int skipTo(int idx) {
+			while(offset < idx && index < offsets.length)
+				next();
+			return offset;
 		}
 
 		@Override
@@ -165,4 +442,5 @@ public IterateByteOffset clone() {
 			return new IterateByteOffset(index, dataIndex, offset);
 		}
 	}
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetChar.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetChar.java
index c1c2930c850..dda7ab9e1da 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetChar.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetChar.java
@@ -21,19 +21,17 @@
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.util.Arrays;
 
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.utils.MemoryEstimates;
 
 public class OffsetChar extends AOffset {
 
 	private static final long serialVersionUID = -1192266421395964882L;
-
-	private final static int maxV = (int) Character.MAX_VALUE;
+	private static final int maxV = (int) Character.MAX_VALUE;
 
 	private final char[] offsets;
 	private final int offsetToFirst;
+	private final int offsetToLast;
 
 	public OffsetChar(int[] indexes) {
 		this(indexes, 0, indexes.length);
@@ -42,21 +40,20 @@ public OffsetChar(int[] indexes) {
 	public OffsetChar(int[] indexes, int apos, int alen) {
 		int endSize = 0;
 		offsetToFirst = indexes[apos];
+		offsetToLast = indexes[alen - 1];
 		int ov = offsetToFirst;
-		for(int i = apos+1; i < alen; i++) {
+		for(int i = apos + 1; i < alen; i++) {
 			final int nv = indexes[i];
-			endSize += 1 + (nv - ov) / maxV;
+			endSize += 1 + (nv - ov - 1) / maxV;
 			ov = nv;
 		}
 		offsets = new char[endSize];
 		ov = offsetToFirst;
 		int p = 0;
 
-		for(int i =  apos+1; i < alen; i++) {
+		for(int i = apos + 1; i < alen; i++) {
 			final int nv = indexes[i];
 			final int offsetSize = (nv - ov);
-			if(offsetSize == 0)
-				throw new DMLCompressionException("Invalid difference between cells :\n" + Arrays.toString(indexes));
 			final int div = offsetSize / maxV;
 			final int mod = offsetSize % maxV;
 			if(mod == 0) {
@@ -72,9 +69,10 @@ public OffsetChar(int[] indexes, int apos, int alen) {
 		}
 	}
 
-	private OffsetChar(char[] offsets, int offsetToFirst) {
+	private OffsetChar(char[] offsets, int offsetToFirst, int offsetToLast) {
 		this.offsets = offsets;
 		this.offsetToFirst = offsetToFirst;
+		this.offsetToLast = offsetToLast;
 	}
 
 	@Override
@@ -93,7 +91,9 @@ public void write(DataOutput out) throws IOException {
 
 	@Override
 	public long getInMemorySize() {
-		return getInMemorySize(offsets.length);
+		long size = 16 + 4 + 8; // object header plus int plus reference
+		size += MemoryEstimates.charArrayCost(offsets.length);
+		return size;
 	}
 
 	@Override
@@ -111,22 +111,69 @@ public int getSize() {
 		return size;
 	}
 
+	@Override
+	public int getOffsetToFirst() {
+		return offsetToFirst;
+	}
+
+	@Override
+	public int getOffsetToLast() {
+		return offsetToLast;
+	}
+
+	@Override
+	public int getOffsetsLength() {
+		return offsets.length;
+	}
+
 	public static OffsetChar readFields(DataInput in) throws IOException {
-		int offsetToFirst = in.readInt();
-		int offsetsLength = in.readInt();
-		char[] offsets = new char[offsetsLength];
+		final int offsetToFirst = in.readInt();
+		final int offsetsLength = in.readInt();
+		final char[] offsets = new char[offsetsLength];
+		int offsetToLast = offsetToFirst;
 		for(int i = 0; i < offsetsLength; i++) {
 			offsets[i] = in.readChar();
+			offsetToLast += offsets[i];
 		}
-		return new OffsetChar(offsets, offsetToFirst);
+		return new OffsetChar(offsets, offsetToFirst, offsetToLast);
 	}
 
-	public static long getInMemorySize(int length) {
+	public static long estimateInMemorySize(int nOffs, int nRows) {
 		long size = 16 + 4 + 8; // object header plus int plus reference
-		size += MemoryEstimates.charArrayCost(length - 1);
+		size += MemoryEstimates.charArrayCost(Math.max(nOffs, nRows / maxV));
 		return size;
 	}
 
+	@Override
+	protected final void preAggregateDenseMapRow(double[] mV, int off, double[] preAV, int cu, int nVal, byte[] data,
+		AIterator it) {
+		final int maxId = data.length - 1;
+		while(it.isNotOver(cu)) {
+			final int dx = it.getDataIndex();
+			preAV[data[dx] & 0xFF] += mV[off + it.value()];
+			if(dx < maxId)
+				it.next();
+			else
+				break;
+		}
+		cacheIterator(it, cu);
+	}
+
+	@Override
+	protected final void preAggregateDenseMapRow(double[] mV, int off, double[] preAV, int cu, int nVal, char[] data,
+		AIterator it) {
+		final int maxId = data.length - 1;
+		while(it.isNotOver(cu)) {
+			final int dx = it.getDataIndex();
+			preAV[data[dx]] += mV[off + it.value()];
+			if(dx < maxId)
+				it.next();
+			else
+				break;
+		}
+		cacheIterator(it, cu);
+	}
+
 	private class IterateCharOffset extends AIterator {
 
 		private IterateCharOffset() {
@@ -139,25 +186,27 @@ private IterateCharOffset(int index, int dataIndex, int offset) {
 
 		@Override
 		public void next() {
-			if(index >= offsets.length) {
-				index++;
-				dataIndex++;
-				return;
-			}
-			final char v = offsets[index++];
-			if(v == 0) {
+			char v = offsets[index];
+			while(v == 0) {
 				offset += maxV;
-				next();
-			}
-			else {
-				dataIndex++;
-				offset += v;
+				index++;
+				v = offsets[index];
 			}
+			offset += v;
+			index++;
+			dataIndex++;
 		}
 
 		@Override
-		public boolean hasNext() {
-			return index <= offsets.length;
+		public int value() {
+			return offset;
+		}
+
+		@Override
+		public int skipTo(int idx) {
+			while(offset < idx && index < offsets.length)
+				next();
+			return offset;
 		}
 
 		@Override
@@ -165,5 +214,4 @@ public IterateCharOffset clone() {
 			return new IterateCharOffset(index, dataIndex, offset);
 		}
 	}
-
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetFactory.java
index d54be828985..60f8231f531 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetFactory.java
@@ -22,12 +22,11 @@
 import java.io.DataInput;
 import java.io.IOException;
 
-import org.apache.sysds.runtime.compress.DMLCompressionException;
-
 public interface OffsetFactory {
 
 	// static final Log LOG = LogFactory.getLog(OffsetFactory.class.getName());
 
+	/** The specific underlying tpes of offsets. */
 	public enum OFF_TYPE {
 		BYTE, CHAR
 	}
@@ -35,11 +34,14 @@ public enum OFF_TYPE {
 	/**
 	 * Main factory pattern creator for Offsets.
 	 * 
+	 * Note this creator is unsafe in the sense it is assumed that the input index list only contain a sequential non
+	 * duplicate incrementing values.
+	 * 
 	 * @param indexes List of indexes, that is assumed to be sorted and have no duplicates
 	 * @return AOffset object containing offsets to the next value.
 	 */
-	public static AOffset create(int[] indexes) {
-		return create(indexes, 0, indexes.length);
+	public static AOffset createOffset(int[] indexes) {
+		return createOffset(indexes, 0, indexes.length);
 	}
 
 	/**
@@ -48,18 +50,22 @@ public static AOffset create(int[] indexes) {
 	 * This is useful if the input is created from a CSR matrix, since it allows us to not reallocate the indexes[] but
 	 * use the shared indexes from the entire CSR representation.
 	 * 
+	 * Note this creator is unsafe in the sense it is assumed that the input indexes in the range from apos to alen only
+	 * contain a sequential non duplicate incrementing values.
+	 * 
 	 * @param indexes The indexes from which to take the offsets.
 	 * @param apos    The position to start looking from in the indexes.
 	 * @param alen    The position to end looking at in the indexes.
 	 * @return A new Offset.
 	 */
-	public static AOffset create(int[] indexes, int apos, int alen) {
+	public static AOffset createOffset(int[] indexes, int apos, int alen) {
+		final int minValue = indexes[apos];
 		final int maxValue = indexes[alen - 1];
-		if(maxValue < 0)
-			throw new DMLCompressionException("Invalid sizes given");
+		final int range = maxValue - minValue;
 		final int endLength = alen - apos;
-		final float avgDist = (float) maxValue / endLength;
-		if(avgDist < 256)
+		final long byteSize = OffsetByte.estimateInMemorySize(endLength, range);
+		final long charSize = OffsetChar.estimateInMemorySize(endLength, range);
+		if(byteSize < charSize)
 			return new OffsetByte(indexes, apos, alen);
 		else
 			return new OffsetChar(indexes, apos, alen);
@@ -96,16 +102,14 @@ public static AOffset readIn(DataInput in) throws IOException {
 	 * @return The estimated size of an offset given the number of offsets and rows.
 	 */
 	public static long estimateInMemorySize(int size, int nRows) {
-		if(size < 0 || nRows < 0)
-			throw new DMLCompressionException("Invalid sizes given: " + size + "  " + nRows);
-		else if(size == 0)
+		if(size == 0)
 			return 8; // If this is the case, then the compression results in constant col groups
 		else {
 			final int avgDiff = nRows / size;
 			if(avgDiff < 256)
-				return OffsetByte.getInMemorySize(size - 1);
+				return OffsetByte.estimateInMemorySize(size - 1, nRows);
 			else
-				return OffsetChar.getInMemorySize(size - 1);
+				return OffsetChar.estimateInMemorySize(size - 1, nRows);
 		}
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibAppend.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibAppend.java
index 6ca2619a160..68eca8045af 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibAppend.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibAppend.java
@@ -70,6 +70,7 @@ public static MatrixBlock append(CompressedMatrixBlock left, CompressedMatrixBlo
 
 		ret = appendColGroups(ret, left.getColGroups(), right.getColGroups(), left.getNumColumns());
 
+		ret.setOverlapping(left.isOverlapping() || right.isOverlapping());
 		double compressedSize = ret.getInMemorySize();
 		double uncompressedSize = MatrixBlock.estimateSizeInMemory(m, n, ret.getSparsity());
 
@@ -85,24 +86,20 @@ public static MatrixBlock append(CompressedMatrixBlock left, CompressedMatrixBlo
 	}
 
 	private static MatrixBlock appendRightEmpty(CompressedMatrixBlock left, MatrixBlock right, int m, int n) {
-
 		CompressedMatrixBlock ret = new CompressedMatrixBlock(m, n);
-
 		List<AColGroup> newGroup = new ArrayList<>(1);
 		newGroup.add(ColGroupEmpty.generate(right.getNumColumns()));
 		ret = appendColGroups(ret, left.getColGroups(), newGroup, left.getNumColumns());
-
+		ret.setOverlapping(left.isOverlapping());
 		return ret;
 	}
 
 	private static MatrixBlock appendLeftEmpty(MatrixBlock left, CompressedMatrixBlock right, int m, int n) {
-
 		CompressedMatrixBlock ret = new CompressedMatrixBlock(m, n);
-
 		List<AColGroup> newGroup = new ArrayList<>(1);
 		newGroup.add(ColGroupEmpty.generate(left.getNumColumns()));
 		ret = appendColGroups(ret, newGroup, right.getColGroups(), left.getNumColumns());
-
+		ret.setOverlapping(right.isOverlapping());
 		return ret;
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java
index e4c33330cd9..a045fa2362c 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java
@@ -158,8 +158,8 @@ private static CompressedMatrixBlock setupCompressedReturnMatrixBlock(Compressed
 		return ret;
 	}
 
-	private static MatrixBlock rowBinCellOp(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock ret,
-		BinaryOperator op, boolean left) {
+	private static MatrixBlock rowBinCellOp(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op,
+		boolean left) {
 		CompressedMatrixBlock cRet = setupCompressedReturnMatrixBlock(m1, ret);
 		if(isValidForOverlappingBinaryCellOperations(m1, op))
 			overlappingBinaryCellOp(m1, m2, cRet, op, left);
@@ -333,32 +333,42 @@ protected static CompressedMatrixBlock binaryMVPlusStack(CompressedMatrixBlock m
 
 	private static MatrixBlock binaryMVCol(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left) {
 
-		MatrixBlock ret = new MatrixBlock(m1.getNumRows(), m1.getNumColumns(), false, -1).allocateBlock();
+		final int nCols = m1.getNumColumns();
+		final int nRows = m1.getNumRows();
+		// Pre filter.
+		final List<AColGroup> groups = m1.getColGroups();
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		if(shouldFilter) {
+			CompressedMatrixBlock mf1 = new CompressedMatrixBlock(m1);
+			double[] constV = new double[nCols];
+			final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
+			filteredGroups.add(ColGroupFactory.genColGroupConst(constV));
+			mf1.allocateColGroupList(filteredGroups);
+			m1 = mf1;
+		}
+		MatrixBlock ret = new MatrixBlock(nRows, nCols, false, -1).allocateBlock();
 
-		final int blkz = CompressionSettings.BITMAP_BLOCK_SZ / m1.getNumColumns() * 5;
+		final int blkz = CompressionSettings.BITMAP_BLOCK_SZ / nCols * 5;
 		final int k = op.getNumThreads();
 		long nnz = 0;
 
 		if(k <= 1) {
-			for(int i = 0; i * blkz < m1.getNumRows(); i++) {
+			for(int i = 0; i < nRows; i += blkz) {
 				if(left)
-					nnz += new BinaryMVColLeftTask(m1, m2, ret, i * blkz, Math.min(m1.getNumRows(), (i + 1) * blkz), op)
-						.call();
+					nnz += new BinaryMVColLeftTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op).call();
 				else
-					nnz += new BinaryMVColTask(m1, m2, ret, i * blkz, Math.min(m1.getNumRows(), (i + 1) * blkz), op).call();
+					nnz += new BinaryMVColTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op).call();
 			}
 		}
 		else {
 			ExecutorService pool = CommonThreadPool.get(op.getNumThreads());
 			ArrayList<Callable<Integer>> tasks = new ArrayList<>();
 			try {
-				for(int i = 0; i * blkz < m1.getNumRows(); i++) {
+				for(int i = 0; i < nRows; i += blkz) {
 					if(left)
-						tasks.add(
-							new BinaryMVColLeftTask(m1, m2, ret, i * blkz, Math.min(m1.getNumRows(), (i + 1) * blkz), op));
+						tasks.add(new BinaryMVColLeftTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op));
 					else
-						tasks.add(new BinaryMVColTask(m1, m2, ret, i * blkz, Math.min(m1.getNumRows(), (i + 1) * blkz), op));
-
+						tasks.add(new BinaryMVColTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op));
 				}
 				for(Future<Integer> f : pool.invokeAll(tasks))
 					nnz += f.get();
@@ -396,7 +406,7 @@ protected BinaryMVColTask(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock
 		public Integer call() {
 			// unsafe decompress, since we count nonzeros afterwards.
 			for(AColGroup g : _m1.getColGroups())
-				g.decompressToBlock(_ret, _rl, _ru);
+				g.decompressToDenseBlock(_ret.getDenseBlock(), _rl, _ru);
 
 			if(_m2.isInSparseFormat())
 				throw new NotImplementedException("Not Implemented sparse Format execution for MM.");
@@ -440,7 +450,7 @@ protected BinaryMVColLeftTask(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBl
 		public Integer call() {
 			// unsafe decompress, since we count nonzeros afterwards.
 			for(AColGroup g : _m1.getColGroups())
-				g.decompressToBlock(_ret, _rl, _ru);
+				g.decompressToDenseBlock(_ret.getDenseBlock(), _rl, _ru);
 
 			if(_m2.isInSparseFormat())
 				throw new NotImplementedException("Not Implemented sparse Format execution for MM.");
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
index 4a39eac1e89..49fdfe281c9 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
@@ -563,7 +563,7 @@ private MatrixBlock getTmp() {
 		private MatrixBlock decompressToTemp() {
 			MatrixBlock tmp = getTmp();
 			for(AColGroup g : _m1.getColGroups())
-				g.decompressToBlock(tmp, _rl, _ru, -_rl, 0);
+				g.decompressToDenseBlock(tmp.getDenseBlock(), _rl, _ru, -_rl, 0);
 			tmp.setNonZeros(_rl + _ru);
 			return tmp;
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java
index 558ca7b3cd0..a646f8f4564 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java
@@ -20,7 +20,6 @@
 package org.apache.sysds.runtime.compress.lib;
 
 import java.util.ArrayList;
-import java.util.Comparator;
 import java.util.List;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
@@ -31,13 +30,13 @@
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
-import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupUncompressed;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.Timing;
 import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.CommonThreadPool;
 import org.apache.sysds.utils.DMLCompressionStatistics;
@@ -69,7 +68,7 @@ public static void decompressTo(CompressedMatrixBlock cmb, MatrixBlock ret, int
 		else if(outSparse)
 			decompressToSparseBlock(cmb, ret, rowOffset, colOffset);
 		else
-			decompressToDenseBlock(cmb, ret, rowOffset, colOffset);
+			decompressToDenseBlock(cmb, ret.getDenseBlock(), rowOffset, colOffset);
 
 		if(DMLScript.STATISTICS) {
 			final double t = time.stop();
@@ -81,29 +80,37 @@ else if(outSparse)
 
 	private static void decompressToSparseBlock(CompressedMatrixBlock cmb, MatrixBlock ret, int rowOffset,
 		int colOffset) {
-		final List<AColGroup> groups = new ArrayList<>(cmb.getColGroups());
-		final int nRows = cmb.getNumRows();
 
-		for(AColGroup g : groups)
-			g.decompressToBlock(ret, 0, nRows, rowOffset, colOffset);
+		final SparseBlock sb = ret.getSparseBlock();
+		final List<AColGroup> groups = cmb.getColGroups();
+		final int nRows = cmb.getNumRows();
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		if(shouldFilter) {
+			final MatrixBlock tmp = cmb.getUncompressed("Decompression to put into Sparse Block");
+			tmp.putInto(ret, rowOffset, colOffset, false);
+		}
+		else
+			for(AColGroup g : groups)
+				g.decompressToSparseBlock(sb, 0, nRows, rowOffset, colOffset);
 	}
 
-	private static void decompressToDenseBlock(CompressedMatrixBlock cmb, MatrixBlock ret, int rowOffset,
-		int colOffset) {
-		final List<AColGroup> groups = new ArrayList<>(cmb.getColGroups());
+	private static void decompressToDenseBlock(CompressedMatrixBlock cmb, DenseBlock ret, int rowOffset, int colOffset) {
+		final List<AColGroup> groups = cmb.getColGroups();
 		// final int nCols = cmb.getNumColumns();
 		final int nRows = cmb.getNumRows();
 
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(groups);
-		double[] constV = containsSDC ? new double[cmb.getNumColumns()] : null;
-		final List<AColGroup> filteredGroups = containsSDC ? CLALibUtils.filterGroups(groups, constV) : groups;
-
-		for(AColGroup g : filteredGroups)
-			g.decompressToBlock(ret, 0, nRows, rowOffset, colOffset);
-
-		if(constV != null) {
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		if(shouldFilter) {
+			final double[] constV = new double[cmb.getNumColumns()];
+			final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
+			for(AColGroup g : filteredGroups)
+				g.decompressToDenseBlock(ret, 0, nRows, rowOffset, colOffset);
 			AColGroup cRet = ColGroupFactory.genColGroupConst(constV);
-			cRet.decompressToBlock(ret, 0, nRows, rowOffset, colOffset);
+			cRet.decompressToDenseBlock(ret, 0, nRows, rowOffset, colOffset);
+		}
+		else {
+			for(AColGroup g : groups)
+				g.decompressToDenseBlock(ret, 0, nRows, rowOffset, colOffset);
 		}
 	}
 
@@ -122,34 +129,49 @@ private static MatrixBlock decompressExecute(CompressedMatrixBlock cmb, int k) {
 			ret.setNonZeros(ret.recomputeNonZeros());
 			return ret; // if uncompressedColGroup is only colGroup.
 		}
-		else if(ret == null) {
-			ret = new MatrixBlock(nRows, nCols, false, -1);
-			ret.allocateDenseBlock();
-		}
 
-		final int block = (int) Math.ceil((double) (CompressionSettings.BITMAP_BLOCK_SZ) / nCols);
-		final int blklen = block > 1000 ? block + 1000 - block % 1000 : Math.max(64, block);
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		double[] constV = shouldFilter ? new double[nCols] : null;
+		final List<AColGroup> filteredGroups = shouldFilter ? CLALibUtils.filterGroups(groups, constV) : groups;
+
+		if(ret == null) { // There was no uncompressed group that fit the entire matrix.
+			final boolean sparse = !shouldFilter && !overlapping &&
+				MatrixBlock.evalSparseFormatInMemory(nRows, nCols, nonZeros);
+			ret = new MatrixBlock(nRows, nCols, sparse);
+			if(sparse)
+				ret.allocateSparseRowsBlock();
+			else
+				ret.allocateDenseBlock();
+		}
 
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(groups);
-		double[] constV = containsSDC ? new double[ret.getNumColumns()] : null;
-		final List<AColGroup> filteredGroups = containsSDC ? CLALibUtils.filterGroups(groups, constV) : groups;
-		if(LOG.isTraceEnabled())
-			LOG.debug("Decompressing with block size: " + blklen);
+		// final int block = (int) Math.ceil((double) (CompressionSettings.BITMAP_BLOCK_SZ) / nCols);
+		// final int blklen = Math.max(block, 64);
+		final int blklen = 32;
 
-		sortGroups(filteredGroups, overlapping);
+		// final int blklen = block > 1000 ? block + 1000 - block % 1000 : Math.max(64, block);
 
 		// check if we are using filtered groups, and if we are not force constV to null
 		if(groups == filteredGroups)
 			constV = null;
 
 		final double eps = getEps(constV);
-		if(k == 1)
-			decompressSingleThread(ret, filteredGroups, nRows, blklen, constV, eps, nonZeros, overlapping);
-		else
-			decompressMultiThread(ret, filteredGroups, nRows, blklen, constV, eps, overlapping, k);
 
-		if(overlapping)
-			ret.recomputeNonZeros();
+		if(k == 1) {
+			if(ret.isInSparseFormat()) {
+				decompressSparseSingleThread(ret, filteredGroups, nRows, blklen);
+				ret.setNonZeros(nonZeros);
+			}
+			else {
+				decompressDenseSingleThread(ret, filteredGroups, nRows, blklen, constV, eps, nonZeros, overlapping);
+				ret.setNonZeros(nonZeros == -1 || overlapping ? ret.recomputeNonZeros() : nonZeros);
+			}
+		}
+		else if(ret.isInSparseFormat()) {
+			decompressSparseMultiThread(ret, filteredGroups, nRows, blklen, k);
+			ret.setNonZeros(nonZeros);
+		}
+		else
+			decompressDenseMultiThread(ret, filteredGroups, nRows, blklen, constV, eps, overlapping, k);
 
 		ret.examSparsity();
 		return ret;
@@ -183,33 +205,46 @@ private static MatrixBlock getUncompressedColGroupAndRemoveFromListOfColGroups(L
 		return ret;
 	}
 
-	private static void decompressSingleThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen, int blklen,
-		double[] constV, double eps, long nonZeros, boolean overlapping) {
+	private static void decompressSparseSingleThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen,
+		int blklen) {
+		final SparseBlock sb = ret.getSparseBlock();
+		for(int i = 0; i < rlen; i += blklen) {
+			final int rl = i;
+			final int ru = Math.min(i + blklen, rlen);
+			for(AColGroup grp : filteredGroups)
+				grp.decompressToSparseBlock(ret.getSparseBlock(), rl, ru);
+			for(int j = rl; j < ru; j++)
+				if(!sb.isEmpty(j))
+					sb.sort(j);
+		}
+
+	}
+
+	private static void decompressDenseSingleThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen,
+		int blklen, double[] constV, double eps, long nonZeros, boolean overlapping) {
 		for(int i = 0; i < rlen; i += blklen) {
 			final int rl = i;
 			final int ru = Math.min(i + blklen, rlen);
 			for(AColGroup grp : filteredGroups)
-				grp.decompressToBlock(ret, rl, ru);
+				grp.decompressToDenseBlock(ret.getDenseBlock(), rl, ru);
 			if(constV != null && !ret.isInSparseFormat())
 				addVector(ret, constV, eps, rl, ru);
 		}
-		ret.setNonZeros(nonZeros == -1 || overlapping ? ret.recomputeNonZeros() : nonZeros);
 	}
 
-	private static void decompressMultiThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen, int blklen,
+	private static void decompressDenseMultiThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen, int blklen,
 		double[] constV, double eps, boolean overlapping, int k) {
 		try {
 			final ExecutorService pool = CommonThreadPool.get(k);
-			final ArrayList<DecompressTask> tasks = new ArrayList<>();
-			for(int i = 0; i * blklen < rlen; i++)
-				tasks.add(new DecompressTask(filteredGroups, ret, eps, i * blklen, Math.min((i + 1) * blklen, rlen),
-					overlapping, constV));
-			List<Future<Long>> rtasks = pool.invokeAll(tasks);
-			pool.shutdown();
+			final ArrayList<DecompressDenseTask> tasks = new ArrayList<>();
+			for(int i = 0; i < rlen; i += blklen)
+				tasks.add(
+					new DecompressDenseTask(filteredGroups, ret, eps, i, Math.min(i + blklen, rlen), overlapping, constV));
 
 			long nnz = 0;
-			for(Future<Long> rt : rtasks)
+			for(Future<Long> rt : pool.invokeAll(tasks))
 				nnz += rt.get();
+			pool.shutdown();
 			ret.setNonZeros(nnz);
 		}
 		catch(InterruptedException | ExecutionException ex) {
@@ -217,23 +252,21 @@ private static void decompressMultiThread(MatrixBlock ret, List<AColGroup> filte
 		}
 	}
 
-	private static void sortGroups(List<AColGroup> groups, boolean overlapping) {
-		if(overlapping) {
-			// add a bit of stability in decompression
-			Comparator<AColGroup> comp = Comparator.comparing(x -> effect(x));
-			groups.sort(comp);
-		}
-	}
+	private static void decompressSparseMultiThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen,
+		int blklen, int k) {
+		try {
+			final ExecutorService pool = CommonThreadPool.get(k);
+			final ArrayList<DecompressSparseTask> tasks = new ArrayList<>();
+			for(int i = 0; i < rlen; i += blklen)
+				tasks.add(new DecompressSparseTask(filteredGroups, ret, i, Math.min(i + blklen, rlen)));
 
-	/**
-	 * Calculate an effect value for a column group. This is used to sort the groups before decompression to decompress
-	 * the columns that have the smallest effect first.
-	 * 
-	 * @param x A Group
-	 * @return A Effect double value.
-	 */
-	private static double effect(AColGroup x) {
-		return (x instanceof ColGroupUncompressed) ? -Double.MAX_VALUE : -Math.max(x.getMax(), Math.abs(x.getMin()));
+			for(Future<Object> rt : pool.invokeAll(tasks))
+				rt.get();
+			pool.shutdown();
+		}
+		catch(InterruptedException | ExecutionException ex) {
+			throw new DMLCompressionException("Parallel decompression failed", ex);
+		}
 	}
 
 	/**
@@ -259,7 +292,7 @@ private static double getEps(double[] constV) {
 		}
 	}
 
-	private static class DecompressTask implements Callable<Long> {
+	private static class DecompressDenseTask implements Callable<Long> {
 		private final List<AColGroup> _colGroups;
 		private final MatrixBlock _ret;
 		private final double _eps;
@@ -268,7 +301,7 @@ private static class DecompressTask implements Callable<Long> {
 		private final double[] _constV;
 		private final boolean _overlapping;
 
-		protected DecompressTask(List<AColGroup> colGroups, MatrixBlock ret, double eps, int rl, int ru,
+		protected DecompressDenseTask(List<AColGroup> colGroups, MatrixBlock ret, double eps, int rl, int ru,
 			boolean overlapping, double[] constV) {
 			_colGroups = colGroups;
 			_ret = ret;
@@ -282,7 +315,7 @@ protected DecompressTask(List<AColGroup> colGroups, MatrixBlock ret, double eps,
 		@Override
 		public Long call() {
 			for(AColGroup grp : _colGroups)
-				grp.decompressToBlock(_ret, _rl, _ru);
+				grp.decompressToDenseBlock(_ret.getDenseBlock(), _rl, _ru);
 
 			if(_constV != null)
 				addVector(_ret, _constV, _eps, _rl, _ru);
@@ -291,6 +324,31 @@ public Long call() {
 		}
 	}
 
+	private static class DecompressSparseTask implements Callable<Object> {
+		private final List<AColGroup> _colGroups;
+		private final MatrixBlock _ret;
+		private final int _rl;
+		private final int _ru;
+
+		protected DecompressSparseTask(List<AColGroup> colGroups, MatrixBlock ret, int rl, int ru) {
+			_colGroups = colGroups;
+			_ret = ret;
+			_rl = rl;
+			_ru = ru;
+		}
+
+		@Override
+		public Object call() {
+			final SparseBlock sb = _ret.getSparseBlock();
+			for(AColGroup grp : _colGroups)
+				grp.decompressToSparseBlock(_ret.getSparseBlock(), _rl, _ru);
+			for(int i = _rl; i < _ru; i++)
+				if(!sb.isEmpty(i))
+					sb.sort(i);
+			return null;
+		}
+	}
+
 	/**
 	 * Add the rowV vector to each row in ret.
 	 * 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
index 919f98a8db6..8b197b3ac3d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
@@ -121,8 +121,8 @@ public static void leftMultByTransposeSelf(CompressedMatrixBlock cmb, MatrixBloc
 		final List<AColGroup> groups = cmb.getColGroups();
 		final int numColumns = cmb.getNumColumns();
 		final int numRows = cmb.getNumRows();
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(groups);
-		final double[] constV = containsSDC ? new double[numColumns] : null;
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		final double[] constV = shouldFilter ? new double[numColumns] : null;
 		final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
 
 		// TODO add parallel again
@@ -177,11 +177,11 @@ private static MatrixBlock leftMultByCompressedTransposedMatrix(CompressedMatrix
 		final List<AColGroup> rightCG = right.getColGroups();
 		final List<AColGroup> leftCG = left.getColGroups();
 
-		final boolean containsRight = CLALibUtils.containsSDCOrConst(rightCG);
+		final boolean containsRight = CLALibUtils.shouldPreFilter(rightCG);
 		double[] cR = containsRight ? new double[cr] : null;
 		final List<AColGroup> fRight = CLALibUtils.filterGroups(rightCG, cR);
 
-		final boolean containsLeft = CLALibUtils.containsSDCOrConst(leftCG);
+		final boolean containsLeft = CLALibUtils.shouldPreFilter(leftCG);
 		double[] cL = containsLeft ? new double[rl] : null;
 		final List<AColGroup> fLeft = CLALibUtils.filterGroups(leftCG, cL);
 
@@ -246,11 +246,11 @@ private static MatrixBlock leftMultByMatrix(List<AColGroup> colGroups, MatrixBlo
 		}
 
 		final int numColumnsOut = ret.getNumColumns();
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(colGroups);
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(colGroups);
 		final int lr = that.getNumRows();
 
 		// a constant colgroup summing the default values.
-		double[] constV = containsSDC ? new double[numColumnsOut] : null;
+		double[] constV = shouldFilter ? new double[numColumnsOut] : null;
 		final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(colGroups, constV);
 		if(colGroups == filteredGroups)
 			constV = null;
@@ -258,9 +258,9 @@ private static MatrixBlock leftMultByMatrix(List<AColGroup> colGroups, MatrixBlo
 
 		if(!filteredGroups.isEmpty()) {
 			if(k == 1)
-				rowSums = leftMultByMatrixPrimitive(filteredGroups, that, ret, 0, lr, containsSDC ? new double[lr] : null);
+				rowSums = leftMultByMatrixPrimitive(filteredGroups, that, ret, 0, lr, shouldFilter ? new double[lr] : null);
 			else
-				rowSums = leftMultByMatrixParallel(filteredGroups, that, ret, containsSDC, overlapping, k);
+				rowSums = leftMultByMatrixParallel(filteredGroups, that, ret, shouldFilter, overlapping, k);
 		}
 		else if(constV != null)
 			rowSums = that.rowSum(k).getDenseBlockValues();
@@ -412,18 +412,19 @@ private static void leftMultByMatrixPrimitiveSparse(List<AColGroup> colGroups, M
 		int rl, int ru, double[] rowSum) {
 
 		for(int i = rl; i < ru; i++) {
+			final SparseBlock sb = that.getSparseBlock();
+			if(sb.isEmpty(i))
+				continue;
 			for(int j = 0; j < colGroups.size(); j++) {
 				colGroups.get(j).leftMultByMatrix(that, ret, i, i + 1);
 			}
 			if(rowSum != null) {
-				final SparseBlock sb = that.getSparseBlock();
-				if(!sb.isEmpty(i)) {
-					final int apos = sb.pos(i);
-					final int alen = sb.size(i) + apos;
-					final double[] aval = sb.values(i);
-					for(int j = apos; j < alen; j++)
-						rowSum[i] += aval[j];
-				}
+				final int apos = sb.pos(i);
+				final int alen = sb.size(i) + apos;
+				final double[] aval = sb.values(i);
+				for(int j = apos; j < alen; j++)
+					rowSum[i] += aval[j];
+
 			}
 		}
 	}
@@ -440,8 +441,8 @@ private static void leftMultByMatrixPrimitiveDense(List<AColGroup> colGroups, Ma
 		// The number of column groups to process together
 		// the value should ideally be set so that the colGroups fits into cache together with a row block.
 		// currently we only try to avoid having a dangling small number of column groups in the last block.
-		final int colGroupBlocking = preAggCGs.size() % 16 < 4 ? 20 : 16;
-
+		final int colGroupBlocking = preAggCGs.size();// % 16 < 4 ? 20 : 16;
+		// final int colGroupBlocking = 3;
 		// Allocate pre Aggregate Array List
 		final MatrixBlock[] preAgg = populatePreAggregate(colGroupBlocking);
 
@@ -461,27 +462,13 @@ private static void leftMultByMatrixPrimitiveDense(List<AColGroup> colGroups, Ma
 				preAgg[j % colGroupBlocking].reset(rowBlockSize, nVals, false);
 			}
 
-			int colBlockSize = 32000;
-
 			// For each row block
 			for(int h = rl; h < ru; h += rowBlockSize) {
-				// For each column block
 				final int rowUpper = Math.min(h + rowBlockSize, ru);
-				for(int i = 0; i < lc; i += colBlockSize) {
-					final int colUpper = Math.min(i + colBlockSize, lc);
-					// Pre Aggregate each column group in block
-					for(int j = g; j < gEnd && j < preAggCGs.size(); j++) {
-						preAggCGs.get(j).preAggregateDense(that, preAgg[j % colGroupBlocking], h, rowUpper, i, colUpper);
-					}
-					if(rowSum != null) {
-						final double[] thatV = that.getDenseBlockValues();
-						for(int r = h; r < rowUpper; r++) {
-							final int rowOff = r * lc;
-							for(int c = rowOff + i; c < rowOff + colUpper; c++)
-								rowSum[r] += thatV[c];
-						}
-					}
-				}
+				if(rowSum != null)
+					preAggregateWithRowSums(that, h, rowUpper, preAggCGs, g, gEnd, preAgg, rowSum);
+				else
+					preAggregate(that, h, rowUpper, preAggCGs, g, gEnd, preAgg);
 
 				// Multiply out the preAggregate to the output matrix.
 				for(int j = g; j < gEnd && j < preAggCGs.size(); j++) {
@@ -507,6 +494,42 @@ private static void leftMultByMatrixPrimitiveDense(List<AColGroup> colGroups, Ma
 		}
 	}
 
+	private static void preAggregateWithRowSums(MatrixBlock that, int rl, int ru, List<APreAgg> preAggCGs, int g,
+		int gEnd, MatrixBlock[] preAgg, double[] rowSum) {
+		final int lc = that.getNumColumns();
+		final int colBlockSize = 25000;
+		final int colGroupBlocking = preAgg.length;
+		// For each column block
+		for(int i = 0; i < lc; i += colBlockSize) {
+			final int colUpper = Math.min(i + colBlockSize, lc);
+			// Pre Aggregate each column group in block
+			for(int j = g; j < gEnd && j < colGroupBlocking; j++)
+				preAggCGs.get(j).preAggregateDense(that, preAgg[j % colGroupBlocking], rl, ru, i, colUpper);
+
+			final double[] thatV = that.getDenseBlockValues();
+			for(int r = rl; r < ru; r++) {
+				final int rowOff = r * lc;
+				for(int c = rowOff + i; c < rowOff + colUpper; c++)
+					rowSum[r] += thatV[c];
+			}
+
+		}
+	}
+
+	private static void preAggregate(MatrixBlock that, int rl, int ru, List<APreAgg> preAggCGs, int g, int gEnd,
+		MatrixBlock[] preAgg) {
+
+		final int lc = that.getNumColumns();
+		final int colBlockSize = 25000;
+		final int colGroupBlocking = preAgg.length;
+		for(int i = 0; i < lc; i += colBlockSize) {
+			final int colUpper = Math.min(i + colBlockSize, lc);
+			// Pre Aggregate each column group in block
+			for(int j = g; j < gEnd && j < colGroupBlocking; j++)
+				preAggCGs.get(j).preAggregateDense(that, preAgg[j % colGroupBlocking], rl, ru, i, colUpper);
+		}
+	}
+
 	private static MatrixBlock[] populatePreAggregate(int colGroupBlocking) {
 		final MatrixBlock[] preAgg = new MatrixBlock[colGroupBlocking];
 		// populate the preAgg array.
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRightMultBy.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRightMultBy.java
index 52ad0da3e4d..3ebdd3a00e3 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRightMultBy.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRightMultBy.java
@@ -92,9 +92,9 @@ private static MatrixBlock rightMultByMatrixOverlapping(CompressedMatrixBlock m1
 		final List<AColGroup> retCg = new ArrayList<>();
 		final CompressedMatrixBlock ret = new CompressedMatrixBlock(rl, cr);
 
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(colGroups);
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(colGroups);
 
-		double[] constV = containsSDC ? new double[rr] : null;
+		double[] constV = shouldFilter ? new double[rr] : null;
 		final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(colGroups, constV);
 		if(colGroups == filteredGroups)
 			constV = null;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java
new file mode 100644
index 00000000000..94865036b42
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.lib;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.colgroup.AColGroup;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+public class CLALibSlice {
+
+	protected static final Log LOG = LogFactory.getLog(CLALibSlice.class.getName());
+
+	public static MatrixBlock slice(CompressedMatrixBlock cmb, int rl, int ru, int cl, int cu, boolean deep) {
+		if(rl == ru && cl == cu)
+			return sliceSingle(cmb, rl, cl);
+		else if(rl == 0 && ru == cmb.getNumRows() - 1)
+			return sliceColumns(cmb, cl, cu);
+		else if(cl == 0 && cu == cmb.getNumColumns() - 1)
+			return sliceRows(cmb, rl, ru);
+		else
+			return sliceInternal(cmb, rl, ru, cl, cu);
+	}
+
+	private static MatrixBlock sliceInternal(CompressedMatrixBlock cmb, int rl, int ru, int cl, int cu) {
+		// In the case where an internal matrix is sliced out, then first slice out the
+		// columns to an compressed intermediate.
+		// Then call slice recursively, to do the row slice.
+		// Since we do not copy the index structure but simply maintain a pointer to the
+		// original this is fine.
+		return sliceRows(sliceColumns(cmb, cl, cu), rl, ru);
+	}
+
+	private static MatrixBlock sliceRows(CompressedMatrixBlock cmb, int rl, int ru) {
+		final int nCol = cmb.getNumColumns();
+		final int rue = ru + 1;
+		MatrixBlock tmp = new MatrixBlock(rue - rl, nCol, false).allocateDenseBlock();
+		DenseBlock db = tmp.getDenseBlock();
+		final List<AColGroup> groups = cmb.getColGroups();
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		if(shouldFilter) {
+			final double[] constV = new double[nCol];
+			final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
+			for(AColGroup g : filteredGroups)
+				g.decompressToDenseBlock(db, rl, rue, -rl, 0);
+			AColGroup cRet = ColGroupFactory.genColGroupConst(constV);
+			cRet.decompressToDenseBlock(db, rl, rue, -rl, 0);
+		}
+		else
+			for(AColGroup g : groups)
+				g.decompressToDenseBlock(db, rl, rue, -rl, 0);
+
+		tmp.recomputeNonZeros();
+		tmp.examSparsity();
+		return tmp;
+	}
+
+	private static MatrixBlock sliceSingle(CompressedMatrixBlock cmb, int row, int col) {
+		// get a single index, and return in a matrixBlock
+		MatrixBlock tmp = new MatrixBlock(1, 1, 0);
+		tmp.appendValue(0, 0, cmb.getValue(row, col));
+		return tmp;
+	}
+
+	private static CompressedMatrixBlock sliceColumns(CompressedMatrixBlock cmb, int cl, int cu) {
+		final int cue = cu + 1;
+		final CompressedMatrixBlock ret = new CompressedMatrixBlock(cmb.getNumRows(), cue - cl);
+
+		final List<AColGroup> newColGroups = new ArrayList<>();
+		for(AColGroup grp : cmb.getColGroups()) {
+			final AColGroup slice = grp.sliceColumns(cl, cue);
+			if(slice != null)
+				newColGroups.add(slice);
+		}
+
+		ret.allocateColGroupList(newColGroups);
+		ret.recomputeNonZeros();
+		ret.setOverlapping(cmb.isOverlapping());
+		return ret;
+	}
+
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibUtils.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibUtils.java
index d6965173600..0141a8d802b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibUtils.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibUtils.java
@@ -23,29 +23,18 @@
 import java.util.Arrays;
 import java.util.List;
 
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
+import org.apache.sysds.runtime.compress.colgroup.AMorphingMMColGroup;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupConst;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupEmpty;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory;
-import org.apache.sysds.runtime.compress.colgroup.ColGroupSDC;
-import org.apache.sysds.runtime.compress.colgroup.ColGroupSDCSingle;
 
 public final class CLALibUtils {
-	// private static final Log LOG = LogFactory.getLog(CLALibUtils.class.getName());
-
-	/**
-	 * Helper method to determine if the column groups contains SDC
-	 * 
-	 * @param groups The ColumnGroups to analyze
-	 * @return A Boolean saying it there is >= 2 SDC Groups.
-	 */
-	protected static boolean containsSDC(List<AColGroup> groups) {
-		for(AColGroup g : groups)
-			if(g instanceof ColGroupSDC || g instanceof ColGroupSDCSingle)
-				return true;
-		return false;
-	}
+	protected static final Log LOG = LogFactory.getLog(CLALibUtils.class.getName());
 
 	/**
 	 * Helper method to determine if the column groups contains SDC or Constant groups.
@@ -53,37 +42,13 @@ protected static boolean containsSDC(List<AColGroup> groups) {
 	 * @param groups The ColumnGroups to analyze
 	 * @return A Boolean saying there is SDC groups or Constant groups.
 	 */
-	protected static boolean containsSDCOrConst(List<AColGroup> groups) {
+	protected static boolean shouldPreFilter(List<AColGroup> groups) {
 		for(AColGroup g : groups)
-			if(g instanceof ColGroupSDC || g instanceof ColGroupSDCSingle || g instanceof ColGroupConst)
+			if(g instanceof AMorphingMMColGroup || g instanceof ColGroupConst)
 				return true;
 		return false;
 	}
 
-	/**
-	 * Helper method to filter out SDC Groups, to add their common value to the ConstV. This allows exploitation of the
-	 * common values in the SDC Groups.
-	 * 
-	 * @param groups The Column Groups
-	 * @param constV The Constant vector to add common values to.
-	 * @return The Filtered list of Column groups containing no SDC Groups but only SDCZero groups.
-	 */
-	protected static List<AColGroup> filterSDCGroups(List<AColGroup> groups, double[] constV) {
-		if(constV == null)
-			return groups;
-			
-		final List<AColGroup> filteredGroups = new ArrayList<>();
-		for(AColGroup g : groups) {
-			if(g instanceof ColGroupSDC)
-				filteredGroups.add(((ColGroupSDC) g).extractCommon(constV));
-			else if(g instanceof ColGroupSDCSingle)
-				filteredGroups.add(((ColGroupSDCSingle) g).extractCommon(constV));
-			else
-				filteredGroups.add(g);
-		}
-		return returnGroupIfFiniteNumbers(groups, filteredGroups, constV);
-	}
-
 	/**
 	 * Helper method to filter out SDC Groups and remove all constant groups, to reduce computation.
 	 * 
@@ -97,10 +62,8 @@ protected static List<AColGroup> filterGroups(List<AColGroup> groups, double[] c
 
 		final List<AColGroup> filteredGroups = new ArrayList<>();
 		for(AColGroup g : groups) {
-			if(g instanceof ColGroupSDC)
-				filteredGroups.add(((ColGroupSDC) g).extractCommon(constV));
-			else if(g instanceof ColGroupSDCSingle)
-				filteredGroups.add(((ColGroupSDCSingle) g).extractCommon(constV));
+			if(g instanceof AMorphingMMColGroup)
+				filteredGroups.add(((AMorphingMMColGroup) g).extractCommon(constV));
 			else if(g instanceof ColGroupEmpty)
 				continue;
 			else if(g instanceof ColGroupConst)
@@ -115,7 +78,8 @@ private static List<AColGroup> returnGroupIfFiniteNumbers(List<AColGroup> groups
 		double[] constV) {
 		for(double v : constV)
 			if(!Double.isFinite(v))
-				return groups;
+				throw new NotImplementedException();
+				// return groups;
 		return filteredGroups;
 	}
 
diff --git a/src/test/java/org/apache/sysds/test/TestUtils.java b/src/test/java/org/apache/sysds/test/TestUtils.java
index 125de369696..a0ba5bf418a 100644
--- a/src/test/java/org/apache/sysds/test/TestUtils.java
+++ b/src/test/java/org/apache/sysds/test/TestUtils.java
@@ -918,7 +918,7 @@ private static void compareMatricesBitAvgDistanceSparse(SparseBlock sbe, SparseB
 				continue;
 			
 			if(sba.size(i) != sbe.size(i))
-				fail(message+"\nNumber of values are not equal in row: " + i);
+				fail(message+"\nNumber of values are not equal in row: " + i +"\nactual:"+ sba.get(i) +"\nexpected:"+ sbe.get(i));
 
 			final double[] e = sbe.values(i);
 			final double[] a = sba.values(i);
diff --git a/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java b/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java
index fee58b97b89..b914dd7a301 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java
@@ -171,6 +171,17 @@ public void testUnaryOperators(AggType aggType, boolean inCP) {
 		testUnaryOperators(aggType, auop, inCP);
 	}
 
+	@Test
+	public void testNonZeros() {
+		if(!(cmb instanceof CompressedMatrixBlock))
+			return; // Input was not compressed then just pass test
+		if(!(cmb.getNonZeros() >= mb.getNonZeros())) {
+			fail(bufferedToString + "\nIncorrect number of non Zeros should guarantee greater than or equals but are "
+				+ cmb.getNonZeros() + " and should be: " + mb.getNonZeros());
+		}
+
+	}
+
 	@Test
 	public void testSerialization() {
 		try {
diff --git a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
index 16ca8ad8246..34a800f4262 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
@@ -95,8 +95,8 @@ public abstract class CompressedTestBase extends TestBase {
 	protected static ValueRange[] usedValueRanges = new ValueRange[] {ValueRange.BOOLEAN, ValueRange.SMALL,
 		ValueRange.NEGATIVE};
 
-	protected static OverLapping[] overLapping = new OverLapping[] {OverLapping.PLUS_LARGE, OverLapping.MATRIX,
-		OverLapping.NONE, OverLapping.APPEND_CONST, OverLapping.C_BIND_SELF};
+	protected static OverLapping[] overLapping = new OverLapping[] {OverLapping.PLUS_LARGE, OverLapping.PLUS_ROW_VECTOR,
+		OverLapping.MATRIX, OverLapping.NONE, OverLapping.APPEND_CONST, OverLapping.C_BIND_SELF};
 
 	protected static CompressionSettingsBuilder[] usedCompressionSettings = new CompressionSettingsBuilder[] {
 		// CLA TESTS!
@@ -264,11 +264,24 @@ else if(ov == OverLapping.SQUASH) {
 							cmb = ((CompressedMatrixBlock) cmb).squash(_k);
 					}
 				}
-				if(ov == OverLapping.PLUS || ov == OverLapping.PLUS_LARGE) {
-					ScalarOperator sop = ov == OverLapping.PLUS_LARGE ? new LeftScalarOperator(Plus.getPlusFnObject(),
-						-3142151) : new LeftScalarOperator(Plus.getPlusFnObject(), 5);
-					mb = mb.scalarOperations(sop, new MatrixBlock());
-					cmb = cmb.scalarOperations(sop, new MatrixBlock());
+				if(cmb instanceof CompressedMatrixBlock) {
+
+					if(ov == OverLapping.PLUS || ov == OverLapping.PLUS_LARGE) {
+						ScalarOperator sop = ov == OverLapping.PLUS_LARGE ? new LeftScalarOperator(Plus.getPlusFnObject(),
+							-3142151) : new LeftScalarOperator(Plus.getPlusFnObject(), 5);
+						mb = mb.scalarOperations(sop, new MatrixBlock());
+						cmb = cmb.scalarOperations(sop, new MatrixBlock());
+					}
+					else if(ov == OverLapping.PLUS_ROW_VECTOR) {
+
+						MatrixBlock v = TestUtils.generateTestMatrixBlock(1, cols, -1, 1, 1.0, 4);
+						BinaryOperator bop = new BinaryOperator(Plus.getPlusFnObject(), _k);
+						mb = mb.binaryOperations(bop, v, null);
+						cmb = cmb.binaryOperations(bop, v, null);
+						lossyTolerance = lossyTolerance + 2;
+					}
+					if(!(cmb instanceof CompressedMatrixBlock))
+						fail("Invalid construction, should result in compressed MatrixBlock");
 				}
 			}
 
@@ -285,6 +298,7 @@ else if(ov == OverLapping.SQUASH) {
 				matrixRowsCols = null;
 			}
 			TestUtils.assertEqualColsAndRows(mb, cmb, bufferedToString);
+
 		}
 		catch(Exception e) {
 			e.printStackTrace();
@@ -375,7 +389,7 @@ public void testDecompress() {
 		try {
 			if(!(cmb instanceof CompressedMatrixBlock))
 				return; // Input was not compressed then just pass test
-
+			((CompressedMatrixBlock) cmb).clearSoftReferenceToDecompressed();
 			MatrixBlock decompressedMatrixBlock = ((CompressedMatrixBlock) cmb).decompress(_k);
 			compareResultMatrices(mb, decompressedMatrixBlock, 1);
 			assertEquals(bufferedToString, mb.getNonZeros(), decompressedMatrixBlock.getNonZeros());
@@ -902,10 +916,13 @@ public void testSlice(int rl, int ru, int cl, int cu) {
 		try {
 			if(!(cmb instanceof CompressedMatrixBlock) || rows * cols > 10000)
 				return;
-			MatrixBlock ret2 = cmb.slice(rl, ru, cl, cu);
-			MatrixBlock ret1 = mb.slice(rl, ru, cl, cu);
-			if(!(ret2 instanceof CompressedMatrixBlock))
-				assertEquals(ret1.getNonZeros(), ret2.getNonZeros());
+			final MatrixBlock ret2 = cmb.slice(rl, ru, cl, cu);
+			final MatrixBlock ret1 = mb.slice(rl, ru, cl, cu);
+			final long nnz1 = ret1.getNonZeros();
+			final long nnz2 = ret2.getNonZeros();
+			if(!(ret2 instanceof CompressedMatrixBlock) && nnz1 != nnz2)
+				fail(bufferedToString + "\nNot same number of non zeros " + nnz1 + " != " + nnz2);
+
 			compareResultMatrices(ret1, ret2, 1);
 		}
 		catch(Exception e) {
diff --git a/src/test/java/org/apache/sysds/test/component/compress/ExtendedMatrixTests.java b/src/test/java/org/apache/sysds/test/component/compress/ExtendedMatrixTests.java
index 3c5be85ee96..924f5ef374b 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/ExtendedMatrixTests.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/ExtendedMatrixTests.java
@@ -76,12 +76,15 @@ public static Collection<Object[]> data() {
 		SparsityType st = SparsityType.FULL;
 		ValueType vt = ValueType.RLE_COMPRESSIBLE;
 		ValueRange vr = ValueRange.SMALL;
-		MatrixTypology mt = MatrixTypology.SMALL;
+		MatrixTypology mt = MatrixTypology.LARGE;
 		OverLapping ov = OverLapping.NONE;
 
 		for(CompressionSettingsBuilder cs : usedCompressionSettings)
 			tests.add(new Object[] {st, vt, vr, cs, mt, ov, 1, null});
 
+		ov = OverLapping.PLUS_ROW_VECTOR;
+		for(CompressionSettingsBuilder cs : usedCompressionSettings)
+			tests.add(new Object[] {st, vt, vr, cs, mt, ov, 1, null});
 		return tests;
 	}
 
@@ -132,7 +135,7 @@ public void testSum() {
 		else if(OverLapping.effectOnOutput(overlappingType))
 			assertTrue(bufferedToString, TestUtils.getPercentDistance(ret2, ret1, true) > .99);
 		else
-			TestUtils.compareScalarBitsJUnit(ret2, ret1, 3, bufferedToString); // Should be exactly same value
+			TestUtils.compareScalarBitsJUnit(ret2, ret1, 100, bufferedToString); // Should be exactly same value
 
 	}
 
diff --git a/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java b/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
index a416b547e1f..7341bc5a3d8 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
@@ -34,8 +34,7 @@ public enum ValueType {
 		RAND_ROUND, // Values rounded to nearest whole numbers.
 		OLE_COMPRESSIBLE, // Ideal inputs for OLE Compression.
 		RLE_COMPRESSIBLE, // Ideal inputs for RLE Compression.
-		ONE_HOT,
-		UNBALANCED_SPARSE, // An input where some columns are super dense and some very sparse
+		ONE_HOT, UNBALANCED_SPARSE, // An input where some columns are super dense and some very sparse
 	}
 
 	public enum MatrixTypology {
@@ -55,7 +54,8 @@ public enum ValueRange {
 	}
 
 	public enum OverLapping {
-		COL, MATRIX, NONE, MATRIX_PLUS, MATRIX_MULT_NEGATIVE, SQUASH, PLUS, APPEND_EMPTY, APPEND_CONST, PLUS_LARGE, C_BIND_SELF;
+		COL, MATRIX, NONE, MATRIX_PLUS, MATRIX_MULT_NEGATIVE, SQUASH, PLUS, APPEND_EMPTY, APPEND_CONST, PLUS_LARGE,
+		C_BIND_SELF, PLUS_ROW_VECTOR;
 
 		public static boolean effectOnOutput(OverLapping opcode) {
 			switch(opcode) {
diff --git a/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingPreAggregateTests.java b/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingPreAggregateTests.java
index 795803f9275..6956e0e37c5 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingPreAggregateTests.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingPreAggregateTests.java
@@ -19,16 +19,21 @@
 
 package org.apache.sysds.test.component.compress.mapping;
 
+import static org.junit.Assert.fail;
+
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Random;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToByte;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetByte;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.test.TestUtils;
 import org.junit.Test;
@@ -46,6 +51,9 @@ public class MappingPreAggregateTests {
 	public final int size;
 	private AMapToData m;
 	private MapToByte ref;
+	private AOffset o;
+	private final MatrixBlock mb; // matrix block to preAggregate from.
+	private final double[] preRef;
 
 	@Parameters
 	public static Collection<Object[]> data() {
@@ -67,6 +75,11 @@ public MappingPreAggregateTests(int seed, MAP_TYPE type, int size) {
 		this.type = type;
 		this.size = size;
 		genBitMap(seed);
+
+		mb = TestUtils.generateTestMatrixBlock(2, size, 0, 100, 1.0, seed);
+		preRef = new double[m.getUnique()];
+		o = OneOffset.create(size);
+		ref.preAggregateDense(mb, preRef, 0, 1, 0, size);
 	}
 
 	protected AMapToData genBitMap(int seed) {
@@ -85,20 +98,69 @@ protected AMapToData genBitMap(int seed) {
 
 	@Test
 	public void testPreAggregateDense() {
-		int nUnique = m.getUnique();
-		int size = m.size();
+		try {
+			final int size = m.size();
+			MatrixBlock mb = TestUtils.generateTestMatrixBlock(1, size, 0, 100, 1.0, seed);
+			double[] pre = new double[m.getUnique()];
+			m.preAggregateDense(mb, pre, 0, 1, 0, size);
+			TestUtils.compareMatrices(preRef, pre, 0.00001);
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			fail(e.toString());
+		}
+	}
 
-		MatrixBlock mb = TestUtils.generateTestMatrixBlock(1, size, 0, 100, 1.0, seed);
-		MatrixBlock pre = new MatrixBlock(1, nUnique, false);
-		pre.allocateDenseBlock();
+	@Test
+	public void testPreAggregateDenseWithIndexes() {
+		switch(type) {
+			case BIT:
+			case INT:
+				return;
+			default:
+				try {
+					final int size = m.size();
+					MatrixBlock mb = TestUtils.generateTestMatrixBlock(1, size, 0, 100, 1.0, seed);
+					double[] pre = new double[m.getUnique()];
+					m.preAggregateDense(mb, pre, 0, 1, 0, size, o);
+					TestUtils.compareMatrices(preRef, pre, 0.00001);
+				}
+				catch(Exception e) {
+					e.printStackTrace();
+					fail(e.toString());
+				}
+		}
+	}
 
-		m.preAggregateDense(mb, pre, 0, 1, 0, 100);
+	@Test(expected = NotImplementedException.class)
+	public void testPreAggregateDenseWithIndexesExceptionExpected() {
+		switch(type) {
+			case BIT:
+			case INT:
+				m.preAggregateDense(mb, null, 0, 1, 0, size, o);
+			default:
+				throw new NotImplementedException();
+		}
+	}
+
+	@Test(expected = NotImplementedException.class)
+	public void testPreAggregateDenseExceptionExpected() {
+		m.preAggregateDense(mb, null, 0, 2, 0, size);
+	}
 
-		MatrixBlock preRef = new MatrixBlock(1, nUnique, false);
-		preRef.allocateDenseBlock();
-		
-		ref.preAggregateDense(mb, preRef, 0, 1,0,100);
+	private static class OneOffset extends OffsetByte {
 
-		TestUtils.compareMatrices(preRef, pre, 0, "preaggregate not same with different maps");
+		private OneOffset(byte[] offsets, int offsetToFirst, int offsetToLast) {
+			super(offsets, offsetToFirst, offsetToLast);
+		}
+
+		protected static OneOffset create(int length) {
+			int offsetToFirst = 0;
+			int offsetToLast = length - 1;
+			byte[] offsets = new byte[length - 1];
+			for(int i = 0; i < offsets.length; i++)
+				offsets[i] = 1;
+			return new OneOffset(offsets, offsetToFirst, offsetToLast);
+		}
 	}
 }
diff --git a/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingTests.java b/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingTests.java
index 2bb813831c5..8509d3e46c8 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingTests.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingTests.java
@@ -64,7 +64,7 @@ public static Collection<Object[]> data() {
 			tests.add(new Object[] {4, t, 63, false});
 			tests.add(new Object[] {3, t, 64, false});
 			tests.add(new Object[] {3, t, 65, false});
-			tests.add(new Object[] {5, t, 64+63, false});
+			tests.add(new Object[] {5, t, 64 + 63, false});
 			tests.add(new Object[] {5, t, 1234, false});
 			tests.add(new Object[] {5, t, 13, true});
 		}
@@ -107,6 +107,7 @@ protected static AMapToData genMap(AMapToData m, int[] expected, int max, boolea
 
 		// to make sure that the bit set is actually filled.
 		m.set(size - 1, max);
+
 		expected[size - 1] = max;
 		return m;
 	}
@@ -205,6 +206,32 @@ public void replaceMax() {
 		}
 	}
 
+	@Test
+	public void getCountsWithDefault() {
+		switch(type) {
+			case CHAR:
+			case BIT:
+			case INT:
+				return;
+			default:
+				int[] counts = m.getCounts(new int[m.getUnique() + 1], size + 10);
+				if(10 != counts[m.getUnique()]) {
+					fail("Incorrect number of unique values:" + m + "\n" + Arrays.toString(counts));
+				}
+		}
+	}
+
+	@Test
+	public void getCountsNoDefault() {
+		switch(type) {
+			case CHAR:
+			case INT:
+				return;
+			default:
+				m.getCounts(new int[m.getUnique()], size);
+		}
+	}
+
 	@Test
 	public void replaceMin() {
 		int max = m.getUpperBoundValue();
@@ -217,6 +244,17 @@ public void replaceMin() {
 		}
 	}
 
+	@Test
+	public void getUnique() {
+		switch(type) {
+			case INT:
+				return;
+			default:
+				int u = m.getUnique();
+				assertEquals(m.getUpperBoundValue() + 1, u);
+		}
+	}
+
 	@Test
 	public void testInMemorySize() {
 		long inMemorySize = m.getInMemorySize();
diff --git a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetNegativeTests.java b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetNegativeTests.java
deleted file mode 100644
index ebf81a3ce14..00000000000
--- a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetNegativeTests.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.test.component.compress.offset;
-
-import static org.junit.Assert.fail;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-
-import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
-import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
-import org.apache.sysds.runtime.compress.colgroup.offset.OffsetByte;
-import org.apache.sysds.runtime.compress.colgroup.offset.OffsetChar;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-@RunWith(value = Parameterized.class)
-public class OffsetNegativeTests {
-
-	private enum TYPE {
-		BYTE, CHAR
-	}
-
-	@Parameterized.Parameter
-	public int[] data;
-	@Parameterized.Parameter(1)
-	public TYPE type;
-
-	@Parameters
-	public static Collection<Object[]> data() {
-		ArrayList<Object[]> tests = new ArrayList<>();
-		// It is assumed that the input is in sorted order, all values are positive and there are no duplicates.
-		for(TYPE t : TYPE.values()) {
-			tests.add(new Object[] {new int[] {1, 1,}, t});
-			tests.add(new Object[] {new int[] {2, 2, 2, 2}, t});
-			tests.add(new Object[] {new int[] {1, 2, 3, 4, 5, 5}, t});
-			tests.add(new Object[] {null, t});
-			tests.add(new Object[] {new int[] {}, t});
-			
-		}
-		return tests;
-	}
-
-	@Test(expected = Exception.class)
-	public void testConstruction() {
-		switch(type) {
-			case BYTE:
-				testConstruction(new OffsetByte(data));
-				break;
-			case CHAR:
-				testConstruction(new OffsetChar(data));
-				break;
-			default:
-				throw new NotImplementedException("not implemented");
-		}
-
-	}
-
-	public void testConstruction(AOffset o) {
-		AIterator i = o.getIterator();
-		for(int j = 0; j < data.length; j++) {
-
-			if(data[j] != i.value())
-				fail("incorrect result using : " + o.getClass().getSimpleName() + " expected: " + Arrays.toString(data)
-					+ " but was :" + o.toString());
-			if(i.hasNext())
-				i.next();
-		}
-	}
-
-}
diff --git a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetSingleTests.java b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetSingleTests.java
index 3fe8393d475..5ec39127e15 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetSingleTests.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetSingleTests.java
@@ -19,23 +19,29 @@
 
 package org.apache.sysds.test.component.compress.offset;
 
+import static org.junit.Assert.assertTrue;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
 import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
 import org.junit.Test;
 
 public class OffsetSingleTests {
 
-	@Test(expected = RuntimeException.class)
-	public void testInvalidSize_01() {
-		OffsetFactory.estimateInMemorySize(-1, 100);
+	@Test
+	public void testEmptyEstimateMemory() {
+		assertTrue(OffsetFactory.estimateInMemorySize(0, 10000) < 10);
 	}
 
-	@Test(expected = RuntimeException.class)
-	public void testInvalidSize_02() {
-		OffsetFactory.estimateInMemorySize(10, -1);
+	@Test(expected = NotImplementedException.class)
+	public void testNotImplementedMultirowAggregationChar() {
+		AOffset a = OffsetFactory.createOffset(new int[] {1, 2, 3, 4, 5});
+		a.preAggregateDenseMap(null, null, 0, 2, 0, 5, -1, (char[]) null);
 	}
 
-	@Test(expected = RuntimeException.class)
-	public void testInvalidCreation() {
-		OffsetFactory.create(new int[] {1, 2, 3, -1});
+	@Test(expected = NotImplementedException.class)
+	public void testNotImplementedMultirowAggregationByte() {
+		AOffset a = OffsetFactory.createOffset(new int[] {1, 2, 3, 4, 5});
+		a.preAggregateDenseMap(null, null, 0, 2, 0, 5, -1, (byte[]) null);
 	}
 }
diff --git a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregate.java b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregate.java
new file mode 100644
index 00000000000..f7cec1f1407
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregate.java
@@ -0,0 +1,460 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.offset;
+
+import static org.junit.Assert.fail;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.math3.util.Precision;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetByte;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetChar;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory.OFF_TYPE;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(value = Parameterized.class)
+public class OffsetTestPreAggregate {
+	protected static final Log LOG = LogFactory.getLog(OffsetTestPreAggregate.class.getName());
+
+	private static final double eps = 0.00001;
+
+	private final int[] data;
+	private final AOffset a;
+
+	private final MatrixBlock leftM;
+
+	// sum of indexes row 1.
+	private final double[] s;
+
+	@Parameters
+	public static Collection<Object[]> data() {
+		ArrayList<Object[]> tests = new ArrayList<>();
+		// It is assumed that the input is in sorted order, all values are positive and there are no duplicates.
+		// note that each tests allocate an matrix of two rows, and the last value length.
+		// therefore don't make it to large.
+		for(OFF_TYPE t : OFF_TYPE.values()) {
+			tests.add(new Object[] {new int[] {1, 2}, t});
+			tests.add(new Object[] {new int[] {2, 142}, t});
+			tests.add(new Object[] {new int[] {142, 421}, t});
+			tests.add(new Object[] {new int[] {1, 1023}, t});
+			tests.add(new Object[] {new int[] {1023, 1024}, t});
+			tests.add(new Object[] {new int[] {1023}, t});
+			tests.add(new Object[] {new int[] {0, 1, 2, 3, 4, 5}, t});
+			tests.add(new Object[] {new int[] {0}, t});
+			tests.add(new Object[] {new int[] {0, 256}, t});
+			tests.add(new Object[] {new int[] {0, 254}, t});
+			tests.add(new Object[] {new int[] {0, 256 * 2}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 2}, t});
+			tests.add(new Object[] {new int[] {0, 254 * 2}, t});
+			tests.add(new Object[] {new int[] {0, 510, 765}, t});
+			tests.add(new Object[] {new int[] {0, 254 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255, 255 * 2, 255 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 2, 255 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 2, 255 * 3, 255 * 10}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 4}, t});
+			tests.add(new Object[] {new int[] {0, 256 * 3}, t});
+			tests.add(new Object[] {new int[] {255 * 3, 255 * 5}, t});
+			tests.add(new Object[] {new int[] {0, 1, 2, 3, 255 * 4, 1500}, t});
+			tests.add(new Object[] {new int[] {0, 1, 2, 3, 4, 5}, t});
+			tests.add(new Object[] {new int[] {0, 1, 2, 3, 4, 5, 125, 142, 161, 1661, 2314}, t});
+			tests.add(new Object[] {new int[] {51, 4251, Character.MAX_VALUE}, t});
+		}
+		return tests;
+	}
+
+	public OffsetTestPreAggregate(int[] data, OFF_TYPE type) {
+		this.data = data;
+		switch(type) {
+			case BYTE:
+				this.a = new OffsetByte(data);
+				break;
+			case CHAR:
+				this.a = new OffsetChar(data);
+				break;
+			default:
+				throw new NotImplementedException("not implemented");
+		}
+		this.leftM = TestUtils.generateTestMatrixBlock(2, data[data.length - 1] + 100, -1, 100, 1.0, 1342);
+		this.s = sumIndexes();
+	}
+
+	@Test
+	public void testToString() {
+		String obs = getString(a);
+		String vs = Arrays.toString(data);
+		if(!obs.equals(vs))
+			fail("The strings are not equivalent ");
+	}
+
+	@Test
+	public void preAggByteMapFirstRowByte() {
+		preAggMapRowByte(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondRowByte() {
+		preAggMapRowByte(1);
+	}
+
+	@Test
+	public void preAggByteMapFirstRowChar() {
+		preAggMapRowChar(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondRowChar() {
+		preAggMapRowChar(1);
+	}
+
+	private void preAggMapRowChar(int row) {
+		double[] preAV = new double[1];
+		char[] m = new char[data.length];
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, leftM.getNumColumns(), 0, m);
+		verifyPreAggMapRowByte(preAV, row);
+	}
+
+	private void preAggMapRowByte(int row) {
+		double[] preAV = new double[1];
+		byte[] m = new byte[data.length];
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, leftM.getNumColumns(), 0, m);
+
+		verifyPreAggMapRowByte(preAV, row);
+	}
+
+	private void verifyPreAggMapRowByte(double[] preAV, int row) {
+
+		if(preAV[0] != s[row])
+			fail("The preaggregate result is not the sum! : " + preAV[0] + " vs " + s[row]);
+	}
+
+	@Test
+	public void preAggByteMapFirstRowByteAll1() {
+		preAggMapRowByteAll1(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondRowByteAll1() {
+		preAggMapRowByteAll1(1);
+	}
+
+	@Test
+	public void preAggByteMapFirstRowCharAll1() {
+		preAggMapRowCharAll1(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondRowCharAll1() {
+		preAggMapRowCharAll1(1);
+	}
+
+	private void preAggMapRowCharAll1(int row) {
+		double[] preAV = new double[2];
+		char[] m = new char[data.length];
+		fill(m, (char) 1);
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, leftM.getNumColumns(), 0, m);
+		verifyPreAggMapRowAllBytes1(preAV, row);
+	}
+
+	private void preAggMapRowByteAll1(int row) {
+		double[] preAV = new double[2];
+		byte[] m = new byte[data.length];
+		fill(m, (byte) 1);
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, leftM.getNumColumns(), 0, m);
+		verifyPreAggMapRowAllBytes1(preAV, row);
+	}
+
+	private void verifyPreAggMapRowAllBytes1(double[] preAV, int row) {
+		if(preAV[0] != 0)
+			fail("aggregate to wrong index");
+		if(preAV[1] != s[row])
+			fail("The preaggregate result is not the sum! : " + preAV[0] + " vs " + s[row]);
+	}
+
+	@Test
+	public void preAggByteMapFirstRowByteOne1() {
+		preAggMapRowByteOne1(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondRowByteOne1() {
+		preAggMapRowByteOne1(1);
+	}
+
+	@Test
+	public void preAggByteMapFirstRowCharOne1() {
+		preAggMapRowCharOne1(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondRowCharOne1() {
+		preAggMapRowCharOne1(1);
+	}
+
+	private void preAggMapRowCharOne1(int row) {
+		if(data.length > 1) {
+			double[] preAV = new double[2];
+			char[] m = new char[data.length];
+			m[1] = 1;
+			a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, leftM.getNumColumns(), 0, m);
+			verifyPreAggMapRowOne1(preAV, row);
+		}
+	}
+
+	private void preAggMapRowByteOne1(int row) {
+		if(data.length > 1) {
+			double[] preAV = new double[2];
+			byte[] m = new byte[data.length];
+			m[1] = 1;
+			a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, leftM.getNumColumns(), 0, m);
+			verifyPreAggMapRowOne1(preAV, row);
+		}
+	}
+
+	private void verifyPreAggMapRowOne1(double[] preAV, int row) {
+		double v = leftM.getValue(row, data[1]);
+		if(preAV[1] != v)
+			fail("aggregate to wrong index");
+		if(!Precision.equals(preAV[0], s[row] - v, eps))
+			fail("The preaggregate result is not the sum! : " + preAV[0] + " vs " + (s[row] - v));
+	}
+
+	@Test
+	public void preAggByteMapFirstSubOfRowByte() {
+		preAggMapSubOfRowByte(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondSubOfRowByte() {
+		preAggMapSubOfRowByte(1);
+	}
+
+	@Test
+	public void preAggByteMapFirstSubOfRowChar() {
+		preAggMapSubOfRowChar(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondSubOfRowChar() {
+		preAggMapSubOfRowChar(1);
+	}
+
+	private void preAggMapSubOfRowChar(int row) {
+		if(data.length > 2) {
+			double[] preAV = new double[2];
+			char[] m = new char[data.length];
+			m[1] = 1;
+			a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, data[data.length - 1], 0, m);
+			verifyPreAggMapSubOfRow(preAV, row);
+		}
+	}
+
+	private void preAggMapSubOfRowByte(int row) {
+		if(data.length > 2) {
+			double[] preAV = new double[2];
+			byte[] m = new byte[data.length];
+			m[1] = 1;
+			a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, data[data.length - 1], 0, m);
+			verifyPreAggMapSubOfRow(preAV, row);
+		}
+	}
+
+	private void verifyPreAggMapSubOfRow(double[] preAV, int row) {
+		double v = leftM.getValue(row, data[1]);
+		double v2 = leftM.getValue(row, data[data.length - 1]);
+		if(preAV[1] != v)
+			fail("aggregate to wrong index");
+		if(!Precision.equals(preAV[0], s[row] - v - v2, eps))
+			fail("The preaggregate result is not the sum! : " + preAV[0] + " vs " + (s[row] - v - v2));
+	}
+
+	@Test
+	public void preAggByteMapFirstSubOfRowV2Byte() {
+		preAggMapSubOfRowV2Byte(0, 2);
+	}
+
+	@Test
+	public void preAggByteMapSecondSubOfRowV2Byte() {
+		preAggMapSubOfRowV2Byte(1, 2);
+	}
+
+	@Test
+	public void preAggByteMapFirstSubOfRowV2Char() {
+		preAggMapSubOfRowV2Char(0, 2);
+	}
+
+	@Test
+	public void preAggByteMapSecondSubOfRowV2Char() {
+		preAggMapSubOfRowV2Char(1, 2);
+	}
+
+	@Test
+	public void preAggByteMapFirstSubOfRowV2ByteV2() {
+		preAggMapSubOfRowV2Byte(0, 244);
+	}
+
+	@Test
+	public void preAggByteMapSecondSubOfRowV2ByteV2() {
+		preAggMapSubOfRowV2Byte(1, 244);
+	}
+
+	@Test
+	public void preAggByteMapFirstSubOfRowV2CharV2() {
+		preAggMapSubOfRowV2Char(0, 244);
+	}
+
+	@Test
+	public void preAggByteMapSecondSubOfRowV2CharV2() {
+		preAggMapSubOfRowV2Char(1, 244);
+	}
+
+	private void preAggMapSubOfRowV2Char(int row, int nVal) {
+		if(data.length > 3) {
+			double[] preAV = new double[2];
+			char[] m = new char[data.length];
+			m[1] = 1;
+			a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, data[data.length - 2], nVal, m);
+			verifyPreAggMapSubOfRowV2Byte(preAV, row);
+		}
+	}
+
+	private void preAggMapSubOfRowV2Byte(int row, int nVal) {
+		if(data.length > 3) {
+			double[] preAV = new double[2];
+			byte[] m = new byte[data.length];
+			m[1] = 1;
+			a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, data[data.length - 2], nVal, m);
+			verifyPreAggMapSubOfRowV2Byte(preAV, row);
+		}
+	}
+
+	private void verifyPreAggMapSubOfRowV2Byte(double[] preAV, int row) {
+		double v = leftM.getValue(row, data[1]);
+		double v2 = leftM.getValue(row, data[data.length - 1]) + leftM.getValue(row, data[data.length - 2]);
+		if(preAV[1] != v)
+			fail("aggregate to wrong index");
+		if(!Precision.equals(preAV[0], s[row] - v - v2, eps))
+			fail("The preaggregate result is not the sum! : " + preAV[0] + " vs " + (s[row] - v - v2));
+	}
+
+	@Test
+	public void preAggByteMapFirstOutOfRangeBeforeByte() {
+		preAggMapOutOfRangeBeforeByte(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondOutOfRangeBeforeByte() {
+		preAggMapOutOfRangeBeforeByte(1);
+	}
+
+	@Test
+	public void preAggByteMapFirstOutOfRangeBeforeChar() {
+		preAggMapOutOfRangeBeforeChar(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondOutOfRangeBeforeChar() {
+		preAggMapOutOfRangeBeforeChar(1);
+	}
+
+	private void preAggMapOutOfRangeBeforeChar(int row) {
+		double[] preAV = null; // should not need access this therefore we make a null argument here.
+		char[] m = new char[data.length];
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, -412, data[0] - 1, 0, m);
+	}
+
+	private void preAggMapOutOfRangeBeforeByte(int row) {
+		double[] preAV = null; // should not need access this therefore we make a null argument here.
+		byte[] m = new byte[data.length];
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, -412, data[0] - 1, 0, m);
+	}
+
+	@Test
+	public void preAggByteMapFirstOutOfRangeAfterByte() {
+		preAggMapOutOfRangeAfterByte(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondOutOfRangeAfterByte() {
+		preAggMapOutOfRangeAfterByte(1);
+	}
+
+	@Test
+	public void preAggByteMapFirstOutOfRangeAfterChar() {
+		preAggMapOutOfRangeAfterChar(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondOutOfRangeAfterChar() {
+		preAggMapOutOfRangeAfterChar(1);
+	}
+
+	private void preAggMapOutOfRangeAfterChar(int row) {
+		double[] preAV = null; // should not need access this therefore we make a null argument here.
+		char[] m = new char[data.length];
+		int id = data[data.length - 1] + 10;
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, id, id + 10, 0, m);
+	}
+
+	private void preAggMapOutOfRangeAfterByte(int row) {
+		double[] preAV = null; // should not need access this therefore we make a null argument here.
+		byte[] m = new byte[data.length];
+		int id = data[data.length - 1] + 10;
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, id, id + 10, 0, m);
+	}
+
+	private final double[] sumIndexes() {
+		double[] lmv = leftM.getDenseBlockValues();
+		double[] ret = new double[leftM.getNumRows()];
+		for(int j = 0; j < leftM.getNumRows(); j++) {
+			final int off = j * leftM.getNumColumns();
+			for(int i = 0; i < data.length; i++)
+				ret[j] += lmv[data[i] + off];
+		}
+		return ret;
+	}
+
+	private final void fill(byte[] a, byte v) {
+		for(int i = 0; i < a.length; i++)
+			a[i] = v;
+	}
+
+	private final void fill(char[] a, char v) {
+		for(int i = 0; i < a.length; i++)
+			a[i] = v;
+	}
+
+	private String getString(AOffset a) {
+		String os = a.toString();
+		return os.substring(os.indexOf("["), os.length());
+	}
+
+}
diff --git a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTests.java b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTests.java
index 0fca9742217..a7c03284143 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTests.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTests.java
@@ -20,6 +20,8 @@
 package org.apache.sysds.test.component.compress.offset;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
@@ -33,6 +35,8 @@
 import java.util.Collection;
 
 import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
@@ -47,8 +51,9 @@
 
 @RunWith(value = Parameterized.class)
 public class OffsetTests {
+	protected static final Log LOG = LogFactory.getLog(OffsetTests.class.getName());
 
-	private static final long sizeTolerance = 265;
+	private static final long sizeTolerance = 100;
 
 	public int[] data;
 	public OFF_TYPE type;
@@ -72,17 +77,25 @@ public static Collection<Object[]> data() {
 			tests.add(new Object[] {new int[] {0, 256}, t});
 			tests.add(new Object[] {new int[] {0, 254}, t});
 			tests.add(new Object[] {new int[] {0, Character.MAX_VALUE}, t});
+			tests.add(new Object[] {new int[] {0, Character.MAX_VALUE, ((int) Character.MAX_VALUE) * 2}, t});
+			tests.add(new Object[] {new int[] {2, Character.MAX_VALUE + 2}, t});
 			tests.add(new Object[] {new int[] {0, ((int) Character.MAX_VALUE) + 1}, t});
 			tests.add(new Object[] {new int[] {0, ((int) Character.MAX_VALUE) - 1}, t});
 			tests.add(new Object[] {new int[] {0, 256 * 2}, t});
 			tests.add(new Object[] {new int[] {0, 255 * 2}, t});
 			tests.add(new Object[] {new int[] {0, 254 * 2}, t});
+			tests.add(new Object[] {new int[] {0, 510, 765}, t});
 			tests.add(new Object[] {new int[] {0, 254 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255, 255 * 2, 255 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 2, 255 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 2, 255 * 3, 255 * 10}, t});
 			tests.add(new Object[] {new int[] {0, 255 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 4}, t});
 			tests.add(new Object[] {new int[] {0, 256 * 3}, t});
 			tests.add(new Object[] {new int[] {255 * 3, 255 * 5}, t});
 			tests.add(new Object[] {new int[] {1000000, 1000000 + 255 * 5}, t});
 			tests.add(new Object[] {new int[] {100000000, 100000000 + 255 * 5}, t});
+			tests.add(new Object[] {new int[] {100000000, 100001275, 100001530}, t});
 			tests.add(new Object[] {new int[] {0, 1, 2, 3, 255 * 4, 1500}, t});
 			tests.add(new Object[] {new int[] {0, 1, 2, 3, 4, 5}, t});
 			tests.add(new Object[] {new int[] {2458248, 2458249, 2458253, 2458254, 2458256, 2458257, 2458258, 2458262,
@@ -109,8 +122,7 @@ public OffsetTests(int[] data, OFF_TYPE type) {
 	@Test
 	public void testConstruction() {
 		try {
-			AIterator i = o.getIterator();
-			compare(i, data);
+			compare(o, data);
 		}
 		catch(Exception e) {
 			e.printStackTrace();
@@ -118,6 +130,30 @@ public void testConstruction() {
 		}
 	}
 
+	@Test
+	public void testCacheExists() {
+		if(data.length > 2) {
+			AIterator i = o.getIterator();
+			i.next();
+			o.cacheIterator(i, data[1]);
+			AIterator ii = o.getIterator(data[1]);
+			assertTrue(ii.equals(i));
+			ii.next();
+			assertFalse(ii.equals(i));
+		}
+	}
+
+	@Test
+	public void testCacheDontExists() {
+		if(data.length > 2) {
+			AIterator i = o.getIterator();
+			i.next();
+			o.cacheIterator(i, data[1]);
+			AIterator ii = o.getIterator(data[2]);
+			assertFalse(ii.equals(i));
+		}
+	}
+
 	@Test
 	public void testSerialization() {
 		try {
@@ -131,9 +167,7 @@ public void testSerialization() {
 			DataInputStream fis = new DataInputStream(bis);
 
 			AOffset n = OffsetFactory.readIn(fis);
-
-			AIterator i = n.getIterator();
-			compare(i, data);
+			compare(n, data);
 		}
 		catch(IOException e) {
 			throw new RuntimeException("Error in io", e);
@@ -170,23 +204,25 @@ public void testOnDiskSizeInBytes() {
 	}
 
 	@Test
-	public void testInMemoryEstimateIsSameAsActualOrSmaller() {
+	public void testInMemoryEstimateIsSameAsActualOrLarger() {
 		try {
-			long inMemorySize = o.getInMemorySize();
+			final long inMemorySize = o.getInMemorySize();
 			long estimatedSize;
 			switch(type) {
 				case BYTE:
-					estimatedSize = OffsetByte.getInMemorySize(data.length);
+					estimatedSize = OffsetByte.estimateInMemorySize(data.length, data[data.length - 1] - data[0]);
 					break;
 				case CHAR:
-					estimatedSize = OffsetChar.getInMemorySize(data.length);
+					estimatedSize = OffsetChar.estimateInMemorySize(data.length, data[data.length - 1] - data[0]);
 					break;
 				default:
 					throw new DMLCompressionException("Unknown input");
 			}
-			final String errorMessage = "in memory size: " + inMemorySize + " is not smaller than estimate: "
-				+ estimatedSize + " with tolerance " + sizeTolerance;
-			assertTrue(errorMessage, inMemorySize - sizeTolerance <= estimatedSize);
+			if(!(inMemorySize <= estimatedSize + sizeTolerance)) {
+
+				fail("in memory size: " + inMemorySize + " is not smaller than estimate: " + estimatedSize
+					+ " with tolerance " + sizeTolerance);
+			}
 		}
 		catch(Exception e) {
 			e.printStackTrace();
@@ -194,15 +230,174 @@ public void testInMemoryEstimateIsSameAsActualOrSmaller() {
 		}
 	}
 
-	private void compare(AIterator i, int[] v) {
-		for(int j = 0; j < v.length; j++) {
+	@Test
+	public void testSkipToContainedIndex() {
+		try {
+			assertEquals(data[data.length - 1], o.getIterator().skipTo(data[data.length - 1]));
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			fail("Failed skipping to last index");
+		}
+	}
 
+	@Test
+	public void testSkipToContainedIndexPlusOne() {
+		try {
+			assertNotEquals(data[data.length - 1] + 1, o.getIterator().skipTo(data[data.length - 1]));
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			fail("Failed skipping to last index");
+		}
+	}
+
+	@Test
+	public void testSkipToContainedIndexPlusN() {
+		try {
+			if(data.length > 1)
+				assertTrue(data[1] <= o.getIterator().skipTo(data[1] + 1));
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			fail("Failed skipping to last index");
+		}
+	}
+
+	@Test
+	public void testSkipToContainedIndexMinusOne() {
+		try {
+			int v = data[data.length - 1];
+			int maxDiff = 1;
+			assertTrue(v <= o.getIterator().skipTo(v - 1) + maxDiff);
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			fail("Failed skipping to last index");
+		}
+	}
+
+	@Test
+	public void testSkipToContainedIndexMinusN() {
+		try {
+			int v = data[data.length - 1];
+			int maxDiff = 142;
+			assertTrue(v <= o.getIterator().skipTo(v - 1) + maxDiff);
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			fail("Failed skipping to last index");
+		}
+	}
+
+	@Test
+	public void testToString() {
+		String os = o.toString();
+		os = os.substring(os.indexOf("["), os.length());
+		String vs = Arrays.toString(data);
+		if(!os.equals(vs)) {
+			fail("The two array string are not equivalent with " + type + "\n" + os + " : " + vs);
+		}
+	}
+
+	@Test
+	public void testIsNotOverFirstDataPoint() {
+		assertFalse(o.getIterator().isNotOver(data[0]));
+	}
+
+	@Test
+	public void testIsNotOverSecondDataPointOnInit() {
+		if(data.length > 1)
+			assertTrue(o.getIterator().isNotOver(data[1]));
+	}
+
+	@Test
+	public void testIsNotOverSecondDataPointOnInitToSecond() {
+		if(data.length > 1)
+			assertFalse(o.getIterator(data[1]).isNotOver(data[1]));
+	}
+
+	@Test
+	public void testIsOverFirstDataPointOnInitToSecond() {
+		if(data.length > 1)
+			assertFalse(o.getIterator(data[1]).isNotOver(data[0]));
+	}
+
+	@Test
+	public void testGetDataIndexOnInit() {
+		assertTrue(o.getIterator().getDataIndex() == 0);
+	}
+
+	@Test
+	public void testGetDataIndexOnInitSkipToFirst() {
+		if(data.length > 1)
+			assertTrue(o.getIterator(data[1]).getDataIndex() == 1);
+	}
+
+	@Test
+	public void testGetDataIndexOnInitSkipToN() {
+		if(data.length > 3)
+			assertTrue(o.getIterator(data[2]).getDataIndex() == 2);
+	}
+
+	@Test
+	public void testGetDataAfterNext() {
+		if(data.length > 1)
+			testGetDataAfterNextN(o.getIterator());
+	}
+
+	@Test
+	public void testGetDataAfterNext2() {
+		if(data.length > 2)
+			testGetDataAfterNextN(o.getIterator(2));
+	}
+
+	public void testGetDataAfterNextN(AIterator it) {
+		int d = it.getDataIndex();
+		it.next();
+		assertEquals(d + 1, it.getDataIndex());
+	}
+
+	@Test
+	public void testGetDataAfterNextComb() {
+		if(data.length > 1)
+			testGetDataAfterNextCombN(o.getIterator());
+	}
+
+	@Test
+	public void testGetDataAfterNextComb2() {
+		if(data.length > 2)
+			testGetDataAfterNextCombN(o.getIterator(2));
+	}
+
+	public void testGetDataAfterNextCombN(AIterator it) {
+		int d = it.getDataIndexAndIncrement();
+		assertEquals(d + 1, it.getDataIndex());
+	}
+
+	@Test
+	public void testGetUnreasonablyHighSkip() {
+		assertTrue(o.getIterator(Integer.MAX_VALUE - 1000) == null);
+	}
+
+	@Test
+	public void testCacheNullIterator() {
+		o.cacheIterator(null, 21415);
+	}
+
+	protected static void compare(AOffset o, int[] v) {
+		AIterator i = o.getIterator();
+		if(v[0] != i.value())
+			fail("incorrect result using : " + o.getClass().getSimpleName() + " expected: " + Arrays.toString(v)
+				+ " but was :" + o.toString());
+		for(int j = 1; j < v.length; j++) {
+			i.next();
 			if(v[j] != i.value())
 				fail("incorrect result using : " + o.getClass().getSimpleName() + " expected: " + Arrays.toString(v)
 					+ " but was :" + o.toString());
-			if(i.hasNext())
-				i.next();
 		}
+		if(i.getOffsetsIndex() != o.getOffsetsLength())
+			fail("The allocated offsets are longer than needed: idx " + i.getOffsetsIndex() + " vs len "
+				+ o.getOffsetsLength() + "\n" + Arrays.toString(v));
 	}
-
 }
diff --git a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestsDefaultConstructor.java b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestsDefaultConstructor.java
new file mode 100644
index 00000000000..ea9017df549
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestsDefaultConstructor.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.offset;
+
+import static org.junit.Assert.fail;
+
+import java.util.ArrayList;
+import java.util.Collection;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(value = Parameterized.class)
+public class OffsetTestsDefaultConstructor {
+	protected static final Log LOG = LogFactory.getLog(OffsetTestsDefaultConstructor.class.getName());
+
+	private static final long sizeTolerance = 100;
+
+	public int[] data;
+	private AOffset o;
+
+	@Parameters
+	public static Collection<Object[]> data() {
+		ArrayList<Object[]> tests = new ArrayList<>();
+		// It is assumed that the input is in sorted order, all values are positive and there are no duplicates.
+
+		tests.add(new Object[] {new int[] {1, 2}});
+		tests.add(new Object[] {new int[] {2, 142}});
+		tests.add(new Object[] {new int[] {142, 421}});
+		tests.add(new Object[] {new int[] {1, 1023}});
+		tests.add(new Object[] {new int[] {1023, 1024}});
+		tests.add(new Object[] {new int[] {1023}});
+		tests.add(new Object[] {new int[] {0, 1, 2, 3, 4, 5}});
+		tests.add(new Object[] {new int[] {0}});
+		tests.add(new Object[] {new int[] {Character.MAX_VALUE, ((int) Character.MAX_VALUE) + 1}});
+		tests.add(new Object[] {new int[] {Character.MAX_VALUE, ((int) Character.MAX_VALUE) * 2}});
+		tests.add(new Object[] {new int[] {0, 256}});
+		tests.add(new Object[] {new int[] {0, 254}});
+		tests.add(new Object[] {new int[] {0, Character.MAX_VALUE}});
+		tests.add(new Object[] {new int[] {0, Character.MAX_VALUE, ((int) Character.MAX_VALUE) * 2}});
+		tests.add(new Object[] {new int[] {2, Character.MAX_VALUE + 2}});
+		tests.add(new Object[] {new int[] {0, ((int) Character.MAX_VALUE) + 1}});
+		tests.add(new Object[] {new int[] {0, ((int) Character.MAX_VALUE) - 1}});
+		tests.add(new Object[] {new int[] {0, 256 * 2}});
+		tests.add(new Object[] {new int[] {0, 255 * 2}});
+		tests.add(new Object[] {new int[] {0, 254 * 2}});
+		tests.add(new Object[] {new int[] {0, 510, 765}});
+		tests.add(new Object[] {new int[] {0, 120, 230}});
+		tests.add(new Object[] {new int[] {1000, 1120, 1230}});
+		tests.add(new Object[] {new int[] {0, 254 * 3}});
+		tests.add(new Object[] {new int[] {0, 255, 255 * 2, 255 * 3}});
+		tests.add(new Object[] {new int[] {0, 255 * 2, 255 * 3}});
+		tests.add(new Object[] {new int[] {0, 255 * 2, 255 * 3, 255 * 10}});
+		tests.add(new Object[] {new int[] {0, 255 * 3}});
+		tests.add(new Object[] {new int[] {0, 255 * 4}});
+		tests.add(new Object[] {new int[] {0, 256 * 3}});
+		tests.add(new Object[] {new int[] {255 * 3, 255 * 5}});
+		tests.add(new Object[] {new int[] {1000000, 1000000 + 255 * 5}});
+		tests.add(new Object[] {new int[] {100000000, 100000000 + 255 * 5}});
+		tests.add(new Object[] {new int[] {100000000, 100001275, 100001530}});
+		tests.add(new Object[] {new int[] {0, 1, 2, 3, 255 * 4, 1500}});
+		tests.add(new Object[] {new int[] {0, 1, 2, 3, 4, 5}});
+		tests.add(new Object[] {new int[] {2458248, 2458249, 2458253, 2458254, 2458256, 2458257, 2458258, 2458262,
+			2458264, 2458266, 2458267, 2458271, 2458272, 2458275, 2458276, 2458281}});
+
+		return tests;
+	}
+
+	public OffsetTestsDefaultConstructor(int[] data) {
+		this.data = data;
+		this.o = OffsetFactory.createOffset(data);
+	}
+
+	@Test
+	public void testConstruction() {
+		try {
+			OffsetTests.compare(o, data);
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			throw e;
+		}
+	}
+
+	@Test
+	public void testMemoryEstimate(){
+		final long est = OffsetFactory.estimateInMemorySize(data.length, data[data.length -1]);
+		final long act = o.getInMemorySize();
+
+		if(!( act <= est + sizeTolerance))
+			fail("In memory is not smaller than estimate " + est + " " + act);
+	}
+}