diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index c8bdd0a45cb..548e5e931b4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -55,6 +55,7 @@
 import org.apache.sysds.runtime.compress.lib.CLALibReExpand;
 import org.apache.sysds.runtime.compress.lib.CLALibRightMultBy;
 import org.apache.sysds.runtime.compress.lib.CLALibScalar;
+import org.apache.sysds.runtime.compress.lib.CLALibSlice;
 import org.apache.sysds.runtime.compress.lib.CLALibSquash;
 import org.apache.sysds.runtime.compress.lib.CLALibUnary;
 import org.apache.sysds.runtime.controlprogram.caching.CacheBlock;
@@ -691,61 +692,14 @@ public void setOverlapping(boolean overlapping) {
 	@Override
 	public MatrixBlock slice(int rl, int ru, int cl, int cu, boolean deep, CacheBlock ret) {
 		validateSliceArgument(rl, ru, cl, cu);
-		MatrixBlock tmp;
-		if(rl == ru && cl == cu) {
-			// get a single index, and return in a matrixBlock
-			tmp = new MatrixBlock(1, 1, 0);
-			tmp.appendValue(0, 0, getValue(rl, cl));
-			return tmp;
-		}
-		else if(rl == 0 && ru == getNumRows() - 1) {
-			tmp = sliceColumns(cl, cu);
-			tmp.recomputeNonZeros();
-			return tmp;
-		}
-		else if(cl == 0 && cu == getNumColumns() - 1) {
-			// Row Slice. Potential optimization if the slice contains enough rows.
-			// +1 since the implementation arguments for slice is inclusive values for ru
-			// and cu. It is not inclusive in decompression, and construction of MatrixBlock.
-			tmp = new MatrixBlock(ru + 1 - rl, getNumColumns(), false).allocateDenseBlock();
-			for(AColGroup g : getColGroups())
-				g.decompressToBlock(tmp, rl, ru + 1, -rl, 0);
-			tmp.recomputeNonZeros();
-			tmp.examSparsity();
-			return tmp;
-		}
-		else {
-			// In the case where an internal matrix is sliced out, then first slice out the
-			// columns to an compressed intermediate.
-			tmp = sliceColumns(cl, cu);
-			// Then call slice recursively, to do the row slice.
-			// Since we do not copy the index structure but simply maintain a pointer to the
-			// original this is fine.
-			tmp = tmp.slice(rl, ru, 0, tmp.getNumColumns() - 1, ret);
-			return tmp;
-		}
-	}
-
-	private CompressedMatrixBlock sliceColumns(int cl, int cu) {
-		CompressedMatrixBlock ret = new CompressedMatrixBlock(this.getNumRows(), cu + 1 - cl);
-		List<AColGroup> newColGroups = new ArrayList<>();
-		for(AColGroup grp : getColGroups()) {
-			AColGroup slice = grp.sliceColumns(cl, cu + 1);
-			if(slice != null)
-				newColGroups.add(slice);
-		}
-		ret.allocateColGroupList(newColGroups);
-		ret.recomputeNonZeros();
-		ret.overlappingColGroups = this.isOverlapping();
-		return ret;
+		return CLALibSlice.slice(this, rl, ru, cl, cu, deep);
 	}
 
 	@Override
 	public void slice(ArrayList<IndexedMatrixValue> outlist, IndexRange range, int rowCut, int colCut, int blen,
 		int boundaryRlen, int boundaryClen) {
-		printDecompressWarning(
+		MatrixBlock tmp = getUncompressed(
 			"slice for distribution to spark. (Could be implemented such that it does not decompress)");
-		MatrixBlock tmp = getUncompressed();
 		tmp.slice(outlist, range, rowCut, colCut, blen, boundaryRlen, boundaryClen);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
index 42ea6a711e9..97f6f0975d1 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
@@ -250,10 +250,6 @@ else if(mb.isEmpty()) {
 		if(res == null)
 			return abortCompression();
 
-		if(compSettings.isInSparkInstruction) {
-			// clear soft reference to uncompressed block in case of spark.
-			res.clearSoftReferenceToDecompressed();
-		}
 		return new ImmutablePair<>(res, _stats);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
index 27a29cb945d..d46611aa96a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
@@ -48,7 +48,7 @@ public abstract class AColGroup implements Serializable {
 
 	/** Public super types of compression ColGroups supported */
 	public enum CompressionType {
-		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC
+		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, PFOR,
 	}
 
 	/**
@@ -57,7 +57,7 @@ public enum CompressionType {
 	 * Protected such that outside the ColGroup package it should be unknown which specific subtype is used.
 	 */
 	protected enum ColGroupType {
-		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCSingle, SDCSingleZeros, SDCZeros;
+		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCSingle, SDCSingleZeros, SDCZeros, PFOR;
 	}
 
 	/** The ColGroup Indexes contained in the ColGroup */
@@ -132,14 +132,27 @@ public long estimateInMemorySize() {
 	}
 
 	/**
-	 * Decompress the contents of the column group into the target matrix,.
+	 * Decompress a range of rows into a sparse block
 	 * 
-	 * @param target A matrix block where the columns covered by this column group have not yet been filled in.
-	 * @param rl     Row to start decompression from
-	 * @param ru     Row to end decompression at (not inclusive)
+	 * Note that this is using append, so the sparse column indexes need to be sorted afterwards.
+	 * 
+	 * @param sb Sparse Target block
+	 * @param rl Row to start at
+	 * @param ru Row to end at
+	 */
+	public final void decompressToSparseBlock(SparseBlock sb, int rl, int ru) {
+		decompressToSparseBlock(sb, rl, ru, 0, 0);
+	}
+
+	/**
+	 * Decompress a range of rows into a dense block
+	 * 
+	 * @param db Sparse Target block
+	 * @param rl Row to start at
+	 * @param ru Row to end at
 	 */
-	public final void decompressToBlock(MatrixBlock target, int rl, int ru) {
-		decompressToBlock(target, rl, ru, 0, 0);
+	public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) {
+		decompressToDenseBlock(db, rl, ru, 0, 0);
 	}
 
 	/**
@@ -326,33 +339,29 @@ public double get(int r, int c) {
 	protected abstract ColGroupType getColGroupType();
 
 	/**
-	 * Decompress the contents of the column group without counting non zeros
+	 * Decompress into the DenseBlock. (no NNZ handling)
 	 * 
-	 * The offsets helps us decompress into specific target areas of the output matrix.
-	 * 
-	 * If OffR and OffC is 0, then decompression output starts at row offset equal to rl,
+	 * @param db   Target DenseBlock
+	 * @param rl   Row to start decompression from
+	 * @param ru   Row to end decompression at
+	 * @param offR Row offset into the target to decompress
+	 * @param offC Column offset into the target to decompress
+	 */
+	public abstract void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC);
+
+	/**
+	 * Decompress into the SparseBlock. (no NNZ handling)
 	 * 
-	 * If for instance a MiniBatch of rows 10 to 15, then target would be 5 rows high and arguments would look like:
-	 *
-	 * cg.decompressToBlock(target, 10, 15, -10, 0)
+	 * Note this method is allowing to calls to append since it is assumed that the sparse column indexes are sorted
+	 * afterwards
 	 * 
-	 * @param target a matrix block where the columns covered by this column group have not yet been filled in.
-	 * @param rl     Row to start decompression at.
-	 * @param ru     Row to end decompression at (not inclusive).
-	 * @param offR   RowOffset into target to assign from.
-	 * @param offC   ColumnOffset into the target matrix to assign from.
+	 * @param sb   Target SparseBlock
+	 * @param rl   Row to start decompression from
+	 * @param ru   Row to end decompression at
+	 * @param offR Row offset into the target to decompress
+	 * @param offC Column offset into the target to decompress
 	 */
-	public final void decompressToBlock(MatrixBlock target, int rl, int ru, int offR, int offC){
-		if(target.isInSparseFormat())
-			decompressToSparseBlock(target.getSparseBlock(), rl, ru, offR, offC);
-		else
-			decompressToDenseBlock(target.getDenseBlock(), rl, ru, offR, offC);
-	}
-
-
-	protected abstract void decompressToDenseBlock(DenseBlock db, int rl, int ru,int offR, int offC);
-
-	protected abstract void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC);
+	public abstract void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC);
 
 	/**
 	 * Right matrix multiplication with this column group.
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupCompressed.java
index 106a2df0677..90cd5c94e9a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupCompressed.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupCompressed.java
@@ -55,11 +55,15 @@ protected AColGroupCompressed(int[] colIndices) {
 
 	protected abstract void computeColMxx(double[] c, Builtin builtin);
 
-	protected abstract void computeSum(double[] c, int nRows, boolean square);
+	protected abstract void computeSum(double[] c, int nRows);
 
-	protected abstract void computeRowSums(double[] c, boolean square, int rl, int ru);
+	protected abstract void computeRowSums(double[] c, int rl, int ru);
 
-	protected abstract void computeColSums(double[] c, int nRows, boolean square);
+	protected abstract void computeSumSq(double[] c, int nRows);
+
+	protected abstract void computeRowSumsSq(double[] c, int rl, int ru);
+
+	protected abstract void computeColSumsSq(double[] c, int nRows);
 
 	protected abstract void computeRowMxx(double[] c, Builtin builtin, int rl, int ru);
 
@@ -79,22 +83,27 @@ public double getMax() {
 		return computeMxx(Double.NEGATIVE_INFINITY, Builtin.getBuiltinFnObject(BuiltinCode.MAX));
 	}
 
-	@Override
-	public void computeColSums(double[] c, int nRows) {
-		computeColSums(c, nRows, false);
-	}
-
 	@Override
 	public final void unaryAggregateOperations(AggregateUnaryOperator op, double[] c, int nRows, int rl, int ru) {
 		final ValueFunction fn = op.aggOp.increOp.fn;
 		if(fn instanceof Plus || fn instanceof KahanPlus || fn instanceof KahanPlusSq) {
 			boolean square = fn instanceof KahanPlusSq;
-			if(op.indexFn instanceof ReduceAll)
-				computeSum(c, nRows, square);
-			else if(op.indexFn instanceof ReduceCol)
-				computeRowSums(c, square, rl, ru);
-			else if(op.indexFn instanceof ReduceRow)
-				computeColSums(c, nRows, square);
+			if(square){
+				if(op.indexFn instanceof ReduceAll)
+					computeSumSq(c, nRows);
+				else if(op.indexFn instanceof ReduceCol)
+					computeRowSumsSq(c, rl, ru);
+				else if(op.indexFn instanceof ReduceRow)
+					computeColSumsSq(c, nRows);
+			}
+			else{
+				if(op.indexFn instanceof ReduceAll)
+					computeSum(c, nRows);
+				else if(op.indexFn instanceof ReduceCol)
+					computeRowSums(c, rl, ru);
+				else if(op.indexFn instanceof ReduceRow)
+					computeColSums(c, nRows);
+			}
 		}
 		else if(fn instanceof Multiply) {
 			if(op.indexFn instanceof ReduceAll)
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java
index 067fa6f20f9..34abf61b05d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java
@@ -38,7 +38,6 @@
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
-import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
 /**
  * Base class for column groups encoded with value dictionary. This include column groups such as DDC OLE and RLE.
@@ -171,7 +170,7 @@ protected abstract void decompressToSparseBlockDenseDictionary(SparseBlock ret,
 		double[] values);
 
 	@Override
-	public final int getNumValues() {
+	public int getNumValues() {
 		return _dict.getNumberOfValues(_colIndexes.length);
 	}
 
@@ -286,15 +285,14 @@ private double[] rightMMPreAggSparse(int numVals, SparseBlock b, int[] aggregate
 	}
 
 	@Override
-	protected final double computeMxx(double c, Builtin builtin) {
+	protected double computeMxx(double c, Builtin builtin) {
 		if(_zeros)
 			c = builtin.execute(c, 0);
 		return _dict.aggregate(c, builtin);
-
 	}
 
 	@Override
-	protected final void computeColMxx(double[] c, Builtin builtin) {
+	protected void computeColMxx(double[] c, Builtin builtin) {
 		if(_zeros)
 			for(int x = 0; x < _colIndexes.length; x++)
 				c[_colIndexes[x]] = builtin.execute(c[_colIndexes[x]], 0);
@@ -302,40 +300,6 @@ protected final void computeColMxx(double[] c, Builtin builtin) {
 		_dict.aggregateCols(c, builtin, _colIndexes);
 	}
 
-	/**
-	 * Method for use by subclasses. Applies a scalar operation to the value metadata stored in the dictionary.
-	 * 
-	 * @param op scalar operation to perform
-	 * @return transformed copy of value metadata for this column group
-	 */
-	protected final ADictionary applyScalarOp(ScalarOperator op) {
-		return _dict.clone().inplaceScalarOp(op);
-	}
-
-	/**
-	 * Method for use by subclasses. Applies a scalar operation to the value metadata stored in the dictionary. This
-	 * specific method is used in cases where an new entry is to be added in the dictionary.
-	 * 
-	 * Method should only be called if the newVal is not 0! Also the newVal should already have the operator applied.
-	 * 
-	 * @param op      The Operator to apply to the underlying data.
-	 * @param newVal  The new Value to append to the underlying data.
-	 * @param numCols The number of columns in the ColGroup, to specify how many copies of the newVal should be appended.
-	 * @return The new Dictionary containing the values.
-	 */
-	protected final ADictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) {
-		return _dict.applyScalarOp(op, newVal, numCols);
-	}
-
-	protected static double[] allocDVector(int len, boolean reset) {
-		return new double[len];
-	}
-
-	protected static int[] allocIVector(int len, boolean reset) {
-		LOG.error("deprecated allocIVector");
-		return new int[len + 1];
-	}
-
 	@Override
 	public void readFields(DataInput in) throws IOException {
 		super.readFields(in);
@@ -362,16 +326,23 @@ public long getExactSizeOnDisk() {
 	public abstract int[] getCounts(int[] out);
 
 	@Override
-	protected final void computeSum(double[] c, int nRows, boolean square) {
-		if(square)
-			c[0] += _dict.sumsq(getCounts(), _colIndexes.length);
-		else
-			c[0] += _dict.sum(getCounts(), _colIndexes.length);
+	protected void computeSum(double[] c, int nRows) {
+		c[0] += _dict.sum(getCounts(), _colIndexes.length);
+	}
+
+	@Override
+	public void computeColSums(double[] c, int nRows) {
+		_dict.colSum(c, getCounts(), _colIndexes);
+	}
+
+	@Override
+	protected void computeSumSq(double[] c, int nRows) {
+		c[0] += _dict.sumSq(getCounts(), _colIndexes.length);
 	}
 
 	@Override
-	protected final void computeColSums(double[] c, int nRows, boolean square) {
-		_dict.colSum(c, getCounts(), _colIndexes, square);
+	protected void computeColSumsSq(double[] c, int nRows) {
+		_dict.colSumSq(c, getCounts(), _colIndexes);
 	}
 
 	@Override
@@ -425,7 +396,7 @@ public AColGroupValue copy() {
 	}
 
 	@Override
-	protected final AColGroup sliceSingleColumn(int idx) {
+	protected AColGroup sliceSingleColumn(int idx) {
 		final AColGroupValue ret = (AColGroupValue) copy();
 		ret._colIndexes = new int[] {0};
 		if(_colIndexes.length == 1)
@@ -437,7 +408,7 @@ protected final AColGroup sliceSingleColumn(int idx) {
 	}
 
 	@Override
-	protected final AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
+	protected AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
 		final AColGroupValue ret = (AColGroupValue) copy();
 		ret._dict = ret._dict.sliceOutColumnRange(idStart, idEnd, _colIndexes.length);
 		ret._colIndexes = outputCols;
@@ -445,20 +416,20 @@ protected final AColGroup sliceMultiColumns(int idStart, int idEnd, int[] output
 	}
 
 	@Override
-	protected final void tsmm(double[] result, int numColumns, int nRows) {
+	protected void tsmm(double[] result, int numColumns, int nRows) {
 		final int[] counts = getCounts();
 		tsmm(result, numColumns, counts, _dict, _colIndexes);
 	}
 
 	@Override
-	public final boolean containsValue(double pattern) {
+	public boolean containsValue(double pattern) {
 		if(pattern == 0 && _zeros)
 			return true;
 		return _dict.containsValue(pattern);
 	}
 
 	@Override
-	public final long getNumberNonZeros(int nRows) {
+	public long getNumberNonZeros(int nRows) {
 		int[] counts = getCounts();
 		return _dict.getNumberNonZeros(counts, _colIndexes.length);
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AMorphingMMColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AMorphingMMColGroup.java
new file mode 100644
index 00000000000..26c055de9d7
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AMorphingMMColGroup.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+/**
+ * Abstract class for column group types that do not perform matrix Multiplication, and decompression for performance
+ * reasons but instead transforms into another type of column group type to perform that operation.
+ */
+public abstract class AMorphingMMColGroup extends AColGroupValue {
+
+	/**
+	 * Constructor for serialization
+	 * 
+	 * @param numRows Number of rows contained
+	 */
+	protected AMorphingMMColGroup(int numRows) {
+		super(numRows);
+	}
+
+	/**
+	 * A Abstract class for column groups that contain ADictionary for values.
+	 * 
+	 * @param colIndices   The Column indexes
+	 * @param numRows      The number of rows contained in this group
+	 * @param dict         The dictionary to contain the distinct tuples
+	 * @param cachedCounts The cached counts of the distinct tuples (can be null since it should be possible to
+	 *                     reconstruct the counts on demand)
+	 */
+	protected AMorphingMMColGroup(int[] colIndices, int numRows, ADictionary dict, int[] cachedCounts) {
+		super(colIndices, numRows, dict, cachedCounts);
+	}
+
+	@Override
+	protected final void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		SparseBlock sb) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	protected final void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		double[] values) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	protected final void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
+		SparseBlock sb) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	protected final void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
+		double[] values) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	public final void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	public final void leftMultByAColGroup(AColGroup lhs, MatrixBlock result) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	public final void tsmmAColGroup(AColGroup other, MatrixBlock result) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	protected final void tsmm(double[] result, int numColumns, int nRows) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	public abstract AColGroup extractCommon(double[] constV);
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/APreAgg.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/APreAgg.java
index 2a15a2110bb..9d1b1e3712a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/APreAgg.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/APreAgg.java
@@ -297,7 +297,7 @@ private boolean shouldPreAggregateLeft(APreAgg lhs) {
 
 	private static MatrixBlock allocatePreAggregate(MatrixBlock m, int numVals, int rl, int ru) {
 		final int lhsRows = ru - rl;
-		final double[] vals = allocDVector(lhsRows * numVals, true);
+		final double[] vals = new double[lhsRows * numVals];
 		final DenseBlock retB = new DenseBlockFP64(new int[] {lhsRows, numVals}, vals);
 		return new MatrixBlock(lhsRows, numVals, retB);
 	}
@@ -318,16 +318,12 @@ private static void tsmmDictionaryWithScaling(final ADictionary dict, final int[
 			if(mb.isEmpty())
 				return;
 			else if(mb.isInSparseFormat())
-				throw new NotImplementedException();
-			else {
-				final double[] values = mb.getDenseBlockValues();
-				MMDictsDenseDenseWithScaling(values, values, rows, cols, counts, ret);
-			}
-		}
-		else {
-			final double[] values = dict.getValues();
-			MMDictsDenseDenseWithScaling(values, values, rows, cols, counts, ret);
+				TSMMDictsSparseWithScaling(mb.getSparseBlock(), rows, cols, counts, ret);
+			else
+				TSMMDictsDenseWithScaling(mb.getDenseBlockValues(), rows, cols, counts, ret);
 		}
+		else
+			TSMMDictsDenseWithScaling(dict.getValues(), rows, cols, counts, ret);
 	}
 
 	/**
@@ -416,9 +412,9 @@ private static void MMDictsDenseDense(double[] left, double[] right, int[] rowsL
 		}
 	}
 
-	private static void MMDictsDenseDenseWithScaling(double[] left, double[] right, int[] rowsLeft, int[] colsRight,
-		int[] scaling, MatrixBlock result) {
-		final int commonDim = Math.min(left.length / rowsLeft.length, right.length / colsRight.length);
+	private static void TSMMDictsDenseWithScaling(double[] dv, int[] rowsLeft, int[] colsRight, int[] scaling,
+		MatrixBlock result) {
+		final int commonDim = Math.min(dv.length / rowsLeft.length, dv.length / colsRight.length);
 		final int resCols = result.getNumColumns();
 		final double[] resV = result.getDenseBlockValues();
 		for(int k = 0; k < commonDim; k++) {
@@ -427,10 +423,34 @@ private static void MMDictsDenseDenseWithScaling(double[] left, double[] right,
 			final int scale = scaling[k];
 			for(int i = 0; i < rowsLeft.length; i++) {
 				final int offOut = rowsLeft[i] * resCols;
-				final double vl = left[offL + i] * scale;
+				final double vl = dv[offL + i] * scale;
 				if(vl != 0)
 					for(int j = 0; j < colsRight.length; j++)
-						resV[offOut + colsRight[j]] += vl * right[offR + j];
+						resV[offOut + colsRight[j]] += vl * dv[offR + j];
+			}
+		}
+	}
+
+	private static void TSMMDictsSparseWithScaling(SparseBlock sb, int[] rowsLeft, int[] colsRight, int[] scaling,
+		MatrixBlock result) {
+
+		final int commonDim = sb.numRows();
+		final int resCols = result.getNumColumns();
+		final double[] resV = result.getDenseBlockValues();
+
+		for(int k = 0; k < commonDim; k++) {
+			if(sb.isEmpty(k))
+				continue;
+			final int apos = sb.pos(k);
+			final int alen = sb.size(k) + apos;
+			final int[] aix = sb.indexes(k);
+			final double[] avals = sb.values(k);
+			final int scale = scaling[k];
+			for(int i = apos; i < alen; i++) {
+				final double v = avals[i] * scale;
+				final int offOut = rowsLeft[aix[i]] * resCols;
+				for(int j = 0; j < alen; j++)
+					resV[offOut + colsRight[aix[j]]] += v * avals[j];
 			}
 		}
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
index 86335b983fa..afe43da66a8 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
@@ -54,21 +54,29 @@ protected ColGroupConst() {
 	 * @param colIndices The Colum indexes for the column group.
 	 * @param dict       The dictionary containing one tuple for the entire compression.
 	 */
-	protected ColGroupConst(int[] colIndices, ADictionary dict) {
+	private ColGroupConst(int[] colIndices, ADictionary dict) {
 		super(colIndices);
 		this._dict = dict;
 	}
 
-	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		double vals = _dict.sumAllRowsToDouble(square, _colIndexes.length)[0];
-		for(int rix = rl; rix < ru; rix++)
-			c[rix] += vals;
+	/**
+	 * Create constructor for a ColGroup Const this constructor ensures that if the dictionary input is empty an Empty
+	 * column group is constructed.
+	 * 
+	 * @param colIndices The column indexes in the column group
+	 * @param dict       The dictionary to use
+	 * @return A Colgroup either const or empty.
+	 */
+	protected static AColGroup create(int[] colIndices, ADictionary dict) {
+		if(dict == null)
+			return new ColGroupEmpty(colIndices);
+		else
+			return new ColGroupConst(colIndices, dict);
 	}
 
 	@Override
 	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		double value = _dict.aggregateTuples(builtin, _colIndexes.length)[0];
+		double value = _dict.aggregateRows(builtin, _colIndexes.length)[0];
 		for(int i = rl; i < ru; i++)
 			c[i] = builtin.execute(c[i], value);
 	}
@@ -108,19 +116,17 @@ public double getIdx(int r, int colIdx) {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		return new ColGroupConst(_colIndexes, _dict.clone().inplaceScalarOp(op));
+		return create(_colIndexes, _dict.applyScalarOp(op));
 	}
 
 	@Override
 	public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
-		ADictionary ret = _dict.binOpLeft(op, v, _colIndexes);
-		return new ColGroupConst(_colIndexes, ret);
+		return create(_colIndexes, _dict.binOpLeft(op, v, _colIndexes));
 	}
 
 	@Override
 	public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) {
-		ADictionary ret = _dict.binOpRight(op, v, _colIndexes);
-		return new ColGroupConst(_colIndexes, ret);
+		return create(_colIndexes, _dict.binOpRight(op, v, _colIndexes));
 	}
 
 	/**
@@ -131,13 +137,12 @@ public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSa
 	 */
 	public void addToCommon(double[] constV) {
 		final double[] values = _dict.getValues();
-		if(values != null && constV != null)
-			for(int i = 0; i < _colIndexes.length; i++)
-				constV[_colIndexes[i]] += values[i];
+		for(int i = 0; i < _colIndexes.length; i++)
+			constV[_colIndexes[i]] += values[i];
 	}
 
 	public double[] getValues() {
-		return _dict != null ? _dict.getValues() : null;
+		return _dict.getValues();
 	}
 
 	@Override
@@ -151,17 +156,38 @@ protected void computeColMxx(double[] c, Builtin builtin) {
 	}
 
 	@Override
-	protected void computeSum(double[] c, int nRows, boolean square) {
-		if(_dict != null)
-			if(square)
-				c[0] += _dict.sumsq(new int[] {nRows}, _colIndexes.length);
-			else
-				c[0] += _dict.sum(new int[] {nRows}, _colIndexes.length);
+	protected void computeSum(double[] c, int nRows) {
+		c[0] += _dict.sum(new int[] {nRows}, _colIndexes.length);
+	}
+
+	@Override
+	public void computeColSums(double[] c, int nRows) {
+		_dict.colSum(c, new int[] {nRows}, _colIndexes);
+	}
+
+	@Override
+	protected void computeSumSq(double[] c, int nRows) {
+
+		c[0] += _dict.sumSq(new int[] {nRows}, _colIndexes.length);
+	}
+
+	@Override
+	protected void computeColSumsSq(double[] c, int nRows) {
+		_dict.colSumSq(c, new int[] {nRows}, _colIndexes);
 	}
 
 	@Override
-	protected void computeColSums(double[] c, int nRows, boolean square) {
-		_dict.colSum(c, new int[] {nRows}, _colIndexes, square);
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		double vals = _dict.sumAllRowsToDouble(_colIndexes.length)[0];
+		for(int rix = rl; rix < ru; rix++)
+			c[rix] += vals;
+	}
+
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		double vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length)[0];
+		for(int rix = rl; rix < ru; rix++)
+			c[rix] += vals;
 	}
 
 	@Override
@@ -183,11 +209,13 @@ public AColGroup rightMultByMatrix(MatrixBlock right) {
 		final int cr = right.getNumColumns();
 		if(_colIndexes.length == rr) {
 			MatrixBlock left = forceValuesToMatrixBlock();
+			if(left.isEmpty())
+				return null;
 			MatrixBlock ret = new MatrixBlock(1, cr, false);
 			LibMatrixMult.matrixMult(left, right, ret);
-			ADictionary d = new MatrixBlockDictionary(ret);
 			if(ret.isEmpty())
 				return null;
+			ADictionary d = new MatrixBlockDictionary(ret);
 			return ColGroupFactory.genColGroupConst(cr, d);
 		}
 		else {
@@ -202,7 +230,7 @@ public void tsmm(double[] result, int numColumns, int nRows) {
 
 	@Override
 	public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
-		throw new NotImplementedException();
+		throw new DMLCompressionException("Should not be called");
 	}
 
 	@Override
@@ -223,19 +251,19 @@ protected AColGroup sliceSingleColumn(int idx) {
 			return new ColGroupEmpty(colIndexes);
 		else {
 			ADictionary retD = new Dictionary(new double[] {_dict.getValue(idx)});
-			return new ColGroupConst(colIndexes, retD);
+			return create(colIndexes, retD);
 		}
 	}
 
 	@Override
 	protected AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
 		ADictionary retD = _dict.sliceOutColumnRange(idStart, idEnd, _colIndexes.length);
-		return new ColGroupConst(outputCols, retD);
+		return create(outputCols, retD);
 	}
 
 	@Override
 	public AColGroup copy() {
-		return new ColGroupConst(_colIndexes, _dict.clone());
+		return create(_colIndexes, _dict.clone());
 	}
 
 	@Override
@@ -251,7 +279,7 @@ public long getNumberNonZeros(int nRows) {
 	@Override
 	public AColGroup replace(double pattern, double replace) {
 		ADictionary replaced = _dict.replace(pattern, replace, _colIndexes.length);
-		return new ColGroupConst(_colIndexes, replaced);
+		return create(_colIndexes, replaced);
 	}
 
 	@Override
@@ -269,9 +297,7 @@ public void write(DataOutput out) throws IOException {
 	@Override
 	public long getExactSizeOnDisk() {
 		long ret = super.getExactSizeOnDisk();
-		if(_dict != null)
-			ret += _dict.getExactSizeOnDisk();
-
+		ret += _dict.getExactSizeOnDisk();
 		return ret;
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
index b6d42312b98..82faecde164 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
@@ -67,26 +67,13 @@ public CompressionType getCompType() {
 	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		SparseBlock sb) {
 		throw new NotImplementedException();
-		// for(int i = rl; i < ru; i++, offT++) {
-		// final int rowIndex = _data.getIndex(i);
-		// if(sb.isEmpty(rowIndex))
-		// continue;
-		// final double[] c = db.values(offT);
-		// final int off = db.pos(offT);
-		// final int apos = sb.pos(rowIndex);
-		// final int alen = sb.size(rowIndex) + apos;
-		// final double[] avals = sb.values(rowIndex);
-		// final int[] aix = sb.indexes(rowIndex);
-		// for(int j = apos; j < alen; j++)
-		// c[off + _colIndexes[aix[j]]] += avals[j];
-		// }
 	}
 
 	@Override
 	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		double[] values) {
 		final int nCol = _colIndexes.length;
-		for(int i = rl,offT = rl + offR; i < ru; i++, offT++) {
+		for(int i = rl, offT = rl + offR; i < ru; i++, offT++) {
 			final double[] c = db.values(offT);
 			final int off = db.pos(offT) + offC;
 			final int rowIndex = _data.getIndex(i) * nCol;
@@ -118,8 +105,15 @@ public double getIdx(int r, int colIdx) {
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		for(int rix = rl; rix < ru; rix++)
+			c[rix] += vals[_data.getIndex(rix)];
+	}
+
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		double[] vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length);
 		for(int rix = rl; rix < ru; rix++)
 			c[rix] += vals[_data.getIndex(rix)];
 	}
@@ -127,7 +121,7 @@ protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
 	@Override
 	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
 		final int nCol = getNumCols();
-		double[] preAggregatedRows = _dict.aggregateTuples(builtin, nCol);
+		double[] preAggregatedRows = _dict.aggregateRows(builtin, nCol);
 		for(int i = rl; i < ru; i++)
 			c[i] = builtin.execute(c[i], preAggregatedRows[_data.getIndex(i)]);
 	}
@@ -151,7 +145,7 @@ public void preAggregate(final MatrixBlock m, final MatrixBlock preAgg, final in
 
 	@Override
 	public void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru, int cl, int cu) {
-		_data.preAggregateDense(m, preAgg, rl, ru, cl, cu);
+		_data.preAggregateDense(m, preAgg.getDenseBlockValues(), rl, ru, cl, cu);
 	}
 
 	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
@@ -181,11 +175,14 @@ public void preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
 	public void preAggregateThatSDCZerosStructure(ColGroupSDCZeros that, Dictionary ret) {
 		final AIterator itThat = that._indexes.getIterator();
 		final int nCol = that._colIndexes.length;
-
-		while(itThat.hasNext()) {
+		final int finalOff = that._indexes.getOffsetToLast();
+		while(true) {
 			final int to = _data.getIndex(itThat.value());
-			final int fr = that._data.getIndex(itThat.getDataIndexAndIncrement());
+			final int fr = that._data.getIndex(itThat.getDataIndex());
 			that._dict.addToEntry(ret, fr, to, nCol);
+			if(itThat.value() == finalOff)
+				break;
+			itThat.next();
 		}
 	}
 
@@ -193,9 +190,12 @@ public void preAggregateThatSDCZerosStructure(ColGroupSDCZeros that, Dictionary
 	public void preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary ret) {
 		final AIterator itThat = that._indexes.getIterator();
 		final int nCol = that._colIndexes.length;
-		while(itThat.hasNext()) {
+		final int finalOff = that._indexes.getOffsetToLast();
+		while(true) {
 			final int to = _data.getIndex(itThat.value());
 			that._dict.addToEntry(ret, 0, to, nCol);
+			if(itThat.value() == finalOff)
+				break;
 			itThat.next();
 		}
 	}
@@ -219,7 +219,7 @@ public long estimateInMemorySize() {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		return new ColGroupDDC(_colIndexes, _numRows, applyScalarOp(op), _data, getCachedCounts());
+		return new ColGroupDDC(_colIndexes, _numRows, _dict.applyScalarOp(op), _data, getCachedCounts());
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
index ec20674c43c..a75f046eb84 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
@@ -19,6 +19,8 @@
 
 package org.apache.sysds.runtime.compress.colgroup;
 
+import java.util.Arrays;
+
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
@@ -69,7 +71,7 @@ public void decompressToDenseBlock(DenseBlock target, int rl, int ru, int offR,
 	}
 
 	@Override
-	public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC){
+	public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) {
 		// do nothing.
 	}
 
@@ -80,10 +82,12 @@ public double getIdx(int r, int colIdx) {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		double val0 = op.executeScalar(0);
-		if(val0 == 0)
+		final double v = op.executeScalar(0);
+		if(v == 0)
 			return this;
-		return new ColGroupConst(_colIndexes, new Dictionary(new double[_colIndexes.length]).inplaceScalarOp(op));
+		double[] retV = new double[_colIndexes.length];
+		Arrays.fill(retV, v);
+		return ColGroupConst.create(_colIndexes, new Dictionary(retV));
 	}
 
 	@Override
@@ -99,7 +103,7 @@ public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSaf
 
 		if(allZero)
 			return this;
-		return new ColGroupConst(_colIndexes, new Dictionary(retVals));
+		return ColGroupConst.create(_colIndexes, new Dictionary(retVals));
 	}
 
 	@Override
@@ -111,10 +115,10 @@ public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSa
 		final int lenV = _colIndexes.length;
 		boolean allZero = true;
 		for(int i = 0; i < lenV; i++)
-			allZero = 0 == (retVals[i] = fn.execute(0, v[_colIndexes[i]])) && allZero ;
+			allZero = 0 == (retVals[i] = fn.execute(0, v[_colIndexes[i]])) && allZero;
 		if(allZero)
 			return this;
-		return new ColGroupConst(_colIndexes, new Dictionary(retVals));
+		return ColGroupConst.create(_colIndexes, new Dictionary(retVals));
 	}
 
 	@Override
@@ -185,11 +189,6 @@ public final double getMax() {
 		return 0;
 	}
 
-	@Override
-	public void computeColSums(double[] c, int nRows) {
-		// do nothing
-	}
-
 	@Override
 	protected double computeMxx(double c, Builtin builtin) {
 		return builtin.execute(c, 0);
@@ -202,17 +201,32 @@ protected void computeColMxx(double[] c, Builtin builtin) {
 	}
 
 	@Override
-	protected void computeSum(double[] c, int nRows, boolean square) {
+	protected void computeSum(double[] c, int nRows) {
+		// do nothing
+	}
+
+	@Override
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		// do nothing
+	}
+
+	@Override
+	public void computeColSums(double[] c, int nRows) {
+		// do nothing
+	}
+
+	@Override
+	protected void computeSumSq(double[] c, int nRows) {
 		// do nothing
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
 		// do nothing
 	}
 
 	@Override
-	protected void computeColSums(double[] c, int nRows, boolean square) {
+	protected void computeColSumsSq(double[] c, int nRows) {
 		// do nothing
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
index fc0edf67fae..72779342445 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
@@ -147,7 +147,7 @@ public static AColGroup genColGroupConst(int[] cols, double[] values) {
 		if(cols.length != values.length)
 			throw new DMLCompressionException("Invalid size of values compared to columns");
 		ADictionary dict = new Dictionary(values);
-		return new ColGroupConst(cols, dict);
+		return ColGroupConst.create(cols, dict);
 	}
 
 	/**
@@ -162,7 +162,7 @@ public static AColGroup genColGroupConst(int numCols, ADictionary dict) {
 			throw new DMLCompressionException(
 				"Invalid construction of const column group with different number of columns in arguments");
 		final int[] colIndices = Util.genColsIndices(numCols);
-		return new ColGroupConst(colIndices, dict);
+		return ColGroupConst.create(colIndices, dict);
 	}
 
 	private static List<AColGroup> genEmpty(MatrixBlock in, CompressionSettings compSettings) {
@@ -194,7 +194,7 @@ private static List<AColGroup> compressColGroupsParallel(MatrixBlock in, Compres
 				if(!tg.isEmpty())
 					tasks.add(new CompressTask(in, tg, compSettings, Math.max(1, k / 2)));
 
-			List<AColGroup> ret = new ArrayList<>(csi.getNumberColGroups());
+			List<AColGroup> ret = new ArrayList<>();
 			for(Future<Collection<AColGroup>> t : pool.invokeAll(tasks))
 				ret.addAll(t.get());
 			pool.shutdown();
@@ -234,11 +234,17 @@ protected CompressTask(MatrixBlock in, List<CompressedSizeInfoColGroup> groups,
 
 		@Override
 		public Collection<AColGroup> call() {
-			ArrayList<AColGroup> res = new ArrayList<>();
-			Tmp tmpMap = new Tmp();
-			for(CompressedSizeInfoColGroup g : _groups)
-				res.addAll(compressColGroup(_in, _compSettings, tmpMap, g, _k));
-			return res;
+			try{
+				ArrayList<AColGroup> res = new ArrayList<>();
+				Tmp tmpMap = new Tmp();
+				for(CompressedSizeInfoColGroup g : _groups)
+					res.addAll(compressColGroup(_in, _compSettings, tmpMap, g, _k));
+				return res;
+			}
+			catch(Exception e){
+				e.printStackTrace();
+				throw e;
+			}
 		}
 	}
 
@@ -347,7 +353,7 @@ private static AColGroup compress(int[] colIndexes, int rlen, ABitmap ubm, Compr
 
 		final IntArrayList[] of = ubm.getOffsetList();
 		if(of.length == 1 && of[0].size() == rlen) // If this always constant
-			return new ColGroupConst(colIndexes, DictionaryFactory.create(ubm));
+			return ColGroupConst.create(colIndexes, DictionaryFactory.create(ubm));
 
 		switch(compType) {
 			case DDC:
@@ -490,7 +496,7 @@ private static AColGroup compressSDC(int[] colIndexes, int rlen, ABitmap ubm, Co
 		ADictionary dict = DictionaryFactory.create(ubm, tupleSparsity);
 		if(ubm.getNumValues() == 1) {
 			if(numZeros >= largestOffset) {
-				final AOffset off = OffsetFactory.create(ubm.getOffsetList()[0].extractValues(true));
+				final AOffset off = OffsetFactory.createOffset(ubm.getOffsetList()[0].extractValues(true));
 				return new ColGroupSDCSingleZeros(colIndexes, rlen, dict, off, null);
 			}
 			else {
@@ -510,7 +516,7 @@ private static AColGroup setupMultiValueZeroColGroup(int[] colIndexes, int rlen,
 		CompressionSettings cs) {
 		IntArrayList[] offsets = ubm.getOffsetList();
 		AInsertionSorter s = InsertionSorterFactory.create(rlen, offsets, cs.sdcSortType);
-		AOffset indexes = OffsetFactory.create(s.getIndexes());
+		AOffset indexes = OffsetFactory.createOffset(s.getIndexes());
 		AMapToData data = s.getData();
 		int[] counts = new int[offsets.length + 1];
 		int sum = 0;
@@ -519,18 +525,16 @@ private static AColGroup setupMultiValueZeroColGroup(int[] colIndexes, int rlen,
 			sum += counts[i];
 		}
 		counts[offsets.length] = rlen - sum;
-		AColGroupValue ret = new ColGroupSDCZeros(colIndexes, rlen, dict, indexes, data, counts);
-		return ret;
+		return ColGroupSDCZeros.create(colIndexes, rlen, dict, indexes, data, counts);
 	}
 
 	private static AColGroup setupMultiValueColGroup(int[] colIndexes, int numZeros, int rlen, ABitmap ubm,
 		int largestIndex, ADictionary dict, CompressionSettings cs) {
 		IntArrayList[] offsets = ubm.getOffsetList();
 		AInsertionSorter s = InsertionSorterFactory.createNegative(rlen, offsets, largestIndex, cs.sdcSortType);
-		AOffset indexes = OffsetFactory.create(s.getIndexes());
+		AOffset indexes = OffsetFactory.createOffset(s.getIndexes());
 		AMapToData _data = s.getData();
-		AColGroupValue ret = new ColGroupSDC(colIndexes, rlen, dict, indexes, _data, null);
-		return ret;
+		return ColGroupSDC.create(colIndexes, rlen, dict, indexes, _data, null);
 	}
 
 	private static AColGroup setupSingleValueSDCColGroup(int[] colIndexes, int rlen, ABitmap ubm, ADictionary dict) {
@@ -548,7 +552,7 @@ private static AColGroup setupSingleValueSDCColGroup(int[] colIndexes, int rlen,
 
 		while(v < rlen)
 			indexes[p++] = v++;
-		AOffset off = OffsetFactory.create(indexes);
+		AOffset off = OffsetFactory.createOffset(indexes);
 
 		return new ColGroupSDCSingle(colIndexes, rlen, dict, off, null);
 	}
@@ -635,14 +639,14 @@ private static AColGroup compressSDCFromSparseTransposedBlock(MatrixBlock mb, in
 			}
 
 			counts[entries.size()] = rlen - sum;
-			final AOffset offsets = OffsetFactory.create(sb.indexes(sbRow), apos, alen);
+			final AOffset offsets = OffsetFactory.createOffset(sb.indexes(sbRow), apos, alen);
 			if(entries.size() <= 1)
 				return new ColGroupSDCSingleZeros(cols, rlen, new Dictionary(dict), offsets, counts);
 			else {
 				final AMapToData mapToData = MapToFactory.create((alen - apos), entries.size());
 				for(int j = apos; j < alen; j++)
 					mapToData.set(j - apos, map.get(vals[j]));
-				return new ColGroupSDCZeros(cols, rlen, new Dictionary(dict), offsets, mapToData, counts);
+				return ColGroupSDCZeros.create(cols, rlen, new Dictionary(dict), offsets, mapToData, counts);
 			}
 		}
 		else {
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
index f8edbcb1975..184ca1a69c2 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
@@ -118,6 +118,8 @@ private static AColGroup constructColGroup(ColGroupType ctype, int nRows){
 				return new ColGroupSDCSingleZeros(nRows);
 			case SDCZeros:
 				return new ColGroupSDCZeros(nRows);
+			case PFOR:
+				return new ColGroupPFOR(nRows);
 			default:
 				throw new DMLRuntimeException("Unsupported ColGroup Type used:  " + ctype);
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
index a303d98910c..64dd626bc17 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
@@ -23,7 +23,6 @@
 
 import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
@@ -66,7 +65,8 @@ public ColGroupType getColGroupType() {
 	}
 
 	@Override
-	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC, double[] values) {
+	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		double[] values) {
 		throw new NotImplementedException();
 		// final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		// final int numCols = getNumCols();
@@ -79,33 +79,34 @@ protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int
 		// double[] c = target.getDenseBlockValues();
 		// // cache conscious append via horizontal scans
 		// for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz) {
-		// 	for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
-		// 		int boff = _ptr[k];
-		// 		int blen = len(k);
-		// 		int bix = apos[k];
-
-		// 		if(bix >= blen)
-		// 			continue;
-		// 		int pos = boff + bix;
-		// 		int len = _data[pos];
-		// 		int i = 1;
-		// 		int row = bi + _data[pos + 1];
-		// 		while(i <= len && row < rl)
-		// 			row = bi + _data[pos + i++];
-
-		// 		for(; i <= len && row < ru; i++) {
-		// 			row = bi + _data[pos + i];
-		// 			int rc = (row - offOut) * targetCols;
-		// 			for(int j = 0; j < numCols; j++)
-		// 				c[rc + _colIndexes[j]] += values[off + j];
-		// 		}
-		// 		apos[k] += len + 1;
-		// 	}
+		// for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// int bix = apos[k];
+
+		// if(bix >= blen)
+		// continue;
+		// int pos = boff + bix;
+		// int len = _data[pos];
+		// int i = 1;
+		// int row = bi + _data[pos + 1];
+		// while(i <= len && row < rl)
+		// row = bi + _data[pos + i++];
+
+		// for(; i <= len && row < ru; i++) {
+		// row = bi + _data[pos + i];
+		// int rc = (row - offOut) * targetCols;
+		// for(int j = 0; j < numCols; j++)
+		// c[rc + _colIndexes[j]] += values[off + j];
+		// }
+		// apos[k] += len + 1;
+		// }
 		// }
 	}
 
 	@Override
-	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC, SparseBlock values) {
+	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		SparseBlock values) {
 		throw new NotImplementedException();
 	}
 
@@ -148,7 +149,7 @@ public AColGroup scalarOperation(ScalarOperator op) {
 		// fast path: sparse-safe operations
 		// Note that bitmaps don't change and are shallow-copied
 		if(op.sparseSafe || val0 == 0 || !_zeros) {
-			return new ColGroupOLE(_colIndexes, _numRows, _zeros, applyScalarOp(op), _data, _ptr, getCachedCounts());
+			return new ColGroupOLE(_colIndexes, _numRows, _zeros, _dict.applyScalarOp(op), _data, _ptr, getCachedCounts());
 		}
 		// slow path: sparse-unsafe operations (potentially create new bitmap)
 		// note: for efficiency, we currently don't drop values that become 0
@@ -156,10 +157,10 @@ public AColGroup scalarOperation(ScalarOperator op) {
 		int[] loff = computeOffsets(lind);
 
 		if(loff.length == 0) { // empty offset list: go back to fast path
-			return new ColGroupOLE(_colIndexes, _numRows, false, applyScalarOp(op), _data, _ptr, getCachedCounts());
+			return new ColGroupOLE(_colIndexes, _numRows, false, _dict.applyScalarOp(op), _data, _ptr, getCachedCounts());
 		}
 
-		ADictionary rvalues = applyScalarOp(op, val0, getNumCols());
+		ADictionary rvalues = _dict.applyScalarOp(op, val0, getNumCols());
 		char[] lbitmap = genOffsetBitmap(loff, loff.length);
 		char[] rbitmaps = Arrays.copyOf(_data, _data.length + lbitmap.length);
 		System.arraycopy(lbitmap, 0, rbitmaps, _data.length, lbitmap.length);
@@ -216,69 +217,74 @@ public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSa
 	// }
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		throw new NotImplementedException();
+		// final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+		// final int numVals = getNumValues();
 
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		final int numVals = getNumValues();
+		// if(numVals > 1 && _numRows > blksz) {
+		// final int blksz2 = CompressionSettings.BITMAP_BLOCK_SZ;
 
-		if(numVals > 1 && _numRows > blksz) {
-			final int blksz2 = CompressionSettings.BITMAP_BLOCK_SZ;
-
-			// step 1: prepare position and value arrays
-			int[] apos = skipScan(numVals, rl);
-			double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-
-			// step 2: cache conscious row sums via horizontal scans
-			for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz2) {
-				int bimax = Math.min(bi + blksz2, ru);
-
-				// horizontal segment scan, incl pos maintenance
-				for(int k = 0; k < numVals; k++) {
-					int boff = _ptr[k];
-					int blen = len(k);
-					double val = aval[k];
-					int bix = apos[k];
-
-					for(int ii = bi; ii < bimax && bix < blen; ii += blksz) {
-						// prepare length, start, and end pos
-						int len = _data[boff + bix];
-
-						// compute partial results
-						for(int i = 1; i <= len; i++) {
-							int rix = ii + _data[boff + bix + i];
-							if(rix >= _numRows)
-								throw new DMLCompressionException("Invalid row " + rix);
-							c[rix] += val;
-						}
-						bix += len + 1;
-					}
+		// // step 1: prepare position and value arrays
+		// int[] apos = skipScan(numVals, rl);
+		// double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
+
+		// // step 2: cache conscious row sums via horizontal scans
+		// for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz2) {
+		// int bimax = Math.min(bi + blksz2, ru);
+
+		// // horizontal segment scan, incl pos maintenance
+		// for(int k = 0; k < numVals; k++) {
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// double val = aval[k];
+		// int bix = apos[k];
+
+		// for(int ii = bi; ii < bimax && bix < blen; ii += blksz) {
+		// // prepare length, start, and end pos
+		// int len = _data[boff + bix];
+
+		// // compute partial results
+		// for(int i = 1; i <= len; i++) {
+		// int rix = ii + _data[boff + bix + i];
+		// if(rix >= _numRows)
+		// throw new DMLCompressionException("Invalid row " + rix);
+		// c[rix] += val;
+		// }
+		// bix += len + 1;
+		// }
 
-					apos[k] = bix;
-				}
-			}
-		}
-		else {
-			// iterate over all values and their bitmaps
-			for(int k = 0; k < numVals; k++) {
-				// prepare value-to-add for entire value bitmap
-				int boff = _ptr[k];
-				int blen = len(k);
-				double val = _dict.sumRow(k, square, _colIndexes.length);
+		// apos[k] = bix;
+		// }
+		// }
+		// }
+		// else {
+		// // iterate over all values and their bitmaps
+		// for(int k = 0; k < numVals; k++) {
+		// // prepare value-to-add for entire value bitmap
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// double val = _dict.sumRow(k, square, _colIndexes.length);
+
+		// // iterate over bitmap blocks and add values
+		// if(val != 0) {
+		// int slen;
+		// int bix = skipScanVal(k, rl);
+		// for(int off = ((rl + 1) / blksz) * blksz; bix < blen && off < ru; bix += slen + 1, off += blksz) {
+		// slen = _data[boff + bix];
+		// for(int i = 1; i <= slen; i++) {
+		// int rix = off + _data[boff + bix + i];
+		// c[rix] += val;
+		// }
+		// }
+		// }
+		// }
+		// }
+	}
 
-				// iterate over bitmap blocks and add values
-				if(val != 0) {
-					int slen;
-					int bix = skipScanVal(k, rl);
-					for(int off = ((rl + 1) / blksz) * blksz; bix < blen && off < ru; bix += slen + 1, off += blksz) {
-						slen = _data[boff + bix];
-						for(int i = 1; i <= slen; i++) {
-							int rix = off + _data[boff + bix + i];
-							c[rix] += val;
-						}
-					}
-				}
-			}
-		}
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		throw new NotImplementedException();
 	}
 
 	@Override
@@ -413,7 +419,7 @@ else if(_data[boff + bix + blckIx] > offset)
 	private int[] skipScan(int numVals, int rl) {
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		rl = (rl / blksz) * blksz;
-		int[] ret = allocIVector(numVals, rl == 0);
+		int[] ret = new int[numVals];
 
 		if(rl > 0) { // rl aligned with blksz
 			for(int k = 0; k < numVals; k++) {
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPFOR.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPFOR.java
new file mode 100644
index 00000000000..e858addbc27
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPFOR.java
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Divide;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Multiply;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+
+/**
+ * ColGroup for Patched Frame Of Reference.
+ * 
+ * This column group fits perfectly into the collection of compression groups
+ * 
+ * It can be constructed when a SDCZeros group get a non zero default value. Then a natural extension is to transform
+ * the group into a PFOR group, since the default value is then treated as an offset, and the dictionary can be copied
+ * with no modifications.
+ * 
+ */
+public class ColGroupPFOR extends AMorphingMMColGroup {
+
+	private static final long serialVersionUID = 3883228464052204203L;
+
+	/** Sparse row indexes for the data that is nonZero */
+	protected AOffset _indexes;
+
+	/** Pointers to row indexes in the dictionary. */
+	protected transient AMapToData _data;
+
+	/** Reference values in this column group */
+	protected double[] _reference;
+
+	/**
+	 * Constructor for serialization
+	 * 
+	 * @param numRows Number of rows contained
+	 */
+	protected ColGroupPFOR(int numRows) {
+		super(numRows);
+	}
+
+	private ColGroupPFOR(int[] colIndices, int numRows, ADictionary dict, AOffset indexes, AMapToData data,
+		int[] cachedCounts, double[] reference) {
+		super(colIndices, numRows, dict, cachedCounts);
+		_data = data;
+		_indexes = indexes;
+		_zeros = allZero(reference);
+		_reference = reference;
+	}
+
+	protected static AColGroup create(int[] colIndices, int numRows, ADictionary dict, AOffset indexes, AMapToData data,
+		int[] cachedCounts, double[] reference) {
+		if(dict == null) {
+			// either ColGroupEmpty or const
+			boolean allZero = true;
+			for(double d : reference)
+				if(d != 0) {
+					allZero = false;
+					break;
+				}
+
+			if(allZero)
+				return new ColGroupEmpty(colIndices);
+			else
+				return ColGroupFactory.genColGroupConst(colIndices, reference);
+		}
+		return new ColGroupPFOR(colIndices, numRows, dict, indexes, data, cachedCounts, reference);
+	}
+
+	private final static boolean allZero(double[] in) {
+		for(double v : in)
+			if(v != 0)
+				return false;
+		return true;
+	}
+
+	@Override
+	public CompressionType getCompType() {
+		return CompressionType.PFOR;
+	}
+
+	@Override
+	public ColGroupType getColGroupType() {
+		return ColGroupType.PFOR;
+	}
+
+	@Override
+	public int[] getCounts(int[] counts) {
+		return _data.getCounts(counts, _numRows);
+	}
+
+	@Override
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		// Add reference value sum.
+		final double refSum = refSum();
+		for(int rix = rl; rix < ru; rix++)
+			c[rix] += refSum;
+
+		final double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		ColGroupSDCZeros.computeRowSums(c, rl, ru, vals, _data, _indexes, _numRows);
+	}
+
+	private final double refSum() {
+		double ret = 0;
+		for(double d : _reference)
+			ret += d;
+		return ret;
+	}
+
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDoubleSq(_reference);
+		ColGroupSDC.computeRowSumsSq(c, rl, ru, vals, _data, _indexes, _numRows);
+	}
+
+	@Override
+	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
+		final double[] vals = _dict.aggregateRows(builtin, _reference);
+		ColGroupSDC.computeRowMxx(c, builtin, rl, ru, vals, _data, _indexes, _numRows, vals[vals.length - 1]);
+	}
+
+	@Override
+	public double getIdx(int r, int colIdx) {
+		final AIterator it = _indexes.getIterator(r);
+		final int nCol = _colIndexes.length;
+		if(it.value() == r) {
+			final int rowOff = _data.getIndex(it.getDataIndex()) * nCol;
+			return _dict.getValue(rowOff + colIdx) + _reference[colIdx];
+		}
+		else
+			return _reference[colIdx];
+	}
+
+	@Override
+	public AColGroup scalarOperation(ScalarOperator op) {
+		final double[] newRef = new double[_reference.length];
+		for(int i = 0; i < _reference.length; i++)
+			newRef[i] = op.executeScalar(_reference[i]);
+		if(op.fn instanceof Plus || op.fn instanceof Minus) {
+			return create(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), newRef);
+		}
+		else if(op.fn instanceof Multiply || op.fn instanceof Divide) {
+			final ADictionary newDict = _dict.applyScalarOp(op);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+		else {
+			final ADictionary newDict = _dict.applyScalarOp(op, _reference, newRef);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+	}
+
+	@Override
+	public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
+		final double[] newRef = new double[_reference.length];
+		for(int i = 0; i < _reference.length; i++)
+			newRef[i] = op.fn.execute(v[_colIndexes[i]], _reference[i]);
+
+		if(op.fn instanceof Plus || op.fn instanceof Minus)
+			return create(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), newRef);
+		else if(op.fn instanceof Multiply || op.fn instanceof Divide) {
+			final ADictionary newDict = _dict.binOpLeft(op, v, _colIndexes);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+		else {
+			final ADictionary newDict = _dict.binOpLeft(op, v, _colIndexes, _reference, newRef);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+	}
+
+	@Override
+	public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) {
+		final double[] newRef = new double[_reference.length];
+		for(int i = 0; i < _reference.length; i++)
+			newRef[i] = op.fn.execute(_reference[i], v[_colIndexes[i]]);
+		if(op.fn instanceof Plus || op.fn instanceof Minus)
+			return new ColGroupPFOR(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), newRef);
+		else if(op.fn instanceof Multiply || op.fn instanceof Divide) {
+			final ADictionary newDict = _dict.binOpRight(op, v, _colIndexes);
+			return new ColGroupPFOR(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+		else {
+			final ADictionary newDict = _dict.binOpRight(op, v, _colIndexes, _reference, newRef);
+			return new ColGroupPFOR(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+	}
+
+	@Override
+	public void write(DataOutput out) throws IOException {
+		super.write(out);
+		_indexes.write(out);
+		_data.write(out);
+		for(double d : _reference)
+			out.writeDouble(d);
+	}
+
+	@Override
+	public void readFields(DataInput in) throws IOException {
+		super.readFields(in);
+		_indexes = OffsetFactory.readIn(in);
+		_data = MapToFactory.readIn(in);
+		_reference = new double[_colIndexes.length];
+		for(int i = 0; i < _colIndexes.length; i++)
+			_reference[i] = in.readDouble();
+	}
+
+	@Override
+	public long getExactSizeOnDisk() {
+		long ret = super.getExactSizeOnDisk();
+		ret += _data.getExactSizeOnDisk();
+		ret += _indexes.getExactSizeOnDisk();
+		ret += 8 * _colIndexes.length; // reference values.
+		return ret;
+	}
+
+	@Override
+	public AColGroup replace(double pattern, double replace) {
+		boolean patternInReference = false;
+		for(double d : _reference)
+			if(pattern == d) {
+				patternInReference = true;
+				break;
+			}
+
+		if(patternInReference) {
+			throw new NotImplementedException("Not Implemented replace where a value in reference should be replaced");
+			// _dict.replace(pattern, replace, _reference, _newReplace);
+		}
+		else {
+			final ADictionary newDict = _dict.replace(pattern, replace, _reference);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), _reference);
+		}
+
+	}
+
+	@Override
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append(super.toString());
+		sb.append(String.format("\n%15s ", "Indexes: "));
+		sb.append(_indexes.toString());
+		sb.append(String.format("\n%15s ", "Data: "));
+		sb.append(_data);
+		sb.append(String.format("\n%15s ", "Reference:"));
+		sb.append(Arrays.toString(_reference));
+		return sb.toString();
+	}
+
+	@Override
+	protected double computeMxx(double c, Builtin builtin) {
+		return _dict.aggregate(c, builtin, _reference);
+	}
+
+	@Override
+	protected void computeColMxx(double[] c, Builtin builtin) {
+		_dict.aggregateCols(c, builtin, _colIndexes, _reference);
+	}
+
+	@Override
+	protected void computeSum(double[] c, int nRows) {
+		super.computeSum(c, nRows);
+		final double refSum = refSum();
+		c[0] += refSum * nRows;
+	}
+
+	@Override
+	public void computeColSums(double[] c, int nRows) {
+		super.computeColSums(c, nRows);
+		for(int i = 0; i < _colIndexes.length; i++)
+			c[_colIndexes[i]] += _reference[i] * nRows;
+	}
+
+	@Override
+	protected void computeSumSq(double[] c, int nRows) {
+		c[0] += _dict.sumSq(getCounts(), _reference);
+	}
+
+	@Override
+	protected void computeColSumsSq(double[] c, int nRows) {
+		_dict.colSumSq(c, getCounts(), _colIndexes, _reference);
+	}
+
+	@Override
+	protected void computeProduct(double[] c, int nRows) {
+		throw new NotImplementedException("Not Implemented PFOR");
+	}
+
+	@Override
+	protected void computeRowProduct(double[] c, int rl, int ru) {
+		throw new NotImplementedException("Not Implemented PFOR");
+	}
+
+	@Override
+	protected void computeColProduct(double[] c, int nRows) {
+		throw new NotImplementedException("Not Implemented PFOR");
+	}
+
+	@Override
+	protected AColGroup sliceSingleColumn(int idx) {
+		ColGroupPFOR ret = (ColGroupPFOR) super.sliceSingleColumn(idx);
+		// select values from double array.
+		ret._reference = new double[1];
+		ret._reference[0] = _reference[idx];
+		return ret;
+	}
+
+	@Override
+	protected AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
+		ColGroupPFOR ret = (ColGroupPFOR) super.sliceMultiColumns(idStart, idEnd, outputCols);
+		final int len = idEnd - idStart;
+		ret._reference = new double[len];
+		for(int i = 0, ii = idStart; i < len; i++, ii++)
+			ret._reference[i] = _reference[ii];
+
+		return ret;
+	}
+
+	@Override
+	public boolean containsValue(double pattern) {
+		if(pattern == 0 && _zeros)
+			return true;
+		else if(Double.isNaN(pattern) || Double.isInfinite(pattern))
+			return containsInfOrNan(pattern) || _dict.containsValue(pattern);
+		else
+			return _dict.containsValue(pattern, _reference);
+	}
+
+	private boolean containsInfOrNan(double pattern) {
+		if(Double.isNaN(pattern)) {
+			for(double d : _reference)
+				if(Double.isNaN(d))
+					return true;
+			return false;
+		}
+		else {
+			for(double d : _reference)
+				if(Double.isInfinite(d))
+					return true;
+			return false;
+		}
+	}
+
+	@Override
+	public long getNumberNonZeros(int nRows) {
+		int[] counts = getCounts();
+		return (long) _dict.getNumberNonZeros(counts, _reference, nRows);
+	}
+
+	@Override
+	public AColGroup extractCommon(double[] constV) {
+		for(int i = 0; i < _colIndexes.length; i++)
+			constV[_colIndexes[i]] += _reference[i];
+		return ColGroupSDCZeros.create(_colIndexes, _numRows, _dict, _indexes, _data, getCounts());
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
index 3d69b9662aa..3ee843468ec 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
@@ -24,7 +24,6 @@
 import java.util.List;
 
 import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
@@ -146,7 +145,7 @@ public AColGroup scalarOperation(ScalarOperator op) {
 		// fast path: sparse-safe operations
 		// Note that bitmaps don't change and are shallow-copied
 		if(op.sparseSafe || val0 == 0 || !_zeros) {
-			return new ColGroupRLE(_colIndexes, _numRows, _zeros, applyScalarOp(op), _data, _ptr, getCachedCounts());
+			return new ColGroupRLE(_colIndexes, _numRows, _zeros, _dict.applyScalarOp(op), _data, _ptr, getCachedCounts());
 		}
 
 		// slow path: sparse-unsafe operations (potentially create new bitmap)
@@ -154,10 +153,10 @@ public AColGroup scalarOperation(ScalarOperator op) {
 		boolean[] lind = computeZeroIndicatorVector();
 		int[] loff = computeOffsets(lind);
 		if(loff.length == 0) { // empty offset list: go back to fast path
-			return new ColGroupRLE(_colIndexes, _numRows, false, applyScalarOp(op), _data, _ptr, getCachedCounts());
+			return new ColGroupRLE(_colIndexes, _numRows, false, _dict.applyScalarOp(op), _data, _ptr, getCachedCounts());
 		}
 
-		ADictionary rvalues = applyScalarOp(op, val0, getNumCols());
+		ADictionary rvalues = _dict.applyScalarOp(op, val0, getNumCols());
 		char[] lbitmap = genRLEBitmap(loff, loff.length);
 
 		char[] rbitmaps = Arrays.copyOf(_data, _data.length + lbitmap.length);
@@ -217,73 +216,143 @@ public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSa
 	// }
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		throw new NotImplementedException();
+		// final int numVals = getNumValues();
 
-		final int numVals = getNumValues();
+		// if(numVals > 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
+		// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+
+		// 	// step 1: prepare position and value arrays
+
+		// 	// current pos / values per RLE list
+		// 	int[] astart = new int[numVals];
+		// 	int[] apos = skipScan(numVals, rl, astart);
+		// 	double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
+
+		// 	// step 2: cache conscious matrix-vector via horizontal scans
+		// 	for(int bi = rl; bi < ru; bi += blksz) {
+		// 		int bimax = Math.min(bi + blksz, ru);
+
+		// 		// horizontal segment scan, incl pos maintenance
+		// 		for(int k = 0; k < numVals; k++) {
+		// 			int boff = _ptr[k];
+		// 			int blen = len(k);
+		// 			double val = aval[k];
+		// 			int bix = apos[k];
+		// 			int start = astart[k];
+
+		// 			// compute partial results, not aligned
+		// 			while(bix < blen) {
+		// 				int lstart = _data[boff + bix];
+		// 				int llen = _data[boff + bix + 1];
+		// 				int from = Math.max(bi, start + lstart);
+		// 				int to = Math.min(start + lstart + llen, bimax);
+		// 				for(int rix = from; rix < to; rix++)
+		// 					c[rix] += val;
+
+		// 				if(start + lstart + llen >= bimax)
+		// 					break;
+		// 				start += lstart + llen;
+		// 				bix += 2;
+		// 			}
 
-		if(numVals > 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
-			final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-
-			// step 1: prepare position and value arrays
-
-			// current pos / values per RLE list
-			int[] astart = new int[numVals];
-			int[] apos = skipScan(numVals, rl, astart);
-			double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-
-			// step 2: cache conscious matrix-vector via horizontal scans
-			for(int bi = rl; bi < ru; bi += blksz) {
-				int bimax = Math.min(bi + blksz, ru);
-
-				// horizontal segment scan, incl pos maintenance
-				for(int k = 0; k < numVals; k++) {
-					int boff = _ptr[k];
-					int blen = len(k);
-					double val = aval[k];
-					int bix = apos[k];
-					int start = astart[k];
-
-					// compute partial results, not aligned
-					while(bix < blen) {
-						int lstart = _data[boff + bix];
-						int llen = _data[boff + bix + 1];
-						int from = Math.max(bi, start + lstart);
-						int to = Math.min(start + lstart + llen, bimax);
-						for(int rix = from; rix < to; rix++)
-							c[rix] += val;
-
-						if(start + lstart + llen >= bimax)
-							break;
-						start += lstart + llen;
-						bix += 2;
-					}
-
-					apos[k] = bix;
-					astart[k] = start;
-				}
-			}
-		}
-		else {
-			for(int k = 0; k < numVals; k++) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				double val = _dict.sumRow(k, square, _colIndexes.length);
-
-				if(val != 0.0) {
-					Pair<Integer, Integer> tmp = skipScanVal(k, rl);
-					int bix = tmp.getKey();
-					int curRunStartOff = tmp.getValue();
-					int curRunEnd = tmp.getValue();
-					for(; bix < blen && curRunEnd < ru; bix += 2) {
-						curRunStartOff = curRunEnd + _data[boff + bix];
-						curRunEnd = curRunStartOff + _data[boff + bix + 1];
-						for(int rix = curRunStartOff; rix < curRunEnd && rix < ru; rix++)
-							c[rix] += val;
-
-					}
-				}
-			}
-		}
+		// 			apos[k] = bix;
+		// 			astart[k] = start;
+		// 		}
+		// 	}
+		// }
+		// else {
+		// 	for(int k = 0; k < numVals; k++) {
+		// 		int boff = _ptr[k];
+		// 		int blen = len(k);
+		// 		double val = _dict.sumRow(k, square, _colIndexes.length);
+
+		// 		if(val != 0.0) {
+		// 			Pair<Integer, Integer> tmp = skipScanVal(k, rl);
+		// 			int bix = tmp.getKey();
+		// 			int curRunStartOff = tmp.getValue();
+		// 			int curRunEnd = tmp.getValue();
+		// 			for(; bix < blen && curRunEnd < ru; bix += 2) {
+		// 				curRunStartOff = curRunEnd + _data[boff + bix];
+		// 				curRunEnd = curRunStartOff + _data[boff + bix + 1];
+		// 				for(int rix = curRunStartOff; rix < curRunEnd && rix < ru; rix++)
+		// 					c[rix] += val;
+
+		// 			}
+		// 		}
+		// 	}
+		// }
+	}
+
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		throw new NotImplementedException();
+		// final int numVals = getNumValues();
+
+		// if(numVals > 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
+		// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+
+		// 	// step 1: prepare position and value arrays
+
+		// 	// current pos / values per RLE list
+		// 	int[] astart = new int[numVals];
+		// 	int[] apos = skipScan(numVals, rl, astart);
+		// 	double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
+
+		// 	// step 2: cache conscious matrix-vector via horizontal scans
+		// 	for(int bi = rl; bi < ru; bi += blksz) {
+		// 		int bimax = Math.min(bi + blksz, ru);
+
+		// 		// horizontal segment scan, incl pos maintenance
+		// 		for(int k = 0; k < numVals; k++) {
+		// 			int boff = _ptr[k];
+		// 			int blen = len(k);
+		// 			double val = aval[k];
+		// 			int bix = apos[k];
+		// 			int start = astart[k];
+
+		// 			// compute partial results, not aligned
+		// 			while(bix < blen) {
+		// 				int lstart = _data[boff + bix];
+		// 				int llen = _data[boff + bix + 1];
+		// 				int from = Math.max(bi, start + lstart);
+		// 				int to = Math.min(start + lstart + llen, bimax);
+		// 				for(int rix = from; rix < to; rix++)
+		// 					c[rix] += val;
+
+		// 				if(start + lstart + llen >= bimax)
+		// 					break;
+		// 				start += lstart + llen;
+		// 				bix += 2;
+		// 			}
+
+		// 			apos[k] = bix;
+		// 			astart[k] = start;
+		// 		}
+		// 	}
+		// }
+		// else {
+		// 	for(int k = 0; k < numVals; k++) {
+		// 		int boff = _ptr[k];
+		// 		int blen = len(k);
+		// 		double val = _dict.sumRow(k, square, _colIndexes.length);
+
+		// 		if(val != 0.0) {
+		// 			Pair<Integer, Integer> tmp = skipScanVal(k, rl);
+		// 			int bix = tmp.getKey();
+		// 			int curRunStartOff = tmp.getValue();
+		// 			int curRunEnd = tmp.getValue();
+		// 			for(; bix < blen && curRunEnd < ru; bix += 2) {
+		// 				curRunStartOff = curRunEnd + _data[boff + bix];
+		// 				curRunEnd = curRunStartOff + _data[boff + bix + 1];
+		// 				for(int rix = curRunStartOff; rix < curRunEnd && rix < ru; rix++)
+		// 					c[rix] += val;
+
+		// 			}
+		// 		}
+		// 	}
+		// }
 	}
 
 	@Override
@@ -395,7 +464,7 @@ public double getIdx(int r, int colIdx) {
 	 * @return array of positions for all values
 	 */
 	private int[] skipScan(int numVals, int rl, int[] astart) {
-		int[] apos = allocIVector(numVals, rl == 0);
+		int[] apos = new int[numVals];
 
 		if(rl > 0) { // rl aligned with blksz
 			for(int k = 0; k < numVals; k++) {
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
index fc011e082a1..9aef4313406 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
@@ -23,18 +23,13 @@
 import java.io.DataOutput;
 import java.io.IOException;
 
-import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
 import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
-import org.apache.sysds.runtime.data.DenseBlock;
-import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
@@ -46,7 +41,7 @@
  * This column group is handy in cases where sparse unsafe operations is executed on very sparse columns. Then the zeros
  * would be materialized in the group without any overhead.
  */
-public class ColGroupSDC extends AColGroupValue {
+public class ColGroupSDC extends AMorphingMMColGroup {
 	private static final long serialVersionUID = 769993538831949086L;
 	/**
 	 * Sparse row indexes for the data
@@ -66,7 +61,7 @@ protected ColGroupSDC(int numRows) {
 		super(numRows);
 	}
 
-	protected ColGroupSDC(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
+	private ColGroupSDC(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
 		int[] cachedCounts) {
 		super(colIndices, numRows, dict, cachedCounts);
 		_indexes = offsets;
@@ -74,6 +69,14 @@ protected ColGroupSDC(int[] colIndices, int numRows, ADictionary dict, AOffset o
 		_zeros = false;
 	}
 
+	protected static AColGroup create(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
+		int[] cachedCounts) {
+		if(dict == null)
+			return new ColGroupEmpty(colIndices);
+		else
+			return new ColGroupSDC(colIndices, numRows, dict, offsets, data, cachedCounts);
+	}
+
 	@Override
 	public CompressionType getCompType() {
 		return CompressionType.SDC;
@@ -85,183 +88,153 @@ public ColGroupType getColGroupType() {
 	}
 
 	@Override
-	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		double[] values) {
+	public double getIdx(int r, int colIdx) {
+		final AIterator it = _indexes.getIterator(r);
 		final int nCol = _colIndexes.length;
-		final int offsetToDefault = values.length - nCol;
-		final AIterator it = _indexes.getIterator(rl);
-
-		int offT = rl + offR;
-		int i = rl;
-		for(; i < ru && it.hasNext(); i++, offT++) {
-			final double[] c = db.values(offT);
-			final int off = db.pos(offT) + offC;
-			if(it.value() == i) {
-				int offset = _data.getIndex(it.getDataIndexAndIncrement()) * nCol;
-				for(int j = 0; j < nCol; j++)
-					c[off + _colIndexes[j]] += values[offset + j];
-			}
-			else
-				for(int j = 0; j < nCol; j++)
-					c[off + _colIndexes[j]] += values[offsetToDefault + j];
-		}
-
-		for(; i < ru; i++, offT++) {
-			final double[] c = db.values(offT);
-			final int off = db.pos(offT) + offC;
-			for(int j = 0; j < nCol; j++)
-				c[off + _colIndexes[j]] += values[offsetToDefault + j];
-		}
-
-		_indexes.cacheIterator(it, ru);
-	}
-
-	@Override
-	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		SparseBlock sb) {
-		throw new NotImplementedException();
-		// final int offsetToDefault = sb.numRows() - 1;
-		// final int defApos = sb.pos(offsetToDefault);
-		// final int defAlen = sb.size(offsetToDefault) + defApos;
-		// final double[] defAvals = sb.values(offsetToDefault);
-		// final int[] defAix = sb.indexes(offsetToDefault);
-		// final DenseBlock db = target.getDenseBlock();
-
-		// int i = rl;
-		// AIterator it = _indexes.getIterator(rl);
-		// for(; i < ru && it.hasNext(); i++, offT++) {
-		// final double[] c = db.values(offT);
-		// final int off = db.pos(offT);
-		// if(it.value() == i) {
-		// int dictIndex = _data.getIndex(it.getDataIndexAndIncrement());
-		// if(sb.isEmpty(dictIndex))
-		// continue;
-		// final int apos = sb.pos(dictIndex);
-		// final int alen = sb.size(dictIndex) + apos;
-		// final double[] avals = sb.values(dictIndex);
-		// final int[] aix = sb.indexes(dictIndex);
-		// for(int j = apos; j < alen; j++)
-		// c[off + _colIndexes[aix[j]]] += avals[j];
-		// }
-		// else
-		// for(int j = defApos; j < defAlen; j++)
-		// c[off + _colIndexes[defAix[j]]] += defAvals[j];
-		// }
-
-		// for(; i < ru; i++, offT++) {
-		// final double[] c = db.values(offT);
-		// final int off = db.pos(offT);
-		// for(int j = defApos; j < defAlen; j++)
-		// c[off + _colIndexes[defAix[j]]] += defAvals[j];
-		// }
-
-		// _indexes.cacheIterator(it, ru);
+		final int rowOff = it.value() == r ? _data.getIndex(it.getDataIndex()) * nCol : getNumValues() * nCol - nCol;
+		return _dict.getValue(rowOff + colIdx);
 	}
 
 	@Override
-	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
-		SparseBlock sb) {
-		throw new NotImplementedException();
-	}
+	protected void computeRowSums(double[] c, int rl, int ru) {
 
-	@Override
-	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
-		double[] values) {
-		final int nCol = _colIndexes.length;
-		final int offsetToDefault = values.length - nCol;
 		final AIterator it = _indexes.getIterator(rl);
-
-		int offT = rl + offR;
-		int i = rl;
-		for(; i < ru && it.hasNext(); i++, offT++) {
-			// final double[] c = db.values(offT);
-			// final int off = db.pos(offT) + offC;
-			if(it.value() == i) {
-				int offset = _data.getIndex(it.getDataIndexAndIncrement()) * nCol;
-				for(int j = 0; j < nCol; j++)
-					ret.append(offT, _colIndexes[j] + offC, values[offset + j]);
-				// c[off + _colIndexes[j]] += values[offset + j];
+		final int numVals = getNumValues();
+		int r = rl;
+		final double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		final double def = vals[numVals - 1];
+		if(it != null && it.value() > ru)
+			_indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= _indexes.getOffsetToLast()) {
+			final int maxId = _data.size() - 1;
+			while(true) {
+				if(it.value() == r) {
+					c[r] += vals[_data.getIndex(it.getDataIndex())];
+					if(it.getDataIndex() < maxId)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] += def;
+				r++;
 			}
-			else
-				for(int j = 0; j < nCol; j++)
-					ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
-			// c[off + _colIndexes[j]] += values[offsetToDefault + j];
 		}
-
-		for(; i < ru; i++, offT++) {
-			// final double[] c = db.values(offT);
-			// final int off = db.pos(offT) + offC;
-			for(int j = 0; j < nCol; j++)
-				ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
-			// c[off + _colIndexes[j]] += values[offsetToDefault + j];
+		else if(it != null) {
+			while(it.isNotOver(ru)) {
+				if(it.value() == r)
+					c[r] += vals[_data.getIndex(it.getDataIndexAndIncrement())];
+				else
+					c[r] += def;
+				r++;
+			}
+			_indexes.cacheIterator(it, ru);
 		}
 
-		_indexes.cacheIterator(it, ru);
+		while(r < ru) {
+			c[r] += def;
+			r++;
+		}
 	}
 
 	@Override
-	public double getIdx(int r, int colIdx) {
-		final AIterator it = _indexes.getIterator(r);
-		final int nCol = _colIndexes.length;
-		final int rowOff = it.value() == r ? getIndex(it.getDataIndex()) * nCol : getNumValues() * nCol - nCol;
-		return _dict.getValue(rowOff + colIdx);
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length);
+		computeRowSumsSq(c, rl, ru, vals, _data, _indexes, _numRows);
 	}
 
-	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		final int numVals = getNumValues();
-		// // pre-aggregate nnz per value tuple
-		double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-
-		int rix = rl;
-		AIterator it = _indexes.getIterator(rl);
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] += vals[numVals - 1];
-			else {
-				c[rix] += vals[_data.getIndex(it.getDataIndexAndIncrement())];
+	protected static final void computeRowSumsSq(double[] c, int rl, int ru, double[] vals, AMapToData data,
+		AOffset indexes, int nRows) {
+		int r = rl;
+		final AIterator it = indexes.getIterator(rl);
+		final double def = vals[vals.length - 1];
+		if(it != null && it.value() > ru)
+			indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= indexes.getOffsetToLast()) {
+			final int maxId = data.size() - 1;
+			while(true) {
+				if(it.value() == r) {
+					c[r] += vals[data.getIndex(it.getDataIndex())];
+					if(it.getDataIndex() < maxId)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] += def;
+				r++;
 			}
 		}
-		for(; rix < ru; rix++) {
-			c[rix] += vals[numVals - 1];
+		else if(it != null) {
+			while(r < ru) {
+				if(it.value() == r)
+					c[r] += vals[data.getIndex(it.getDataIndexAndIncrement())];
+				else
+					c[r] += def;
+				r++;
+			}
+			indexes.cacheIterator(it, ru);
 		}
 
+		while(r < ru) {
+			c[r] += def;
+			r++;
+		}
 	}
 
 	@Override
 	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		final int numVals = getNumValues();
-		final double[] vals = _dict.aggregateTuples(builtin, _colIndexes.length);
-		final AIterator it = _indexes.getIterator(rl);
-		int rix = rl;
+		final double[] vals = _dict.aggregateRows(builtin, _colIndexes.length);
+		computeRowMxx(c, builtin, rl, ru, vals, _data, _indexes, _numRows, vals[vals.length - 1]);
+	}
 
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] = builtin.execute(c[rix], vals[numVals - 1]);
-			else
-				c[rix] = builtin.execute(c[rix], vals[_data.getIndex(it.getDataIndexAndIncrement())]);
+	protected static final void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, double[] vals,
+		AMapToData data, AOffset indexes, int nRows, double def) {
+		int r = rl;
+		final AIterator it = indexes.getIterator(rl);
+		if(it != null && it.value() > ru)
+			indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= indexes.getOffsetToLast()) {
+			final int maxId = data.size() - 1;
+			while(true) {
+				if(it.value() == r) {
+					c[r] = builtin.execute(c[r], vals[data.getIndex(it.getDataIndex())]);
+					if(it.getDataIndex() < maxId)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] = builtin.execute(c[r], def);
+				r++;
+			}
+		}
+		else if(it != null) {
+			while(r < ru) {
+				if(it.value() == r)
+					c[r] = builtin.execute(c[r], vals[data.getIndex(it.getDataIndexAndIncrement())]);
+				else
+					c[r] = builtin.execute(c[r], def);
+				r++;
+			}
+			indexes.cacheIterator(it, ru);
 		}
 
-		// cover remaining rows with default value
-		for(; rix < ru; rix++)
-			c[rix] = builtin.execute(c[rix], vals[numVals - 1]);
+		while(r < ru) {
+			c[r] = builtin.execute(c[r], def);
+			r++;
+		}
 	}
 
 	@Override
 	public int[] getCounts(int[] counts) {
-		final int nonDefaultLength = _data.size();
-		// final AIterator it = _indexes.getIterator();
-		final int defaults = _numRows - nonDefaultLength;
-		for(int i = 0; i < nonDefaultLength; i++)
-			counts[_data.getIndex(i)]++;
-
-		counts[counts.length - 1] += defaults;
-
-		return counts;
-	}
-
-	public int getIndex(int r) {
-		return _data.getIndex(r);
+		return _data.getCounts(counts, _numRows);
 	}
 
 	@Override
@@ -274,19 +247,19 @@ public long estimateInMemorySize() {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		return new ColGroupSDC(_colIndexes, _numRows, applyScalarOp(op), _indexes, _data, getCachedCounts());
+		return create(_colIndexes, _numRows, _dict.applyScalarOp(op), _indexes, _data, getCachedCounts());
 	}
 
 	@Override
 	public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
 		ADictionary ret = _dict.binOpLeft(op, v, _colIndexes);
-		return new ColGroupSDC(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+		return create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
 	}
 
 	@Override
 	public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) {
 		ADictionary ret = _dict.binOpRight(op, v, _colIndexes);
-		return new ColGroupSDC(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+		return create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
 	}
 
 	@Override
@@ -311,40 +284,17 @@ public long getExactSizeOnDisk() {
 		return ret;
 	}
 
-	public ColGroupSDCZeros extractCommon(double[] constV) {
+	@Override
+	public AColGroup extractCommon(double[] constV) {
 		double[] commonV = _dict.getTuple(getNumValues() - 1, _colIndexes.length);
 		if(commonV == null) // The common tuple was all zero. Therefore this column group should never have been SDC.
-			return new ColGroupSDCZeros(_colIndexes, _numRows, _dict, _indexes, _data, getCounts());
+			return ColGroupSDCZeros.create(_colIndexes, _numRows, _dict, _indexes, _data, getCounts());
 
 		for(int i = 0; i < _colIndexes.length; i++)
 			constV[_colIndexes[i]] += commonV[i];
 
 		ADictionary subtractedDict = _dict.subtractTuple(commonV);
-		return new ColGroupSDCZeros(_colIndexes, _numRows, subtractedDict, _indexes, _data, getCounts());
-	}
-
-	@Override
-	public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
-	@Override
-	public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
-	@Override
-	public void tsmmAColGroup(AColGroup other, MatrixBlock result) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
+		return ColGroupSDCZeros.create(_colIndexes, _numRows, subtractedDict, _indexes, _data, getCounts());
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
index cb123eca99c..c3f19c5ddad 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
@@ -23,16 +23,11 @@
 import java.io.DataOutput;
 import java.io.IOException;
 
-import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
 import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
-import org.apache.sysds.runtime.data.DenseBlock;
-import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
@@ -44,7 +39,7 @@
  * This column group is handy in cases where sparse unsafe operations is executed on very sparse columns. Then the zeros
  * would be materialized in the group without any overhead.
  */
-public class ColGroupSDCSingle extends AColGroupValue {
+public class ColGroupSDCSingle extends AMorphingMMColGroup {
 	private static final long serialVersionUID = 3883228464052204200L;
 	/**
 	 * Sparse row indexes for the data
@@ -76,126 +71,185 @@ public ColGroupType getColGroupType() {
 		return ColGroupType.SDCSingle;
 	}
 
-	@Override
-	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		double[] values) {
-		final int nCol = _colIndexes.length;
-		final int offsetToDefault = values.length - nCol;
-		final AIterator it = _indexes.getIterator(rl);
-
-		int offT = rl + offR;
-		int i = rl;
-		for(; i < ru && it.hasNext(); i++, offT++) {
-			final double[] c = db.values(offT);
-			final int off = db.pos(offT) + offC;
-			if(it.value() == i) {
-				for(int j = 0; j < nCol; j++)
-					c[off + _colIndexes[j]] += values[j];
-				it.next();
-			}
-			else
-				for(int j = 0; j < nCol; j++)
-					c[off + _colIndexes[j]] += values[offsetToDefault + j];
-		}
-
-		for(; i < ru; i++, offT++) {
-			final double[] c = db.values(offT);
-			final int off = db.pos(offT) + offC;
-			for(int j = 0; j < nCol; j++)
-				c[off + _colIndexes[j]] += values[offsetToDefault + j];
-		}
+	// @Override
+	// protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+	// double[] values) {
+	// final int nCol = _colIndexes.length;
+	// final int offsetToDefault = values.length - nCol;
+	// final AIterator it = _indexes.getIterator(rl);
+
+	// int offT = rl + offR;
+	// int i = rl;
+	// for(; i < ru && it.hasNext(); i++, offT++) {
+	// final double[] c = db.values(offT);
+	// final int off = db.pos(offT) + offC;
+	// if(it.value() == i) {
+	// for(int j = 0; j < nCol; j++)
+	// c[off + _colIndexes[j]] += values[j];
+	// it.next();
+	// }
+	// else
+	// for(int j = 0; j < nCol; j++)
+	// c[off + _colIndexes[j]] += values[offsetToDefault + j];
+	// }
+
+	// for(; i < ru; i++, offT++) {
+	// final double[] c = db.values(offT);
+	// final int off = db.pos(offT) + offC;
+	// for(int j = 0; j < nCol; j++)
+	// c[off + _colIndexes[j]] += values[offsetToDefault + j];
+	// }
+
+	// _indexes.cacheIterator(it, ru);
+	// }
+
+	// @Override
+	// protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+	// SparseBlock values) {
+	// throw new NotImplementedException();
+	// }
+
+	// @Override
+	// protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
+	// SparseBlock sb) {
+	// throw new NotImplementedException();
+	// }
+
+	// @Override
+	// protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
+	// double[] values) {
+	// final int nCol = _colIndexes.length;
+	// final int offsetToDefault = values.length - nCol;
+	// final AIterator it = _indexes.getIterator(rl);
+
+	// int offT = rl + offR;
+	// int i = rl;
+	// for(; i < ru && it.hasNext(); i++, offT++) {
+	// if(it.value() == i) {
+	// for(int j = 0; j < nCol; j++)
+	// ret.append(offT, _colIndexes[j] + offC, values[j]);
+	// it.next();
+	// }
+	// else
+	// for(int j = 0; j < nCol; j++)
+	// ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
+	// }
+
+	// for(; i < ru; i++, offT++)
+	// for(int j = 0; j < nCol; j++)
+	// ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
+
+	// _indexes.cacheIterator(it, ru);
+	// }
 
-		_indexes.cacheIterator(it, ru);
+	@Override
+	public double getIdx(int r, int colIdx) {
+		AIterator it = _indexes.getIterator(r);
+		if(it.value() == r)
+			return _dict.getValue(colIdx);
+		else
+			return _dict.getValue(_colIndexes.length + colIdx);
 	}
 
 	@Override
-	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		SparseBlock values) {
-		throw new NotImplementedException();
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		computeRowSums(c, rl, ru, vals);
 	}
 
 	@Override
-	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
-		SparseBlock sb) {
-		throw new NotImplementedException();
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length);
+		computeRowSums(c, rl, ru, vals);
 	}
 
-	@Override
-	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
-		double[] values) {
-		final int nCol = _colIndexes.length;
-		final int offsetToDefault = values.length - nCol;
+	protected void computeRowSums(double[] c, int rl, int ru, double[] vals) {
+		int r = rl;
 		final AIterator it = _indexes.getIterator(rl);
-
-		int offT = rl + offR;
-		int i = rl;
-		for(; i < ru && it.hasNext(); i++, offT++) {
-			if(it.value() == i) {
-				for(int j = 0; j < nCol; j++)
-					ret.append(offT, _colIndexes[j] + offC, values[j]);
-				it.next();
+		final double def = vals[1];
+		final double norm = vals[0];
+		if(it != null && it.value() > ru)
+			_indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= _indexes.getOffsetToLast()) {
+			final int maxOff = _indexes.getOffsetToLast();
+			while(true) {
+				if(it.value() == r) {
+					c[r] += norm;
+					if(it.value() < maxOff)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] += def;
+				r++;
 			}
-			else
-				for(int j = 0; j < nCol; j++)
-					ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
+		}
+		else if(it != null) {
+			while(r < ru) {
+				if(it.value() == r)
+					c[r] += norm;
+				else
+					c[r] += def;
+				r++;
+			}
+			_indexes.cacheIterator(it, ru);
 		}
 
-		for(; i < ru; i++, offT++)
-			for(int j = 0; j < nCol; j++)
-				ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
-
-		_indexes.cacheIterator(it, ru);
+		while(r < ru) {
+			c[r] += def;
+			r++;
+		}
 	}
 
 	@Override
-	public double getIdx(int r, int colIdx) {
-		AIterator it = _indexes.getIterator(r);
-		if(it.value() == r)
-			return _dict.getValue(colIdx);
-		else
-			return _dict.getValue(_colIndexes.length + colIdx);
+	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
+		final double[] vals = _dict.aggregateRows(builtin, _colIndexes.length);
+		computeRowMxx(c, builtin, rl, ru, _indexes, _numRows, vals[1], vals[0]);
 	}
 
-	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-
-		// // pre-aggregate nnz per value tuple
-		final double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-		final AIterator it = _indexes.getIterator();
-
-		int rix = rl;
-		it.skipTo(rl);
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] += vals[1];
-			else {
-				c[rix] += vals[0];
-				it.next();
+	protected static final void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, AOffset indexes, int nRows,
+		double def, double norm) {
+		int r = rl;
+		final AIterator it = indexes.getIterator(rl);
+		if(it != null && it.value() > ru)
+			indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= indexes.getOffsetToLast()) {
+			final int maxOff = indexes.getOffsetToLast();
+			while(true) {
+				if(it.value() == r) {
+					c[r] = builtin.execute(c[r], norm);
+					if(it.value() < maxOff)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] = builtin.execute(c[r], def);
+				r++;
 			}
 		}
-		for(; rix < ru; rix++) {
-			c[rix] += vals[1];
-		}
-	}
-
-	@Override
-	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		final double[] vals = _dict.aggregateTuples(builtin, _colIndexes.length);
-		final AIterator it = _indexes.getIterator(rl);
-		int rix = rl;
-
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] = builtin.execute(c[rix], vals[1]);
-			else {
-				c[rix] = builtin.execute(c[rix], vals[0]);
-				it.next();
+		else if(it != null) {
+			while(r < ru) {
+				if(it.value() == r) {
+					c[r] = builtin.execute(c[r], norm);
+					it.next();
+				}
+				else
+					c[r] = builtin.execute(c[r], def);
+				r++;
 			}
+			indexes.cacheIterator(it, ru);
 		}
 
-		// cover remaining rows with default value
-		for(; rix < ru; rix++)
-			c[rix] = builtin.execute(c[rix], vals[1]);
+		while(r < ru) {
+			c[r] = builtin.execute(c[r], def);
+			r++;
+		}
 	}
 
 	@Override
@@ -214,7 +268,7 @@ public long estimateInMemorySize() {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		return new ColGroupSDCSingle(_colIndexes, _numRows, applyScalarOp(op), _indexes, getCachedCounts());
+		return new ColGroupSDCSingle(_colIndexes, _numRows, _dict.applyScalarOp(op), _indexes, getCachedCounts());
 	}
 
 	@Override
@@ -248,6 +302,7 @@ public long getExactSizeOnDisk() {
 		return ret;
 	}
 
+	@Override
 	public ColGroupSDCSingleZeros extractCommon(double[] constV) {
 		double[] commonV = _dict.getTuple(getNumValues() - 1, _colIndexes.length);
 
@@ -261,30 +316,6 @@ public ColGroupSDCSingleZeros extractCommon(double[] constV) {
 		return new ColGroupSDCSingleZeros(_colIndexes, _numRows, subtractedDict, _indexes, getCachedCounts());
 	}
 
-	@Override
-	public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
-	@Override
-	public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
-	@Override
-	public void tsmmAColGroup(AColGroup other, MatrixBlock result) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
index d8edd0d3c7c..534856735b4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
@@ -46,9 +46,7 @@
  */
 public class ColGroupSDCSingleZeros extends APreAgg {
 	private static final long serialVersionUID = 8033235615964315078L;
-	/**
-	 * Sparse row indexes for the data
-	 */
+	/** Sparse row indexes for the data */
 	protected transient AOffset _indexes;
 
 	/**
@@ -80,64 +78,129 @@ public ColGroupType getColGroupType() {
 	@Override
 	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		double[] values) {
-		final int nCol = _colIndexes.length;
+
 		final AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int row = offR + it.value();
-			final double[] c = db.values(row);
-			final int off = db.pos(row) + offC;
-			for(int j = 0; j < nCol; j++)
-				c[off + _colIndexes[j]] += values[j];
-
-			it.next();
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int maxOff = _indexes.getOffsetToLast();
+			final int nCol = _colIndexes.length;
+			while(true) {
+				final int row = offR + it.value();
+				final double[] c = db.values(row);
+				final int off = db.pos(row);
+				for(int j = 0; j < nCol; j++)
+					c[off + _colIndexes[j] + offC] += values[j];
+				if(it.value() < maxOff)
+					it.next();
+				else
+					break;
+			}
 		}
-		_indexes.cacheIterator(it, ru);
+		else {
+			final int nCol = _colIndexes.length;
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				final double[] c = db.values(row);
+				final int off = db.pos(row);
+				for(int j = 0; j < nCol; j++)
+					c[off + _colIndexes[j] + offC] += values[j];
+
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
+		}
+
 	}
 
 	@Override
 	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		SparseBlock values) {
-		throw new NotImplementedException();
-		// final int offTCorr = offT - rl;
-		// final DenseBlock db = target.getDenseBlock();
-		// final int apos = values.pos(0);
-		// final int alen = values.size(0) + apos;
-		// final int[] aix = values.indexes(0);
-		// final double[] avals = values.values(0);
-
-		// AIterator it = _indexes.getIterator(rl);
-		// while(it.hasNext() && it.value() < ru) {
-		// final int idx = offTCorr + it.value();
-		// final double[] c = db.values(idx);
-		// final int off = db.pos(idx);
-
-		// for(int j = apos; j < alen; j++)
-		// c[off + _colIndexes[aix[j]]] += avals[j];
-
-		// it.next();
-		// }
-
-		// _indexes.cacheIterator(it, ru);
+		SparseBlock sb) {
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			throw new NotImplementedException();
+		}
+		else {
+			final int apos = sb.pos(0);
+			final int alen = sb.size(0) + apos;
+			final int[] aix = sb.indexes(0);
+			final double[] avals = sb.values(0);
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				final double[] c = db.values(row);
+				final int off = db.pos(row);
+				for(int j = apos; j < alen; j++)
+					c[off + _colIndexes[aix[j]] + offC] += avals[j];
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
+		}
 	}
 
 	@Override
 	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
 		SparseBlock sb) {
-		throw new NotImplementedException();
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			throw new NotImplementedException();
+		}
+		else {
+			final int apos = sb.pos(0);
+			final int alen = sb.size(0) + apos;
+			final int[] aix = sb.indexes(0);
+			final double[] avals = sb.values(0);
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				for(int j = apos; j < alen; j++)
+					ret.append(row, _colIndexes[aix[j]] + offC, avals[j]);
+
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
+		}
 	}
 
 	@Override
 	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
 		double[] values) {
-		final int nCol = _colIndexes.length;
 		final AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int row = offR + it.value();
-			for(int j = 0; j < nCol; j++)
-				ret.append(row, _colIndexes[j] + offC, values[j]);
-			it.next();
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int nCol = _colIndexes.length;
+			final int lastOff = _indexes.getOffsetToLast();
+			while(true) {
+				final int row = offR + it.value();
+				for(int j = 0; j < nCol; j++)
+					ret.append(row, _colIndexes[j] + offC, values[j]);
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+			final int nCol = _colIndexes.length;
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				for(int j = 0; j < nCol; j++)
+					ret.append(row, _colIndexes[j] + offC, values[j]);
+
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
 	}
 
 	@Override
@@ -150,34 +213,45 @@ public double getIdx(int r, int colIdx) {
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		final double vals = _dict.sumAllRowsToDouble(square, _colIndexes.length)[0];
-		final AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			c[it.value()] += vals;
-			it.next();
-		}
-
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		final double def = _dict.sumAllRowsToDouble(_colIndexes.length)[0];
+		computeRowSum(c, rl, ru, def);
 	}
 
 	@Override
-	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		final double vals = _dict.aggregateTuples(builtin, _colIndexes.length)[0];
-		final AIterator it = _indexes.getIterator(rl);
-		int rix = rl;
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double def = _dict.sumAllRowsToDoubleSq(_colIndexes.length)[0];
+		computeRowSum(c, rl, ru, def);
+	}
 
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] = builtin.execute(c[rix], 0);
-			else {
-				c[rix] = builtin.execute(c[rix], vals);
+	protected void computeRowSum(double[] c, int rl, int ru, double def) {
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() > ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int maxOff = _indexes.getOffsetToLast();
+			while(true) {
+				c[it.value()] += def;
+				if(it.value() == maxOff)
+					break;
+				it.next();
+			}
+		}
+		else {
+			while(it.isNotOver(ru)) {
+				c[it.value()] += def;
 				it.next();
 			}
+			_indexes.cacheIterator(it, ru);
 		}
+	}
 
-		// cover remaining rows
-		for(; rix < ru; rix++)
-			c[rix] = builtin.execute(c[rix], 0);
+	@Override
+	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
+		final double[] vals = _dict.aggregateRows(builtin, _colIndexes.length);
+		ColGroupSDCSingle.computeRowMxx(c, builtin, rl, ru, _indexes, _numRows, 0, vals[0]);
 	}
 
 	@Override
@@ -197,66 +271,88 @@ public void preAggregate(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
 
 	@Override
 	public void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru, int cl, int cu) {
-		final double[] mV = m.getDenseBlockValues();
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final int numVals = getNumValues();
-		final int blockSize = 2000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, cu);
-			final AIterator itStart = _indexes.getIterator(block);
-			AIterator it;
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
-				final int offLeft = rowLeft * _numRows;
-				it = itStart.clone();
-				while(it.value() < blockEnd && it.hasNext()) {
-					final int i = it.value();
-					preAV[offOut] += mV[offLeft + i];
-					it.next();
-				}
+
+		final AIterator it = _indexes.getIterator(cl);
+		if(it == null)
+			return;
+		else if(it.value() > cu)
+			_indexes.cacheIterator(it, cu);
+		else if(rl == ru - 1) {
+			final int maxOff = _indexes.getOffsetToLast();
+			final double[] mV = m.getDenseBlockValues();
+			final double[] preAV = preAgg.getDenseBlockValues();
+			final int offLeft = rl * _numRows;
+			while(true) {
+				final int i = it.value();
+				preAV[0] += mV[offLeft + i];
+				if(i == maxOff)
+					break;
+				it.next();
 			}
 		}
+		else
+			throw new NotImplementedException();
+
 	}
 
 	private void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int numVals = getNumValues();
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
-			final AIterator it = _indexes.getIterator();
-			final int offLeft = rowLeft * _numRows;
-			while(it.hasNext()) {
-				final int i = it.value();
-				preAV[offOut] += mV[offLeft + i];
+		final AIterator it = _indexes.getIterator();
+		if(rl == ru - 1) {
+			double ret = 0;
+			final DenseBlock db = m.getDenseBlock();
+			final double[] mV = db.values(rl);
+			final int off = db.pos(rl);
+			final int offsetToLast = _indexes.getOffsetToLast();
+			while(true) {
+				ret += mV[off + it.value()];
+				if(it.value() == offsetToLast)
+					break;
 				it.next();
 			}
+
+			preAgg.setValue(0, 0, ret);
 		}
+		else
+			throw new NotImplementedException();
 	}
 
 	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final int numVals = getNumValues();
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
-			if(sb.isEmpty(rowLeft))
-				continue;
-			final AIterator it = _indexes.getIterator();
-			final int apos = sb.pos(rowLeft);
-			final int alen = sb.size(rowLeft) + apos;
-			final int[] aix = sb.indexes(rowLeft);
-			final double[] avals = sb.values(rowLeft);
+		final AIterator it = _indexes.getIterator();
+		if(rl == ru - 1) {
+			final int apos = sb.pos(rl);
+			final int alen = sb.size(rl) + apos;
+			final int[] aix = sb.indexes(rl);
+			final double[] avals = sb.values(rl);
+			final int offsetToLast = _indexes.getOffsetToLast();
+
+			double ret = 0;
 			int j = apos;
-			while(it.hasNext() && j < alen) {
-				final int index = aix[j];
-				final int v = it.value();
-				if(index < v)
-					j++;
-				else if(index == v) {
-					preAV[offOut] += avals[j++];
+
+			while(true) {
+				final int idx = aix[j];
+
+				if(idx == it.value()) {
+					ret += avals[j++];
+					if(j >= alen || it.value() >= offsetToLast)
+						break;
 					it.next();
 				}
-				else
+				else if(idx < it.value()) {
+					j++;
+					if(j >= alen)
+						break;
+				}
+				else {
+					if(it.value() >= offsetToLast)
+						break;
 					it.next();
+				}
 			}
+
+			preAgg.setValue(0, 0, ret);
 		}
+		else
+			throw new NotImplementedException();
 	}
 
 	@Override
@@ -271,9 +367,9 @@ public AColGroup scalarOperation(ScalarOperator op) {
 		double val0 = op.executeScalar(0);
 		boolean isSparseSafeOp = op.sparseSafe || val0 == 0;
 		if(isSparseSafeOp)
-			return new ColGroupSDCSingleZeros(_colIndexes, _numRows, applyScalarOp(op), _indexes, getCachedCounts());
+			return new ColGroupSDCSingleZeros(_colIndexes, _numRows, _dict.applyScalarOp(op), _indexes, getCachedCounts());
 		else {
-			ADictionary aDictionary = applyScalarOp(op, val0, getNumCols());// swapEntries();
+			ADictionary aDictionary = _dict.applyScalarOp(op, val0, getNumCols());// swapEntries();
 			// ADictionary aDictionary = applyScalarOp(op, val0, getNumCols());
 			return new ColGroupSDCSingle(_colIndexes, _numRows, aDictionary, _indexes, null);
 		}
@@ -336,10 +432,15 @@ public boolean sameIndexStructure(AColGroupCompressed that) {
 	public void preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
 		final AIterator itThis = _indexes.getIterator();
 		final int nCol = that._colIndexes.length;
-		while(itThis.hasNext()) {
+		final int finalOffThis = _indexes.getOffsetToLast();
+
+		while(true) {
 			final int fr = that._data.getIndex(itThis.value());
 			that._dict.addToEntry(ret, fr, 0, nCol);
-			itThis.next();
+			if(itThis.value() >= finalOffThis)
+				break;
+			else
+				itThis.next();
 		}
 	}
 
@@ -348,26 +449,69 @@ public void preAggregateThatSDCZerosStructure(ColGroupSDCZeros that, Dictionary
 		final AIterator itThat = that._indexes.getIterator();
 		final AIterator itThis = _indexes.getIterator();
 		final int nCol = that._colIndexes.length;
-
-		while(itThat.hasNext() && itThis.hasNext()) {
-			final int v = itThat.value();
-			if(v == itThis.skipTo(v))
-				that._dict.addToEntry(ret, that.getIndex(itThat.getDataIndex()), 0, nCol);
-
-			itThat.next();
+		final int finalOffThis = _indexes.getOffsetToLast();
+		final int finalOffThat = that._indexes.getOffsetToLast();
+
+		while(true) {
+			if(itThat.value() == itThis.value()) {
+				that._dict.addToEntry(ret, that._data.getIndex(itThat.getDataIndex()), 0, nCol);
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+			else if(itThat.value() < itThis.value()) {
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+			}
+			else {
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
 		}
 	}
 
 	@Override
 	public void preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary ret) {
+		final int nCol = that._colIndexes.length;
 		final AIterator itThat = that._indexes.getIterator();
 		final AIterator itThis = _indexes.getIterator();
-		final int nCol = that._colIndexes.length;
-		while(itThat.hasNext()) {
-			final int v = itThat.value();
-			if(v == itThis.skipTo(v))
+		final int finalOffThis = _indexes.getOffsetToLast();
+		final int finalOffThat = that._indexes.getOffsetToLast();
+
+		while(true) {
+			if(itThat.value() == itThis.value()) {
 				that._dict.addToEntry(ret, 0, 0, nCol);
-			itThat.next();
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+			else if(itThat.value() < itThis.value()) {
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+			}
+			else {
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+
 		}
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
index a7632dd70ad..8fa9887b2f5 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
@@ -23,13 +23,9 @@
 import java.io.DataOutput;
 import java.io.IOException;
 
-import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
-import org.apache.sysds.runtime.compress.colgroup.mapping.MapToByte;
-import org.apache.sysds.runtime.compress.colgroup.mapping.MapToChar;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
@@ -37,6 +33,7 @@
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
@@ -72,19 +69,20 @@ protected ColGroupSDCZeros(int numRows) {
 		super(numRows);
 	}
 
-	protected ColGroupSDCZeros(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data) {
-		super(colIndices, numRows, dict, null);
+	private ColGroupSDCZeros(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
+		int[] cachedCounts) {
+		super(colIndices, numRows, dict, cachedCounts);
 		_indexes = offsets;
 		_data = data;
 		_zeros = true;
 	}
 
-	protected ColGroupSDCZeros(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
+	protected static AColGroup create(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
 		int[] cachedCounts) {
-		super(colIndices, numRows, dict, cachedCounts);
-		_indexes = offsets;
-		_data = data;
-		_zeros = true;
+		if(dict == null)
+			return new ColGroupEmpty(colIndices);
+		else
+			return new ColGroupSDCZeros(colIndices, numRows, dict, offsets, data, cachedCounts);
 	}
 
 	@Override
@@ -100,129 +98,256 @@ public ColGroupType getColGroupType() {
 	@Override
 	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		double[] values) {
-		final int nCol = _colIndexes.length;
 
-		AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int idx = offR + it.value();
-			final double[] c = db.values(idx);
-			final int off = db.pos(idx) + offC;
-			final int offDict = getIndex(it.getDataIndexAndIncrement()) * nCol;
-			for(int j = 0; j < nCol; j++)
-				c[off + _colIndexes[j]] += values[offDict + j];
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int lastOff = _indexes.getOffsetToLast();
+			final int nCol = _colIndexes.length;
+			while(true) {
+				final int idx = offR + it.value();
+				final double[] c = db.values(idx);
+				final int off = db.pos(idx) + offC;
+				final int offDict = _data.getIndex(it.getDataIndex()) * nCol;
+				for(int j = 0; j < nCol; j++)
+					c[off + _colIndexes[j]] += values[offDict + j];
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+
+			final int nCol = _colIndexes.length;
+			while(it.isNotOver(ru)) {
+				final int idx = offR + it.value();
+				final double[] c = db.values(idx);
+				final int off = db.pos(idx) + offC;
+				final int offDict = _data.getIndex(it.getDataIndex()) * nCol;
+				for(int j = 0; j < nCol; j++)
+					c[off + _colIndexes[j]] += values[offDict + j];
 
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
+
 	}
 
 	@Override
 	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		SparseBlock sb) {
-		AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int idx = offR + it.value();
-			final int dictIndex = getIndex(it.getDataIndexAndIncrement());
-			if(sb.isEmpty(dictIndex))
-				continue;
-
-			final double[] c = db.values(idx);
-			final int off = db.pos(idx) + offC;
-			final int apos = sb.pos(dictIndex);
-			final int alen = sb.size(dictIndex) + apos;
-			final double[] avals = sb.values(dictIndex);
-			final int[] aix = sb.indexes(dictIndex);
-			for(int j = apos; j < alen; j++)
-				c[off + _colIndexes[aix[j]]] += avals[j];
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int lastOff = _indexes.getOffsetToLast();
+			while(true) {
+				final int idx = offR + it.value();
+				final double[] c = db.values(idx);
+				final int dx = it.getDataIndex();
+				final int dictIndex = _data.getIndex(dx);
+				if(sb.isEmpty(dictIndex)) {
+					if(it.value() == lastOff)
+						return;
+					it.next();
+					continue;
+				}
+
+				final int off = db.pos(idx) + offC;
+				final int apos = sb.pos(dictIndex);
+				final int alen = sb.size(dictIndex) + apos;
+				final double[] avals = sb.values(dictIndex);
+				final int[] aix = sb.indexes(dictIndex);
+				for(int j = apos; j < alen; j++)
+					c[off + _colIndexes[aix[j]]] += avals[j];
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+			while(it.isNotOver(ru)) {
+				final int idx = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int dictIndex = _data.getIndex(dx);
+				if(sb.isEmpty(dictIndex)) {
+					it.next();
+					continue;
+				}
+
+				final double[] c = db.values(idx);
+				final int off = db.pos(idx) + offC;
+				final int apos = sb.pos(dictIndex);
+				final int alen = sb.size(dictIndex) + apos;
+				final double[] avals = sb.values(dictIndex);
+				final int[] aix = sb.indexes(dictIndex);
+				for(int j = apos; j < alen; j++)
+					c[off + _colIndexes[aix[j]]] += avals[j];
+
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
 	}
 
 	@Override
 	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
 		SparseBlock sb) {
-		AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int row = offR + it.value();
-			final int dictIndex = getIndex(it.getDataIndexAndIncrement());
-			if(sb.isEmpty(dictIndex))
-				continue;
-
-			final int apos = sb.pos(dictIndex);
-			final int alen = sb.size(dictIndex) + apos;
-			final double[] avals = sb.values(dictIndex);
-			final int[] aix = sb.indexes(dictIndex);
-			for(int j = apos; j < alen; j++)
-				ret.append(row, _colIndexes[aix[j]] + offC, avals[j] );
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int lastOff = _indexes.getOffsetToLast();
+			while(true) {
+				final int row = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int dictIndex = _data.getIndex(dx);
+				if(sb.isEmpty(dictIndex)) {
+					if(it.value() == lastOff)
+						return;
+					it.next();
+					continue;
+				}
+
+				final int apos = sb.pos(dictIndex);
+				final int alen = sb.size(dictIndex) + apos;
+				final double[] avals = sb.values(dictIndex);
+				final int[] aix = sb.indexes(dictIndex);
+				for(int j = apos; j < alen; j++)
+					ret.append(row, _colIndexes[aix[j]] + offC, avals[j]);
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int dictIndex = _data.getIndex(dx);
+				if(sb.isEmpty(dictIndex)) {
+					it.next();
+					continue;
+				}
+
+				final int apos = sb.pos(dictIndex);
+				final int alen = sb.size(dictIndex) + apos;
+				final double[] avals = sb.values(dictIndex);
+				final int[] aix = sb.indexes(dictIndex);
+				for(int j = apos; j < alen; j++)
+					ret.append(row, _colIndexes[aix[j]] + offC, avals[j]);
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
 	}
 
 	@Override
 	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
 		double[] values) {
-		final int nCol = _colIndexes.length;
+		// LOG.error(ret);
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru >= _indexes.getOffsetToLast()) {
+			final int lastOff = _indexes.getOffsetToLast();
+			final int nCol = _colIndexes.length;
+			while(true) {
+				final int row = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int offDict = _data.getIndex(dx) * nCol;
+				for(int j = 0; j < nCol; j++)
+					ret.append(row, _colIndexes[j] + offC, values[offDict + j]);
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+
+			final int nCol = _colIndexes.length;
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int offDict = _data.getIndex(dx) * nCol;
+				for(int j = 0; j < nCol; j++)
+					ret.append(row, _colIndexes[j] + offC, values[offDict + j]);
 
-		AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int row = offR + it.value();
-			final int offDict = getIndex(it.getDataIndexAndIncrement()) * nCol;
-			for(int j = 0; j < nCol; j++)
-				ret.append(row, _colIndexes[j] + offC, values[offDict + j]);
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
+
 	}
 
 	@Override
 	public double getIdx(int r, int colIdx) {
 		final AIterator it = _indexes.getIterator(r);
+		if(it == null || it.value() != r)
+			return 0;
 		final int nCol = _colIndexes.length;
-		if(it.value() == r)
-			return _dict.getValue(getIndex(it.getDataIndex()) * nCol + colIdx);
-		else
-			return 0.0;
+		return _dict.getValue(_data.getIndex(it.getDataIndex()) * nCol + colIdx);
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		final double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-		final AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru)
-			c[it.value()] += vals[getIndex(it.getDataIndexAndIncrement())];
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		computeRowSums(c, rl, ru, vals);
 	}
 
 	@Override
-	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		final double[] vals = _dict.aggregateTuples(builtin, _colIndexes.length);
-		final AIterator it = _indexes.getIterator(rl);
-		int rix = rl;
-
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] = builtin.execute(c[rix], 0);
-			else
-				c[rix] = builtin.execute(c[rix], vals[_data.getIndex(it.getDataIndexAndIncrement())]);
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length);
+		computeRowSums(c, rl, ru, vals);
+	}
+
+	protected void computeRowSums(double[] c, int rl, int ru, double[] vals) {
+		computeRowSums(c, rl, ru, vals, _data, _indexes, _numRows);
+	}
+
+	protected static final void computeRowSums(double[] c, int rl, int ru, double[] vals, AMapToData data,
+		AOffset indexes, int nRows) {
+		final AIterator it = indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() > ru)
+			indexes.cacheIterator(it, ru);
+		else if(ru >= indexes.getOffsetToLast()) {
+			final int maxId = data.size() - 1;
+			c[it.value()] += vals[data.getIndex(it.getDataIndex())];
+			while(it.getDataIndex() < maxId) {
+				it.next();
+				c[it.value()] += vals[data.getIndex(it.getDataIndex())];
+			}
+		}
+		else {
+			while(it.isNotOver(ru)) {
+				c[it.value()] += vals[data.getIndex(it.getDataIndex())];
+				it.next();
+			}
+			indexes.cacheIterator(it, ru);
 		}
-
-		// cover remaining rows with default value
-		for(; rix < ru; rix++)
-			c[rix] = builtin.execute(c[rix], 0);
 	}
 
 	@Override
-	public int[] getCounts(int[] counts) {
-		final int nonDefaultLength = _data.size();
-		// final AIterator it = _indexes.getIterator();
-		final int zeros = _numRows - nonDefaultLength;
-		for(int i = 0; i < nonDefaultLength; i++)
-			counts[_data.getIndex(i)]++;
-
-		counts[counts.length - 1] += zeros;
-
-		return counts;
+	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
+		final double[] vals = _dict.aggregateRows(builtin, _colIndexes.length);
+		ColGroupSDC.computeRowMxx(c, builtin, rl, ru, vals, _data, _indexes, _numRows, 0);
 	}
 
-	public int getIndex(int r) {
-		return _data.getIndex(r);
+	@Override
+	public int[] getCounts(int[] counts) {
+		return _data.getCounts(counts, _numRows);
 	}
 
 	@Override
@@ -235,82 +360,11 @@ public void preAggregate(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
 
 	@Override
 	public void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru, int cl, int cu) {
-
-		final int numVals = getNumValues();
-		if(cl != 0 && cu != preAgg.getNumColumns())
-			throw new NotImplementedException("Not implemented preAggregate of sub number of columns");
-		if(_data instanceof MapToByte)
-			preAggregateDenseByte(m, preAgg, ((MapToByte) _data).getBytes(), rl, ru, cl, cu, _numRows, numVals, _indexes);
-		else if(_data instanceof MapToChar)
-			preAggregateDenseChar(m, preAgg, ((MapToChar) _data).getChars(), rl, ru, cl, cu, _numRows, numVals, _indexes);
-		else
-			throw new DMLCompressionException("Unsupported map type:" + _data);
-
-	}
-
-	private static void preAggregateDenseByte(final MatrixBlock m, final MatrixBlock preAgg, final byte[] d,
-		final int rl, final int ru, final int cl, final int cu, final int nRow, final int nVal, AOffset indexes) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		// multi row iterator.
-		final AIterator itStart = indexes.getIterator(cl);
-		AIterator it = null;
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-			final int offLeft = rowLeft * nRow;
-			it = itStart.clone();
-			while(it.value() < cu && it.hasNext()) {
-				int i = it.value();
-				int index = d[it.getDataIndexAndIncrement()] & 0xFF;
-				preAV[offOut + index] += mV[offLeft + i];
-			}
-		}
-		if(it != null && cu < m.getNumColumns())
-			indexes.cacheIterator(it, cu);
-	}
-
-	private static void preAggregateDenseChar(final MatrixBlock m, final MatrixBlock preAgg, final char[] d,
-		final int rl, final int ru, final int cl, final int cu, final int nRow, final int nVal, AOffset indexes) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		// multi row iterator.
-		final AIterator itStart = indexes.getIterator(cl);
-		AIterator it = null;
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-			final int offLeft = rowLeft * nRow;
-			it = itStart.clone();
-			while(it.value() < cu && it.hasNext()) {
-				int i = it.value();
-				int index = d[it.getDataIndexAndIncrement()];
-				preAV[offOut + index] += mV[offLeft + i];
-			}
-		}
-		if(it != null && cu < m.getNumColumns())
-			indexes.cacheIterator(it, cu);
+		_data.preAggregateDense(m, preAgg.getDenseBlockValues(), rl, ru, cl, cu, _indexes);
 	}
 
 	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final int numVals = getNumValues();
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
-			if(sb.isEmpty(rowLeft))
-				continue;
-			final AIterator it = _indexes.getIterator();
-			final int apos = sb.pos(rowLeft);
-			final int alen = sb.size(rowLeft) + apos;
-			final int[] aix = sb.indexes(rowLeft);
-			final double[] avals = sb.values(rowLeft);
-			int j = apos;
-			while(it.hasNext() && j < alen) {
-				final int index = aix[j];
-				final int val = it.value();
-				if(index < val)
-					j++;
-				else if(index == val)
-					preAV[offOut + _data.getIndex(it.getDataIndexAndIncrement())] += avals[j++];
-				else
-					it.next();
-			}
-		}
+		_data.preAggregateSparse(sb, preAgg.getDenseBlockValues(), rl, ru, _indexes);
 	}
 
 	@Override
@@ -326,10 +380,10 @@ public AColGroup scalarOperation(ScalarOperator op) {
 		double val0 = op.executeScalar(0);
 		boolean isSparseSafeOp = op.sparseSafe || val0 == 0;
 		if(isSparseSafeOp)
-			return new ColGroupSDCZeros(_colIndexes, _numRows, applyScalarOp(op), _indexes, _data, getCachedCounts());
+			return create(_colIndexes, _numRows, _dict.applyScalarOp(op), _indexes, _data, getCachedCounts());
 		else {
-			ADictionary rValues = applyScalarOp(op, val0, getNumCols());
-			return new ColGroupSDC(_colIndexes, _numRows, rValues, _indexes, _data, getCachedCounts());
+			ADictionary rValues = _dict.applyScalarOp(op, val0, getNumCols());
+			return ColGroupSDC.create(_colIndexes, _numRows, rValues, _indexes, _data, getCachedCounts());
 		}
 	}
 
@@ -337,11 +391,15 @@ public AColGroup scalarOperation(ScalarOperator op) {
 	public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
 		if(isRowSafe) {
 			ADictionary ret = _dict.binOpLeft(op, v, _colIndexes);
-			return new ColGroupSDCZeros(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+			return create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+		}
+		else if(op.fn instanceof Plus) {
+			double[] def = ColGroupUtils.binaryDefRowLeft(op, v, _colIndexes);
+			return ColGroupPFOR.create(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), def);
 		}
 		else {
 			ADictionary ret = _dict.applyBinaryRowOpLeftAppendNewEntry(op, v, _colIndexes);
-			return new ColGroupSDC(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+			return ColGroupSDC.create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
 		}
 	}
 
@@ -349,11 +407,15 @@ public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSaf
 	public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) {
 		if(isRowSafe) {
 			ADictionary ret = _dict.binOpRight(op, v, _colIndexes);
-			return new ColGroupSDCZeros(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+			return create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+		}
+		else if(op.fn instanceof Plus) {
+			double[] def = ColGroupUtils.binaryDefRowRight(op, v, _colIndexes);
+			return ColGroupPFOR.create(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), def);
 		}
 		else {
 			ADictionary ret = _dict.applyBinaryRowOpRightAppendNewEntry(op, v, _colIndexes);
-			return new ColGroupSDC(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+			return ColGroupSDC.create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
 		}
 	}
 
@@ -394,10 +456,15 @@ public void preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
 		final AIterator itThis = _indexes.getIterator();
 		final int nCol = that._colIndexes.length;
 
-		while(itThis.hasNext()) {
+		final int finalOffThis = _indexes.getOffsetToLast();
+		while(true) {
 			final int fr = that._data.getIndex(itThis.value());
-			final int to = getIndex(itThis.getDataIndexAndIncrement());
+			final int to = _data.getIndex(itThis.getDataIndex());
 			that._dict.addToEntry(ret, fr, to, nCol);
+			if(itThis.value() >= finalOffThis)
+				break;
+			else
+				itThis.next();
 		}
 	}
 
@@ -405,17 +472,37 @@ public void preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
 	public void preAggregateThatSDCZerosStructure(ColGroupSDCZeros that, Dictionary ret) {
 		final AIterator itThat = that._indexes.getIterator();
 		final AIterator itThis = _indexes.getIterator();
+
+		final int finalOffThis = _indexes.getOffsetToLast();
+		final int finalOffThat = that._indexes.getOffsetToLast();
+
 		final int nCol = that._colIndexes.length;
-		while(itThat.hasNext() && itThis.hasNext()) {
+		while(true) {
 			if(itThat.value() == itThis.value()) {
-				final int fr = that.getIndex(itThat.getDataIndexAndIncrement());
-				final int to = getIndex(itThis.getDataIndexAndIncrement());
+				final int fr = that._data.getIndex(itThat.getDataIndex());
+				final int to = _data.getIndex(itThis.getDataIndex());
 				that._dict.addToEntry(ret, fr, to, nCol);
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+			else if(itThat.value() < itThis.value()) {
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+			}
+			else {
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
 			}
-			else if(itThat.value() < itThis.value())
-				itThat.next();
-			else
-				itThis.next();
 		}
 	}
 
@@ -425,16 +512,34 @@ public void preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that,
 		final AIterator itThis = _indexes.getIterator();
 		final int nCol = that._colIndexes.length;
 
-		while(itThat.hasNext() && itThis.hasNext()) {
+		final int finalOffThis = _indexes.getOffsetToLast();
+		final int finalOffThat = that._indexes.getOffsetToLast();
+
+		while(true) {
 			if(itThat.value() == itThis.value()) {
-				final int to = getIndex(itThis.getDataIndexAndIncrement());
+				final int to = _data.getIndex(itThis.getDataIndex());
 				that._dict.addToEntry(ret, 0, to, nCol);
-				itThat.next();
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+			else if(itThat.value() < itThis.value()) {
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+			}
+			else {
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
 			}
-			else if(itThat.value() < itThis.value())
-				itThat.next();
-			else
-				itThis.next();
 		}
 	}
 
@@ -448,7 +553,7 @@ public AColGroup replace(double pattern, double replace) {
 
 	private AColGroup replaceZero(double replace) {
 		ADictionary replaced = _dict.replaceZeroAndExtend(replace, _colIndexes.length);
-		return new ColGroupSDC(_colIndexes, _numRows, replaced, _indexes, _data, getCachedCounts());
+		return ColGroupSDC.create(_colIndexes, _numRows, replaced, _indexes, _data, getCachedCounts());
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUtils.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUtils.java
new file mode 100644
index 00000000000..f33d2dee293
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUtils.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.sysds.runtime.functionobjects.ValueFunction;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+
+public class ColGroupUtils {
+
+	/**
+	 * Calculate the result of performing the binary operation on an empty row to the left
+	 * 
+	 * v op empty
+	 * 
+	 * @param op         The operator
+	 * @param v          The values to use on the left side of the operator
+	 * @param colIndexes The column indexes to extract
+	 * @return The result as a double array.
+	 */
+	protected final static double[] binaryDefRowLeft(BinaryOperator op, double[] v, int[] colIndexes) {
+		final ValueFunction fn = op.fn;
+		final int len = colIndexes.length;
+		final double[] ret = new double[len];
+		for(int i = 0; i < len; i++)
+			ret[i] = fn.execute(v[colIndexes[i]], 0);
+		return ret;
+	}
+
+	/**
+	 * Calculate the result of performing the binary operation on an empty row to the right
+	 * 
+	 * empty op v
+	 * 
+	 * @param op         The operator
+	 * @param v          The values to use on the left side of the operator
+	 * @param colIndexes The column indexes to extract
+	 * @return The result as a double array.
+	 */
+	protected final static double[] binaryDefRowRight(BinaryOperator op, double[] v, int[] colIndexes) {
+		final ValueFunction fn = op.fn;
+		final int len = colIndexes.length;
+		final double[] ret = new double[len];
+		for(int i = 0; i < len; i++)
+			ret[i] = fn.execute(0, v[colIndexes[i]]);
+		return ret;
+	}
+
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
index 79be408c17f..7ee7ed38d8a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
@@ -70,6 +70,16 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract double aggregate(double init, Builtin fn);
 
+	/**
+	 * Aggregate all the contained values, with a reference offset.
+	 * 
+	 * @param init      The initial value, in cases such as Max value this could be -infinity.
+	 * @param fn        The function to apply to the values
+	 * @param reference The reference offset to each value in the dictionary
+	 * @return The aggregated value as a double.
+	 */
+	public abstract double aggregate(double init, Builtin fn, double[] reference);
+
 	/**
 	 * Aggregate all entries in the rows.
 	 * 
@@ -77,7 +87,57 @@ public abstract class ADictionary implements Serializable {
 	 * @param nCol The number of columns contained in the dictionary.
 	 * @return Aggregates for this dictionary tuples.
 	 */
-	public abstract double[] aggregateTuples(Builtin fn, int nCol);
+	public abstract double[] aggregateRows(Builtin fn, int nCol);
+
+	/**
+	 * Aggregate all entries in the rows with an offset value reference added.
+	 * 
+	 * @param fn        The aggregate function
+	 * @param reference The reference offset to each value in the dictionary
+	 * @return Aggregates for this dictionary tuples.
+	 */
+	public abstract double[] aggregateRows(Builtin fn, double[] reference);
+
+	/**
+	 * Aggregates the columns into the target double array provided.
+	 * 
+	 * @param c          The target double array, this contains the full number of columns, therefore the colIndexes for
+	 *                   this specific dictionary is needed.
+	 * @param fn         The function to apply to individual columns
+	 * @param colIndexes The mapping to the target columns from the individual columns
+	 */
+	public abstract void aggregateCols(double[] c, Builtin fn, int[] colIndexes);
+
+	/**
+	 * Aggregates the columns into the target double array provided.
+	 * 
+	 * @param c          The target double array, this contains the full number of columns, therefore the colIndexes for
+	 *                   this specific dictionary is needed.
+	 * @param fn         The function to apply to individual columns
+	 * @param reference  The reference offset values to add to each cell.
+	 * @param colIndexes The mapping to the target columns from the individual columns
+	 */
+	public abstract void aggregateCols(double[] c, Builtin fn, int[] colIndexes, double[] reference);
+
+	/**
+	 * Allocate a new dictionary and applies the scalar operation on each cell of the to then return the new.
+	 * 
+	 * @param op The operator.
+	 * @return The new dictionary to return.
+	 */
+	public abstract ADictionary applyScalarOp(ScalarOperator op);
+
+	/**
+	 * Allocate a new dictionary and apply the scalar operation on each cell to then return a new dictionary.
+	 * 
+	 * outValues[j] = op(this.values[j] + reference[i]) - newReference[i]
+	 * 
+	 * @param op           The operator to apply to each cell.
+	 * @param reference    The reference value to add before the operator.
+	 * @param newReference The reference value to subtract after the operator.
+	 * @return A New Dictionary.
+	 */
+	public abstract ADictionary applyScalarOp(ScalarOperator op, double[] reference, double[] newReference);
 
 	/**
 	 * Applies the scalar operation on the dictionary. Note that this operation modifies the underlying data, and
@@ -109,6 +169,23 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes);
 
+	/**
+	 * Apply the binary operator such that each value is offset by the reference before application. Then put the result
+	 * into the new dictionary, but offset it by the new reference.
+	 * 
+	 * outValues[j] = op(v[colIndexes[i]], this.values[j] + reference[i]) - newReference[i]
+	 * 
+	 * 
+	 * @param op           The operation to apply on the dictionary values.
+	 * @param v            The values to use on the left side of the operator.
+	 * @param colIndexes   The column indexes to use.
+	 * @param reference    The reference value to add before operator.
+	 * @param newReference The reference value to subtract after operator.
+	 * @return A new dictionary.
+	 */
+	public abstract ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference);
+
 	/**
 	 * Apply binary row operation on the right side.
 	 * 
@@ -119,6 +196,22 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes);
 
+	/**
+	 * Apply the binary operator such that each value is offset by the reference before application. Then put the result
+	 * into the new dictionary, but offset it by the new reference.
+	 * 
+	 * outValues[j] = op(this.values[j] + reference[i], v[colIndexes[i]]) - newReference[i]
+	 * 
+	 * @param op           The operation to apply on the dictionary values.
+	 * @param v            The values to use on the right side of the operator.
+	 * @param colIndexes   The column indexes to use.
+	 * @param reference    The reference value to add before operator.
+	 * @param newReference The reference value to subtract after operator.
+	 * @return A new dictionary.
+	 */
+	public abstract ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference);
+
 	/**
 	 * Apply binary row operation on the left side and allocate a new dictionary.
 	 * 
@@ -131,7 +224,6 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract ADictionary applyBinaryRowOpLeftAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes);
 
-
 	/**
 	 * Apply binary row operation on this dictionary on the right side.
 	 * 
@@ -155,16 +247,6 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract ADictionary cloneAndExtend(int len);
 
-	/**
-	 * Aggregates the columns into the target double array provided.
-	 * 
-	 * @param c          The target double array, this contains the full number of columns, therefore the colIndexes for
-	 *                   this specific dictionary is needed.
-	 * @param fn         The function to apply to individual columns
-	 * @param colIndexes The mapping to the target columns from the individual columns
-	 */
-	public abstract void aggregateCols(double[] c, Builtin fn, int[] colIndexes);
-
 	/**
 	 * Write the dictionary to a DataOutput.
 	 * 
@@ -200,21 +282,57 @@ public abstract class ADictionary implements Serializable {
 	 * 
 	 * Note if the number of columns is one the actual dictionaries values are simply returned.
 	 * 
-	 * @param square    If each entry should be squared.
+	 * 
+	 * @param nrColumns The number of columns in the ColGroup to know how to get the values from the dictionary.
+	 * @return a double array containing the row sums from this dictionary.
+	 */
+	public abstract double[] sumAllRowsToDouble(int nrColumns);
+
+	/**
+	 * Method used as a pre-aggregate of each tuple in the dictionary, to single double values.
+	 * 
+	 * Note if the number of columns is one the actual dictionaries values are simply returned.
+	 * 
 	 * @param nrColumns The number of columns in the ColGroup to know how to get the values from the dictionary.
 	 * @return a double array containing the row sums from this dictionary.
 	 */
-	public abstract double[] sumAllRowsToDouble(boolean square, int nrColumns);
+	public abstract double[] sumAllRowsToDoubleSq(int nrColumns);
+
+	/**
+	 * Method used as a pre-aggregate of each tuple in the dictionary, to single double values.
+	 * 
+	 * @param reference The reference values to add to each cell.
+	 * @return a double array containing the row sums from this dictionary.
+	 */
+	public abstract double[] sumAllRowsToDoubleSq(double[] reference);
 
 	/**
 	 * Sum the values at a specific row.
 	 * 
 	 * @param k         The row index to sum
-	 * @param square    If each entry should be squared.
 	 * @param nrColumns The number of columns
 	 * @return The sum of the row.
 	 */
-	public abstract double sumRow(int k, boolean square, int nrColumns);
+	public abstract double sumRow(int k, int nrColumns);
+
+	/**
+	 * Sum the values at a specific row.
+	 * 
+	 * @param k         The row index to sum
+	 * @param nrColumns The number of columns
+	 * @return The sum of the row.
+	 */
+	public abstract double sumRowSq(int k, int nrColumns);
+
+	/**
+	 * Sum the values at a specific row, with a reference array to scale the values.
+	 * 
+	 * @param k         The row index to sum
+	 * @param nrColumns The number of columns
+	 * @param reference The reference vector to add to each cell processed.
+	 * @return The sum of the row.
+	 */
+	public abstract double sumRowSq(int k, int nrColumns, double[] reference);
 
 	/**
 	 * get the column sum of this dictionary only.
@@ -232,9 +350,29 @@ public abstract class ADictionary implements Serializable {
 	 * @param counts     The counts of the individual tuples.
 	 * @param colIndexes The columns indexes of the parent column group, this indicate where to put the column sum into
 	 *                   the c output.
-	 * @param square     Specify if the values should be squared
 	 */
-	public abstract void colSum(double[] c, int[] counts, int[] colIndexes, boolean square);
+	public abstract void colSum(double[] c, int[] counts, int[] colIndexes);
+
+	/**
+	 * Get the column sum of the values contained in the dictionary
+	 * 
+	 * @param c          The output array allocated to contain all column groups output.
+	 * @param counts     The counts of the individual tuples.
+	 * @param colIndexes The columns indexes of the parent column group, this indicate where to put the column sum into
+	 *                   the c output.
+	 */
+	public abstract void colSumSq(double[] c, int[] counts, int[] colIndexes);
+
+	/**
+	 * Get the column sum of the values contained in the dictionary with an offset reference value added to each cell.
+	 * 
+	 * @param c          The output array allocated to contain all column groups output.
+	 * @param counts     The counts of the individual tuples.
+	 * @param colIndexes The columns indexes of the parent column group, this indicate where to put the column sum into
+	 *                   the c output.
+	 * @param reference  The reference values to add to each cell.
+	 */
+	public abstract void colSumSq(double[] c, int[] counts, int[] colIndexes, double[] reference);
 
 	/**
 	 * Get the sum of the values contained in the dictionary
@@ -252,7 +390,16 @@ public abstract class ADictionary implements Serializable {
 	 * @param nCol   The number of columns contained
 	 * @return The square sum scaled by the counts provided.
 	 */
-	public abstract double sumsq(int[] counts, int nCol);
+	public abstract double sumSq(int[] counts, int nCol);
+
+	/**
+	 * Get the square sum of the values contained in the dictionary with a reference offset on each value.
+	 * 
+	 * @param counts    The counts of the individual tuples
+	 * @param reference The reference value
+	 * @return The square sum scaled by the counts and reference.
+	 */
+	public abstract double sumSq(int[] counts, double[] reference);
 
 	/**
 	 * Get a string representation of the dictionary, that considers the layout of the data.
@@ -298,6 +445,15 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract boolean containsValue(double pattern);
 
+	/**
+	 * Detect if the dictionary contains a specific value with reference offset.
+	 * 
+	 * @param pattern   The pattern/ value to search for
+	 * @param reference The reference double array.
+	 * @return true if the value is contained else false.
+	 */
+	public abstract boolean containsValue(double pattern, double[] reference);
+
 	/**
 	 * Calculate the number of non zeros in the dictionary. The number of non zeros should be scaled with the counts
 	 * given. This gives the exact number of non zero values in the parent column group.
@@ -308,6 +464,20 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract long getNumberNonZeros(int[] counts, int nCol);
 
+	/**
+	 * Calculate the number of non zeros in the dictionary.
+	 * 
+	 * Each value in the dictionary should be added to the reference value.
+	 * 
+	 * The number of non zeros should be scaled with the given counts.
+	 * 
+	 * @param counts    The Counts of each dict entry.
+	 * @param reference The reference vector.
+	 * @param nRows     The number of rows in the input.
+	 * @return The NonZero Count.
+	 */
+	public abstract long getNumberNonZeros(int[] counts, double[] reference, int nRows);
+
 	/**
 	 * Copies and adds the dictionary entry from this dictionary to the d dictionary
 	 * 
@@ -380,6 +550,8 @@ public abstract ADictionary preaggValuesFromDense(final int numVals, final int[]
 	 */
 	public abstract ADictionary replace(double pattern, double replace, int nCol);
 
+	public abstract ADictionary replace(double pattern, double replace, double[] reference);
+
 	public abstract ADictionary replaceZeroAndExtend(double replace, int nCol);
 
 	public abstract double product(int[] counts, int nCol);
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
index 3707de70fd0..8f9a91b287e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
@@ -80,7 +80,19 @@ public double aggregate(double init, Builtin fn) {
 	}
 
 	@Override
-	public double[] aggregateTuples(Builtin fn, final int nCol) {
+	public double aggregate(double init, Builtin fn, double[] reference) {
+		final int nCol = reference.length;
+		double ret = init;
+		for(int i = 0; i < _values.length; i++)
+			ret = fn.execute(ret, _values[i] + reference[i % nCol]);
+
+		for(int i = 0; i < nCol; i++)
+			ret = fn.execute(ret, reference[i]);
+		return ret;
+	}
+
+	@Override
+	public double[] aggregateRows(Builtin fn, int nCol) {
 		if(nCol == 1)
 			return _values;
 		final int nRows = _values.length / nCol;
@@ -94,9 +106,48 @@ public double[] aggregateTuples(Builtin fn, final int nCol) {
 		return res;
 	}
 
+	@Override
+	public double[] aggregateRows(Builtin fn, double[] reference) {
+		final int nCol = reference.length;
+		final int nRows = _values.length / nCol;
+		double[] res = new double[nRows + 1];
+		int off = 0;
+		for(int i = 0; i < nRows; i++) {
+			res[i] = _values[off++] + reference[0];
+			for(int j = 1; j < nCol; j++)
+				res[i] = fn.execute(res[i], _values[off++] + reference[j]);
+		}
+		res[nRows] = reference[0];
+		for(int i = 0; i < nCol; i++)
+			res[nRows] = fn.execute(res[nRows], reference[i]);
+		return res;
+	}
+
+	@Override
+	public Dictionary applyScalarOp(ScalarOperator op) {
+		final double[] retV = new double[_values.length];
+		for(int i = 0; i < _values.length; i++)
+			retV[i] = op.executeScalar(_values[i]);
+		return new Dictionary(retV);
+	}
+
+	@Override
+	public Dictionary applyScalarOp(ScalarOperator op, double[] reference, double[] newReference) {
+		final double[] retV = new double[_values.length];
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int i = 0; i < nRow; i++) {
+			for(int j = 0; j < nCol; j++) {
+				retV[off] = op.executeScalar(_values[off] + reference[j]) - newReference[j];
+				off++;
+			}
+		}
+		return new Dictionary(retV);
+	}
+
 	@Override
 	public Dictionary inplaceScalarOp(ScalarOperator op) {
-		// in-place modification of the dictionary
 		int len = size();
 		for(int i = 0; i < len; i++)
 			_values[i] = op.executeScalar(_values[i]);
@@ -125,6 +176,23 @@ public Dictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes) {
 		return new Dictionary(retVals);
 	}
 
+	@Override
+	public Dictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		final ValueFunction fn = op.fn;
+		final double[] retV = new double[_values.length];
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int i = 0; i < nRow; i++) {
+			for(int j = 0; j < nCol; j++) {
+				retV[off] = fn.execute(_values[off] + reference[j], v[colIndexes[j]]) - newReference[j];
+				off++;
+			}
+		}
+		return new Dictionary(retV);
+	}
+
 	@Override
 	public final Dictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes) {
 		final ValueFunction fn = op.fn;
@@ -136,9 +204,26 @@ public final Dictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexe
 		return new Dictionary(retVals);
 	}
 
+	@Override
+	public Dictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		final ValueFunction fn = op.fn;
+		final double[] retV = new double[_values.length];
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int i = 0; i < nRow; i++) {
+			for(int j = 0; j < nCol; j++) {
+				retV[off] = fn.execute(v[colIndexes[j]], _values[off] + reference[j]) - newReference[j];
+				off++;
+			}
+		}
+		return new Dictionary(retV);
+	}
+
 	@Override
 	public Dictionary applyBinaryRowOpRightAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
-		ValueFunction fn = op.fn;
+		final ValueFunction fn = op.fn;
 		final int len = size();
 		final int lenV = colIndexes.length;
 		final double[] values = new double[len + lenV];
@@ -152,7 +237,7 @@ public Dictionary applyBinaryRowOpRightAppendNewEntry(BinaryOperator op, double[
 
 	@Override
 	public final Dictionary applyBinaryRowOpLeftAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
-		ValueFunction fn = op.fn;
+		final ValueFunction fn = op.fn;
 		final int len = size();
 		final int lenV = colIndexes.length;
 		final double[] values = new double[len + lenV];
@@ -207,34 +292,67 @@ public int getNumberOfValues(int nCol) {
 	}
 
 	@Override
-	public double[] sumAllRowsToDouble(boolean square, int nrColumns) {
-		if(nrColumns == 1 && !square)
+	public double[] sumAllRowsToDouble(int nrColumns) {
+		if(nrColumns == 1)
 			return getValues(); // shallow copy of values
 
 		// pre-aggregate value tuple
 		final int numVals = getNumberOfValues(nrColumns);
 		double[] ret = new double[numVals];
-		for(int k = 0; k < numVals; k++) {
-			ret[k] = sumRow(k, square, nrColumns);
-		}
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRow(k, nrColumns);
+
+		return ret;
+	}
+
+	@Override
+	public double[] sumAllRowsToDoubleSq(int nrColumns) {
+		// pre-aggregate value tuple
+		final int numVals = getNumberOfValues(nrColumns);
+		double[] ret = new double[numVals];
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRowSq(k, nrColumns);
 
 		return ret;
 	}
 
 	@Override
-	public double sumRow(int k, boolean square, int nrColumns) {
+	public double[] sumAllRowsToDoubleSq(double[] reference) {
+		final int nCol = reference.length;
+		final int numVals = getNumberOfValues(nCol);
+		double[] ret = new double[numVals + 1];
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRowSq(k, nCol, reference);
+		for(int i = 0; i < nCol; i++)
+			ret[numVals] += reference[i] * reference[i];
+		return ret;
+	}
 
-		int valOff = k * nrColumns;
+	@Override
+	public double sumRow(int k, int nrColumns) {
+		final int valOff = k * nrColumns;
 		double res = 0.0;
-		if(!square) {
-			for(int i = 0; i < nrColumns; i++) {
-				res += _values[valOff + i];
-			}
-		}
-		else {
-			// kSquare
-			for(int i = 0; i < nrColumns; i++)
-				res += _values[valOff + i] * _values[valOff + i];
+		for(int i = 0; i < nrColumns; i++)
+			res += _values[valOff + i];
+		return res;
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns) {
+		final int valOff = k * nrColumns;
+		double res = 0.0;
+		for(int i = 0; i < nrColumns; i++)
+			res += _values[valOff + i] * _values[valOff + i];
+		return res;
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns, double[] reference) {
+		final int valOff = k * nrColumns;
+		double res = 0.0;
+		for(int i = 0; i < nrColumns; i++) {
+			final double v = _values[valOff + i] + reference[i];
+			res += v * v;
 		}
 		return res;
 	}
@@ -252,44 +370,89 @@ public double[] colSum(int[] counts, int nCol) {
 	}
 
 	@Override
-	public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
-		for(int k = 0; k < _values.length / colIndexes.length; k++) {
+	public void colSum(double[] c, int[] counts, int[] colIndexes) {
+		final int nCol = colIndexes.length;
+		for(int k = 0; k < _values.length / nCol; k++) {
 			final int cntk = counts[k];
-			for(int j = 0; j < colIndexes.length; j++) {
-				double v = _values[k * colIndexes.length + j];
-				if(square)
-					c[colIndexes[j]] += v * v * cntk;
-				else
-					c[colIndexes[j]] += v * cntk;
+			final int off = k * nCol;
+			for(int j = 0; j < nCol; j++)
+				c[colIndexes[j]] += _values[off + j] * cntk;
+		}
+	}
+
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes) {
+		final int nCol = colIndexes.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int k = 0; k < nRow; k++) {
+			final int cntk = counts[k];
+			for(int j = 0; j < nCol; j++) {
+				final double v = _values[off++];
+				c[colIndexes[j]] += v * v * cntk;
 			}
 		}
+	}
 
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes, double[] reference) {
+		final int nCol = colIndexes.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int k = 0; k < nRow; k++) {
+			final int cntk = counts[k];
+			for(int j = 0; j < nCol; j++) {
+				final double v = _values[off++] + reference[j];
+				c[colIndexes[j]] += v * v * cntk;
+			}
+		}
+		for(int i = 0; i < nCol; i++)
+			c[colIndexes[i]] += reference[i] * reference[i] * counts[nRow];
 	}
 
 	@Override
-	public double sum(int[] counts, int ncol) {
+	public double sum(int[] counts, int nCol) {
 		double out = 0;
 		int valOff = 0;
-		for(int k = 0; k < _values.length / ncol; k++) {
+		for(int k = 0; k < _values.length / nCol; k++) {
 			int countK = counts[k];
-			for(int j = 0; j < ncol; j++) {
-				out += getValue(valOff++) * countK;
+			for(int j = 0; j < nCol; j++) {
+				out += _values[valOff++] * countK;
 			}
 		}
 		return out;
 	}
 
 	@Override
-	public double sumsq(int[] counts, int ncol) {
+	public double sumSq(int[] counts, int nCol) {
 		double out = 0;
 		int valOff = 0;
-		for(int k = 0; k < _values.length / ncol; k++) {
-			int countK = counts[k];
-			for(int j = 0; j < ncol; j++) {
-				double val = getValue(valOff++);
+		for(int k = 0; k < _values.length / nCol; k++) {
+			final int countK = counts[k];
+			for(int j = 0; j < nCol; j++) {
+				final double val = _values[valOff++];
+				out += val * val * countK;
+			}
+		}
+		return out;
+	}
+
+	@Override
+	public double sumSq(int[] counts, double[] reference) {
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		double out = 0;
+		int valOff = 0;
+		for(int k = 0; k < nRow; k++) {
+			final int countK = counts[k];
+			for(int j = 0; j < nCol; j++) {
+				final double val = _values[valOff++] + reference[j];
 				out += val * val * countK;
 			}
 		}
+		for(int i = 0; i < nCol; i++)
+			out += reference[i] * reference[i] * counts[nRow];
+
 		return out;
 	}
 
@@ -383,6 +546,15 @@ public boolean containsValue(double pattern) {
 		return false;
 	}
 
+	@Override
+	public boolean containsValue(double pattern, double[] reference) {
+		final int nCol = reference.length;
+		for(int i = 0; i < _values.length; i++)
+			if(_values[i] + reference[i % nCol] == pattern)
+				return true;
+		return false;
+	}
+
 	@Override
 	public long getNumberNonZeros(int[] counts, int nCol) {
 		long nnz = 0;
@@ -399,6 +571,27 @@ public long getNumberNonZeros(int[] counts, int nCol) {
 		return nnz;
 	}
 
+	@Override
+	public long getNumberNonZeros(int[] counts, double[] reference, int nRows) {
+		long nnz = 0;
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		for(int i = 0; i < nRow; i++) {
+			long rowCount = 0;
+			final int off = i * nCol;
+			for(int j = off, jj = 0; j < off + nCol; j++, jj++) {
+				if(_values[j] + reference[jj] != 0)
+					rowCount++;
+			}
+			nnz += rowCount * counts[i];
+		}
+		for(int i = 0; i < nCol; i++)
+			if(reference[i] != 0)
+				nnz += counts[nRow];
+
+		return nnz;
+	}
+
 	@Override
 	public void addToEntry(Dictionary d, int fr, int to, int nCol) {
 		final int sf = nCol * fr; // start from
@@ -446,12 +639,22 @@ public MatrixBlockDictionary getMBDict(int nCol) {
 
 	@Override
 	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes) {
-		int ncol = colIndexes.length;
-		int vlen = size() / ncol;
-		for(int k = 0; k < vlen; k++)
-			for(int j = 0, valOff = k * ncol; j < ncol; j++)
-				c[colIndexes[j]] = fn.execute(c[colIndexes[j]], getValue(valOff + j));
+		final int nCol = colIndexes.length;
+		final int rlen = _values.length / nCol;
+		for(int k = 0; k < rlen; k++)
+			for(int j = 0, valOff = k * nCol; j < nCol; j++)
+				c[colIndexes[j]] = fn.execute(c[colIndexes[j]], _values[valOff + j]);
+	}
 
+	@Override
+	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes, double[] reference) {
+		final int nCol = reference.length;
+		final int rlen = _values.length / nCol;
+		for(int k = 0; k < rlen; k++)
+			for(int j = 0, valOff = k * nCol; j < nCol; j++)
+				c[colIndexes[j]] = fn.execute(c[colIndexes[j]], _values[valOff + j] + reference[j]);
+		for(int i = 0; i < nCol; i++)
+			c[colIndexes[i]] = fn.execute(c[colIndexes[i]], reference[i]);
 	}
 
 	@Override
@@ -488,10 +691,23 @@ public ADictionary replace(double pattern, double replace, int nCol) {
 		double[] retV = new double[_values.length];
 		for(int i = 0; i < _values.length; i++) {
 			final double v = _values[i];
-			if(v == pattern)
-				retV[i] = replace;
-			else
-				retV[i] = v;
+			retV[i] = v == pattern ? replace : v;
+		}
+		return new Dictionary(retV);
+	}
+
+	@Override
+	public ADictionary replace(double pattern, double replace, double[] reference) {
+		final double[] retV = new double[_values.length];
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int i = 0; i < nRow; i++) {
+			for(int j = 0; j < nCol; j++) {
+				final double v = _values[off];
+				retV[off++] = v + reference[j] == pattern ? replace - reference[j] : v;
+
+			}
 		}
 		return new Dictionary(retV);
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
index 1db433c5c29..982c3c903c6 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
@@ -171,6 +171,8 @@ public static ADictionary moveFrequentToLastDictionaryEntry(ADictionary dict, AB
 			else if(mb.isInSparseFormat()) {
 				MatrixBlockDictionary mbdn = moveToLastDictionaryEntrySparse(mb.getSparseBlock(), largestIndex, zeros, nCol,
 					largestIndexSize);
+				if(mbdn == null)
+					return null;
 				MatrixBlock mbn = mbdn.getMatrixBlock();
 				mbn.setNonZeros(mb.getNonZeros());
 				if(mbn.getNonZeros() == 0)
@@ -196,6 +198,8 @@ private static MatrixBlockDictionary moveToLastDictionaryEntrySparse(SparseBlock
 			for(int i = indexToMove + 1; i < sb.numRows(); i++)
 				sb.set(i - 1, sb.get(i), false);
 			sb.set(sb.numRows() - 1, swap, false);
+			if(ret.isEmpty())
+				return null;
 			return new MatrixBlockDictionary(ret);
 		}
 
@@ -214,6 +218,8 @@ private static MatrixBlockDictionary moveToLastDictionaryEntrySparse(SparseBlock
 			for(int i = indexToMove + 1; i < sb.numRows(); i++)
 				retB.set(i - 1, sb.get(i), false);
 		}
+		if(ret.isEmpty())
+			return null;
 		return new MatrixBlockDictionary(ret);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
index b3fa6f7e09f..b9fc6868ea6 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
@@ -25,6 +25,7 @@
 import java.util.Arrays;
 
 import org.apache.commons.lang.NotImplementedException;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.utils.Util;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.DenseBlockFP64;
@@ -45,10 +46,14 @@ public class MatrixBlockDictionary extends ADictionary {
 
 	public MatrixBlockDictionary(double[] values, int nCol) {
 		_data = Util.matrixBlockFromDenseArray(values, nCol);
+		if(_data.isEmpty())
+			throw new DMLCompressionException("Invalid construction of empty dictionary");
 	}
 
 	public MatrixBlockDictionary(MatrixBlock data) {
 		_data = data;
+		if(_data.isEmpty())
+			throw new DMLCompressionException("Invalid construction of empty dictionary");
 	}
 
 	public MatrixBlock getMatrixBlock() {
@@ -93,7 +98,45 @@ else if(fn.getBuiltinCode() == BuiltinCode.MIN)
 	}
 
 	@Override
-	public double[] aggregateTuples(Builtin fn, int nCol) {
+	public double aggregate(double init, Builtin fn, double[] reference) {
+		final int nCol = reference.length;
+		final int nRows = _data.getNumRows();
+		double ret = init;
+
+		for(int i = 0; i < nCol; i++)
+			ret = fn.execute(ret, reference[i]);
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRows; i++) {
+				if(sb.isEmpty(i))
+					continue;
+				final int apos = sb.pos(i);
+				final int alen = sb.size(i) + apos;
+				final int[] aix = sb.indexes(i);
+				final double[] avals = sb.values(i);
+				for(int k = apos; k < alen; k++) {
+					final double v = avals[k] + reference[aix[k]];
+					ret = fn.execute(ret, v);
+				}
+			}
+		}
+		else if(!_data.isEmpty()) {
+			final double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < nRows; k++) {
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					ret = fn.execute(ret, v);
+				}
+			}
+		}
+
+		return ret;
+	}
+
+	@Override
+	public double[] aggregateRows(Builtin fn, int nCol) {
 		double[] ret = new double[_data.getNumRows()];
 		if(_data.isEmpty())
 			return ret;
@@ -129,6 +172,53 @@ else if(nCol == 1)
 		return ret;
 	}
 
+	@Override
+	public double[] aggregateRows(Builtin fn, double[] reference) {
+		final int nCol = reference.length;
+		final int nRows = _data.getNumRows();
+		final double[] ret = new double[nRows + 1];
+
+		ret[nRows] = reference[0];
+		for(int i = 1; i < nCol; i++)
+			ret[nRows] = fn.execute(ret[nRows], reference[i]);
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRows; i++) {
+				if(sb.isEmpty(i))
+					ret[i] = ret[nRows];
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 1;
+					ret[i] = (aix[k] == 0) ? avals[k++] + reference[0] : reference[0];
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						ret[i] = fn.execute(ret[i], v);
+					}
+					for(; j < _data.getNumColumns(); j++)
+						ret[i] = fn.execute(ret[i], reference[j]);
+				}
+			}
+		}
+		else if(!_data.isEmpty()) {
+			final double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < nRows; k++) {
+				ret[k] = values[off++] + reference[0];
+				for(int j = 1; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					ret[k] = fn.execute(ret[k], v);
+				}
+			}
+		}
+
+		return ret;
+	}
+
 	@Override
 	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes) {
 		if(_data.isEmpty()) {
@@ -172,9 +262,102 @@ else if(_data.isInSparseFormat()) {
 	}
 
 	@Override
-	public ADictionary inplaceScalarOp(ScalarOperator op) {
+	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes, double[] reference) {
+		final int nCol = _data.getNumColumns();
+		final int nRow = _data.getNumRows();
+
+		for(int j = 0; j < colIndexes.length; j++) {
+			final int idx = colIndexes[j];
+			c[idx] = fn.execute(c[idx], reference[j]);
+		}
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRow; i++) {
+				if(sb.isEmpty(i))
+					continue;
+				final int apos = sb.pos(i);
+				final int alen = sb.size(i) + apos;
+				final double[] avals = sb.values(i);
+				final int[] aix = sb.indexes(i);
+				// This is a cool trick but it only works with min / max.
+				for(int k = apos; k < alen; k++) {
+					final int idx = colIndexes[aix[k]];
+					c[idx] = fn.execute(c[idx], avals[k] + reference[aix[k]]);
+				}
+			}
+		}
+		else if(!_data.isEmpty()) {
+			final double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < nRow; k++) {
+				for(int j = 0; j < nCol; j++) {
+					final int idx = colIndexes[j];
+					c[idx] = fn.execute(c[idx], values[off++] + reference[j]);
+				}
+			}
+		}
+	}
+
+	@Override
+	public ADictionary applyScalarOp(ScalarOperator op) {
 		MatrixBlock res = _data.scalarOperations(op, new MatrixBlock());
-		return new MatrixBlockDictionary(res);
+		if(res.isEmpty())
+			return null;
+		else
+			return new MatrixBlockDictionary(res);
+	}
+
+	@Override
+	public ADictionary applyScalarOp(ScalarOperator op, double[] reference, double[] newReference) {
+		final int nCol = _data.getNumColumns();
+		final int nRow = _data.getNumRows();
+		final MatrixBlock ret = new MatrixBlock(nRow, nCol, false);
+		ret.allocateDenseBlock();
+		final double[] retV = ret.getDenseBlockValues();
+		int off = 0;
+		if(_data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRow; i++) {
+				if(sb.isEmpty(i))
+					for(int j = 0; j < nCol; j++)
+						retV[off++] = op.executeScalar(reference[j]) - newReference[j];
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int j = 0;
+					for(int k = apos; j < nCol && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						retV[off++] = op.executeScalar(v) - newReference[j];
+					}
+					for(; j < nCol; j++)
+						retV[off++] = op.executeScalar(reference[j]) - newReference[j];
+				}
+			}
+		}
+		else {
+			final double[] values = _data.getDenseBlockValues();
+			for(int i = 0; i < nRow; i++) {
+				for(int j = 0; j < nCol; j++) {
+					retV[off] = op.executeScalar(values[off] + reference[j]) - newReference[j];
+					off++;
+				}
+			}
+		}
+
+		ret.recomputeNonZeros();
+		ret.examSparsity();
+		if(ret.isEmpty())
+			return null;
+		else
+			return new MatrixBlockDictionary(ret);
+
+	}
+
+	@Override
+	public ADictionary inplaceScalarOp(ScalarOperator op) {
+		throw new NotImplementedException();
 	}
 
 	@Override
@@ -182,15 +365,16 @@ public ADictionary applyScalarOp(ScalarOperator op, double newVal, int numCols)
 		MatrixBlock res = _data.scalarOperations(op, new MatrixBlock());
 		final int lastRow = res.getNumRows();
 		MatrixBlock res2 = new MatrixBlock(lastRow + 1, res.getNumColumns(), true);
-		if(res.isEmpty()) {
+		if(res.isEmpty())
 			for(int i = 0; i < numCols; i++)
 				res2.appendValue(lastRow, i, newVal);
-			return new MatrixBlockDictionary(res2);
-		}
-		else {
+		else
 			res.append(new MatrixBlock(1, numCols, newVal), res2, false);
+
+		if(res2.isEmpty())
+			return null;
+		else
 			return new MatrixBlockDictionary(res2);
-		}
 	}
 
 	@Override
@@ -199,6 +383,12 @@ public ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes) {
 		return new MatrixBlockDictionary(rowVector.binaryOperations(op, _data, null));
 	}
 
+	@Override
+	public Dictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public ADictionary applyBinaryRowOpLeftAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
 		MatrixBlock rowVector = Util.extractValues(v, colIndexes);
@@ -212,6 +402,12 @@ public ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes) {
 		return new MatrixBlockDictionary(_data.binaryOperations(op, rowVector, null));
 	}
 
+	@Override
+	public Dictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public ADictionary applyBinaryRowOpRightAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
 		MatrixBlock rowVector = Util.extractValues(v, colIndexes);
@@ -242,7 +438,7 @@ public int getNumberOfValues(int ncol) {
 	}
 
 	@Override
-	public double[] sumAllRowsToDouble(boolean square, int nrColumns) {
+	public double[] sumAllRowsToDouble(int nrColumns) {
 		double[] ret = new double[_data.getNumRows()];
 
 		if(_data.isEmpty())
@@ -255,7 +451,7 @@ else if(_data.isInSparseFormat()) {
 					final int alen = sb.size(i) + apos;
 					final double[] avals = sb.values(i);
 					for(int j = apos; j < alen; j++) {
-						ret[i] += (square) ? avals[j] * avals[j] : avals[j];
+						ret[i] += avals[j];
 					}
 				}
 			}
@@ -266,7 +462,7 @@ else if(_data.isInSparseFormat()) {
 			for(int k = 0; k < _data.getNumRows(); k++) {
 				for(int j = 0; j < _data.getNumColumns(); j++) {
 					final double v = values[off++];
-					ret[k] += (square) ? v * v : v;
+					ret[k] += v;
 				}
 			}
 		}
@@ -274,7 +470,95 @@ else if(_data.isInSparseFormat()) {
 	}
 
 	@Override
-	public double sumRow(int k, boolean square, int nrColumns) {
+	public double[] sumAllRowsToDoubleSq(int nrColumns) {
+		final double[] ret = new double[_data.getNumRows()];
+
+		if(_data.isEmpty())
+			return ret;
+		else if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(!sb.isEmpty(i)) {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final double[] avals = sb.values(i);
+					for(int j = apos; j < alen; j++) {
+						ret[i] += avals[j] * avals[j];
+					}
+				}
+			}
+		}
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < _data.getNumRows(); k++) {
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++];
+					ret[k] += v * v;
+				}
+			}
+		}
+		return ret;
+	}
+
+	@Override
+	public double[] sumAllRowsToDoubleSq(double[] reference) {
+		final int nCol = reference.length;
+		final int numVals = _data.getNumRows();
+		final double[] ret = new double[numVals + 1];
+
+		final int finalIndex = numVals;
+		for(int i = 0; i < nCol; i++)
+			ret[finalIndex] += reference[i] * reference[i];
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < numVals; i++) {
+				if(sb.isEmpty(i))
+					ret[i] = ret[finalIndex];
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 0;
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						ret[i] += v * v;
+					}
+					for(; j < _data.getNumColumns(); j++)
+						ret[i] += reference[j] * reference[j];
+				}
+
+			}
+		}
+		else if(!_data.isEmpty()) {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < numVals; k++) {
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					ret[k] += v * v;
+				}
+			}
+		}
+
+		return ret;
+	}
+
+	@Override
+	public double sumRow(int k, int nrColumns) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns, double[] reference) {
 		throw new NotImplementedException();
 	}
 
@@ -314,7 +598,40 @@ public double[] colSum(int[] counts, int nCol) {
 	}
 
 	@Override
-	public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
+	public void colSum(double[] c, int[] counts, int[] colIndexes) {
+		if(_data.isEmpty())
+			return;
+		if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(!sb.isEmpty(i)) {
+					// double tmpSum = 0;
+					final int count = counts[i];
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					for(int j = apos; j < alen; j++) {
+						c[colIndexes[aix[j]]] += count * avals[j];
+					}
+				}
+			}
+		}
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < _data.getNumRows(); k++) {
+				final int countK = counts[k];
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++];
+					c[colIndexes[j]] += v * countK;
+				}
+			}
+		}
+	}
+
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes) {
 		if(_data.isEmpty())
 			return;
 		if(_data.isInSparseFormat()) {
@@ -328,7 +645,7 @@ public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
 					final int[] aix = sb.indexes(i);
 					final double[] avals = sb.values(i);
 					for(int j = apos; j < alen; j++) {
-						c[colIndexes[aix[j]]] += square ? count * avals[j] * avals[j] : count * avals[j];
+						c[colIndexes[aix[j]]] += count * avals[j] * avals[j];
 					}
 				}
 			}
@@ -340,7 +657,50 @@ public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
 				final int countK = counts[k];
 				for(int j = 0; j < _data.getNumColumns(); j++) {
 					final double v = values[off++];
-					c[colIndexes[j]] += square ? v * v * countK : v * countK;
+					c[colIndexes[j]] += v * v * countK;
+				}
+			}
+		}
+	}
+
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes, double[] reference) {
+		final int nCol = reference.length;
+		final int nRow = _data.getNumRows();
+		for(int i = 0; i < nCol; i++)
+			c[colIndexes[i]] += reference[i] * reference[i] * counts[nRow];
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRow; i++) {
+				final int countK = counts[i];
+				if(sb.isEmpty(i))
+					for(int j = 0; j < nCol; j++)
+						c[colIndexes[j]] += reference[j] * reference[j] * countK;
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 0;
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						c[colIndexes[j]] += v * v * countK;
+					}
+					for(; j < _data.getNumColumns(); j++)
+						c[colIndexes[j]] += reference[j] * reference[j] * countK;
+				}
+			}
+		}
+		else if(!_data.isEmpty()) {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < nRow; k++) {
+				final int countK = counts[k];
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					c[colIndexes[j]] += v * v * countK;
 				}
 			}
 		}
@@ -380,7 +740,7 @@ public double sum(int[] counts, int ncol) {
 	}
 
 	@Override
-	public double sumsq(int[] counts, int ncol) {
+	public double sumSq(int[] counts, int ncol) {
 		double tmpSum = 0;
 		if(_data.isEmpty())
 			return tmpSum;
@@ -412,6 +772,54 @@ public double sumsq(int[] counts, int ncol) {
 		return tmpSum;
 	}
 
+	@Override
+	public double sumSq(int[] counts, double[] reference) {
+		final int nCol = reference.length;
+		final int numVals = _data.getNumRows();
+		double ret = 0;
+		for(int i = 0; i < nCol; i++)
+			ret += reference[i] * reference[i];
+		final double ref = ret;
+		ret *= counts[numVals];
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < numVals; i++) {
+				final int countK = counts[i];
+				if(sb.isEmpty(i))
+					ret += ref * countK;
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 0;
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						ret += v * v * countK;
+					}
+					for(; j < _data.getNumColumns(); j++)
+						ret += reference[j] * reference[j] * countK;
+				}
+
+			}
+		}
+		else if(!_data.isEmpty()) {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < numVals; k++) {
+				final int countK = counts[k];
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					ret += v * v * countK;
+				}
+			}
+		}
+
+		return ret;
+	}
+
 	@Override
 	public String getString(int colIndexes) {
 		return _data.toString();
@@ -438,6 +846,53 @@ public boolean containsValue(double pattern) {
 		return _data.containsValue(pattern);
 	}
 
+	@Override
+	public boolean containsValue(double pattern, double[] reference) {
+
+		if(_data.isEmpty()) {
+			for(double d : reference)
+				if(pattern == d)
+					return true;
+			return false;
+		}
+		else if(_data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(sb.isEmpty(i))
+					continue;
+				final int apos = sb.pos(i);
+				final int alen = sb.size(i) + apos;
+				final int[] aix = sb.indexes(i);
+				final double[] avals = sb.values(i);
+				int k = apos;
+				int j = 0;
+				for(; j < _data.getNumColumns() && k < alen; j++) {
+					if(aix[k] == j) {
+						if(reference[j] + avals[k++] == pattern)
+							return true;
+					}
+					else {
+						if(reference[j] == pattern)
+							return true;
+					}
+				}
+				for(; j < _data.getNumColumns(); j++)
+					if(reference[j] == pattern)
+						return true;
+
+			}
+		}
+		else {
+			final double[] values = _data.getDenseBlockValues();
+			final int nCol = reference.length;
+			for(int i = 0; i < values.length; i++)
+				if(values[i] + reference[i % nCol] == pattern)
+					return true;
+
+		}
+		return false;
+	}
+
 	@Override
 	public long getNumberNonZeros(int[] counts, int nCol) {
 		if(_data.isEmpty())
@@ -449,7 +904,6 @@ public long getNumberNonZeros(int[] counts, int nCol) {
 			for(int i = 0; i < _data.getNumRows(); i++)
 				if(!sb.isEmpty(i))
 					nnz += sb.size(i) * counts[i];
-
 		}
 		else {
 			double[] values = _data.getDenseBlockValues();
@@ -467,6 +921,64 @@ public long getNumberNonZeros(int[] counts, int nCol) {
 		return nnz;
 	}
 
+	@Override
+	public long getNumberNonZeros(int[] counts, double[] reference, int nRows) {
+		long nnz = 0;
+		for(double d : reference)
+			if(d != 0)
+				nnz++;
+		if(_data.isEmpty()) {
+			// sum counts
+			return nnz * nRows;
+		}
+		else if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			long emptyRowNNZ = nnz;
+			nnz *= counts[counts.length - 1]; // multiply count with the common value count in reference.
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(sb.isEmpty(i))
+					nnz += emptyRowNNZ * counts[i];
+				else {
+					int countThis = 0;
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 0;
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						if(aix[k] == j) {
+							if(reference[j] + avals[k++] != 0)
+								countThis++;
+						}
+						else {
+							if(reference[j] != 0)
+								countThis++;
+						}
+					}
+					for(; j < _data.getNumColumns(); j++)
+						if(reference[j] != 0)
+							countThis++;
+
+					nnz += countThis * counts[i];
+				}
+			}
+		}
+		else {
+			nnz *= counts[counts.length - 1]; // multiply count with the common value count in reference.
+			final double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				int countThisTuple = 0;
+				for(int j = 0; j < _data.getNumColumns(); j++)
+					if(values[off++] + reference[j] != 0)
+						countThisTuple++;
+				nnz += countThisTuple * counts[i];
+			}
+		}
+		return nnz;
+	}
+
 	@Override
 	public void addToEntry(Dictionary d, int fr, int to, int nCol) {
 		double[] v = d.getValues();
@@ -529,6 +1041,8 @@ public ADictionary subtractTuple(double[] tuple) {
 		MatrixBlock rowVector = new MatrixBlock(1, tuple.length, b);
 		MatrixBlock res = new MatrixBlock(_data.getNumColumns(), _data.getNumRows(), _data.isInSparseFormat());
 		_data.binaryOperations(new BinaryOperator(Minus.getMinusFnObject()), rowVector, res);
+		if(res.isEmpty())
+			return null;
 		return new MatrixBlockDictionary(res);
 	}
 
@@ -645,7 +1159,7 @@ else if(_data.isInSparseFormat()) {
 
 		DenseBlock dictV = new DenseBlockFP64(new int[] {numVals, aggregateColumns.length}, ret);
 		MatrixBlock dictM = new MatrixBlock(numVals, aggregateColumns.length, dictV);
-		dictM.getNonZeros();
+		dictM.recomputeNonZeros();
 		dictM.examSparsity();
 		return new MatrixBlockDictionary(dictM);
 
@@ -653,16 +1167,66 @@ else if(_data.isInSparseFormat()) {
 
 	@Override
 	public ADictionary replace(double pattern, double replace, int nCol) {
-		MatrixBlock ret = _data.replaceOperations(new MatrixBlock(), pattern, replace);
+		final MatrixBlock ret = _data.replaceOperations(new MatrixBlock(), pattern, replace);
+		if(ret.isEmpty())
+			return null;
 		return new MatrixBlockDictionary(ret);
 	}
 
+	@Override
+	public ADictionary replace(double pattern, double replace, double[] reference) {
+		final int nRow = _data.getNumRows();
+		final int nCol = _data.getNumColumns();
+		final MatrixBlock ret = new MatrixBlock(nRow, nCol, false);
+		ret.allocateDenseBlock();
+		final double[] retV = ret.getDenseBlockValues();
+		int off = 0;
+		if(_data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRow; i ++){
+				if(sb.isEmpty(i))
+					for(int j = 0; j < nCol; j++)
+						retV[off++] = pattern == reference[j] ? replace - reference[j] : 0;
+				else{
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int j = 0;
+					for(int k = apos; j < nCol && k < alen; j++){
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						retV[off++] = pattern == v ? replace - reference[j] : v - reference[j];
+					}
+					for(; j < nCol; j++)
+						retV[off++] = pattern == reference[j] ? replace - reference[j] : 0;
+				}
+			}
+		}
+		else {
+			final double[] values = _data.getDenseBlockValues();
+			for(int i = 0; i < nRow; i++) {
+				for(int j = 0; j < nCol; j++) {
+					final double v = values[off];
+					retV[off++] = pattern == v + reference[j] ? replace - reference[j] : v;
+				}
+			}
+		}
+
+		ret.recomputeNonZeros();
+		ret.examSparsity();
+		if(ret.isEmpty())
+			return null;
+		else
+			return new MatrixBlockDictionary(ret);
+
+	}
+
 	@Override
 	public ADictionary replaceZeroAndExtend(double replace, int nCol) {
 		final int nRows = _data.getNumRows();
 		final int nCols = _data.getNumColumns();
 		final long nonZerosOut = (nRows + 1) * nCols;
-		final MatrixBlock ret = new MatrixBlock(_data.getNumRows() + 1, _data.getNumColumns(), false);
+		final MatrixBlock ret = new MatrixBlock(nRows + 1, nCols, false);
 		ret.allocateBlock();
 		ret.setNonZeros(nonZerosOut);
 		final double[] retValues = ret.getDenseBlockValues();
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
index bfab5275c79..879892a3745 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
@@ -101,7 +101,12 @@ public double aggregate(double init, Builtin fn) {
 	}
 
 	@Override
-	public double[] aggregateTuples(Builtin fn, final int nCol) {
+	public double aggregate(double init, Builtin fn, double[] reference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public double[] aggregateRows(Builtin fn, final int nCol) {
 		if(nCol == 1)
 			return getValues();
 		final int nRows = _values.length / nCol;
@@ -115,6 +120,11 @@ public double[] aggregateTuples(Builtin fn, final int nCol) {
 		return res;
 	}
 
+	@Override
+	public double[] aggregateRows(Builtin fn, double[] reference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public QDictionary inplaceScalarOp(ScalarOperator op) {
 		if(_values == null)
@@ -154,6 +164,11 @@ else if(op.fn instanceof Plus) {
 		return this;
 	}
 
+	@Override
+	public QDictionary applyScalarOp(ScalarOperator op) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public QDictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) {
 		double[] temp = getValues();
@@ -219,39 +234,60 @@ public int getNumberOfValues(int nCol) {
 	}
 
 	@Override
-	public double[] sumAllRowsToDouble(boolean square, int nrColumns) {
-		if(nrColumns == 1 && !square)
+	public double[] sumAllRowsToDouble(int nrColumns) {
+		if(nrColumns == 1)
 			return getValues(); // shallow copy of values
 
 		final int numVals = getNumberOfValues(nrColumns);
 		double[] ret = new double[numVals];
-		for(int k = 0; k < numVals; k++) {
-			ret[k] = sumRow(k, square, nrColumns);
-		}
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRow(k, nrColumns);
 
 		return ret;
 	}
 
 	@Override
-	public double sumRow(int k, boolean square, int nrColumns) {
+	public double[] sumAllRowsToDoubleSq(int nrColumns) {
+		final int numVals = getNumberOfValues(nrColumns);
+		double[] ret = new double[numVals];
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRowSq(k, nrColumns);
+		return ret;
+	}
+
+	@Override
+	public double[] sumAllRowsToDoubleSq(double[] reference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public double sumRow(int k, int nrColumns) {
 		if(_values == null)
 			return 0;
 		int valOff = k * nrColumns;
 
-		if(!square) {
-			int res = 0;
-			for(int i = 0; i < nrColumns; i++) {
-				res += _values[valOff + i];
-			}
-			return res * _scale;
-		}
-		else {
-			// kSquare
-			double res = 0.0;
-			for(int i = 0; i < nrColumns; i++)
-				res += (int) (_values[valOff + i] * _values[valOff + i]) * _scale * _scale;
-			return res;
+		int res = 0;
+		for(int i = 0; i < nrColumns; i++) {
+			res += _values[valOff + i];
 		}
+		return res * _scale;
+
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns) {
+		if(_values == null)
+			return 0;
+		int valOff = k * nrColumns;
+		double res = 0.0;
+		for(int i = 0; i < nrColumns; i++)
+			res += (int) (_values[valOff + i] * _values[valOff + i]) * _scale * _scale;
+		return res;
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns, double[] reference) {
+		throw new NotImplementedException();
 	}
 
 	@Override
@@ -260,17 +296,32 @@ public double[] colSum(int[] counts, int nCol) {
 	}
 
 	@Override
-	public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
+	public void colSum(double[] c, int[] counts, int[] colIndexes) {
 		throw new NotImplementedException("Not Implemented");
 	}
 
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes) {
+		throw new NotImplementedException("Not Implemented");
+	}
+
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes, double[] reference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public double sum(int[] counts, int ncol) {
 		throw new NotImplementedException("Not Implemented");
 	}
 
 	@Override
-	public double sumsq(int[] counts, int ncol) {
+	public double sumSq(int[] counts, int ncol) {
+		throw new NotImplementedException("Not Implemented");
+	}
+
+	@Override
+	public double sumSq(int[] counts, double[] reference) {
 		throw new NotImplementedException("Not Implemented");
 	}
 
@@ -341,6 +392,11 @@ public boolean containsValue(double pattern) {
 		throw new NotImplementedException("Not contains value on Q Dictionary");
 	}
 
+	@Override
+	public boolean containsValue(double pattern, double[] reference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public long getNumberNonZeros(int[] counts, int nCol) {
 		long nnz = 0;
@@ -357,6 +413,11 @@ public long getNumberNonZeros(int[] counts, int nCol) {
 		return nnz;
 	}
 
+	@Override
+	public long getNumberNonZeros(int[] counts, double[] reference, int nRows) {
+		throw new NotImplementedException("not implemented yet");
+	}
+
 	@Override
 	public void addToEntry(Dictionary d, int fr, int to, int nCol) {
 		throw new NotImplementedException("Not implemented yet");
@@ -387,6 +448,11 @@ public void aggregateCols(double[] c, Builtin fn, int[] colIndexes) {
 		throw new NotImplementedException();
 	}
 
+	@Override
+	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes, double[] reference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public ADictionary scaleTuples(int[] scaling, int nCol) {
 		throw new NotImplementedException();
@@ -403,6 +469,11 @@ public ADictionary replace(double pattern, double replace, int nCol) {
 		throw new NotImplementedException();
 	}
 
+	@Override
+	public ADictionary replace(double pattern, double replace, double[] reference) {
+		throw new NotImplementedException();
+	}
+
 	@Override
 	public ADictionary replaceZeroAndExtend(double replace, int nCol) {
 		throw new NotImplementedException();
@@ -420,25 +491,38 @@ public void colProduct(double[] res, int[] counts, int[] colIndexes) {
 
 	@Override
 	public ADictionary applyBinaryRowOpLeftAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
-		// TODO Auto-generated method stub
-		return null;
+		throw new NotImplementedException();
 	}
 
 	@Override
 	public ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes) {
-		// TODO Auto-generated method stub
-		return null;
+		throw new NotImplementedException();
 	}
 
 	@Override
 	public ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes) {
-		// TODO Auto-generated method stub
-		return null;
+		throw new NotImplementedException();
 	}
 
 	@Override
 	public ADictionary applyBinaryRowOpRightAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
-		// TODO Auto-generated method stub
-		return null;
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public ADictionary applyScalarOp(ScalarOperator op, double[] reference, double[] newReference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		throw new NotImplementedException();
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/MaterializeSort.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/MaterializeSort.java
index 2d9c5b84308..341268b763b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/MaterializeSort.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/MaterializeSort.java
@@ -24,7 +24,7 @@
 import org.apache.sysds.runtime.compress.utils.IntArrayList;
 
 public class MaterializeSort extends AInsertionSorter {
-	public static int CACHE_BLOCK = 1000;
+	public static int CACHE_BLOCK = 50000;
 
 	/** a dense mapToData, that have a value for each row in the input. */
 	private final AMapToData md;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
index d3310fee72b..953ea49d858 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
@@ -25,12 +25,13 @@
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
 public abstract class AMapToData implements Serializable {
 
-	private static final long serialVersionUID = 100512759972844714L;
-
 	protected static final Log LOG = LogFactory.getLog(AMapToData.class.getName());
 
 	/** Number of unique values inside this map. */
@@ -44,7 +45,10 @@ public abstract class AMapToData implements Serializable {
 	 * @param nUnique number of unique values.
 	 */
 	protected AMapToData(int nUnique) {
-		this.nUnique = nUnique;
+		if(nUnique + 1 < 0)
+			this.nUnique = Integer.MAX_VALUE;
+		else
+			this.nUnique = nUnique + 1;
 	}
 
 	/**
@@ -145,14 +149,63 @@ protected final void setUnique(int nUnique) {
 	/**
 	 * Pre aggregate a dense matrix m into pre, subject to only including a row segment and column segment.
 	 * 
-	 * @param m   The dense matrix values to preaggregate
-	 * @param pre The preAggregate to populate with the summed values of m
-	 * @param rl  The row start in m
-	 * @param ru  The row end in m
-	 * @param cl  The column start in m
-	 * @param cu  The column end in m
+	 * @param m     The dense matrix values to preaggregate
+	 * @param preAV The preAggregate double array populate with the summed values of m
+	 * @param rl    The row start in m
+	 * @param ru    The row end in m
+	 * @param cl    The column start in m
+	 * @param cu    The column end in m
+	 */
+	public final void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu) {
+		final DenseBlock db = m.getDenseBlock();
+		if(rl == ru - 1) {
+			final double[] mV = db.values(rl);
+			final int off = db.pos(rl);
+			preAggregateDenseToRow(mV, off, preAV, cl, cu);
+		}
+		else {
+			preAggregateDenseRows(m, preAV, rl, ru, cl, cu);
+		}
+	}
+
+	protected abstract void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu);
+
+	protected void preAggregateDenseRows(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu) {
+		LOG.warn("Inefficient implementation pre aggregate of multi row in use");
+		final int nRow = m.getNumColumns();
+		final int nVal = getUnique() -1;
+		final double[] mV = m.getDenseBlockValues();
+		final int blockSize = 4000;
+		for(int block = cl; block < cu; block += blockSize) {
+			final int blockEnd = Math.min(block + blockSize, nRow);
+			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
+				final int offLeft = rowLeft * nRow;
+				for(int rc = block; rc < blockEnd; rc++) {
+					final int idx = getIndex(rc);
+					preAV[offOut + idx] += mV[offLeft + rc];
+				}
+			}
+		}
+	}
+
+	public abstract void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu,
+		AOffset indexes);
+
+	public abstract void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes);
+
+	/**
+	 * Get the number of counts of each unique value contained in this map.
+	 * 
+	 * @param counts The object to return.
+	 * @param nRows  The number of rows in the calling column group.
 	 */
-	public abstract void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu);
+	public int[] getCounts(int[] counts, int nRows) {
+		final int nonDefaultLength = size();
+		for(int i = 0; i < nonDefaultLength; i++)
+			counts[getIndex(i)]++;
+		counts[counts.length - 1] += nRows - nonDefaultLength;
+		return counts;
+	}
 
 	/**
 	 * Copy the values in this map into another mapping object.
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
index 678ee65619e..af81dc338a9 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
@@ -24,7 +24,10 @@
 import java.io.IOException;
 import java.util.BitSet;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -118,24 +121,26 @@ public static MapToBit readFields(DataInput in) throws IOException {
 	}
 
 	@Override
-	public void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu) {
-		final int nRow = m.getNumColumns();
-		final int nVal = pre.getNumColumns();
-		final double[] preAV = pre.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int blockSize = 4000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, nRow);
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-				final int offLeft = rowLeft * nRow;
-				for(int rc = block; rc < blockEnd; rc++)
-					preAV[_data.get(rc) ? offOut + 1 : offOut] += mV[offLeft + rc];
-			}
-		}
+	protected void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu) {
+		off += cl;
+		for(int rc = cl; rc < cu; rc++, off++)
+			preAV[_data.get(rc) ? 1 : 0] += mV[off];
+	}
+	
+	@Override
+	public void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu,
+		AOffset indexes) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes){
+		throw new NotImplementedException();
 	}
 
 	@Override
 	public int getUpperBoundValue() {
 		return 1;
 	}
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
index 5bd1e645b47..537c45836f6 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
@@ -25,6 +25,8 @@
 import java.util.Arrays;
 
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -98,9 +100,9 @@ public static MapToByte readFields(DataInput in) throws IOException {
 		return new MapToByte(unique, data);
 	}
 
-	public byte[] getBytes() {
-		return _data;
-	}
+	// public byte[] getBytes() {
+	// return _data;
+	// }
 
 	@Override
 	public void replace(int v, int r) {
@@ -125,24 +127,27 @@ public void copy(AMapToData d) {
 	}
 
 	@Override
-	public void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu) {
-		final int nRow = m.getNumColumns();
-		final int nVal = pre.getNumColumns();
-		final double[] preAV = pre.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int blockSize = 4000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, nRow);
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-				final int offLeft = rowLeft * nRow;
-				for(int rc = block; rc < blockEnd; rc++) {
-					final int idx = _data[rc] & 0xFF;
-					preAV[offOut + idx] += mV[offLeft + rc];
-				}
-			}
+	protected void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu) {
+		if(getUnique() < 127) {
+			for(int rc = cl; rc < cu; rc++)
+				preAV[_data[rc]] += mV[off + rc];
+		}
+		else {
+			for(int rc = cl; rc < cu; rc++)
+				preAV[_data[rc] & 0xFF] += mV[off + rc];
 		}
 	}
 
+	@Override
+	public final void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, AOffset indexes) {
+		indexes.preAggregateDenseMap(m, preAV, rl, ru, cl, cu, getUnique(), _data);
+	}
+
+	@Override
+	public void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes){
+		indexes.preAggregateSparseMap(sb, preAV, rl, ru, getUnique(), _data);
+	}
+
 	@Override
 	public int getUpperBoundValue() {
 		return 255;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
index d1fc0125a2a..249bc6ba50c 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
@@ -25,6 +25,8 @@
 import java.util.Arrays;
 
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -107,29 +109,50 @@ public static MapToChar readFields(DataInput in) throws IOException {
 		return new MapToChar(unique, data);
 	}
 
-	public char[] getChars() {
+	protected char[] getChars() {
 		return _data;
 	}
 
+	private void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, int cu, int off) {
+		int h = (cu - cl) % 8;
+		off += cl;
+		for(int rc = cl; rc < cl + h; rc++, off++)
+			preAV[_data[rc]] += mV[off];
+		for(int rc = cl + h; rc < cu; rc += 8, off += 8) {
+			int id1 = _data[rc], id2 = _data[rc + 1], id3 = _data[rc + 2], id4 = _data[rc + 3], id5 = _data[rc + 4],
+				id6 = _data[rc + 5], id7 = _data[rc + 6], id8 = _data[rc + 7];
+			preAV[id1] += mV[off];
+			preAV[id2] += mV[off + 1];
+			preAV[id3] += mV[off + 2];
+			preAV[id4] += mV[off + 3];
+			preAV[id5] += mV[off + 4];
+			preAV[id6] += mV[off + 5];
+			preAV[id7] += mV[off + 6];
+			preAV[id8] += mV[off + 7];
+		}
+	}
+
 	@Override
-	public void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu) {
-		final int nRow = m.getNumColumns();
-		final int nVal = pre.getNumColumns();
-		final double[] preAV = pre.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int blockSize = 4000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, nRow);
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-				final int offLeft = rowLeft * nRow;
-				for(int rc = block; rc < blockEnd; rc++) {
-					final int idx = _data[rc];
-					preAV[offOut + idx] += mV[offLeft + rc];
-				}
-			}
+	protected void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu) {
+		if(cu - cl > 1000)
+			preAggregateDenseToRowBy8(mV, preAV, cl, cu, off);
+		else {
+			off += cl;
+			for(int rc = cl; rc < cu; rc++, off++)
+				preAV[_data[rc]] += mV[off];
 		}
 	}
 
+	@Override
+	public void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, AOffset indexes) {
+		indexes.preAggregateDenseMap(m, preAV, rl, ru, cl, cu, getUnique(), _data);
+	}
+
+	@Override
+	public void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes) {
+		indexes.preAggregateSparseMap(sb, preAV, rl, ru, getUnique(), _data);
+	}
+
 	@Override
 	public int getUpperBoundValue() {
 		return Character.MAX_VALUE;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
index 8a706880e96..de8d95f6a3d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
@@ -79,7 +79,7 @@ public static AMapToData resize(AMapToData d, int numTuples) {
 		AMapToData ret;
 		if(d instanceof MapToBit)
 			return d;
-		else if(numTuples <= 1)
+		else if(numTuples <= 2)
 			ret = new MapToBit(numTuples, size);
 		else if(d instanceof MapToByte)
 			return d;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
index b991ccb7e0f..6a518573a54 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
@@ -24,7 +24,10 @@
 import java.io.IOException;
 import java.util.Arrays;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -106,22 +109,20 @@ public static MapToInt readFields(DataInput in) throws IOException {
 	}
 
 	@Override
-	public void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu) {
-		final int nRow = m.getNumColumns();
-		final int nVal = pre.getNumColumns();
-		final double[] preAV = pre.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int blockSize = 4000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, nRow);
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-				final int offLeft = rowLeft * nRow;
-				for(int rc = block; rc < blockEnd; rc++) {
-					final int idx = _data[rc];
-					preAV[offOut + idx] += mV[offLeft + rc];
-				}
-			}
-		}
+	protected void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu) {
+		off += cl;
+		for(int rc = cl; rc < cu; rc++, off++)
+			preAV[_data[rc]] += mV[off];
+	}
+
+	@Override
+	public void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, AOffset indexes) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes) {
+		throw new NotImplementedException();
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AIterator.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AIterator.java
index 17a502629d5..1c7e81e2057 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AIterator.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AIterator.java
@@ -49,13 +49,6 @@ protected AIterator(int index, int dataIndex, int offset) {
 	 */
 	public abstract void next();
 
-	/**
-	 * Get a boolean specifying if the iterator is done
-	 * 
-	 * @return A boolean that is true if there are more values contained in the Iterator.
-	 */
-	public abstract boolean hasNext();
-
 	/**
 	 * Get the current index value, note this correspond to a row index in the original matrix.
 	 * 
@@ -66,25 +59,38 @@ public int value() {
 	}
 
 	/**
-	 * Get the current index value and increment the pointers
+	 * find out if the current offset is not exceeding the index.
 	 * 
-	 * @return The current value pointed at.
+	 * @param ub The offset to not exceed
+	 * @return boolean if it is exceeded.
 	 */
-	public int valueAndIncrement() {
-		int x = offset;
-		next();
-		return x;
+	public boolean isNotOver(int ub) {
+		return offset < ub;
 	}
 
 	/**
 	 * Get the current data index associated with the index returned from value.
 	 * 
-	 * @return The data Index.
+	 * This index points to a position int the mapToData object, that then inturn can be used to lookup the dictionary
+	 * entry in ADictionary.
+	 * 
+	 * @return The Data Index.
 	 */
 	public int getDataIndex() {
 		return dataIndex;
 	}
 
+	/**
+	 * Get the current offsets index, that points to the underlying offsets list.
+	 * 
+	 * This is available for debugging purposes, not to be used for the calling classes.
+	 * 
+	 * @return The Offsets Index.
+	 */
+	public int getOffsetsIndex() {
+		return index;
+	}
+
 	/**
 	 * Get the current data index and increment the pointers using the next operator.
 	 * 
@@ -99,17 +105,23 @@ public int getDataIndexAndIncrement() {
 	/**
 	 * Skip values until index is achieved.
 	 * 
-	 * @param index The index to skip to.
+	 * @param idx The index to skip to.
 	 * @return the index that follows or are equal to the skip to index.
 	 */
-	public int skipTo(int index) {
-		while(hasNext() && offset < index)
-			next();
-		return offset;
-	}
+	public abstract int skipTo(int idx);
 
 	/**
 	 * Copy the iterator with the current values.
 	 */
 	public abstract AIterator clone();
+
+	/**
+	 * Unsafe version of equals, note that it should only compare iterators stemming from the same Offset Object.
+	 * 
+	 * @param o The Iterator to compare
+	 * @return The result
+	 */
+	public boolean equals(AIterator o) {
+		return o.index == this.index;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java
index 27816009a25..2f51e7f7442 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java
@@ -21,12 +21,13 @@
 import java.io.DataOutput;
 import java.io.IOException;
 import java.io.Serializable;
-import java.lang.ref.SoftReference;
-import java.util.HashMap;
-import java.util.Map;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
 /**
  * Offset list encoder interface.
@@ -39,9 +40,14 @@
  */
 public abstract class AOffset implements Serializable {
 
-	private static final long serialVersionUID = -4143271285905723425L;
 	protected static final Log LOG = LogFactory.getLog(AOffset.class.getName());
-	protected SoftReference<Map<Integer, AIterator>> skipIterators;
+
+	private ThreadLocal<OffsetCache> cacheRow = new ThreadLocal<OffsetCache>() {
+		@Override
+		protected OffsetCache initialValue() {
+			return null;
+		}
+	};
 
 	/**
 	 * Get an iterator of the offsets.
@@ -57,16 +63,23 @@ public abstract class AOffset implements Serializable {
 	 * @return AIterator that iterate through index and dictionary offset values.
 	 */
 	public AIterator getIterator(int row) {
-		if(skipIterators != null) {
-			Map<Integer, AIterator> sk = skipIterators.get();
-			AIterator it = sk.getOrDefault(row, null);
-			if(it != null)
-				return it.clone();
-		}
-		AIterator it = getIterator();
+		if(row <= getOffsetToFirst())
+			return getIterator();
+		else if(row >= getOffsetToLast())
+			return null;
+
+		// try the cache first.
+		OffsetCache c = cacheRow.get();
+		if(c != null && c.row == row)
+			return c.it.clone();
+
+		// Use the cached iterator if it is closer to the queried row.
+		AIterator it = c != null && c.row < row ? c.it.clone() : getIterator();
 		it.skipTo(row);
+		// cache this new iterator.
 		cacheIterator(it.clone(), row);
 		return it;
+
 	}
 
 	/**
@@ -76,14 +89,18 @@ public AIterator getIterator(int row) {
 	 * @param row The row index to cache the iterator as.
 	 */
 	public void cacheIterator(AIterator it, int row) {
-		if(skipIterators != null) {
-			Map<Integer, AIterator> sk = skipIterators.get();
-			sk.put(row, it);
+		if(it == null)
+			return;
+		OffsetCache c = cacheRow.get();
+		if(c == null) {
+			c = new OffsetCache();
+			c.it = it;
+			c.row = row;
+			cacheRow.set(c);
 		}
 		else {
-			Map<Integer, AIterator> nsk = new HashMap<>();
-			nsk.put(row, it.clone());
-			skipIterators = new SoftReference<>(nsk);
+			c.it = it;
+			c.row = row;
 		}
 	}
 
@@ -98,6 +115,20 @@ public void cacheIterator(AIterator it, int row) {
 	 */
 	public abstract void write(DataOutput out) throws IOException;
 
+	/**
+	 * Get the offset to the first index
+	 * 
+	 * @return The first index offset
+	 */
+	public abstract int getOffsetToFirst();
+
+	/**
+	 * Get the offset to the last value
+	 * 
+	 * @return The last values offset
+	 */
+	public abstract int getOffsetToLast();
+
 	/**
 	 * Get the in memory size of the Offset object
 	 * 
@@ -119,17 +150,200 @@ public void cacheIterator(AIterator it, int row) {
 	 */
 	public abstract int getSize();
 
+	/**
+	 * Get the length of the underlying offsets lists.
+	 * 
+	 * @return The number of offsets.
+	 */
+	public abstract int getOffsetsLength();
+
+	public final void preAggregateDenseMap(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		char[] data) {
+		// multi row iterator.
+		final AIterator it = getIterator(cl);
+		if(it == null)
+			return;
+		else if(it.offset > cu)
+			cacheIterator(it, cu); // cache this iterator.
+		else if(rl == ru - 1) {
+			final DenseBlock db = m.getDenseBlock();
+			final double[] mV = db.values(rl);
+			final int off = db.pos(rl);
+			preAggregateDenseMapRow(mV, off, preAV, cu, nVal, data, it);
+		}
+		else {
+			final DenseBlock db = m.getDenseBlock();
+			preAggregateDenseMapRows(db, preAV, rl, ru, cl, cu, nVal, data);
+		}
+	}
+
+	public final void preAggregateDenseMap(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		byte[] data) {
+		// multi row iterator.
+		final AIterator it = getIterator(cl);
+		if(it == null)
+			return;
+		else if(it.offset > cu)
+			cacheIterator(it, cu); // cache this iterator.
+		else if(rl == ru - 1) {
+			final DenseBlock db = m.getDenseBlock();
+			final double[] mV = db.values(rl);
+			final int off = db.pos(rl);
+			preAggregateDenseMapRow(mV, off, preAV, cu, nVal, data, it);
+		}
+		else {
+			final DenseBlock db = m.getDenseBlock();
+			preAggregateDenseMapRows(db, preAV, rl, ru, cl, cu, nVal, data);
+		}
+	}
+
+	protected abstract void preAggregateDenseMapRow(double[] mV, int off, double[] preAV, int cu, int nVal, byte[] data,
+		AIterator it);
+
+	protected abstract void preAggregateDenseMapRow(double[] mV, int off, double[] preAV, int cu, int nVal, char[] data,
+		AIterator it);
+
+	protected void preAggregateDenseMapRows(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		char[] data) {
+			
+		LOG.warn("Inefficient implementation of Preaggregate DenseMap multi row.");
+		throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+	}
+
+	protected void preAggregateDenseMapRows(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		byte[] data) {
+		LOG.warn("Inefficient implementation of Preaggregate DenseMap multi row.");
+		throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+	}
+
+	public final void preAggregateSparseMap(SparseBlock sb, double[] preAV, int rl, int ru, int nVal, char[] data) {
+		final AIterator it = getIterator();
+		if(rl == ru - 1)
+			preAggregateSparseMapRow(sb, preAV, rl, nVal, data, it);
+		else
+			throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+	}
+
+	public final void preAggregateSparseMap(SparseBlock sb, double[] preAV, int rl, int ru, int nVal, byte[] data) {
+		final AIterator it = getIterator();
+		if(rl == ru - 1)
+			preAggregateSparseMapRow(sb, preAV, rl, nVal, data, it);
+		else
+			throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+	}
+
+	public final void preAggregateSparseMap(SparseBlock sb, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		char[] data) {
+		// multi row iterator.
+		final AIterator it = getIterator(cl);
+		if(it == null)
+			return;
+		else if(it.offset > cu)
+			cacheIterator(it, cu); // cache this iterator.
+		else if(rl == ru - 1)
+			preAggregateSparseMapRow(sb, preAV, rl, nVal, data, it);
+		else
+			throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+
+	}
+
+	public final void preAggregateSparseMap(SparseBlock sb, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		byte[] data) {
+		// multi row iterator.
+		final AIterator it = getIterator(cl);
+		if(it == null)
+			return;
+		else if(it.offset > cu)
+			cacheIterator(it, cu); // cache this iterator.
+		else if(rl == ru - 1)
+			preAggregateSparseMapRow(sb, preAV, rl, nVal, data, it);
+		else
+			throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+
+	}
+
+	protected void preAggregateSparseMapRow(SparseBlock sb, double[] preAV, int r, int nVal, byte[] data, AIterator it) {
+		final int apos = sb.pos(r);
+		final int alen = sb.size(r) + apos;
+		final int[] aix = sb.indexes(r);
+		final double[] avals = sb.values(r);
+
+		final int maxId = data.length - 1;
+
+		int j = apos;
+		while(true) {
+			final int idx = aix[j];
+			if(idx == it.offset) {
+				preAV[data[it.dataIndex] & 0xFF] += avals[j++];
+				if(j >= alen || it.dataIndex >= maxId)
+					break;
+				it.next();
+			}
+			else if(idx < it.offset) {
+				j++;
+				if(j >= alen)
+					break;
+			}
+			else {
+				if(it.dataIndex >= maxId)
+					break;
+				it.next();
+			}
+		}
+	}
+
+	protected void preAggregateSparseMapRow(SparseBlock sb, double[] preAV, int r, int nVal, char[] data, AIterator it) {
+		final int apos = sb.pos(r);
+		final int alen = sb.size(r) + apos;
+		final int[] aix = sb.indexes(r);
+		final double[] avals = sb.values(r);
+
+		final int maxId = data.length - 1;
+
+		int j = apos;
+		while(true) {
+			final int idx = aix[j];
+			if(idx == it.offset) {
+				preAV[data[it.dataIndex]] += avals[j++];
+				if(j >= alen || it.dataIndex >= maxId)
+					break;
+				it.next();
+			}
+			else if(idx < it.offset) {
+				j++;
+				if(j >= alen)
+					break;
+			}
+			else {
+				if(it.dataIndex >= maxId)
+					break;
+				it.next();
+			}
+		}
+	}
+
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
-		AIterator i = getIterator();
 		sb.append(this.getClass().getSimpleName());
-		sb.append(" [");
-		sb.append(i.valueAndIncrement());
-
-		while(i.hasNext())
-			sb.append(", " + i.valueAndIncrement());
+		final AIterator it = getIterator();
+		final int last = getOffsetToLast();
+		sb.append("[");
+		while(it.offset < last) {
+			sb.append(it.offset);
+			sb.append(", ");
+			it.next();
+		}
+		sb.append(it.offset);
 		sb.append("]");
 		return sb.toString();
 	}
+
+	protected static class OffsetCache {
+		protected AIterator it = null;
+		protected int row = -1;
+
+		protected OffsetCache() {
+		}
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetByte.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetByte.java
index 29133cbd758..ebb29df1900 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetByte.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetByte.java
@@ -21,18 +21,18 @@
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.util.Arrays;
 
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.utils.MemoryEstimates;
 
 public class OffsetByte extends AOffset {
 
 	private static final long serialVersionUID = -4716104973912491790L;
+	private static final int maxV = 255;
 
-	private final static int maxV = 255;
 	private final byte[] offsets;
 	private final int offsetToFirst;
+	private final int offsetToLast;
+	private final boolean noOverHalf;
 
 	public OffsetByte(int[] indexes) {
 		this(indexes, 0, indexes.length);
@@ -41,21 +41,22 @@ public OffsetByte(int[] indexes) {
 	public OffsetByte(int[] indexes, int apos, int alen) {
 		int endSize = 0;
 		offsetToFirst = indexes[apos];
+		offsetToLast = indexes[alen - 1];
 		int ov = offsetToFirst;
+		// find the size of the array
 		for(int i = apos + 1; i < alen; i++) {
 			final int nv = indexes[i];
-			endSize += 1 + (nv - ov) / maxV;
+			endSize += 1 + (nv - ov - 1) / maxV;
 			ov = nv;
 		}
 		offsets = new byte[endSize];
 		ov = offsetToFirst;
 		int p = 0;
 
+		// populate the array
 		for(int i = apos + 1; i < alen; i++) {
 			final int nv = indexes[i];
 			final int offsetSize = nv - ov;
-			if(offsetSize == 0)
-				throw new DMLCompressionException("Invalid difference between cells :\n" + Arrays.toString(indexes));
 			final int div = offsetSize / maxV;
 			final int mod = offsetSize % maxV;
 			if(mod == 0) {
@@ -69,11 +70,30 @@ public OffsetByte(int[] indexes, int apos, int alen) {
 
 			ov = nv;
 		}
+		boolean noOverHalf = true;
+		for(byte b : offsets)
+			if(b < 0) {
+				noOverHalf = false;
+				break;
+			}
+		this.noOverHalf = noOverHalf;
 	}
 
-	private OffsetByte(byte[] offsets, int offsetToFirst) {
+	protected OffsetByte(byte[] offsets, int offsetToFirst, int offsetToLast) {
 		this.offsets = offsets;
 		this.offsetToFirst = offsetToFirst;
+		this.offsetToLast = offsetToLast;
+		this.noOverHalf = getNoOverHalf();
+	}
+
+	private boolean getNoOverHalf() {
+		boolean noOverHalf = true;
+		for(byte b : offsets)
+			if(b < 0) {
+				noOverHalf = false;
+				break;
+			}
+		return noOverHalf;
 	}
 
 	@Override
@@ -92,7 +112,9 @@ public void write(DataOutput out) throws IOException {
 
 	@Override
 	public long getInMemorySize() {
-		return getInMemorySize(offsets.length);
+		long size = 16 + 4 + 4 + 8; // object header plus ints plus reference
+		size += MemoryEstimates.byteArrayCost(offsets.length);
+		return size;
 	}
 
 	@Override
@@ -103,29 +125,288 @@ public long getExactSizeOnDisk() {
 	@Override
 	public int getSize() {
 		int size = 1;
-		for(byte b : offsets) {
+		for(byte b : offsets)
 			if(b != 0)
 				size++;
-		}
+
 		return size;
 	}
 
-	public static long getInMemorySize(int length) {
-		long size = 16 + 4 + 8; // object header plus int plus reference
-		size += MemoryEstimates.byteArrayCost(length);
+	@Override
+	public int getOffsetToFirst() {
+		return offsetToFirst;
+	}
+
+	@Override
+	public int getOffsetToLast() {
+		return offsetToLast;
+	}
+
+	@Override
+	public int getOffsetsLength() {
+		return offsets.length;
+	}
+
+	public static long estimateInMemorySize(int nOffs, int nRows) {
+		long size = 16 + 4 + 4 + 8; // object header plus int plus reference
+		size += MemoryEstimates.byteArrayCost(Math.max(nOffs, nRows / maxV));
 		return size;
 	}
 
 	public static OffsetByte readFields(DataInput in) throws IOException {
-		int offsetToFirst = in.readInt();
-		int offsetsLength = in.readInt();
-		byte[] offsets = new byte[offsetsLength];
+		final int offsetToFirst = in.readInt();
+		final int offsetsLength = in.readInt();
+
+		final byte[] offsets = new byte[offsetsLength];
+		int offsetToLast = offsetToFirst;
 		for(int i = 0; i < offsetsLength; i++) {
 			offsets[i] = in.readByte();
+			offsetToLast += offsets[i] & 0xFF;
+		}
+		return new OffsetByte(offsets, offsetToFirst, offsetToLast);
+	}
+
+	@Override
+	protected final void preAggregateDenseMapRow(double[] mV, int off, double[] preAV, int cu, int nVal, byte[] data,
+		AIterator it) {
+		IterateByteOffset itb = (IterateByteOffset) it;
+		final boolean noZero = offsets.length == data.length - 1;
+		if(cu < offsetToLast + 1) {
+			if(noOverHalf && noZero && nVal < 127)
+				preAggregateDenseByteMapRowBelowEndAndNoZeroNoOverHalfAlsoData(mV, off, preAV, cu, data, itb);
+			else if(noOverHalf && noZero)
+				preAggregateDenseByteMapRowBelowEndAndNoZeroNoOverHalf(mV, off, preAV, cu, data, itb);
+			else if(noZero)
+				preAggregateDenseByteMapRowBelowEndAndNoZero(mV, off, preAV, cu, data, itb);
+			else
+				preAggregateDenseByteMapRowBelowEnd(mV, off, preAV, cu, data, itb);
+			cacheIterator(itb, cu);
+		}
+		else if(noZero)
+			preAggregateDenseByteMapRowNoZero(mV, off, preAV, data, itb);
+		else
+			preAggregateDenseByteMapRow(mV, off, preAV, data, itb);
+
+	}
+
+	private final void preAggregateDenseByteMapRow(double[] mV, int off, double[] preAV, byte[] data,
+		IterateByteOffset it) {
+		final int maxId = data.length - 1;
+
+		int offset = it.offset + off;
+		int index = it.index;
+		int dataIndex = it.dataIndex;
+
+		preAV[data[dataIndex] & 0xFF] += mV[offset];
+		while(dataIndex < maxId) {
+			byte v = offsets[index];
+			while(v == 0) {
+				offset += maxV;
+				index++;
+				v = offsets[index];
+			}
+			offset += v & 0xFF;
+			index++;
+			dataIndex++;
+			preAV[data[dataIndex] & 0xFF] += mV[offset];
+		}
+	}
+
+	private final void preAggregateDenseByteMapRowNoZero(double[] mV, int off, double[] preAV, byte[] data,
+		IterateByteOffset it) {
+
+		int offset = it.offset + off;
+		int index = it.index;
+
+		while(index < offsets.length) {
+			preAV[data[index] & 0xFF] += mV[offset];
+			offset += offsets[index++] & 0xFF;
+		}
+		// process straggler index.
+		preAV[data[index] & 0xFF] += mV[offset];
+	}
+
+	private void preAggregateDenseByteMapRowBelowEnd(double[] mV, int off, double[] preAV, int cu, byte[] data,
+		IterateByteOffset it) {
+
+		cu += off;
+		it.offset += off;
+		while(it.offset < cu) {
+			preAV[data[it.dataIndex] & 0xFF] += mV[it.offset];
+			byte v = offsets[it.index];
+			while(v == 0) {
+				it.offset += maxV;
+				it.index++;
+				v = offsets[it.index];
+			}
+			it.offset += v & 0xFF;
+			it.index++;
+			it.dataIndex++;
+		}
+		it.offset -= off;
+	}
+
+	private void preAggregateDenseByteMapRowBelowEndAndNoZero(double[] mV, int off, double[] preAV, int cu, byte[] data,
+		IterateByteOffset it) {
+
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index] & 0xFF] += mV[offset];
+			offset += offsets[index++] & 0xFF;
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+	private final void preAggregateDenseByteMapRowBelowEndAndNoZeroNoOverHalf(double[] mV, int off, double[] preAV,
+		int cu, byte[] data, IterateByteOffset it) {
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index] & 0xFF] += mV[offset];
+			offset += offsets[index++];
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+	private final void preAggregateDenseByteMapRowBelowEndAndNoZeroNoOverHalfAlsoData(double[] mV, int off,
+		double[] preAV, int cu, byte[] data, IterateByteOffset it) {
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index]] += mV[offset];
+			offset += offsets[index++];
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+	@Override
+	protected final void preAggregateDenseMapRow(double[] mV, int off, double[] preAV, int cu, int nVal, char[] data,
+		AIterator it) {
+		IterateByteOffset itb = (IterateByteOffset) it;
+		final boolean noZero = offsets.length == data.length - 1;
+		if(cu < offsetToLast + 1) {
+			if(noOverHalf && noZero)
+				preAggregateDenseCharMapRowBelowEndAndNoZeroNoOverHalf(mV, off, preAV, cu, data, itb);
+			else if(noZero)
+				preAggregateDenseCharMapRowBelowEndAndNoZero(mV, off, preAV, cu, data, itb);
+			else
+				preAggregateDenseCharMapRowBelowEnd(mV, off, preAV, cu, data, itb);
+			cacheIterator(itb, cu);
+		}
+		else if(noZero)
+			preAggregateDenseCharMapRowNoZero(mV, off, preAV, data, itb);
+		else
+			preAggregateDenseCharMapRow(mV, off, preAV, data, itb);
+	}
+
+	private void preAggregateDenseCharMapRow(double[] mV, int off, double[] preAV, char[] data, IterateByteOffset it) {
+		final int maxId = data.length - 1;
+		int offset = it.offset + off;
+		int index = it.index;
+		int dataIndex = it.dataIndex;
+
+		preAV[data[dataIndex]] += mV[offset];
+		while(dataIndex < maxId) {
+			byte v = offsets[index];
+			while(v == 0) {
+				offset += maxV;
+				index++;
+				v = offsets[index];
+			}
+			offset += v & 0xff;
+			index++;
+			dataIndex++;
+			preAV[data[dataIndex]] += mV[offset];
 		}
-		return new OffsetByte(offsets, offsetToFirst);
 	}
 
+	private void preAggregateDenseCharMapRowNoZero(double[] mV, int off, double[] preAV, char[] data,
+		IterateByteOffset it) {
+
+		int offset = it.offset + off;
+		int index = it.index;
+		while(index < offsets.length) {
+			preAV[data[index]] += mV[offset];
+			offset += offsets[index++] & 0xFF;
+		}
+		preAV[data[index]] += mV[offset];
+	}
+
+	private void preAggregateDenseCharMapRowBelowEnd(double[] mV, int off, double[] preAV, int cu, char[] data,
+		IterateByteOffset it) {
+
+		cu += off;
+		it.offset += off;
+		while(it.offset < cu) {
+			preAV[data[it.dataIndex]] += mV[it.offset];
+			byte v = offsets[it.index];
+			while(v == 0) {
+				it.offset += maxV;
+				it.index++;
+				v = offsets[it.index];
+			}
+			it.offset += v & 0xFF;
+			it.index++;
+			it.dataIndex++;
+		}
+		it.offset -= off;
+	}
+
+	private void preAggregateDenseCharMapRowBelowEndAndNoZero(double[] mV, int off, double[] preAV, int cu, char[] data,
+		IterateByteOffset it) {
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index]] += mV[offset];
+			offset += offsets[index++] & 0xFF;
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+	private final void preAggregateDenseCharMapRowBelowEndAndNoZeroNoOverHalf(double[] mV, int off, double[] preAV,
+		int cu, char[] data, IterateByteOffset it) {
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index]] += mV[offset];
+			offset += offsets[index++];
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+
+
 	private class IterateByteOffset extends AIterator {
 
 		private IterateByteOffset() {
@@ -138,26 +419,22 @@ private IterateByteOffset(int index, int dataIndex, int offset) {
 
 		@Override
 		public void next() {
-			if(index >= offsets.length) {
-				index++;
-				dataIndex++;
-				return;
-			}
-
-			final byte v = offsets[index++];
-			if(v == 0) {
+			byte v = offsets[index];
+			while(v == 0) {
 				offset += maxV;
-				next();
-			}
-			else {
-				dataIndex++;
-				offset += v & 0xFF;
+				index++;
+				v = offsets[index];
 			}
+			offset += v & 0xFF;
+			index++;
+			dataIndex++;
 		}
 
 		@Override
-		public boolean hasNext() {
-			return index <= offsets.length;
+		public int skipTo(int idx) {
+			while(offset < idx && index < offsets.length)
+				next();
+			return offset;
 		}
 
 		@Override
@@ -165,4 +442,5 @@ public IterateByteOffset clone() {
 			return new IterateByteOffset(index, dataIndex, offset);
 		}
 	}
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetChar.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetChar.java
index c1c2930c850..dda7ab9e1da 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetChar.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetChar.java
@@ -21,19 +21,17 @@
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.util.Arrays;
 
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.utils.MemoryEstimates;
 
 public class OffsetChar extends AOffset {
 
 	private static final long serialVersionUID = -1192266421395964882L;
-
-	private final static int maxV = (int) Character.MAX_VALUE;
+	private static final int maxV = (int) Character.MAX_VALUE;
 
 	private final char[] offsets;
 	private final int offsetToFirst;
+	private final int offsetToLast;
 
 	public OffsetChar(int[] indexes) {
 		this(indexes, 0, indexes.length);
@@ -42,21 +40,20 @@ public OffsetChar(int[] indexes) {
 	public OffsetChar(int[] indexes, int apos, int alen) {
 		int endSize = 0;
 		offsetToFirst = indexes[apos];
+		offsetToLast = indexes[alen - 1];
 		int ov = offsetToFirst;
-		for(int i = apos+1; i < alen; i++) {
+		for(int i = apos + 1; i < alen; i++) {
 			final int nv = indexes[i];
-			endSize += 1 + (nv - ov) / maxV;
+			endSize += 1 + (nv - ov - 1) / maxV;
 			ov = nv;
 		}
 		offsets = new char[endSize];
 		ov = offsetToFirst;
 		int p = 0;
 
-		for(int i =  apos+1; i < alen; i++) {
+		for(int i = apos + 1; i < alen; i++) {
 			final int nv = indexes[i];
 			final int offsetSize = (nv - ov);
-			if(offsetSize == 0)
-				throw new DMLCompressionException("Invalid difference between cells :\n" + Arrays.toString(indexes));
 			final int div = offsetSize / maxV;
 			final int mod = offsetSize % maxV;
 			if(mod == 0) {
@@ -72,9 +69,10 @@ public OffsetChar(int[] indexes, int apos, int alen) {
 		}
 	}
 
-	private OffsetChar(char[] offsets, int offsetToFirst) {
+	private OffsetChar(char[] offsets, int offsetToFirst, int offsetToLast) {
 		this.offsets = offsets;
 		this.offsetToFirst = offsetToFirst;
+		this.offsetToLast = offsetToLast;
 	}
 
 	@Override
@@ -93,7 +91,9 @@ public void write(DataOutput out) throws IOException {
 
 	@Override
 	public long getInMemorySize() {
-		return getInMemorySize(offsets.length);
+		long size = 16 + 4 + 8; // object header plus int plus reference
+		size += MemoryEstimates.charArrayCost(offsets.length);
+		return size;
 	}
 
 	@Override
@@ -111,22 +111,69 @@ public int getSize() {
 		return size;
 	}
 
+	@Override
+	public int getOffsetToFirst() {
+		return offsetToFirst;
+	}
+
+	@Override
+	public int getOffsetToLast() {
+		return offsetToLast;
+	}
+
+	@Override
+	public int getOffsetsLength() {
+		return offsets.length;
+	}
+
 	public static OffsetChar readFields(DataInput in) throws IOException {
-		int offsetToFirst = in.readInt();
-		int offsetsLength = in.readInt();
-		char[] offsets = new char[offsetsLength];
+		final int offsetToFirst = in.readInt();
+		final int offsetsLength = in.readInt();
+		final char[] offsets = new char[offsetsLength];
+		int offsetToLast = offsetToFirst;
 		for(int i = 0; i < offsetsLength; i++) {
 			offsets[i] = in.readChar();
+			offsetToLast += offsets[i];
 		}
-		return new OffsetChar(offsets, offsetToFirst);
+		return new OffsetChar(offsets, offsetToFirst, offsetToLast);
 	}
 
-	public static long getInMemorySize(int length) {
+	public static long estimateInMemorySize(int nOffs, int nRows) {
 		long size = 16 + 4 + 8; // object header plus int plus reference
-		size += MemoryEstimates.charArrayCost(length - 1);
+		size += MemoryEstimates.charArrayCost(Math.max(nOffs, nRows / maxV));
 		return size;
 	}
 
+	@Override
+	protected final void preAggregateDenseMapRow(double[] mV, int off, double[] preAV, int cu, int nVal, byte[] data,
+		AIterator it) {
+		final int maxId = data.length - 1;
+		while(it.isNotOver(cu)) {
+			final int dx = it.getDataIndex();
+			preAV[data[dx] & 0xFF] += mV[off + it.value()];
+			if(dx < maxId)
+				it.next();
+			else
+				break;
+		}
+		cacheIterator(it, cu);
+	}
+
+	@Override
+	protected final void preAggregateDenseMapRow(double[] mV, int off, double[] preAV, int cu, int nVal, char[] data,
+		AIterator it) {
+		final int maxId = data.length - 1;
+		while(it.isNotOver(cu)) {
+			final int dx = it.getDataIndex();
+			preAV[data[dx]] += mV[off + it.value()];
+			if(dx < maxId)
+				it.next();
+			else
+				break;
+		}
+		cacheIterator(it, cu);
+	}
+
 	private class IterateCharOffset extends AIterator {
 
 		private IterateCharOffset() {
@@ -139,25 +186,27 @@ private IterateCharOffset(int index, int dataIndex, int offset) {
 
 		@Override
 		public void next() {
-			if(index >= offsets.length) {
-				index++;
-				dataIndex++;
-				return;
-			}
-			final char v = offsets[index++];
-			if(v == 0) {
+			char v = offsets[index];
+			while(v == 0) {
 				offset += maxV;
-				next();
-			}
-			else {
-				dataIndex++;
-				offset += v;
+				index++;
+				v = offsets[index];
 			}
+			offset += v;
+			index++;
+			dataIndex++;
 		}
 
 		@Override
-		public boolean hasNext() {
-			return index <= offsets.length;
+		public int value() {
+			return offset;
+		}
+
+		@Override
+		public int skipTo(int idx) {
+			while(offset < idx && index < offsets.length)
+				next();
+			return offset;
 		}
 
 		@Override
@@ -165,5 +214,4 @@ public IterateCharOffset clone() {
 			return new IterateCharOffset(index, dataIndex, offset);
 		}
 	}
-
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetFactory.java
index d54be828985..60f8231f531 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetFactory.java
@@ -22,12 +22,11 @@
 import java.io.DataInput;
 import java.io.IOException;
 
-import org.apache.sysds.runtime.compress.DMLCompressionException;
-
 public interface OffsetFactory {
 
 	// static final Log LOG = LogFactory.getLog(OffsetFactory.class.getName());
 
+	/** The specific underlying tpes of offsets. */
 	public enum OFF_TYPE {
 		BYTE, CHAR
 	}
@@ -35,11 +34,14 @@ public enum OFF_TYPE {
 	/**
 	 * Main factory pattern creator for Offsets.
 	 * 
+	 * Note this creator is unsafe in the sense it is assumed that the input index list only contain a sequential non
+	 * duplicate incrementing values.
+	 * 
 	 * @param indexes List of indexes, that is assumed to be sorted and have no duplicates
 	 * @return AOffset object containing offsets to the next value.
 	 */
-	public static AOffset create(int[] indexes) {
-		return create(indexes, 0, indexes.length);
+	public static AOffset createOffset(int[] indexes) {
+		return createOffset(indexes, 0, indexes.length);
 	}
 
 	/**
@@ -48,18 +50,22 @@ public static AOffset create(int[] indexes) {
 	 * This is useful if the input is created from a CSR matrix, since it allows us to not reallocate the indexes[] but
 	 * use the shared indexes from the entire CSR representation.
 	 * 
+	 * Note this creator is unsafe in the sense it is assumed that the input indexes in the range from apos to alen only
+	 * contain a sequential non duplicate incrementing values.
+	 * 
 	 * @param indexes The indexes from which to take the offsets.
 	 * @param apos    The position to start looking from in the indexes.
 	 * @param alen    The position to end looking at in the indexes.
 	 * @return A new Offset.
 	 */
-	public static AOffset create(int[] indexes, int apos, int alen) {
+	public static AOffset createOffset(int[] indexes, int apos, int alen) {
+		final int minValue = indexes[apos];
 		final int maxValue = indexes[alen - 1];
-		if(maxValue < 0)
-			throw new DMLCompressionException("Invalid sizes given");
+		final int range = maxValue - minValue;
 		final int endLength = alen - apos;
-		final float avgDist = (float) maxValue / endLength;
-		if(avgDist < 256)
+		final long byteSize = OffsetByte.estimateInMemorySize(endLength, range);
+		final long charSize = OffsetChar.estimateInMemorySize(endLength, range);
+		if(byteSize < charSize)
 			return new OffsetByte(indexes, apos, alen);
 		else
 			return new OffsetChar(indexes, apos, alen);
@@ -96,16 +102,14 @@ public static AOffset readIn(DataInput in) throws IOException {
 	 * @return The estimated size of an offset given the number of offsets and rows.
 	 */
 	public static long estimateInMemorySize(int size, int nRows) {
-		if(size < 0 || nRows < 0)
-			throw new DMLCompressionException("Invalid sizes given: " + size + "  " + nRows);
-		else if(size == 0)
+		if(size == 0)
 			return 8; // If this is the case, then the compression results in constant col groups
 		else {
 			final int avgDiff = nRows / size;
 			if(avgDiff < 256)
-				return OffsetByte.getInMemorySize(size - 1);
+				return OffsetByte.estimateInMemorySize(size - 1, nRows);
 			else
-				return OffsetChar.getInMemorySize(size - 1);
+				return OffsetChar.estimateInMemorySize(size - 1, nRows);
 		}
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibAppend.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibAppend.java
index 6ca2619a160..68eca8045af 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibAppend.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibAppend.java
@@ -70,6 +70,7 @@ public static MatrixBlock append(CompressedMatrixBlock left, CompressedMatrixBlo
 
 		ret = appendColGroups(ret, left.getColGroups(), right.getColGroups(), left.getNumColumns());
 
+		ret.setOverlapping(left.isOverlapping() || right.isOverlapping());
 		double compressedSize = ret.getInMemorySize();
 		double uncompressedSize = MatrixBlock.estimateSizeInMemory(m, n, ret.getSparsity());
 
@@ -85,24 +86,20 @@ public static MatrixBlock append(CompressedMatrixBlock left, CompressedMatrixBlo
 	}
 
 	private static MatrixBlock appendRightEmpty(CompressedMatrixBlock left, MatrixBlock right, int m, int n) {
-
 		CompressedMatrixBlock ret = new CompressedMatrixBlock(m, n);
-
 		List<AColGroup> newGroup = new ArrayList<>(1);
 		newGroup.add(ColGroupEmpty.generate(right.getNumColumns()));
 		ret = appendColGroups(ret, left.getColGroups(), newGroup, left.getNumColumns());
-
+		ret.setOverlapping(left.isOverlapping());
 		return ret;
 	}
 
 	private static MatrixBlock appendLeftEmpty(MatrixBlock left, CompressedMatrixBlock right, int m, int n) {
-
 		CompressedMatrixBlock ret = new CompressedMatrixBlock(m, n);
-
 		List<AColGroup> newGroup = new ArrayList<>(1);
 		newGroup.add(ColGroupEmpty.generate(left.getNumColumns()));
 		ret = appendColGroups(ret, newGroup, right.getColGroups(), left.getNumColumns());
-
+		ret.setOverlapping(right.isOverlapping());
 		return ret;
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java
index e4c33330cd9..a045fa2362c 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java
@@ -158,8 +158,8 @@ private static CompressedMatrixBlock setupCompressedReturnMatrixBlock(Compressed
 		return ret;
 	}
 
-	private static MatrixBlock rowBinCellOp(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock ret,
-		BinaryOperator op, boolean left) {
+	private static MatrixBlock rowBinCellOp(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op,
+		boolean left) {
 		CompressedMatrixBlock cRet = setupCompressedReturnMatrixBlock(m1, ret);
 		if(isValidForOverlappingBinaryCellOperations(m1, op))
 			overlappingBinaryCellOp(m1, m2, cRet, op, left);
@@ -333,32 +333,42 @@ protected static CompressedMatrixBlock binaryMVPlusStack(CompressedMatrixBlock m
 
 	private static MatrixBlock binaryMVCol(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left) {
 
-		MatrixBlock ret = new MatrixBlock(m1.getNumRows(), m1.getNumColumns(), false, -1).allocateBlock();
+		final int nCols = m1.getNumColumns();
+		final int nRows = m1.getNumRows();
+		// Pre filter.
+		final List<AColGroup> groups = m1.getColGroups();
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		if(shouldFilter) {
+			CompressedMatrixBlock mf1 = new CompressedMatrixBlock(m1);
+			double[] constV = new double[nCols];
+			final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
+			filteredGroups.add(ColGroupFactory.genColGroupConst(constV));
+			mf1.allocateColGroupList(filteredGroups);
+			m1 = mf1;
+		}
+		MatrixBlock ret = new MatrixBlock(nRows, nCols, false, -1).allocateBlock();
 
-		final int blkz = CompressionSettings.BITMAP_BLOCK_SZ / m1.getNumColumns() * 5;
+		final int blkz = CompressionSettings.BITMAP_BLOCK_SZ / nCols * 5;
 		final int k = op.getNumThreads();
 		long nnz = 0;
 
 		if(k <= 1) {
-			for(int i = 0; i * blkz < m1.getNumRows(); i++) {
+			for(int i = 0; i < nRows; i += blkz) {
 				if(left)
-					nnz += new BinaryMVColLeftTask(m1, m2, ret, i * blkz, Math.min(m1.getNumRows(), (i + 1) * blkz), op)
-						.call();
+					nnz += new BinaryMVColLeftTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op).call();
 				else
-					nnz += new BinaryMVColTask(m1, m2, ret, i * blkz, Math.min(m1.getNumRows(), (i + 1) * blkz), op).call();
+					nnz += new BinaryMVColTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op).call();
 			}
 		}
 		else {
 			ExecutorService pool = CommonThreadPool.get(op.getNumThreads());
 			ArrayList<Callable<Integer>> tasks = new ArrayList<>();
 			try {
-				for(int i = 0; i * blkz < m1.getNumRows(); i++) {
+				for(int i = 0; i < nRows; i += blkz) {
 					if(left)
-						tasks.add(
-							new BinaryMVColLeftTask(m1, m2, ret, i * blkz, Math.min(m1.getNumRows(), (i + 1) * blkz), op));
+						tasks.add(new BinaryMVColLeftTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op));
 					else
-						tasks.add(new BinaryMVColTask(m1, m2, ret, i * blkz, Math.min(m1.getNumRows(), (i + 1) * blkz), op));
-
+						tasks.add(new BinaryMVColTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op));
 				}
 				for(Future<Integer> f : pool.invokeAll(tasks))
 					nnz += f.get();
@@ -396,7 +406,7 @@ protected BinaryMVColTask(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock
 		public Integer call() {
 			// unsafe decompress, since we count nonzeros afterwards.
 			for(AColGroup g : _m1.getColGroups())
-				g.decompressToBlock(_ret, _rl, _ru);
+				g.decompressToDenseBlock(_ret.getDenseBlock(), _rl, _ru);
 
 			if(_m2.isInSparseFormat())
 				throw new NotImplementedException("Not Implemented sparse Format execution for MM.");
@@ -440,7 +450,7 @@ protected BinaryMVColLeftTask(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBl
 		public Integer call() {
 			// unsafe decompress, since we count nonzeros afterwards.
 			for(AColGroup g : _m1.getColGroups())
-				g.decompressToBlock(_ret, _rl, _ru);
+				g.decompressToDenseBlock(_ret.getDenseBlock(), _rl, _ru);
 
 			if(_m2.isInSparseFormat())
 				throw new NotImplementedException("Not Implemented sparse Format execution for MM.");
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
index 4a39eac1e89..49fdfe281c9 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
@@ -563,7 +563,7 @@ private MatrixBlock getTmp() {
 		private MatrixBlock decompressToTemp() {
 			MatrixBlock tmp = getTmp();
 			for(AColGroup g : _m1.getColGroups())
-				g.decompressToBlock(tmp, _rl, _ru, -_rl, 0);
+				g.decompressToDenseBlock(tmp.getDenseBlock(), _rl, _ru, -_rl, 0);
 			tmp.setNonZeros(_rl + _ru);
 			return tmp;
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java
index 558ca7b3cd0..a646f8f4564 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java
@@ -20,7 +20,6 @@
 package org.apache.sysds.runtime.compress.lib;
 
 import java.util.ArrayList;
-import java.util.Comparator;
 import java.util.List;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
@@ -31,13 +30,13 @@
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
-import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupUncompressed;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.Timing;
 import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.CommonThreadPool;
 import org.apache.sysds.utils.DMLCompressionStatistics;
@@ -69,7 +68,7 @@ public static void decompressTo(CompressedMatrixBlock cmb, MatrixBlock ret, int
 		else if(outSparse)
 			decompressToSparseBlock(cmb, ret, rowOffset, colOffset);
 		else
-			decompressToDenseBlock(cmb, ret, rowOffset, colOffset);
+			decompressToDenseBlock(cmb, ret.getDenseBlock(), rowOffset, colOffset);
 
 		if(DMLScript.STATISTICS) {
 			final double t = time.stop();
@@ -81,29 +80,37 @@ else if(outSparse)
 
 	private static void decompressToSparseBlock(CompressedMatrixBlock cmb, MatrixBlock ret, int rowOffset,
 		int colOffset) {
-		final List<AColGroup> groups = new ArrayList<>(cmb.getColGroups());
-		final int nRows = cmb.getNumRows();
 
-		for(AColGroup g : groups)
-			g.decompressToBlock(ret, 0, nRows, rowOffset, colOffset);
+		final SparseBlock sb = ret.getSparseBlock();
+		final List<AColGroup> groups = cmb.getColGroups();
+		final int nRows = cmb.getNumRows();
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		if(shouldFilter) {
+			final MatrixBlock tmp = cmb.getUncompressed("Decompression to put into Sparse Block");
+			tmp.putInto(ret, rowOffset, colOffset, false);
+		}
+		else
+			for(AColGroup g : groups)
+				g.decompressToSparseBlock(sb, 0, nRows, rowOffset, colOffset);
 	}
 
-	private static void decompressToDenseBlock(CompressedMatrixBlock cmb, MatrixBlock ret, int rowOffset,
-		int colOffset) {
-		final List<AColGroup> groups = new ArrayList<>(cmb.getColGroups());
+	private static void decompressToDenseBlock(CompressedMatrixBlock cmb, DenseBlock ret, int rowOffset, int colOffset) {
+		final List<AColGroup> groups = cmb.getColGroups();
 		// final int nCols = cmb.getNumColumns();
 		final int nRows = cmb.getNumRows();
 
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(groups);
-		double[] constV = containsSDC ? new double[cmb.getNumColumns()] : null;
-		final List<AColGroup> filteredGroups = containsSDC ? CLALibUtils.filterGroups(groups, constV) : groups;
-
-		for(AColGroup g : filteredGroups)
-			g.decompressToBlock(ret, 0, nRows, rowOffset, colOffset);
-
-		if(constV != null) {
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		if(shouldFilter) {
+			final double[] constV = new double[cmb.getNumColumns()];
+			final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
+			for(AColGroup g : filteredGroups)
+				g.decompressToDenseBlock(ret, 0, nRows, rowOffset, colOffset);
 			AColGroup cRet = ColGroupFactory.genColGroupConst(constV);
-			cRet.decompressToBlock(ret, 0, nRows, rowOffset, colOffset);
+			cRet.decompressToDenseBlock(ret, 0, nRows, rowOffset, colOffset);
+		}
+		else {
+			for(AColGroup g : groups)
+				g.decompressToDenseBlock(ret, 0, nRows, rowOffset, colOffset);
 		}
 	}
 
@@ -122,34 +129,49 @@ private static MatrixBlock decompressExecute(CompressedMatrixBlock cmb, int k) {
 			ret.setNonZeros(ret.recomputeNonZeros());
 			return ret; // if uncompressedColGroup is only colGroup.
 		}
-		else if(ret == null) {
-			ret = new MatrixBlock(nRows, nCols, false, -1);
-			ret.allocateDenseBlock();
-		}
 
-		final int block = (int) Math.ceil((double) (CompressionSettings.BITMAP_BLOCK_SZ) / nCols);
-		final int blklen = block > 1000 ? block + 1000 - block % 1000 : Math.max(64, block);
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		double[] constV = shouldFilter ? new double[nCols] : null;
+		final List<AColGroup> filteredGroups = shouldFilter ? CLALibUtils.filterGroups(groups, constV) : groups;
+
+		if(ret == null) { // There was no uncompressed group that fit the entire matrix.
+			final boolean sparse = !shouldFilter && !overlapping &&
+				MatrixBlock.evalSparseFormatInMemory(nRows, nCols, nonZeros);
+			ret = new MatrixBlock(nRows, nCols, sparse);
+			if(sparse)
+				ret.allocateSparseRowsBlock();
+			else
+				ret.allocateDenseBlock();
+		}
 
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(groups);
-		double[] constV = containsSDC ? new double[ret.getNumColumns()] : null;
-		final List<AColGroup> filteredGroups = containsSDC ? CLALibUtils.filterGroups(groups, constV) : groups;
-		if(LOG.isTraceEnabled())
-			LOG.debug("Decompressing with block size: " + blklen);
+		// final int block = (int) Math.ceil((double) (CompressionSettings.BITMAP_BLOCK_SZ) / nCols);
+		// final int blklen = Math.max(block, 64);
+		final int blklen = 32;
 
-		sortGroups(filteredGroups, overlapping);
+		// final int blklen = block > 1000 ? block + 1000 - block % 1000 : Math.max(64, block);
 
 		// check if we are using filtered groups, and if we are not force constV to null
 		if(groups == filteredGroups)
 			constV = null;
 
 		final double eps = getEps(constV);
-		if(k == 1)
-			decompressSingleThread(ret, filteredGroups, nRows, blklen, constV, eps, nonZeros, overlapping);
-		else
-			decompressMultiThread(ret, filteredGroups, nRows, blklen, constV, eps, overlapping, k);
 
-		if(overlapping)
-			ret.recomputeNonZeros();
+		if(k == 1) {
+			if(ret.isInSparseFormat()) {
+				decompressSparseSingleThread(ret, filteredGroups, nRows, blklen);
+				ret.setNonZeros(nonZeros);
+			}
+			else {
+				decompressDenseSingleThread(ret, filteredGroups, nRows, blklen, constV, eps, nonZeros, overlapping);
+				ret.setNonZeros(nonZeros == -1 || overlapping ? ret.recomputeNonZeros() : nonZeros);
+			}
+		}
+		else if(ret.isInSparseFormat()) {
+			decompressSparseMultiThread(ret, filteredGroups, nRows, blklen, k);
+			ret.setNonZeros(nonZeros);
+		}
+		else
+			decompressDenseMultiThread(ret, filteredGroups, nRows, blklen, constV, eps, overlapping, k);
 
 		ret.examSparsity();
 		return ret;
@@ -183,33 +205,46 @@ private static MatrixBlock getUncompressedColGroupAndRemoveFromListOfColGroups(L
 		return ret;
 	}
 
-	private static void decompressSingleThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen, int blklen,
-		double[] constV, double eps, long nonZeros, boolean overlapping) {
+	private static void decompressSparseSingleThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen,
+		int blklen) {
+		final SparseBlock sb = ret.getSparseBlock();
+		for(int i = 0; i < rlen; i += blklen) {
+			final int rl = i;
+			final int ru = Math.min(i + blklen, rlen);
+			for(AColGroup grp : filteredGroups)
+				grp.decompressToSparseBlock(ret.getSparseBlock(), rl, ru);
+			for(int j = rl; j < ru; j++)
+				if(!sb.isEmpty(j))
+					sb.sort(j);
+		}
+
+	}
+
+	private static void decompressDenseSingleThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen,
+		int blklen, double[] constV, double eps, long nonZeros, boolean overlapping) {
 		for(int i = 0; i < rlen; i += blklen) {
 			final int rl = i;
 			final int ru = Math.min(i + blklen, rlen);
 			for(AColGroup grp : filteredGroups)
-				grp.decompressToBlock(ret, rl, ru);
+				grp.decompressToDenseBlock(ret.getDenseBlock(), rl, ru);
 			if(constV != null && !ret.isInSparseFormat())
 				addVector(ret, constV, eps, rl, ru);
 		}
-		ret.setNonZeros(nonZeros == -1 || overlapping ? ret.recomputeNonZeros() : nonZeros);
 	}
 
-	private static void decompressMultiThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen, int blklen,
+	private static void decompressDenseMultiThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen, int blklen,
 		double[] constV, double eps, boolean overlapping, int k) {
 		try {
 			final ExecutorService pool = CommonThreadPool.get(k);
-			final ArrayList<DecompressTask> tasks = new ArrayList<>();
-			for(int i = 0; i * blklen < rlen; i++)
-				tasks.add(new DecompressTask(filteredGroups, ret, eps, i * blklen, Math.min((i + 1) * blklen, rlen),
-					overlapping, constV));
-			List<Future<Long>> rtasks = pool.invokeAll(tasks);
-			pool.shutdown();
+			final ArrayList<DecompressDenseTask> tasks = new ArrayList<>();
+			for(int i = 0; i < rlen; i += blklen)
+				tasks.add(
+					new DecompressDenseTask(filteredGroups, ret, eps, i, Math.min(i + blklen, rlen), overlapping, constV));
 
 			long nnz = 0;
-			for(Future<Long> rt : rtasks)
+			for(Future<Long> rt : pool.invokeAll(tasks))
 				nnz += rt.get();
+			pool.shutdown();
 			ret.setNonZeros(nnz);
 		}
 		catch(InterruptedException | ExecutionException ex) {
@@ -217,23 +252,21 @@ private static void decompressMultiThread(MatrixBlock ret, List<AColGroup> filte
 		}
 	}
 
-	private static void sortGroups(List<AColGroup> groups, boolean overlapping) {
-		if(overlapping) {
-			// add a bit of stability in decompression
-			Comparator<AColGroup> comp = Comparator.comparing(x -> effect(x));
-			groups.sort(comp);
-		}
-	}
+	private static void decompressSparseMultiThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen,
+		int blklen, int k) {
+		try {
+			final ExecutorService pool = CommonThreadPool.get(k);
+			final ArrayList<DecompressSparseTask> tasks = new ArrayList<>();
+			for(int i = 0; i < rlen; i += blklen)
+				tasks.add(new DecompressSparseTask(filteredGroups, ret, i, Math.min(i + blklen, rlen)));
 
-	/**
-	 * Calculate an effect value for a column group. This is used to sort the groups before decompression to decompress
-	 * the columns that have the smallest effect first.
-	 * 
-	 * @param x A Group
-	 * @return A Effect double value.
-	 */
-	private static double effect(AColGroup x) {
-		return (x instanceof ColGroupUncompressed) ? -Double.MAX_VALUE : -Math.max(x.getMax(), Math.abs(x.getMin()));
+			for(Future<Object> rt : pool.invokeAll(tasks))
+				rt.get();
+			pool.shutdown();
+		}
+		catch(InterruptedException | ExecutionException ex) {
+			throw new DMLCompressionException("Parallel decompression failed", ex);
+		}
 	}
 
 	/**
@@ -259,7 +292,7 @@ private static double getEps(double[] constV) {
 		}
 	}
 
-	private static class DecompressTask implements Callable<Long> {
+	private static class DecompressDenseTask implements Callable<Long> {
 		private final List<AColGroup> _colGroups;
 		private final MatrixBlock _ret;
 		private final double _eps;
@@ -268,7 +301,7 @@ private static class DecompressTask implements Callable<Long> {
 		private final double[] _constV;
 		private final boolean _overlapping;
 
-		protected DecompressTask(List<AColGroup> colGroups, MatrixBlock ret, double eps, int rl, int ru,
+		protected DecompressDenseTask(List<AColGroup> colGroups, MatrixBlock ret, double eps, int rl, int ru,
 			boolean overlapping, double[] constV) {
 			_colGroups = colGroups;
 			_ret = ret;
@@ -282,7 +315,7 @@ protected DecompressTask(List<AColGroup> colGroups, MatrixBlock ret, double eps,
 		@Override
 		public Long call() {
 			for(AColGroup grp : _colGroups)
-				grp.decompressToBlock(_ret, _rl, _ru);
+				grp.decompressToDenseBlock(_ret.getDenseBlock(), _rl, _ru);
 
 			if(_constV != null)
 				addVector(_ret, _constV, _eps, _rl, _ru);
@@ -291,6 +324,31 @@ public Long call() {
 		}
 	}
 
+	private static class DecompressSparseTask implements Callable<Object> {
+		private final List<AColGroup> _colGroups;
+		private final MatrixBlock _ret;
+		private final int _rl;
+		private final int _ru;
+
+		protected DecompressSparseTask(List<AColGroup> colGroups, MatrixBlock ret, int rl, int ru) {
+			_colGroups = colGroups;
+			_ret = ret;
+			_rl = rl;
+			_ru = ru;
+		}
+
+		@Override
+		public Object call() {
+			final SparseBlock sb = _ret.getSparseBlock();
+			for(AColGroup grp : _colGroups)
+				grp.decompressToSparseBlock(_ret.getSparseBlock(), _rl, _ru);
+			for(int i = _rl; i < _ru; i++)
+				if(!sb.isEmpty(i))
+					sb.sort(i);
+			return null;
+		}
+	}
+
 	/**
 	 * Add the rowV vector to each row in ret.
 	 * 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
index 919f98a8db6..8b197b3ac3d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
@@ -121,8 +121,8 @@ public static void leftMultByTransposeSelf(CompressedMatrixBlock cmb, MatrixBloc
 		final List<AColGroup> groups = cmb.getColGroups();
 		final int numColumns = cmb.getNumColumns();
 		final int numRows = cmb.getNumRows();
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(groups);
-		final double[] constV = containsSDC ? new double[numColumns] : null;
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		final double[] constV = shouldFilter ? new double[numColumns] : null;
 		final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
 
 		// TODO add parallel again
@@ -177,11 +177,11 @@ private static MatrixBlock leftMultByCompressedTransposedMatrix(CompressedMatrix
 		final List<AColGroup> rightCG = right.getColGroups();
 		final List<AColGroup> leftCG = left.getColGroups();
 
-		final boolean containsRight = CLALibUtils.containsSDCOrConst(rightCG);
+		final boolean containsRight = CLALibUtils.shouldPreFilter(rightCG);
 		double[] cR = containsRight ? new double[cr] : null;
 		final List<AColGroup> fRight = CLALibUtils.filterGroups(rightCG, cR);
 
-		final boolean containsLeft = CLALibUtils.containsSDCOrConst(leftCG);
+		final boolean containsLeft = CLALibUtils.shouldPreFilter(leftCG);
 		double[] cL = containsLeft ? new double[rl] : null;
 		final List<AColGroup> fLeft = CLALibUtils.filterGroups(leftCG, cL);
 
@@ -246,11 +246,11 @@ private static MatrixBlock leftMultByMatrix(List<AColGroup> colGroups, MatrixBlo
 		}
 
 		final int numColumnsOut = ret.getNumColumns();
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(colGroups);
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(colGroups);
 		final int lr = that.getNumRows();
 
 		// a constant colgroup summing the default values.
-		double[] constV = containsSDC ? new double[numColumnsOut] : null;
+		double[] constV = shouldFilter ? new double[numColumnsOut] : null;
 		final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(colGroups, constV);
 		if(colGroups == filteredGroups)
 			constV = null;
@@ -258,9 +258,9 @@ private static MatrixBlock leftMultByMatrix(List<AColGroup> colGroups, MatrixBlo
 
 		if(!filteredGroups.isEmpty()) {
 			if(k == 1)
-				rowSums = leftMultByMatrixPrimitive(filteredGroups, that, ret, 0, lr, containsSDC ? new double[lr] : null);
+				rowSums = leftMultByMatrixPrimitive(filteredGroups, that, ret, 0, lr, shouldFilter ? new double[lr] : null);
 			else
-				rowSums = leftMultByMatrixParallel(filteredGroups, that, ret, containsSDC, overlapping, k);
+				rowSums = leftMultByMatrixParallel(filteredGroups, that, ret, shouldFilter, overlapping, k);
 		}
 		else if(constV != null)
 			rowSums = that.rowSum(k).getDenseBlockValues();
@@ -412,18 +412,19 @@ private static void leftMultByMatrixPrimitiveSparse(List<AColGroup> colGroups, M
 		int rl, int ru, double[] rowSum) {
 
 		for(int i = rl; i < ru; i++) {
+			final SparseBlock sb = that.getSparseBlock();
+			if(sb.isEmpty(i))
+				continue;
 			for(int j = 0; j < colGroups.size(); j++) {
 				colGroups.get(j).leftMultByMatrix(that, ret, i, i + 1);
 			}
 			if(rowSum != null) {
-				final SparseBlock sb = that.getSparseBlock();
-				if(!sb.isEmpty(i)) {
-					final int apos = sb.pos(i);
-					final int alen = sb.size(i) + apos;
-					final double[] aval = sb.values(i);
-					for(int j = apos; j < alen; j++)
-						rowSum[i] += aval[j];
-				}
+				final int apos = sb.pos(i);
+				final int alen = sb.size(i) + apos;
+				final double[] aval = sb.values(i);
+				for(int j = apos; j < alen; j++)
+					rowSum[i] += aval[j];
+
 			}
 		}
 	}
@@ -440,8 +441,8 @@ private static void leftMultByMatrixPrimitiveDense(List<AColGroup> colGroups, Ma
 		// The number of column groups to process together
 		// the value should ideally be set so that the colGroups fits into cache together with a row block.
 		// currently we only try to avoid having a dangling small number of column groups in the last block.
-		final int colGroupBlocking = preAggCGs.size() % 16 < 4 ? 20 : 16;
-
+		final int colGroupBlocking = preAggCGs.size();// % 16 < 4 ? 20 : 16;
+		// final int colGroupBlocking = 3;
 		// Allocate pre Aggregate Array List
 		final MatrixBlock[] preAgg = populatePreAggregate(colGroupBlocking);
 
@@ -461,27 +462,13 @@ private static void leftMultByMatrixPrimitiveDense(List<AColGroup> colGroups, Ma
 				preAgg[j % colGroupBlocking].reset(rowBlockSize, nVals, false);
 			}
 
-			int colBlockSize = 32000;
-
 			// For each row block
 			for(int h = rl; h < ru; h += rowBlockSize) {
-				// For each column block
 				final int rowUpper = Math.min(h + rowBlockSize, ru);
-				for(int i = 0; i < lc; i += colBlockSize) {
-					final int colUpper = Math.min(i + colBlockSize, lc);
-					// Pre Aggregate each column group in block
-					for(int j = g; j < gEnd && j < preAggCGs.size(); j++) {
-						preAggCGs.get(j).preAggregateDense(that, preAgg[j % colGroupBlocking], h, rowUpper, i, colUpper);
-					}
-					if(rowSum != null) {
-						final double[] thatV = that.getDenseBlockValues();
-						for(int r = h; r < rowUpper; r++) {
-							final int rowOff = r * lc;
-							for(int c = rowOff + i; c < rowOff + colUpper; c++)
-								rowSum[r] += thatV[c];
-						}
-					}
-				}
+				if(rowSum != null)
+					preAggregateWithRowSums(that, h, rowUpper, preAggCGs, g, gEnd, preAgg, rowSum);
+				else
+					preAggregate(that, h, rowUpper, preAggCGs, g, gEnd, preAgg);
 
 				// Multiply out the preAggregate to the output matrix.
 				for(int j = g; j < gEnd && j < preAggCGs.size(); j++) {
@@ -507,6 +494,42 @@ private static void leftMultByMatrixPrimitiveDense(List<AColGroup> colGroups, Ma
 		}
 	}
 
+	private static void preAggregateWithRowSums(MatrixBlock that, int rl, int ru, List<APreAgg> preAggCGs, int g,
+		int gEnd, MatrixBlock[] preAgg, double[] rowSum) {
+		final int lc = that.getNumColumns();
+		final int colBlockSize = 25000;
+		final int colGroupBlocking = preAgg.length;
+		// For each column block
+		for(int i = 0; i < lc; i += colBlockSize) {
+			final int colUpper = Math.min(i + colBlockSize, lc);
+			// Pre Aggregate each column group in block
+			for(int j = g; j < gEnd && j < colGroupBlocking; j++)
+				preAggCGs.get(j).preAggregateDense(that, preAgg[j % colGroupBlocking], rl, ru, i, colUpper);
+
+			final double[] thatV = that.getDenseBlockValues();
+			for(int r = rl; r < ru; r++) {
+				final int rowOff = r * lc;
+				for(int c = rowOff + i; c < rowOff + colUpper; c++)
+					rowSum[r] += thatV[c];
+			}
+
+		}
+	}
+
+	private static void preAggregate(MatrixBlock that, int rl, int ru, List<APreAgg> preAggCGs, int g, int gEnd,
+		MatrixBlock[] preAgg) {
+
+		final int lc = that.getNumColumns();
+		final int colBlockSize = 25000;
+		final int colGroupBlocking = preAgg.length;
+		for(int i = 0; i < lc; i += colBlockSize) {
+			final int colUpper = Math.min(i + colBlockSize, lc);
+			// Pre Aggregate each column group in block
+			for(int j = g; j < gEnd && j < colGroupBlocking; j++)
+				preAggCGs.get(j).preAggregateDense(that, preAgg[j % colGroupBlocking], rl, ru, i, colUpper);
+		}
+	}
+
 	private static MatrixBlock[] populatePreAggregate(int colGroupBlocking) {
 		final MatrixBlock[] preAgg = new MatrixBlock[colGroupBlocking];
 		// populate the preAgg array.
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRightMultBy.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRightMultBy.java
index 52ad0da3e4d..3ebdd3a00e3 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRightMultBy.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRightMultBy.java
@@ -92,9 +92,9 @@ private static MatrixBlock rightMultByMatrixOverlapping(CompressedMatrixBlock m1
 		final List<AColGroup> retCg = new ArrayList<>();
 		final CompressedMatrixBlock ret = new CompressedMatrixBlock(rl, cr);
 
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(colGroups);
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(colGroups);
 
-		double[] constV = containsSDC ? new double[rr] : null;
+		double[] constV = shouldFilter ? new double[rr] : null;
 		final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(colGroups, constV);
 		if(colGroups == filteredGroups)
 			constV = null;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java
new file mode 100644
index 00000000000..94865036b42
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.lib;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.colgroup.AColGroup;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+public class CLALibSlice {
+
+	protected static final Log LOG = LogFactory.getLog(CLALibSlice.class.getName());
+
+	public static MatrixBlock slice(CompressedMatrixBlock cmb, int rl, int ru, int cl, int cu, boolean deep) {
+		if(rl == ru && cl == cu)
+			return sliceSingle(cmb, rl, cl);
+		else if(rl == 0 && ru == cmb.getNumRows() - 1)
+			return sliceColumns(cmb, cl, cu);
+		else if(cl == 0 && cu == cmb.getNumColumns() - 1)
+			return sliceRows(cmb, rl, ru);
+		else
+			return sliceInternal(cmb, rl, ru, cl, cu);
+	}
+
+	private static MatrixBlock sliceInternal(CompressedMatrixBlock cmb, int rl, int ru, int cl, int cu) {
+		// In the case where an internal matrix is sliced out, then first slice out the
+		// columns to an compressed intermediate.
+		// Then call slice recursively, to do the row slice.
+		// Since we do not copy the index structure but simply maintain a pointer to the
+		// original this is fine.
+		return sliceRows(sliceColumns(cmb, cl, cu), rl, ru);
+	}
+
+	private static MatrixBlock sliceRows(CompressedMatrixBlock cmb, int rl, int ru) {
+		final int nCol = cmb.getNumColumns();
+		final int rue = ru + 1;
+		MatrixBlock tmp = new MatrixBlock(rue - rl, nCol, false).allocateDenseBlock();
+		DenseBlock db = tmp.getDenseBlock();
+		final List<AColGroup> groups = cmb.getColGroups();
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		if(shouldFilter) {
+			final double[] constV = new double[nCol];
+			final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
+			for(AColGroup g : filteredGroups)
+				g.decompressToDenseBlock(db, rl, rue, -rl, 0);
+			AColGroup cRet = ColGroupFactory.genColGroupConst(constV);
+			cRet.decompressToDenseBlock(db, rl, rue, -rl, 0);
+		}
+		else
+			for(AColGroup g : groups)
+				g.decompressToDenseBlock(db, rl, rue, -rl, 0);
+
+		tmp.recomputeNonZeros();
+		tmp.examSparsity();
+		return tmp;
+	}
+
+	private static MatrixBlock sliceSingle(CompressedMatrixBlock cmb, int row, int col) {
+		// get a single index, and return in a matrixBlock
+		MatrixBlock tmp = new MatrixBlock(1, 1, 0);
+		tmp.appendValue(0, 0, cmb.getValue(row, col));
+		return tmp;
+	}
+
+	private static CompressedMatrixBlock sliceColumns(CompressedMatrixBlock cmb, int cl, int cu) {
+		final int cue = cu + 1;
+		final CompressedMatrixBlock ret = new CompressedMatrixBlock(cmb.getNumRows(), cue - cl);
+
+		final List<AColGroup> newColGroups = new ArrayList<>();
+		for(AColGroup grp : cmb.getColGroups()) {
+			final AColGroup slice = grp.sliceColumns(cl, cue);
+			if(slice != null)
+				newColGroups.add(slice);
+		}
+
+		ret.allocateColGroupList(newColGroups);
+		ret.recomputeNonZeros();
+		ret.setOverlapping(cmb.isOverlapping());
+		return ret;
+	}
+
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibUtils.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibUtils.java
index d6965173600..0141a8d802b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibUtils.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibUtils.java
@@ -23,29 +23,18 @@
 import java.util.Arrays;
 import java.util.List;
 
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
+import org.apache.sysds.runtime.compress.colgroup.AMorphingMMColGroup;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupConst;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupEmpty;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory;
-import org.apache.sysds.runtime.compress.colgroup.ColGroupSDC;
-import org.apache.sysds.runtime.compress.colgroup.ColGroupSDCSingle;
 
 public final class CLALibUtils {
-	// private static final Log LOG = LogFactory.getLog(CLALibUtils.class.getName());
-
-	/**
-	 * Helper method to determine if the column groups contains SDC
-	 * 
-	 * @param groups The ColumnGroups to analyze
-	 * @return A Boolean saying it there is >= 2 SDC Groups.
-	 */
-	protected static boolean containsSDC(List<AColGroup> groups) {
-		for(AColGroup g : groups)
-			if(g instanceof ColGroupSDC || g instanceof ColGroupSDCSingle)
-				return true;
-		return false;
-	}
+	protected static final Log LOG = LogFactory.getLog(CLALibUtils.class.getName());
 
 	/**
 	 * Helper method to determine if the column groups contains SDC or Constant groups.
@@ -53,37 +42,13 @@ protected static boolean containsSDC(List<AColGroup> groups) {
 	 * @param groups The ColumnGroups to analyze
 	 * @return A Boolean saying there is SDC groups or Constant groups.
 	 */
-	protected static boolean containsSDCOrConst(List<AColGroup> groups) {
+	protected static boolean shouldPreFilter(List<AColGroup> groups) {
 		for(AColGroup g : groups)
-			if(g instanceof ColGroupSDC || g instanceof ColGroupSDCSingle || g instanceof ColGroupConst)
+			if(g instanceof AMorphingMMColGroup || g instanceof ColGroupConst)
 				return true;
 		return false;
 	}
 
-	/**
-	 * Helper method to filter out SDC Groups, to add their common value to the ConstV. This allows exploitation of the
-	 * common values in the SDC Groups.
-	 * 
-	 * @param groups The Column Groups
-	 * @param constV The Constant vector to add common values to.
-	 * @return The Filtered list of Column groups containing no SDC Groups but only SDCZero groups.
-	 */
-	protected static List<AColGroup> filterSDCGroups(List<AColGroup> groups, double[] constV) {
-		if(constV == null)
-			return groups;
-			
-		final List<AColGroup> filteredGroups = new ArrayList<>();
-		for(AColGroup g : groups) {
-			if(g instanceof ColGroupSDC)
-				filteredGroups.add(((ColGroupSDC) g).extractCommon(constV));
-			else if(g instanceof ColGroupSDCSingle)
-				filteredGroups.add(((ColGroupSDCSingle) g).extractCommon(constV));
-			else
-				filteredGroups.add(g);
-		}
-		return returnGroupIfFiniteNumbers(groups, filteredGroups, constV);
-	}
-
 	/**
 	 * Helper method to filter out SDC Groups and remove all constant groups, to reduce computation.
 	 * 
@@ -97,10 +62,8 @@ protected static List<AColGroup> filterGroups(List<AColGroup> groups, double[] c
 
 		final List<AColGroup> filteredGroups = new ArrayList<>();
 		for(AColGroup g : groups) {
-			if(g instanceof ColGroupSDC)
-				filteredGroups.add(((ColGroupSDC) g).extractCommon(constV));
-			else if(g instanceof ColGroupSDCSingle)
-				filteredGroups.add(((ColGroupSDCSingle) g).extractCommon(constV));
+			if(g instanceof AMorphingMMColGroup)
+				filteredGroups.add(((AMorphingMMColGroup) g).extractCommon(constV));
 			else if(g instanceof ColGroupEmpty)
 				continue;
 			else if(g instanceof ColGroupConst)
@@ -115,7 +78,8 @@ private static List<AColGroup> returnGroupIfFiniteNumbers(List<AColGroup> groups
 		double[] constV) {
 		for(double v : constV)
 			if(!Double.isFinite(v))
-				return groups;
+				throw new NotImplementedException();
+				// return groups;
 		return filteredGroups;
 	}
 
diff --git a/src/test/java/org/apache/sysds/test/TestUtils.java b/src/test/java/org/apache/sysds/test/TestUtils.java
index 125de369696..a0ba5bf418a 100644
--- a/src/test/java/org/apache/sysds/test/TestUtils.java
+++ b/src/test/java/org/apache/sysds/test/TestUtils.java
@@ -918,7 +918,7 @@ private static void compareMatricesBitAvgDistanceSparse(SparseBlock sbe, SparseB
 				continue;
 			
 			if(sba.size(i) != sbe.size(i))
-				fail(message+"\nNumber of values are not equal in row: " + i);
+				fail(message+"\nNumber of values are not equal in row: " + i +"\nactual:"+ sba.get(i) +"\nexpected:"+ sbe.get(i));
 
 			final double[] e = sbe.values(i);
 			final double[] a = sba.values(i);
diff --git a/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java b/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java
index fee58b97b89..b914dd7a301 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java
@@ -171,6 +171,17 @@ public void testUnaryOperators(AggType aggType, boolean inCP) {
 		testUnaryOperators(aggType, auop, inCP);
 	}
 
+	@Test
+	public void testNonZeros() {
+		if(!(cmb instanceof CompressedMatrixBlock))
+			return; // Input was not compressed then just pass test
+		if(!(cmb.getNonZeros() >= mb.getNonZeros())) {
+			fail(bufferedToString + "\nIncorrect number of non Zeros should guarantee greater than or equals but are "
+				+ cmb.getNonZeros() + " and should be: " + mb.getNonZeros());
+		}
+
+	}
+
 	@Test
 	public void testSerialization() {
 		try {
diff --git a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
index 16ca8ad8246..34a800f4262 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
@@ -95,8 +95,8 @@ public abstract class CompressedTestBase extends TestBase {
 	protected static ValueRange[] usedValueRanges = new ValueRange[] {ValueRange.BOOLEAN, ValueRange.SMALL,
 		ValueRange.NEGATIVE};
 
-	protected static OverLapping[] overLapping = new OverLapping[] {OverLapping.PLUS_LARGE, OverLapping.MATRIX,
-		OverLapping.NONE, OverLapping.APPEND_CONST, OverLapping.C_BIND_SELF};
+	protected static OverLapping[] overLapping = new OverLapping[] {OverLapping.PLUS_LARGE, OverLapping.PLUS_ROW_VECTOR,
+		OverLapping.MATRIX, OverLapping.NONE, OverLapping.APPEND_CONST, OverLapping.C_BIND_SELF};
 
 	protected static CompressionSettingsBuilder[] usedCompressionSettings = new CompressionSettingsBuilder[] {
 		// CLA TESTS!
@@ -264,11 +264,24 @@ else if(ov == OverLapping.SQUASH) {
 							cmb = ((CompressedMatrixBlock) cmb).squash(_k);
 					}
 				}
-				if(ov == OverLapping.PLUS || ov == OverLapping.PLUS_LARGE) {
-					ScalarOperator sop = ov == OverLapping.PLUS_LARGE ? new LeftScalarOperator(Plus.getPlusFnObject(),
-						-3142151) : new LeftScalarOperator(Plus.getPlusFnObject(), 5);
-					mb = mb.scalarOperations(sop, new MatrixBlock());
-					cmb = cmb.scalarOperations(sop, new MatrixBlock());
+				if(cmb instanceof CompressedMatrixBlock) {
+
+					if(ov == OverLapping.PLUS || ov == OverLapping.PLUS_LARGE) {
+						ScalarOperator sop = ov == OverLapping.PLUS_LARGE ? new LeftScalarOperator(Plus.getPlusFnObject(),
+							-3142151) : new LeftScalarOperator(Plus.getPlusFnObject(), 5);
+						mb = mb.scalarOperations(sop, new MatrixBlock());
+						cmb = cmb.scalarOperations(sop, new MatrixBlock());
+					}
+					else if(ov == OverLapping.PLUS_ROW_VECTOR) {
+
+						MatrixBlock v = TestUtils.generateTestMatrixBlock(1, cols, -1, 1, 1.0, 4);
+						BinaryOperator bop = new BinaryOperator(Plus.getPlusFnObject(), _k);
+						mb = mb.binaryOperations(bop, v, null);
+						cmb = cmb.binaryOperations(bop, v, null);
+						lossyTolerance = lossyTolerance + 2;
+					}
+					if(!(cmb instanceof CompressedMatrixBlock))
+						fail("Invalid construction, should result in compressed MatrixBlock");
 				}
 			}
 
@@ -285,6 +298,7 @@ else if(ov == OverLapping.SQUASH) {
 				matrixRowsCols = null;
 			}
 			TestUtils.assertEqualColsAndRows(mb, cmb, bufferedToString);
+
 		}
 		catch(Exception e) {
 			e.printStackTrace();
@@ -375,7 +389,7 @@ public void testDecompress() {
 		try {
 			if(!(cmb instanceof CompressedMatrixBlock))
 				return; // Input was not compressed then just pass test
-
+			((CompressedMatrixBlock) cmb).clearSoftReferenceToDecompressed();
 			MatrixBlock decompressedMatrixBlock = ((CompressedMatrixBlock) cmb).decompress(_k);
 			compareResultMatrices(mb, decompressedMatrixBlock, 1);
 			assertEquals(bufferedToString, mb.getNonZeros(), decompressedMatrixBlock.getNonZeros());
@@ -902,10 +916,13 @@ public void testSlice(int rl, int ru, int cl, int cu) {
 		try {
 			if(!(cmb instanceof CompressedMatrixBlock) || rows * cols > 10000)
 				return;
-			MatrixBlock ret2 = cmb.slice(rl, ru, cl, cu);
-			MatrixBlock ret1 = mb.slice(rl, ru, cl, cu);
-			if(!(ret2 instanceof CompressedMatrixBlock))
-				assertEquals(ret1.getNonZeros(), ret2.getNonZeros());
+			final MatrixBlock ret2 = cmb.slice(rl, ru, cl, cu);
+			final MatrixBlock ret1 = mb.slice(rl, ru, cl, cu);
+			final long nnz1 = ret1.getNonZeros();
+			final long nnz2 = ret2.getNonZeros();
+			if(!(ret2 instanceof CompressedMatrixBlock) && nnz1 != nnz2)
+				fail(bufferedToString + "\nNot same number of non zeros " + nnz1 + " != " + nnz2);
+
 			compareResultMatrices(ret1, ret2, 1);
 		}
 		catch(Exception e) {
diff --git a/src/test/java/org/apache/sysds/test/component/compress/ExtendedMatrixTests.java b/src/test/java/org/apache/sysds/test/component/compress/ExtendedMatrixTests.java
index 3c5be85ee96..924f5ef374b 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/ExtendedMatrixTests.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/ExtendedMatrixTests.java
@@ -76,12 +76,15 @@ public static Collection<Object[]> data() {
 		SparsityType st = SparsityType.FULL;
 		ValueType vt = ValueType.RLE_COMPRESSIBLE;
 		ValueRange vr = ValueRange.SMALL;
-		MatrixTypology mt = MatrixTypology.SMALL;
+		MatrixTypology mt = MatrixTypology.LARGE;
 		OverLapping ov = OverLapping.NONE;
 
 		for(CompressionSettingsBuilder cs : usedCompressionSettings)
 			tests.add(new Object[] {st, vt, vr, cs, mt, ov, 1, null});
 
+		ov = OverLapping.PLUS_ROW_VECTOR;
+		for(CompressionSettingsBuilder cs : usedCompressionSettings)
+			tests.add(new Object[] {st, vt, vr, cs, mt, ov, 1, null});
 		return tests;
 	}
 
@@ -132,7 +135,7 @@ public void testSum() {
 		else if(OverLapping.effectOnOutput(overlappingType))
 			assertTrue(bufferedToString, TestUtils.getPercentDistance(ret2, ret1, true) > .99);
 		else
-			TestUtils.compareScalarBitsJUnit(ret2, ret1, 3, bufferedToString); // Should be exactly same value
+			TestUtils.compareScalarBitsJUnit(ret2, ret1, 100, bufferedToString); // Should be exactly same value
 
 	}
 
diff --git a/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java b/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
index a416b547e1f..7341bc5a3d8 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
@@ -34,8 +34,7 @@ public enum ValueType {
 		RAND_ROUND, // Values rounded to nearest whole numbers.
 		OLE_COMPRESSIBLE, // Ideal inputs for OLE Compression.
 		RLE_COMPRESSIBLE, // Ideal inputs for RLE Compression.
-		ONE_HOT,
-		UNBALANCED_SPARSE, // An input where some columns are super dense and some very sparse
+		ONE_HOT, UNBALANCED_SPARSE, // An input where some columns are super dense and some very sparse
 	}
 
 	public enum MatrixTypology {
@@ -55,7 +54,8 @@ public enum ValueRange {
 	}
 
 	public enum OverLapping {
-		COL, MATRIX, NONE, MATRIX_PLUS, MATRIX_MULT_NEGATIVE, SQUASH, PLUS, APPEND_EMPTY, APPEND_CONST, PLUS_LARGE, C_BIND_SELF;
+		COL, MATRIX, NONE, MATRIX_PLUS, MATRIX_MULT_NEGATIVE, SQUASH, PLUS, APPEND_EMPTY, APPEND_CONST, PLUS_LARGE,
+		C_BIND_SELF, PLUS_ROW_VECTOR;
 
 		public static boolean effectOnOutput(OverLapping opcode) {
 			switch(opcode) {
diff --git a/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingPreAggregateTests.java b/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingPreAggregateTests.java
index 795803f9275..6956e0e37c5 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingPreAggregateTests.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingPreAggregateTests.java
@@ -19,16 +19,21 @@
 
 package org.apache.sysds.test.component.compress.mapping;
 
+import static org.junit.Assert.fail;
+
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Random;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToByte;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetByte;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.test.TestUtils;
 import org.junit.Test;
@@ -46,6 +51,9 @@ public class MappingPreAggregateTests {
 	public final int size;
 	private AMapToData m;
 	private MapToByte ref;
+	private AOffset o;
+	private final MatrixBlock mb; // matrix block to preAggregate from.
+	private final double[] preRef;
 
 	@Parameters
 	public static Collection<Object[]> data() {
@@ -67,6 +75,11 @@ public MappingPreAggregateTests(int seed, MAP_TYPE type, int size) {
 		this.type = type;
 		this.size = size;
 		genBitMap(seed);
+
+		mb = TestUtils.generateTestMatrixBlock(2, size, 0, 100, 1.0, seed);
+		preRef = new double[m.getUnique()];
+		o = OneOffset.create(size);
+		ref.preAggregateDense(mb, preRef, 0, 1, 0, size);
 	}
 
 	protected AMapToData genBitMap(int seed) {
@@ -85,20 +98,69 @@ protected AMapToData genBitMap(int seed) {
 
 	@Test
 	public void testPreAggregateDense() {
-		int nUnique = m.getUnique();
-		int size = m.size();
+		try {
+			final int size = m.size();
+			MatrixBlock mb = TestUtils.generateTestMatrixBlock(1, size, 0, 100, 1.0, seed);
+			double[] pre = new double[m.getUnique()];
+			m.preAggregateDense(mb, pre, 0, 1, 0, size);
+			TestUtils.compareMatrices(preRef, pre, 0.00001);
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			fail(e.toString());
+		}
+	}
 
-		MatrixBlock mb = TestUtils.generateTestMatrixBlock(1, size, 0, 100, 1.0, seed);
-		MatrixBlock pre = new MatrixBlock(1, nUnique, false);
-		pre.allocateDenseBlock();
+	@Test
+	public void testPreAggregateDenseWithIndexes() {
+		switch(type) {
+			case BIT:
+			case INT:
+				return;
+			default:
+				try {
+					final int size = m.size();
+					MatrixBlock mb = TestUtils.generateTestMatrixBlock(1, size, 0, 100, 1.0, seed);
+					double[] pre = new double[m.getUnique()];
+					m.preAggregateDense(mb, pre, 0, 1, 0, size, o);
+					TestUtils.compareMatrices(preRef, pre, 0.00001);
+				}
+				catch(Exception e) {
+					e.printStackTrace();
+					fail(e.toString());
+				}
+		}
+	}
 
-		m.preAggregateDense(mb, pre, 0, 1, 0, 100);
+	@Test(expected = NotImplementedException.class)
+	public void testPreAggregateDenseWithIndexesExceptionExpected() {
+		switch(type) {
+			case BIT:
+			case INT:
+				m.preAggregateDense(mb, null, 0, 1, 0, size, o);
+			default:
+				throw new NotImplementedException();
+		}
+	}
+
+	@Test(expected = NotImplementedException.class)
+	public void testPreAggregateDenseExceptionExpected() {
+		m.preAggregateDense(mb, null, 0, 2, 0, size);
+	}
 
-		MatrixBlock preRef = new MatrixBlock(1, nUnique, false);
-		preRef.allocateDenseBlock();
-		
-		ref.preAggregateDense(mb, preRef, 0, 1,0,100);
+	private static class OneOffset extends OffsetByte {
 
-		TestUtils.compareMatrices(preRef, pre, 0, "preaggregate not same with different maps");
+		private OneOffset(byte[] offsets, int offsetToFirst, int offsetToLast) {
+			super(offsets, offsetToFirst, offsetToLast);
+		}
+
+		protected static OneOffset create(int length) {
+			int offsetToFirst = 0;
+			int offsetToLast = length - 1;
+			byte[] offsets = new byte[length - 1];
+			for(int i = 0; i < offsets.length; i++)
+				offsets[i] = 1;
+			return new OneOffset(offsets, offsetToFirst, offsetToLast);
+		}
 	}
 }
diff --git a/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingTests.java b/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingTests.java
index 2bb813831c5..8509d3e46c8 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingTests.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/mapping/MappingTests.java
@@ -64,7 +64,7 @@ public static Collection<Object[]> data() {
 			tests.add(new Object[] {4, t, 63, false});
 			tests.add(new Object[] {3, t, 64, false});
 			tests.add(new Object[] {3, t, 65, false});
-			tests.add(new Object[] {5, t, 64+63, false});
+			tests.add(new Object[] {5, t, 64 + 63, false});
 			tests.add(new Object[] {5, t, 1234, false});
 			tests.add(new Object[] {5, t, 13, true});
 		}
@@ -107,6 +107,7 @@ protected static AMapToData genMap(AMapToData m, int[] expected, int max, boolea
 
 		// to make sure that the bit set is actually filled.
 		m.set(size - 1, max);
+
 		expected[size - 1] = max;
 		return m;
 	}
@@ -205,6 +206,32 @@ public void replaceMax() {
 		}
 	}
 
+	@Test
+	public void getCountsWithDefault() {
+		switch(type) {
+			case CHAR:
+			case BIT:
+			case INT:
+				return;
+			default:
+				int[] counts = m.getCounts(new int[m.getUnique() + 1], size + 10);
+				if(10 != counts[m.getUnique()]) {
+					fail("Incorrect number of unique values:" + m + "\n" + Arrays.toString(counts));
+				}
+		}
+	}
+
+	@Test
+	public void getCountsNoDefault() {
+		switch(type) {
+			case CHAR:
+			case INT:
+				return;
+			default:
+				m.getCounts(new int[m.getUnique()], size);
+		}
+	}
+
 	@Test
 	public void replaceMin() {
 		int max = m.getUpperBoundValue();
@@ -217,6 +244,17 @@ public void replaceMin() {
 		}
 	}
 
+	@Test
+	public void getUnique() {
+		switch(type) {
+			case INT:
+				return;
+			default:
+				int u = m.getUnique();
+				assertEquals(m.getUpperBoundValue() + 1, u);
+		}
+	}
+
 	@Test
 	public void testInMemorySize() {
 		long inMemorySize = m.getInMemorySize();
diff --git a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetNegativeTests.java b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetNegativeTests.java
deleted file mode 100644
index ebf81a3ce14..00000000000
--- a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetNegativeTests.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.test.component.compress.offset;
-
-import static org.junit.Assert.fail;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-
-import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
-import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
-import org.apache.sysds.runtime.compress.colgroup.offset.OffsetByte;
-import org.apache.sysds.runtime.compress.colgroup.offset.OffsetChar;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-@RunWith(value = Parameterized.class)
-public class OffsetNegativeTests {
-
-	private enum TYPE {
-		BYTE, CHAR
-	}
-
-	@Parameterized.Parameter
-	public int[] data;
-	@Parameterized.Parameter(1)
-	public TYPE type;
-
-	@Parameters
-	public static Collection<Object[]> data() {
-		ArrayList<Object[]> tests = new ArrayList<>();
-		// It is assumed that the input is in sorted order, all values are positive and there are no duplicates.
-		for(TYPE t : TYPE.values()) {
-			tests.add(new Object[] {new int[] {1, 1,}, t});
-			tests.add(new Object[] {new int[] {2, 2, 2, 2}, t});
-			tests.add(new Object[] {new int[] {1, 2, 3, 4, 5, 5}, t});
-			tests.add(new Object[] {null, t});
-			tests.add(new Object[] {new int[] {}, t});
-			
-		}
-		return tests;
-	}
-
-	@Test(expected = Exception.class)
-	public void testConstruction() {
-		switch(type) {
-			case BYTE:
-				testConstruction(new OffsetByte(data));
-				break;
-			case CHAR:
-				testConstruction(new OffsetChar(data));
-				break;
-			default:
-				throw new NotImplementedException("not implemented");
-		}
-
-	}
-
-	public void testConstruction(AOffset o) {
-		AIterator i = o.getIterator();
-		for(int j = 0; j < data.length; j++) {
-
-			if(data[j] != i.value())
-				fail("incorrect result using : " + o.getClass().getSimpleName() + " expected: " + Arrays.toString(data)
-					+ " but was :" + o.toString());
-			if(i.hasNext())
-				i.next();
-		}
-	}
-
-}
diff --git a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetSingleTests.java b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetSingleTests.java
index 3fe8393d475..5ec39127e15 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetSingleTests.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetSingleTests.java
@@ -19,23 +19,29 @@
 
 package org.apache.sysds.test.component.compress.offset;
 
+import static org.junit.Assert.assertTrue;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
 import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
 import org.junit.Test;
 
 public class OffsetSingleTests {
 
-	@Test(expected = RuntimeException.class)
-	public void testInvalidSize_01() {
-		OffsetFactory.estimateInMemorySize(-1, 100);
+	@Test
+	public void testEmptyEstimateMemory() {
+		assertTrue(OffsetFactory.estimateInMemorySize(0, 10000) < 10);
 	}
 
-	@Test(expected = RuntimeException.class)
-	public void testInvalidSize_02() {
-		OffsetFactory.estimateInMemorySize(10, -1);
+	@Test(expected = NotImplementedException.class)
+	public void testNotImplementedMultirowAggregationChar() {
+		AOffset a = OffsetFactory.createOffset(new int[] {1, 2, 3, 4, 5});
+		a.preAggregateDenseMap(null, null, 0, 2, 0, 5, -1, (char[]) null);
 	}
 
-	@Test(expected = RuntimeException.class)
-	public void testInvalidCreation() {
-		OffsetFactory.create(new int[] {1, 2, 3, -1});
+	@Test(expected = NotImplementedException.class)
+	public void testNotImplementedMultirowAggregationByte() {
+		AOffset a = OffsetFactory.createOffset(new int[] {1, 2, 3, 4, 5});
+		a.preAggregateDenseMap(null, null, 0, 2, 0, 5, -1, (byte[]) null);
 	}
 }
diff --git a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregate.java b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregate.java
new file mode 100644
index 00000000000..f7cec1f1407
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregate.java
@@ -0,0 +1,460 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.offset;
+
+import static org.junit.Assert.fail;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.math3.util.Precision;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetByte;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetChar;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory.OFF_TYPE;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(value = Parameterized.class)
+public class OffsetTestPreAggregate {
+	protected static final Log LOG = LogFactory.getLog(OffsetTestPreAggregate.class.getName());
+
+	private static final double eps = 0.00001;
+
+	private final int[] data;
+	private final AOffset a;
+
+	private final MatrixBlock leftM;
+
+	// sum of indexes row 1.
+	private final double[] s;
+
+	@Parameters
+	public static Collection<Object[]> data() {
+		ArrayList<Object[]> tests = new ArrayList<>();
+		// It is assumed that the input is in sorted order, all values are positive and there are no duplicates.
+		// note that each tests allocate an matrix of two rows, and the last value length.
+		// therefore don't make it to large.
+		for(OFF_TYPE t : OFF_TYPE.values()) {
+			tests.add(new Object[] {new int[] {1, 2}, t});
+			tests.add(new Object[] {new int[] {2, 142}, t});
+			tests.add(new Object[] {new int[] {142, 421}, t});
+			tests.add(new Object[] {new int[] {1, 1023}, t});
+			tests.add(new Object[] {new int[] {1023, 1024}, t});
+			tests.add(new Object[] {new int[] {1023}, t});
+			tests.add(new Object[] {new int[] {0, 1, 2, 3, 4, 5}, t});
+			tests.add(new Object[] {new int[] {0}, t});
+			tests.add(new Object[] {new int[] {0, 256}, t});
+			tests.add(new Object[] {new int[] {0, 254}, t});
+			tests.add(new Object[] {new int[] {0, 256 * 2}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 2}, t});
+			tests.add(new Object[] {new int[] {0, 254 * 2}, t});
+			tests.add(new Object[] {new int[] {0, 510, 765}, t});
+			tests.add(new Object[] {new int[] {0, 254 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255, 255 * 2, 255 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 2, 255 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 2, 255 * 3, 255 * 10}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 4}, t});
+			tests.add(new Object[] {new int[] {0, 256 * 3}, t});
+			tests.add(new Object[] {new int[] {255 * 3, 255 * 5}, t});
+			tests.add(new Object[] {new int[] {0, 1, 2, 3, 255 * 4, 1500}, t});
+			tests.add(new Object[] {new int[] {0, 1, 2, 3, 4, 5}, t});
+			tests.add(new Object[] {new int[] {0, 1, 2, 3, 4, 5, 125, 142, 161, 1661, 2314}, t});
+			tests.add(new Object[] {new int[] {51, 4251, Character.MAX_VALUE}, t});
+		}
+		return tests;
+	}
+
+	public OffsetTestPreAggregate(int[] data, OFF_TYPE type) {
+		this.data = data;
+		switch(type) {
+			case BYTE:
+				this.a = new OffsetByte(data);
+				break;
+			case CHAR:
+				this.a = new OffsetChar(data);
+				break;
+			default:
+				throw new NotImplementedException("not implemented");
+		}
+		this.leftM = TestUtils.generateTestMatrixBlock(2, data[data.length - 1] + 100, -1, 100, 1.0, 1342);
+		this.s = sumIndexes();
+	}
+
+	@Test
+	public void testToString() {
+		String obs = getString(a);
+		String vs = Arrays.toString(data);
+		if(!obs.equals(vs))
+			fail("The strings are not equivalent ");
+	}
+
+	@Test
+	public void preAggByteMapFirstRowByte() {
+		preAggMapRowByte(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondRowByte() {
+		preAggMapRowByte(1);
+	}
+
+	@Test
+	public void preAggByteMapFirstRowChar() {
+		preAggMapRowChar(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondRowChar() {
+		preAggMapRowChar(1);
+	}
+
+	private void preAggMapRowChar(int row) {
+		double[] preAV = new double[1];
+		char[] m = new char[data.length];
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, leftM.getNumColumns(), 0, m);
+		verifyPreAggMapRowByte(preAV, row);
+	}
+
+	private void preAggMapRowByte(int row) {
+		double[] preAV = new double[1];
+		byte[] m = new byte[data.length];
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, leftM.getNumColumns(), 0, m);
+
+		verifyPreAggMapRowByte(preAV, row);
+	}
+
+	private void verifyPreAggMapRowByte(double[] preAV, int row) {
+
+		if(preAV[0] != s[row])
+			fail("The preaggregate result is not the sum! : " + preAV[0] + " vs " + s[row]);
+	}
+
+	@Test
+	public void preAggByteMapFirstRowByteAll1() {
+		preAggMapRowByteAll1(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondRowByteAll1() {
+		preAggMapRowByteAll1(1);
+	}
+
+	@Test
+	public void preAggByteMapFirstRowCharAll1() {
+		preAggMapRowCharAll1(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondRowCharAll1() {
+		preAggMapRowCharAll1(1);
+	}
+
+	private void preAggMapRowCharAll1(int row) {
+		double[] preAV = new double[2];
+		char[] m = new char[data.length];
+		fill(m, (char) 1);
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, leftM.getNumColumns(), 0, m);
+		verifyPreAggMapRowAllBytes1(preAV, row);
+	}
+
+	private void preAggMapRowByteAll1(int row) {
+		double[] preAV = new double[2];
+		byte[] m = new byte[data.length];
+		fill(m, (byte) 1);
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, leftM.getNumColumns(), 0, m);
+		verifyPreAggMapRowAllBytes1(preAV, row);
+	}
+
+	private void verifyPreAggMapRowAllBytes1(double[] preAV, int row) {
+		if(preAV[0] != 0)
+			fail("aggregate to wrong index");
+		if(preAV[1] != s[row])
+			fail("The preaggregate result is not the sum! : " + preAV[0] + " vs " + s[row]);
+	}
+
+	@Test
+	public void preAggByteMapFirstRowByteOne1() {
+		preAggMapRowByteOne1(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondRowByteOne1() {
+		preAggMapRowByteOne1(1);
+	}
+
+	@Test
+	public void preAggByteMapFirstRowCharOne1() {
+		preAggMapRowCharOne1(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondRowCharOne1() {
+		preAggMapRowCharOne1(1);
+	}
+
+	private void preAggMapRowCharOne1(int row) {
+		if(data.length > 1) {
+			double[] preAV = new double[2];
+			char[] m = new char[data.length];
+			m[1] = 1;
+			a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, leftM.getNumColumns(), 0, m);
+			verifyPreAggMapRowOne1(preAV, row);
+		}
+	}
+
+	private void preAggMapRowByteOne1(int row) {
+		if(data.length > 1) {
+			double[] preAV = new double[2];
+			byte[] m = new byte[data.length];
+			m[1] = 1;
+			a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, leftM.getNumColumns(), 0, m);
+			verifyPreAggMapRowOne1(preAV, row);
+		}
+	}
+
+	private void verifyPreAggMapRowOne1(double[] preAV, int row) {
+		double v = leftM.getValue(row, data[1]);
+		if(preAV[1] != v)
+			fail("aggregate to wrong index");
+		if(!Precision.equals(preAV[0], s[row] - v, eps))
+			fail("The preaggregate result is not the sum! : " + preAV[0] + " vs " + (s[row] - v));
+	}
+
+	@Test
+	public void preAggByteMapFirstSubOfRowByte() {
+		preAggMapSubOfRowByte(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondSubOfRowByte() {
+		preAggMapSubOfRowByte(1);
+	}
+
+	@Test
+	public void preAggByteMapFirstSubOfRowChar() {
+		preAggMapSubOfRowChar(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondSubOfRowChar() {
+		preAggMapSubOfRowChar(1);
+	}
+
+	private void preAggMapSubOfRowChar(int row) {
+		if(data.length > 2) {
+			double[] preAV = new double[2];
+			char[] m = new char[data.length];
+			m[1] = 1;
+			a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, data[data.length - 1], 0, m);
+			verifyPreAggMapSubOfRow(preAV, row);
+		}
+	}
+
+	private void preAggMapSubOfRowByte(int row) {
+		if(data.length > 2) {
+			double[] preAV = new double[2];
+			byte[] m = new byte[data.length];
+			m[1] = 1;
+			a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, data[data.length - 1], 0, m);
+			verifyPreAggMapSubOfRow(preAV, row);
+		}
+	}
+
+	private void verifyPreAggMapSubOfRow(double[] preAV, int row) {
+		double v = leftM.getValue(row, data[1]);
+		double v2 = leftM.getValue(row, data[data.length - 1]);
+		if(preAV[1] != v)
+			fail("aggregate to wrong index");
+		if(!Precision.equals(preAV[0], s[row] - v - v2, eps))
+			fail("The preaggregate result is not the sum! : " + preAV[0] + " vs " + (s[row] - v - v2));
+	}
+
+	@Test
+	public void preAggByteMapFirstSubOfRowV2Byte() {
+		preAggMapSubOfRowV2Byte(0, 2);
+	}
+
+	@Test
+	public void preAggByteMapSecondSubOfRowV2Byte() {
+		preAggMapSubOfRowV2Byte(1, 2);
+	}
+
+	@Test
+	public void preAggByteMapFirstSubOfRowV2Char() {
+		preAggMapSubOfRowV2Char(0, 2);
+	}
+
+	@Test
+	public void preAggByteMapSecondSubOfRowV2Char() {
+		preAggMapSubOfRowV2Char(1, 2);
+	}
+
+	@Test
+	public void preAggByteMapFirstSubOfRowV2ByteV2() {
+		preAggMapSubOfRowV2Byte(0, 244);
+	}
+
+	@Test
+	public void preAggByteMapSecondSubOfRowV2ByteV2() {
+		preAggMapSubOfRowV2Byte(1, 244);
+	}
+
+	@Test
+	public void preAggByteMapFirstSubOfRowV2CharV2() {
+		preAggMapSubOfRowV2Char(0, 244);
+	}
+
+	@Test
+	public void preAggByteMapSecondSubOfRowV2CharV2() {
+		preAggMapSubOfRowV2Char(1, 244);
+	}
+
+	private void preAggMapSubOfRowV2Char(int row, int nVal) {
+		if(data.length > 3) {
+			double[] preAV = new double[2];
+			char[] m = new char[data.length];
+			m[1] = 1;
+			a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, data[data.length - 2], nVal, m);
+			verifyPreAggMapSubOfRowV2Byte(preAV, row);
+		}
+	}
+
+	private void preAggMapSubOfRowV2Byte(int row, int nVal) {
+		if(data.length > 3) {
+			double[] preAV = new double[2];
+			byte[] m = new byte[data.length];
+			m[1] = 1;
+			a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, 0, data[data.length - 2], nVal, m);
+			verifyPreAggMapSubOfRowV2Byte(preAV, row);
+		}
+	}
+
+	private void verifyPreAggMapSubOfRowV2Byte(double[] preAV, int row) {
+		double v = leftM.getValue(row, data[1]);
+		double v2 = leftM.getValue(row, data[data.length - 1]) + leftM.getValue(row, data[data.length - 2]);
+		if(preAV[1] != v)
+			fail("aggregate to wrong index");
+		if(!Precision.equals(preAV[0], s[row] - v - v2, eps))
+			fail("The preaggregate result is not the sum! : " + preAV[0] + " vs " + (s[row] - v - v2));
+	}
+
+	@Test
+	public void preAggByteMapFirstOutOfRangeBeforeByte() {
+		preAggMapOutOfRangeBeforeByte(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondOutOfRangeBeforeByte() {
+		preAggMapOutOfRangeBeforeByte(1);
+	}
+
+	@Test
+	public void preAggByteMapFirstOutOfRangeBeforeChar() {
+		preAggMapOutOfRangeBeforeChar(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondOutOfRangeBeforeChar() {
+		preAggMapOutOfRangeBeforeChar(1);
+	}
+
+	private void preAggMapOutOfRangeBeforeChar(int row) {
+		double[] preAV = null; // should not need access this therefore we make a null argument here.
+		char[] m = new char[data.length];
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, -412, data[0] - 1, 0, m);
+	}
+
+	private void preAggMapOutOfRangeBeforeByte(int row) {
+		double[] preAV = null; // should not need access this therefore we make a null argument here.
+		byte[] m = new byte[data.length];
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, -412, data[0] - 1, 0, m);
+	}
+
+	@Test
+	public void preAggByteMapFirstOutOfRangeAfterByte() {
+		preAggMapOutOfRangeAfterByte(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondOutOfRangeAfterByte() {
+		preAggMapOutOfRangeAfterByte(1);
+	}
+
+	@Test
+	public void preAggByteMapFirstOutOfRangeAfterChar() {
+		preAggMapOutOfRangeAfterChar(0);
+	}
+
+	@Test
+	public void preAggByteMapSecondOutOfRangeAfterChar() {
+		preAggMapOutOfRangeAfterChar(1);
+	}
+
+	private void preAggMapOutOfRangeAfterChar(int row) {
+		double[] preAV = null; // should not need access this therefore we make a null argument here.
+		char[] m = new char[data.length];
+		int id = data[data.length - 1] + 10;
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, id, id + 10, 0, m);
+	}
+
+	private void preAggMapOutOfRangeAfterByte(int row) {
+		double[] preAV = null; // should not need access this therefore we make a null argument here.
+		byte[] m = new byte[data.length];
+		int id = data[data.length - 1] + 10;
+		a.preAggregateDenseMap(this.leftM, preAV, row, 1 + row, id, id + 10, 0, m);
+	}
+
+	private final double[] sumIndexes() {
+		double[] lmv = leftM.getDenseBlockValues();
+		double[] ret = new double[leftM.getNumRows()];
+		for(int j = 0; j < leftM.getNumRows(); j++) {
+			final int off = j * leftM.getNumColumns();
+			for(int i = 0; i < data.length; i++)
+				ret[j] += lmv[data[i] + off];
+		}
+		return ret;
+	}
+
+	private final void fill(byte[] a, byte v) {
+		for(int i = 0; i < a.length; i++)
+			a[i] = v;
+	}
+
+	private final void fill(char[] a, char v) {
+		for(int i = 0; i < a.length; i++)
+			a[i] = v;
+	}
+
+	private String getString(AOffset a) {
+		String os = a.toString();
+		return os.substring(os.indexOf("["), os.length());
+	}
+
+}
diff --git a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTests.java b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTests.java
index 0fca9742217..a7c03284143 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTests.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTests.java
@@ -20,6 +20,8 @@
 package org.apache.sysds.test.component.compress.offset;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
@@ -33,6 +35,8 @@
 import java.util.Collection;
 
 import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
@@ -47,8 +51,9 @@
 
 @RunWith(value = Parameterized.class)
 public class OffsetTests {
+	protected static final Log LOG = LogFactory.getLog(OffsetTests.class.getName());
 
-	private static final long sizeTolerance = 265;
+	private static final long sizeTolerance = 100;
 
 	public int[] data;
 	public OFF_TYPE type;
@@ -72,17 +77,25 @@ public static Collection<Object[]> data() {
 			tests.add(new Object[] {new int[] {0, 256}, t});
 			tests.add(new Object[] {new int[] {0, 254}, t});
 			tests.add(new Object[] {new int[] {0, Character.MAX_VALUE}, t});
+			tests.add(new Object[] {new int[] {0, Character.MAX_VALUE, ((int) Character.MAX_VALUE) * 2}, t});
+			tests.add(new Object[] {new int[] {2, Character.MAX_VALUE + 2}, t});
 			tests.add(new Object[] {new int[] {0, ((int) Character.MAX_VALUE) + 1}, t});
 			tests.add(new Object[] {new int[] {0, ((int) Character.MAX_VALUE) - 1}, t});
 			tests.add(new Object[] {new int[] {0, 256 * 2}, t});
 			tests.add(new Object[] {new int[] {0, 255 * 2}, t});
 			tests.add(new Object[] {new int[] {0, 254 * 2}, t});
+			tests.add(new Object[] {new int[] {0, 510, 765}, t});
 			tests.add(new Object[] {new int[] {0, 254 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255, 255 * 2, 255 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 2, 255 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 2, 255 * 3, 255 * 10}, t});
 			tests.add(new Object[] {new int[] {0, 255 * 3}, t});
+			tests.add(new Object[] {new int[] {0, 255 * 4}, t});
 			tests.add(new Object[] {new int[] {0, 256 * 3}, t});
 			tests.add(new Object[] {new int[] {255 * 3, 255 * 5}, t});
 			tests.add(new Object[] {new int[] {1000000, 1000000 + 255 * 5}, t});
 			tests.add(new Object[] {new int[] {100000000, 100000000 + 255 * 5}, t});
+			tests.add(new Object[] {new int[] {100000000, 100001275, 100001530}, t});
 			tests.add(new Object[] {new int[] {0, 1, 2, 3, 255 * 4, 1500}, t});
 			tests.add(new Object[] {new int[] {0, 1, 2, 3, 4, 5}, t});
 			tests.add(new Object[] {new int[] {2458248, 2458249, 2458253, 2458254, 2458256, 2458257, 2458258, 2458262,
@@ -109,8 +122,7 @@ public OffsetTests(int[] data, OFF_TYPE type) {
 	@Test
 	public void testConstruction() {
 		try {
-			AIterator i = o.getIterator();
-			compare(i, data);
+			compare(o, data);
 		}
 		catch(Exception e) {
 			e.printStackTrace();
@@ -118,6 +130,30 @@ public void testConstruction() {
 		}
 	}
 
+	@Test
+	public void testCacheExists() {
+		if(data.length > 2) {
+			AIterator i = o.getIterator();
+			i.next();
+			o.cacheIterator(i, data[1]);
+			AIterator ii = o.getIterator(data[1]);
+			assertTrue(ii.equals(i));
+			ii.next();
+			assertFalse(ii.equals(i));
+		}
+	}
+
+	@Test
+	public void testCacheDontExists() {
+		if(data.length > 2) {
+			AIterator i = o.getIterator();
+			i.next();
+			o.cacheIterator(i, data[1]);
+			AIterator ii = o.getIterator(data[2]);
+			assertFalse(ii.equals(i));
+		}
+	}
+
 	@Test
 	public void testSerialization() {
 		try {
@@ -131,9 +167,7 @@ public void testSerialization() {
 			DataInputStream fis = new DataInputStream(bis);
 
 			AOffset n = OffsetFactory.readIn(fis);
-
-			AIterator i = n.getIterator();
-			compare(i, data);
+			compare(n, data);
 		}
 		catch(IOException e) {
 			throw new RuntimeException("Error in io", e);
@@ -170,23 +204,25 @@ public void testOnDiskSizeInBytes() {
 	}
 
 	@Test
-	public void testInMemoryEstimateIsSameAsActualOrSmaller() {
+	public void testInMemoryEstimateIsSameAsActualOrLarger() {
 		try {
-			long inMemorySize = o.getInMemorySize();
+			final long inMemorySize = o.getInMemorySize();
 			long estimatedSize;
 			switch(type) {
 				case BYTE:
-					estimatedSize = OffsetByte.getInMemorySize(data.length);
+					estimatedSize = OffsetByte.estimateInMemorySize(data.length, data[data.length - 1] - data[0]);
 					break;
 				case CHAR:
-					estimatedSize = OffsetChar.getInMemorySize(data.length);
+					estimatedSize = OffsetChar.estimateInMemorySize(data.length, data[data.length - 1] - data[0]);
 					break;
 				default:
 					throw new DMLCompressionException("Unknown input");
 			}
-			final String errorMessage = "in memory size: " + inMemorySize + " is not smaller than estimate: "
-				+ estimatedSize + " with tolerance " + sizeTolerance;
-			assertTrue(errorMessage, inMemorySize - sizeTolerance <= estimatedSize);
+			if(!(inMemorySize <= estimatedSize + sizeTolerance)) {
+
+				fail("in memory size: " + inMemorySize + " is not smaller than estimate: " + estimatedSize
+					+ " with tolerance " + sizeTolerance);
+			}
 		}
 		catch(Exception e) {
 			e.printStackTrace();
@@ -194,15 +230,174 @@ public void testInMemoryEstimateIsSameAsActualOrSmaller() {
 		}
 	}
 
-	private void compare(AIterator i, int[] v) {
-		for(int j = 0; j < v.length; j++) {
+	@Test
+	public void testSkipToContainedIndex() {
+		try {
+			assertEquals(data[data.length - 1], o.getIterator().skipTo(data[data.length - 1]));
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			fail("Failed skipping to last index");
+		}
+	}
 
+	@Test
+	public void testSkipToContainedIndexPlusOne() {
+		try {
+			assertNotEquals(data[data.length - 1] + 1, o.getIterator().skipTo(data[data.length - 1]));
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			fail("Failed skipping to last index");
+		}
+	}
+
+	@Test
+	public void testSkipToContainedIndexPlusN() {
+		try {
+			if(data.length > 1)
+				assertTrue(data[1] <= o.getIterator().skipTo(data[1] + 1));
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			fail("Failed skipping to last index");
+		}
+	}
+
+	@Test
+	public void testSkipToContainedIndexMinusOne() {
+		try {
+			int v = data[data.length - 1];
+			int maxDiff = 1;
+			assertTrue(v <= o.getIterator().skipTo(v - 1) + maxDiff);
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			fail("Failed skipping to last index");
+		}
+	}
+
+	@Test
+	public void testSkipToContainedIndexMinusN() {
+		try {
+			int v = data[data.length - 1];
+			int maxDiff = 142;
+			assertTrue(v <= o.getIterator().skipTo(v - 1) + maxDiff);
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			fail("Failed skipping to last index");
+		}
+	}
+
+	@Test
+	public void testToString() {
+		String os = o.toString();
+		os = os.substring(os.indexOf("["), os.length());
+		String vs = Arrays.toString(data);
+		if(!os.equals(vs)) {
+			fail("The two array string are not equivalent with " + type + "\n" + os + " : " + vs);
+		}
+	}
+
+	@Test
+	public void testIsNotOverFirstDataPoint() {
+		assertFalse(o.getIterator().isNotOver(data[0]));
+	}
+
+	@Test
+	public void testIsNotOverSecondDataPointOnInit() {
+		if(data.length > 1)
+			assertTrue(o.getIterator().isNotOver(data[1]));
+	}
+
+	@Test
+	public void testIsNotOverSecondDataPointOnInitToSecond() {
+		if(data.length > 1)
+			assertFalse(o.getIterator(data[1]).isNotOver(data[1]));
+	}
+
+	@Test
+	public void testIsOverFirstDataPointOnInitToSecond() {
+		if(data.length > 1)
+			assertFalse(o.getIterator(data[1]).isNotOver(data[0]));
+	}
+
+	@Test
+	public void testGetDataIndexOnInit() {
+		assertTrue(o.getIterator().getDataIndex() == 0);
+	}
+
+	@Test
+	public void testGetDataIndexOnInitSkipToFirst() {
+		if(data.length > 1)
+			assertTrue(o.getIterator(data[1]).getDataIndex() == 1);
+	}
+
+	@Test
+	public void testGetDataIndexOnInitSkipToN() {
+		if(data.length > 3)
+			assertTrue(o.getIterator(data[2]).getDataIndex() == 2);
+	}
+
+	@Test
+	public void testGetDataAfterNext() {
+		if(data.length > 1)
+			testGetDataAfterNextN(o.getIterator());
+	}
+
+	@Test
+	public void testGetDataAfterNext2() {
+		if(data.length > 2)
+			testGetDataAfterNextN(o.getIterator(2));
+	}
+
+	public void testGetDataAfterNextN(AIterator it) {
+		int d = it.getDataIndex();
+		it.next();
+		assertEquals(d + 1, it.getDataIndex());
+	}
+
+	@Test
+	public void testGetDataAfterNextComb() {
+		if(data.length > 1)
+			testGetDataAfterNextCombN(o.getIterator());
+	}
+
+	@Test
+	public void testGetDataAfterNextComb2() {
+		if(data.length > 2)
+			testGetDataAfterNextCombN(o.getIterator(2));
+	}
+
+	public void testGetDataAfterNextCombN(AIterator it) {
+		int d = it.getDataIndexAndIncrement();
+		assertEquals(d + 1, it.getDataIndex());
+	}
+
+	@Test
+	public void testGetUnreasonablyHighSkip() {
+		assertTrue(o.getIterator(Integer.MAX_VALUE - 1000) == null);
+	}
+
+	@Test
+	public void testCacheNullIterator() {
+		o.cacheIterator(null, 21415);
+	}
+
+	protected static void compare(AOffset o, int[] v) {
+		AIterator i = o.getIterator();
+		if(v[0] != i.value())
+			fail("incorrect result using : " + o.getClass().getSimpleName() + " expected: " + Arrays.toString(v)
+				+ " but was :" + o.toString());
+		for(int j = 1; j < v.length; j++) {
+			i.next();
 			if(v[j] != i.value())
 				fail("incorrect result using : " + o.getClass().getSimpleName() + " expected: " + Arrays.toString(v)
 					+ " but was :" + o.toString());
-			if(i.hasNext())
-				i.next();
 		}
+		if(i.getOffsetsIndex() != o.getOffsetsLength())
+			fail("The allocated offsets are longer than needed: idx " + i.getOffsetsIndex() + " vs len "
+				+ o.getOffsetsLength() + "\n" + Arrays.toString(v));
 	}
-
 }
diff --git a/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestsDefaultConstructor.java b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestsDefaultConstructor.java
new file mode 100644
index 00000000000..ea9017df549
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestsDefaultConstructor.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.offset;
+
+import static org.junit.Assert.fail;
+
+import java.util.ArrayList;
+import java.util.Collection;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(value = Parameterized.class)
+public class OffsetTestsDefaultConstructor {
+	protected static final Log LOG = LogFactory.getLog(OffsetTestsDefaultConstructor.class.getName());
+
+	private static final long sizeTolerance = 100;
+
+	public int[] data;
+	private AOffset o;
+
+	@Parameters
+	public static Collection<Object[]> data() {
+		ArrayList<Object[]> tests = new ArrayList<>();
+		// It is assumed that the input is in sorted order, all values are positive and there are no duplicates.
+
+		tests.add(new Object[] {new int[] {1, 2}});
+		tests.add(new Object[] {new int[] {2, 142}});
+		tests.add(new Object[] {new int[] {142, 421}});
+		tests.add(new Object[] {new int[] {1, 1023}});
+		tests.add(new Object[] {new int[] {1023, 1024}});
+		tests.add(new Object[] {new int[] {1023}});
+		tests.add(new Object[] {new int[] {0, 1, 2, 3, 4, 5}});
+		tests.add(new Object[] {new int[] {0}});
+		tests.add(new Object[] {new int[] {Character.MAX_VALUE, ((int) Character.MAX_VALUE) + 1}});
+		tests.add(new Object[] {new int[] {Character.MAX_VALUE, ((int) Character.MAX_VALUE) * 2}});
+		tests.add(new Object[] {new int[] {0, 256}});
+		tests.add(new Object[] {new int[] {0, 254}});
+		tests.add(new Object[] {new int[] {0, Character.MAX_VALUE}});
+		tests.add(new Object[] {new int[] {0, Character.MAX_VALUE, ((int) Character.MAX_VALUE) * 2}});
+		tests.add(new Object[] {new int[] {2, Character.MAX_VALUE + 2}});
+		tests.add(new Object[] {new int[] {0, ((int) Character.MAX_VALUE) + 1}});
+		tests.add(new Object[] {new int[] {0, ((int) Character.MAX_VALUE) - 1}});
+		tests.add(new Object[] {new int[] {0, 256 * 2}});
+		tests.add(new Object[] {new int[] {0, 255 * 2}});
+		tests.add(new Object[] {new int[] {0, 254 * 2}});
+		tests.add(new Object[] {new int[] {0, 510, 765}});
+		tests.add(new Object[] {new int[] {0, 120, 230}});
+		tests.add(new Object[] {new int[] {1000, 1120, 1230}});
+		tests.add(new Object[] {new int[] {0, 254 * 3}});
+		tests.add(new Object[] {new int[] {0, 255, 255 * 2, 255 * 3}});
+		tests.add(new Object[] {new int[] {0, 255 * 2, 255 * 3}});
+		tests.add(new Object[] {new int[] {0, 255 * 2, 255 * 3, 255 * 10}});
+		tests.add(new Object[] {new int[] {0, 255 * 3}});
+		tests.add(new Object[] {new int[] {0, 255 * 4}});
+		tests.add(new Object[] {new int[] {0, 256 * 3}});
+		tests.add(new Object[] {new int[] {255 * 3, 255 * 5}});
+		tests.add(new Object[] {new int[] {1000000, 1000000 + 255 * 5}});
+		tests.add(new Object[] {new int[] {100000000, 100000000 + 255 * 5}});
+		tests.add(new Object[] {new int[] {100000000, 100001275, 100001530}});
+		tests.add(new Object[] {new int[] {0, 1, 2, 3, 255 * 4, 1500}});
+		tests.add(new Object[] {new int[] {0, 1, 2, 3, 4, 5}});
+		tests.add(new Object[] {new int[] {2458248, 2458249, 2458253, 2458254, 2458256, 2458257, 2458258, 2458262,
+			2458264, 2458266, 2458267, 2458271, 2458272, 2458275, 2458276, 2458281}});
+
+		return tests;
+	}
+
+	public OffsetTestsDefaultConstructor(int[] data) {
+		this.data = data;
+		this.o = OffsetFactory.createOffset(data);
+	}
+
+	@Test
+	public void testConstruction() {
+		try {
+			OffsetTests.compare(o, data);
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			throw e;
+		}
+	}
+
+	@Test
+	public void testMemoryEstimate(){
+		final long est = OffsetFactory.estimateInMemorySize(data.length, data[data.length -1]);
+		final long act = o.getInMemorySize();
+
+		if(!( act <= est + sizeTolerance))
+			fail("In memory is not smaller than estimate " + est + " " + act);
+	}
+}