Skip to content

Commit

Permalink
[MINOR] JIT optimize LMM Pre-aggregate
Browse files Browse the repository at this point in the history
Because of abstract classes the efficiency of the JIT compiler
is subpar in the AMapToData instance. To improve this i have added
individual overwritten instructions in some of the Map types.
This duplicate code, but improve performance by 30-50% according to the
profiler.
  • Loading branch information
Baunsgaard committed Oct 30, 2023
1 parent c21fa99 commit a826c10
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 76 deletions.
Expand Up @@ -129,8 +129,8 @@ public void set(int n, Integer v) {
*
* @param n index to set.
* @param v the value to set it to.
* @return v as encoded, note this value can be different that the one put in if the map is not able to represent
* the value
* @return v as encoded, note this value can be different that the one put in if the map is not able to represent the
* value
*/
public abstract int setAndGet(int n, int v);

Expand Down Expand Up @@ -235,16 +235,19 @@ protected void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, in
off += cl;
for(int rc = cl; rc < cl + h; rc++, off++)
preAV[getIndex(rc)] += mV[off];
for(int rc = cl + h; rc < cu; rc += 8, off += 8) {
preAV[getIndex(rc)] += mV[off];
preAV[getIndex(rc + 1)] += mV[off + 1];
preAV[getIndex(rc + 2)] += mV[off + 2];
preAV[getIndex(rc + 3)] += mV[off + 3];
preAV[getIndex(rc + 4)] += mV[off + 4];
preAV[getIndex(rc + 5)] += mV[off + 5];
preAV[getIndex(rc + 6)] += mV[off + 6];
preAV[getIndex(rc + 7)] += mV[off + 7];
}
for(int rc = cl + h; rc < cu; rc += 8, off += 8)
preAggregateDenseToRowVec8(mV, preAV, rc, off);
}

protected void preAggregateDenseToRowVec8(double[] mV, double[] preAV, int rc, int off){
preAV[getIndex(rc)] += mV[off];
preAV[getIndex(rc + 1)] += mV[off + 1];
preAV[getIndex(rc + 2)] += mV[off + 2];
preAV[getIndex(rc + 3)] += mV[off + 3];
preAV[getIndex(rc + 4)] += mV[off + 4];
preAV[getIndex(rc + 5)] += mV[off + 5];
preAV[getIndex(rc + 6)] += mV[off + 6];
preAV[getIndex(rc + 7)] += mV[off + 7];
}

/**
Expand Down Expand Up @@ -329,8 +332,7 @@ protected void preAggregateDenseMultiRowContiguousBy1(double[] mV, int nCol, int
* @param cu The column in m to end at (not inclusive)
* @param indexes The Offset Indexes to iterate through
*/
public final void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu,
AOffset indexes) {
public final void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, AOffset indexes) {
indexes.preAggregateDenseMap(m, preAV, rl, ru, cl, cu, getUnique(), this);
}

Expand Down Expand Up @@ -417,6 +419,8 @@ public final int[] getCounts() {
* @param nCol The number of columns
*/
public final void preAggregateDDC_DDC(AMapToData tm, IDictionary td, Dictionary ret, int nCol) {
if(td.getNumberOfValues(nCol) != tm.nUnique)
throw new DMLCompressionException("Invalid map and dict combination");
if(nCol == 1)
preAggregateDDC_DDCSingleCol(tm, td.getValues(), ret.getValues());
else
Expand All @@ -431,31 +435,55 @@ public final void preAggregateDDC_DDC(AMapToData tm, IDictionary td, Dictionary
* @param ret The output dictionary to aggregate into
*/
protected void preAggregateDDC_DDCSingleCol(AMapToData tm, double[] td, double[] v) {

final int sz = size();
for(int r = 0; r < sz; r++)
final int h = sz % 8;
for(int r = 0; r < h; r++)
v[getIndex(r)] += td[tm.getIndex(r)];
for(int r = h; r < sz; r += 8)
preAggregateDDC_DDCSingleCol_vec(tm, td, v, r);

}

protected void preAggregateDDC_DDCSingleCol_vec(AMapToData tm, double[] td, double[] v, int r) {
final int r2 = r + 1, r3 = r + 2, r4 = r + 3, r5 = r + 4, r6 = r + 5, r7 = r + 6, r8 = r + 7;
v[getIndex(r)] += td[tm.getIndex(r)];
v[getIndex(r2)] += td[tm.getIndex(r2)];
v[getIndex(r3)] += td[tm.getIndex(r3)];
v[getIndex(r4)] += td[tm.getIndex(r4)];
v[getIndex(r5)] += td[tm.getIndex(r5)];
v[getIndex(r6)] += td[tm.getIndex(r6)];
v[getIndex(r7)] += td[tm.getIndex(r7)];
v[getIndex(r8)] += td[tm.getIndex(r8)];
}

/**
* PreAggregate into dictionary with two sides of DDC guaranteed to multiple column tuples.
*
* @param tm Map of other side
* @param tm Map of other side that indicate the indexes to take out and put into ret
* @param td Dictionary to take values from (other side dictionary)
* @param ret The output dictionary to aggregate into
* @param nCol The number of columns
* @param nCol The number of columns in td
*/
protected void preAggregateDDC_DDCMultiCol(AMapToData tm, IDictionary td, double[] v, int nCol) {
protected void preAggregateDDC_DDCMultiCol(final AMapToData tm, final IDictionary td, final double[] v,
final int nCol) {

final int sz = size();
final int h = sz % 8;
for(int r = 0; r < h; r++)
td.addToEntry(v, tm.getIndex(r), getIndex(r), nCol);
for(int r = h; r < sz; r += 8)
preAggregateDDC_DDCMultiCol_vec(tm, td, v, nCol, r);

for(int r = h; r < sz; r += 8) {
int r2 = r + 1, r3 = r + 2, r4 = r + 3, r5 = r + 4, r6 = r + 5, r7 = r + 6, r8 = r + 7;
td.addToEntryVectorized(v, tm.getIndex(r), tm.getIndex(r2), tm.getIndex(r3), tm.getIndex(r4),
tm.getIndex(r5), tm.getIndex(r6), tm.getIndex(r7), tm.getIndex(r8), getIndex(r), getIndex(r2),
getIndex(r3), getIndex(r4), getIndex(r5), getIndex(r6), getIndex(r7), getIndex(r8), nCol);
}
}

protected void preAggregateDDC_DDCMultiCol_vec(final AMapToData tm, final IDictionary td, final double[] v,
final int nCol, final int r) {
final int r2 = r + 1, r3 = r + 2, r4 = r + 3, r5 = r + 4, r6 = r + 5, r7 = r + 6, r8 = r + 7;
td.addToEntryVectorized(v, //
tm.getIndex(r), tm.getIndex(r2), tm.getIndex(r3), tm.getIndex(r4), tm.getIndex(r5), tm.getIndex(r6),
tm.getIndex(r7), tm.getIndex(r8), getIndex(r), //
getIndex(r2), getIndex(r3), getIndex(r4), getIndex(r5), getIndex(r6), getIndex(r7), getIndex(r8), nCol);
}

/**
Expand Down Expand Up @@ -577,8 +605,8 @@ private int preAggregateSDCZ_DDCMultiCol_vect(AMapToData tm, IDictionary td, dou
final int h = size % 8;
int i = 0;
while(i < size - h) {
int t1 = getIndex(i), t2 = getIndex(i + 1), t3 = getIndex(i + 2), t4 = getIndex(i + 3),
t5 = getIndex(i + 4), t6 = getIndex(i + 5), t7 = getIndex(i + 6), t8 = getIndex(i + 7);
int t1 = getIndex(i), t2 = getIndex(i + 1), t3 = getIndex(i + 2), t4 = getIndex(i + 3), t5 = getIndex(i + 4),
t6 = getIndex(i + 5), t7 = getIndex(i + 6), t8 = getIndex(i + 7);

int f1 = it.value(), f2 = it.next(), f3 = it.next(), f4 = it.next(), f5 = it.next(), f6 = it.next(),
f7 = it.next(), f8 = it.next();
Expand Down Expand Up @@ -607,8 +635,7 @@ public final void preAggregateSDCZ_SDCZ(AMapToData tm, IDictionary td, AOffset t
preAggregateSDCZ_SDCZMultiCol(tm, td, tof, of, ret.getValues(), nCol);
}

private final void preAggregateSDCZ_SDCZSingleCol(AMapToData tm, double[] td, AOffset tof, AOffset of,
double[] dv) {
private final void preAggregateSDCZ_SDCZSingleCol(AMapToData tm, double[] td, AOffset tof, AOffset of, double[] dv) {
final AOffsetIterator itThat = tof.getOffsetIterator();
final AOffsetIterator itThis = of.getOffsetIterator();
final int tSize = tm.size() - 1, size = size() - 1;
Expand Down Expand Up @@ -872,7 +899,7 @@ public void verify() {
}
}
}

@Override
public String toString() {
final int sz = size();
Expand Down
Expand Up @@ -48,6 +48,7 @@ public MapToByte(int unique, int size) {
protected MapToByte(int unique, byte[] data) {
super(unique);
_data = data;
verify();
}

protected MapToUByte toUByte() {
Expand Down Expand Up @@ -155,17 +156,21 @@ protected void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, in
final int h = (cu - cl) % 8;
off += cl;
for(int rc = cl; rc < cl + h; rc++, off++)
preAV[_data[rc] & 0xFF] += mV[off];
for(int rc = cl + h; rc < cu; rc += 8, off += 8) {
preAV[_data[rc] & 0xFF] += mV[off];
preAV[_data[rc + 1] & 0xFF] += mV[off + 1];
preAV[_data[rc + 2] & 0xFF] += mV[off + 2];
preAV[_data[rc + 3] & 0xFF] += mV[off + 3];
preAV[_data[rc + 4] & 0xFF] += mV[off + 4];
preAV[_data[rc + 5] & 0xFF] += mV[off + 5];
preAV[_data[rc + 6] & 0xFF] += mV[off + 6];
preAV[_data[rc + 7] & 0xFF] += mV[off + 7];
}
preAV[getIndex(rc)] += mV[off];
for(int rc = cl + h; rc < cu; rc += 8, off += 8)
preAggregateDenseToRowVec8(mV, preAV, rc, off);
}

@Override
protected void preAggregateDenseToRowVec8(double[] mV, double[] preAV, int rc, int off) {
preAV[getIndex(rc)] += mV[off];
preAV[getIndex(rc + 1)] += mV[off + 1];
preAV[getIndex(rc + 2)] += mV[off + 2];
preAV[getIndex(rc + 3)] += mV[off + 3];
preAV[getIndex(rc + 4)] += mV[off + 4];
preAV[getIndex(rc + 5)] += mV[off + 5];
preAV[getIndex(rc + 6)] += mV[off + 6];
preAV[getIndex(rc + 7)] += mV[off + 7];
}

@Override
Expand Down
Expand Up @@ -49,6 +49,7 @@ public MapToChar(int unique, int size) {
public MapToChar(int unique, char[] data) {
super(unique);
_data = data;
verify();
}

@Override
Expand Down Expand Up @@ -113,8 +114,8 @@ public void write(DataOutput out) throws IOException {
out.writeInt(_data.length);
final int BS = 100;
if(_data.length > BS) {
final byte[] buff = new byte[BS*2];
for(int i = 0; i < _data.length; ) {
final byte[] buff = new byte[BS * 2];
for(int i = 0; i < _data.length;) {
if(i + BS <= _data.length) {
for(int o = 0; o < BS; o++) {
IOUtilFunctions.shortToBa(_data[i++], buff, o * 2);
Expand Down Expand Up @@ -152,17 +153,21 @@ protected void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, in
final int h = (cu - cl) % 8;
off += cl;
for(int rc = cl; rc < cl + h; rc++, off++)
preAV[_data[rc]] += mV[off];
for(int rc = cl + h; rc < cu; rc += 8, off += 8) {
preAV[_data[rc]] += mV[off];
preAV[_data[rc + 1]] += mV[off + 1];
preAV[_data[rc + 2]] += mV[off + 2];
preAV[_data[rc + 3]] += mV[off + 3];
preAV[_data[rc + 4]] += mV[off + 4];
preAV[_data[rc + 5]] += mV[off + 5];
preAV[_data[rc + 6]] += mV[off + 6];
preAV[_data[rc + 7]] += mV[off + 7];
}
preAV[getIndex(rc)] += mV[off];
for(int rc = cl + h; rc < cu; rc += 8, off += 8)
preAggregateDenseToRowVec8(mV, preAV, rc, off);
}

@Override
protected void preAggregateDenseToRowVec8(double[] mV, double[] preAV, int rc, int off){
preAV[getIndex(rc)] += mV[off];
preAV[getIndex(rc + 1)] += mV[off + 1];
preAV[getIndex(rc + 2)] += mV[off + 2];
preAV[getIndex(rc + 3)] += mV[off + 3];
preAV[getIndex(rc + 4)] += mV[off + 4];
preAV[getIndex(rc + 5)] += mV[off + 5];
preAV[getIndex(rc + 6)] += mV[off + 6];
preAV[getIndex(rc + 7)] += mV[off + 7];
}

@Override
Expand Down Expand Up @@ -304,4 +309,25 @@ public boolean equals(AMapToData e) {
e.getUnique() == getUnique() && //
Arrays.equals(((MapToChar) e)._data, _data);
}

@Override
protected void preAggregateDDC_DDCSingleCol_vec(AMapToData tm, double[] td, double[] v, int r) {
if(tm instanceof MapToChar)
preAggregateDDC_DDCSingleCol_vecChar((MapToChar) tm, td, v, r);
else
super.preAggregateDDC_DDCSingleCol_vec(tm, td, v, r);
}

protected final void preAggregateDDC_DDCSingleCol_vecChar(MapToChar tm, double[] td, double[] v, int r) {
final int r2 = r + 1, r3 = r + 2, r4 = r + 3, r5 = r + 4, r6 = r + 5, r7 = r + 6, r8 = r + 7;
v[getIndex(r)] += td[tm.getIndex(r)];
v[getIndex(r2)] += td[tm.getIndex(r2)];
v[getIndex(r3)] += td[tm.getIndex(r3)];
v[getIndex(r4)] += td[tm.getIndex(r4)];
v[getIndex(r5)] += td[tm.getIndex(r5)];
v[getIndex(r6)] += td[tm.getIndex(r6)];
v[getIndex(r7)] += td[tm.getIndex(r7)];
v[getIndex(r8)] += td[tm.getIndex(r8)];
}

}
Expand Up @@ -53,6 +53,7 @@ public MapToCharPByte(int unique, char[] data_c, byte[] data_b) {
super(unique);
_data_c = data_c;
_data_b = data_b;
verify();
}

@Override
Expand Down Expand Up @@ -278,4 +279,26 @@ public boolean equals(AMapToData e) {
Arrays.equals(((MapToCharPByte) e)._data_b, _data_b) && //
Arrays.equals(((MapToCharPByte) e)._data_c, _data_c);
}

@Override
protected void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, int cu, int off) {
final int h = (cu - cl) % 8;
off += cl;
for(int rc = cl; rc < cl + h; rc++, off++)
preAV[getIndex(rc)] += mV[off];
for(int rc = cl + h; rc < cu; rc += 8, off += 8)
preAggregateDenseToRowVec8(mV, preAV, rc, off);
}

@Override
protected void preAggregateDenseToRowVec8(double[] mV, double[] preAV, int rc, int off){
preAV[getIndex(rc)] += mV[off];
preAV[getIndex(rc + 1)] += mV[off + 1];
preAV[getIndex(rc + 2)] += mV[off + 2];
preAV[getIndex(rc + 3)] += mV[off + 3];
preAV[getIndex(rc + 4)] += mV[off + 4];
preAV[getIndex(rc + 5)] += mV[off + 5];
preAV[getIndex(rc + 6)] += mV[off + 6];
preAV[getIndex(rc + 7)] += mV[off + 7];
}
}
Expand Up @@ -48,6 +48,7 @@ public MapToInt(int unique, int size) {
private MapToInt(int unique, int[] data) {
super(unique);
_data = data;
verify();
}

protected int[] getData() {
Expand Down Expand Up @@ -130,19 +131,24 @@ protected void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, in
final int h = (cu - cl) % 8;
off += cl;
for(int rc = cl; rc < cl + h; rc++, off++)
preAV[_data[rc]] += mV[off];
for(int rc = cl + h; rc < cu; rc += 8, off += 8) {
preAV[_data[rc]] += mV[off];
preAV[_data[rc + 1]] += mV[off + 1];
preAV[_data[rc + 2]] += mV[off + 2];
preAV[_data[rc + 3]] += mV[off + 3];
preAV[_data[rc + 4]] += mV[off + 4];
preAV[_data[rc + 5]] += mV[off + 5];
preAV[_data[rc + 6]] += mV[off + 6];
preAV[_data[rc + 7]] += mV[off + 7];
}
preAV[getIndex(rc)] += mV[off];
for(int rc = cl + h; rc < cu; rc += 8, off += 8)
preAggregateDenseToRowVec8(mV, preAV, rc, off);
}

@Override
protected void preAggregateDenseToRowVec8(double[] mV, double[] preAV, int rc, int off){
preAV[getIndex(rc)] += mV[off];
preAV[getIndex(rc + 1)] += mV[off + 1];
preAV[getIndex(rc + 2)] += mV[off + 2];
preAV[getIndex(rc + 3)] += mV[off + 3];
preAV[getIndex(rc + 4)] += mV[off + 4];
preAV[getIndex(rc + 5)] += mV[off + 5];
preAV[getIndex(rc + 6)] += mV[off + 6];
preAV[getIndex(rc + 7)] += mV[off + 7];
}


@Override
protected void preAggregateDenseMultiRowContiguousBy8(double[] mV, int nCol, int nVal, double[] preAV, int rl,
int ru, int cl, int cu) {
Expand Down
Expand Up @@ -95,17 +95,21 @@ protected void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, in
final int h = (cu - cl) % 8;
off += cl;
for(int rc = cl; rc < cl + h; rc++, off++)
preAV[_data[rc]] += mV[off];
for(int rc = cl + h; rc < cu; rc += 8, off += 8) {
preAV[_data[rc]] += mV[off];
preAV[_data[rc + 1]] += mV[off + 1];
preAV[_data[rc + 2]] += mV[off + 2];
preAV[_data[rc + 3]] += mV[off + 3];
preAV[_data[rc + 4]] += mV[off + 4];
preAV[_data[rc + 5]] += mV[off + 5];
preAV[_data[rc + 6]] += mV[off + 6];
preAV[_data[rc + 7]] += mV[off + 7];
}
preAV[getIndex(rc)] += mV[off];
for(int rc = cl + h; rc < cu; rc += 8, off += 8)
preAggregateDenseToRowVec8(mV, preAV, rc, off);
}

@Override
protected void preAggregateDenseToRowVec8(double[] mV, double[] preAV, int rc, int off) {
preAV[getIndex(rc)] += mV[off];
preAV[getIndex(rc + 1)] += mV[off + 1];
preAV[getIndex(rc + 2)] += mV[off + 2];
preAV[getIndex(rc + 3)] += mV[off + 3];
preAV[getIndex(rc + 4)] += mV[off + 4];
preAV[getIndex(rc + 5)] += mV[off + 5];
preAV[getIndex(rc + 6)] += mV[off + 6];
preAV[getIndex(rc + 7)] += mV[off + 7];
}

@Override
Expand All @@ -121,7 +125,7 @@ public int[] getCounts(int[] ret) {
}

@Override
public int getMaxPossible(){
public int getMaxPossible() {
return 128;
}

Expand Down

0 comments on commit a826c10

Please sign in to comment.