From a826c10a5149f139918395151ce6d573a97dd663 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Mon, 30 Oct 2023 13:24:32 +0100 Subject: [PATCH] [MINOR] JIT optimize LMM Pre-aggregate Because of abstract classes the efficiency of the JIT compiler is subpar in the AMapToData instance. To improve this i have added individual overwritten instructions in some of the Map types. This duplicate code, but improve performance by 30-50% according to the profiler. --- .../compress/colgroup/mapping/AMapToData.java | 85 ++++++++++++------- .../compress/colgroup/mapping/MapToByte.java | 27 +++--- .../compress/colgroup/mapping/MapToChar.java | 52 +++++++++--- .../colgroup/mapping/MapToCharPByte.java | 23 +++++ .../compress/colgroup/mapping/MapToInt.java | 28 +++--- .../compress/colgroup/mapping/MapToUByte.java | 28 +++--- 6 files changed, 167 insertions(+), 76 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java index b12461bf7c4..b66c7ddb877 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java @@ -129,8 +129,8 @@ public void set(int n, Integer v) { * * @param n index to set. * @param v the value to set it to. - * @return v as encoded, note this value can be different that the one put in if the map is not able to represent - * the value + * @return v as encoded, note this value can be different that the one put in if the map is not able to represent the + * value */ public abstract int setAndGet(int n, int v); @@ -235,16 +235,19 @@ protected void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, in off += cl; for(int rc = cl; rc < cl + h; rc++, off++) preAV[getIndex(rc)] += mV[off]; - for(int rc = cl + h; rc < cu; rc += 8, off += 8) { - preAV[getIndex(rc)] += mV[off]; - preAV[getIndex(rc + 1)] += mV[off + 1]; - preAV[getIndex(rc + 2)] += mV[off + 2]; - preAV[getIndex(rc + 3)] += mV[off + 3]; - preAV[getIndex(rc + 4)] += mV[off + 4]; - preAV[getIndex(rc + 5)] += mV[off + 5]; - preAV[getIndex(rc + 6)] += mV[off + 6]; - preAV[getIndex(rc + 7)] += mV[off + 7]; - } + for(int rc = cl + h; rc < cu; rc += 8, off += 8) + preAggregateDenseToRowVec8(mV, preAV, rc, off); + } + + protected void preAggregateDenseToRowVec8(double[] mV, double[] preAV, int rc, int off){ + preAV[getIndex(rc)] += mV[off]; + preAV[getIndex(rc + 1)] += mV[off + 1]; + preAV[getIndex(rc + 2)] += mV[off + 2]; + preAV[getIndex(rc + 3)] += mV[off + 3]; + preAV[getIndex(rc + 4)] += mV[off + 4]; + preAV[getIndex(rc + 5)] += mV[off + 5]; + preAV[getIndex(rc + 6)] += mV[off + 6]; + preAV[getIndex(rc + 7)] += mV[off + 7]; } /** @@ -329,8 +332,7 @@ protected void preAggregateDenseMultiRowContiguousBy1(double[] mV, int nCol, int * @param cu The column in m to end at (not inclusive) * @param indexes The Offset Indexes to iterate through */ - public final void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, - AOffset indexes) { + public final void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, AOffset indexes) { indexes.preAggregateDenseMap(m, preAV, rl, ru, cl, cu, getUnique(), this); } @@ -417,6 +419,8 @@ public final int[] getCounts() { * @param nCol The number of columns */ public final void preAggregateDDC_DDC(AMapToData tm, IDictionary td, Dictionary ret, int nCol) { + if(td.getNumberOfValues(nCol) != tm.nUnique) + throw new DMLCompressionException("Invalid map and dict combination"); if(nCol == 1) preAggregateDDC_DDCSingleCol(tm, td.getValues(), ret.getValues()); else @@ -431,31 +435,55 @@ public final void preAggregateDDC_DDC(AMapToData tm, IDictionary td, Dictionary * @param ret The output dictionary to aggregate into */ protected void preAggregateDDC_DDCSingleCol(AMapToData tm, double[] td, double[] v) { + final int sz = size(); - for(int r = 0; r < sz; r++) + final int h = sz % 8; + for(int r = 0; r < h; r++) v[getIndex(r)] += td[tm.getIndex(r)]; + for(int r = h; r < sz; r += 8) + preAggregateDDC_DDCSingleCol_vec(tm, td, v, r); + + } + + protected void preAggregateDDC_DDCSingleCol_vec(AMapToData tm, double[] td, double[] v, int r) { + final int r2 = r + 1, r3 = r + 2, r4 = r + 3, r5 = r + 4, r6 = r + 5, r7 = r + 6, r8 = r + 7; + v[getIndex(r)] += td[tm.getIndex(r)]; + v[getIndex(r2)] += td[tm.getIndex(r2)]; + v[getIndex(r3)] += td[tm.getIndex(r3)]; + v[getIndex(r4)] += td[tm.getIndex(r4)]; + v[getIndex(r5)] += td[tm.getIndex(r5)]; + v[getIndex(r6)] += td[tm.getIndex(r6)]; + v[getIndex(r7)] += td[tm.getIndex(r7)]; + v[getIndex(r8)] += td[tm.getIndex(r8)]; } /** * PreAggregate into dictionary with two sides of DDC guaranteed to multiple column tuples. * - * @param tm Map of other side + * @param tm Map of other side that indicate the indexes to take out and put into ret * @param td Dictionary to take values from (other side dictionary) * @param ret The output dictionary to aggregate into - * @param nCol The number of columns + * @param nCol The number of columns in td */ - protected void preAggregateDDC_DDCMultiCol(AMapToData tm, IDictionary td, double[] v, int nCol) { + protected void preAggregateDDC_DDCMultiCol(final AMapToData tm, final IDictionary td, final double[] v, + final int nCol) { + final int sz = size(); final int h = sz % 8; for(int r = 0; r < h; r++) td.addToEntry(v, tm.getIndex(r), getIndex(r), nCol); + for(int r = h; r < sz; r += 8) + preAggregateDDC_DDCMultiCol_vec(tm, td, v, nCol, r); - for(int r = h; r < sz; r += 8) { - int r2 = r + 1, r3 = r + 2, r4 = r + 3, r5 = r + 4, r6 = r + 5, r7 = r + 6, r8 = r + 7; - td.addToEntryVectorized(v, tm.getIndex(r), tm.getIndex(r2), tm.getIndex(r3), tm.getIndex(r4), - tm.getIndex(r5), tm.getIndex(r6), tm.getIndex(r7), tm.getIndex(r8), getIndex(r), getIndex(r2), - getIndex(r3), getIndex(r4), getIndex(r5), getIndex(r6), getIndex(r7), getIndex(r8), nCol); - } + } + + protected void preAggregateDDC_DDCMultiCol_vec(final AMapToData tm, final IDictionary td, final double[] v, + final int nCol, final int r) { + final int r2 = r + 1, r3 = r + 2, r4 = r + 3, r5 = r + 4, r6 = r + 5, r7 = r + 6, r8 = r + 7; + td.addToEntryVectorized(v, // + tm.getIndex(r), tm.getIndex(r2), tm.getIndex(r3), tm.getIndex(r4), tm.getIndex(r5), tm.getIndex(r6), + tm.getIndex(r7), tm.getIndex(r8), getIndex(r), // + getIndex(r2), getIndex(r3), getIndex(r4), getIndex(r5), getIndex(r6), getIndex(r7), getIndex(r8), nCol); } /** @@ -577,8 +605,8 @@ private int preAggregateSDCZ_DDCMultiCol_vect(AMapToData tm, IDictionary td, dou final int h = size % 8; int i = 0; while(i < size - h) { - int t1 = getIndex(i), t2 = getIndex(i + 1), t3 = getIndex(i + 2), t4 = getIndex(i + 3), - t5 = getIndex(i + 4), t6 = getIndex(i + 5), t7 = getIndex(i + 6), t8 = getIndex(i + 7); + int t1 = getIndex(i), t2 = getIndex(i + 1), t3 = getIndex(i + 2), t4 = getIndex(i + 3), t5 = getIndex(i + 4), + t6 = getIndex(i + 5), t7 = getIndex(i + 6), t8 = getIndex(i + 7); int f1 = it.value(), f2 = it.next(), f3 = it.next(), f4 = it.next(), f5 = it.next(), f6 = it.next(), f7 = it.next(), f8 = it.next(); @@ -607,8 +635,7 @@ public final void preAggregateSDCZ_SDCZ(AMapToData tm, IDictionary td, AOffset t preAggregateSDCZ_SDCZMultiCol(tm, td, tof, of, ret.getValues(), nCol); } - private final void preAggregateSDCZ_SDCZSingleCol(AMapToData tm, double[] td, AOffset tof, AOffset of, - double[] dv) { + private final void preAggregateSDCZ_SDCZSingleCol(AMapToData tm, double[] td, AOffset tof, AOffset of, double[] dv) { final AOffsetIterator itThat = tof.getOffsetIterator(); final AOffsetIterator itThis = of.getOffsetIterator(); final int tSize = tm.size() - 1, size = size() - 1; @@ -872,7 +899,7 @@ public void verify() { } } } - + @Override public String toString() { final int sz = size(); diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java index 837468d3ebf..fcbc84ce984 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java @@ -48,6 +48,7 @@ public MapToByte(int unique, int size) { protected MapToByte(int unique, byte[] data) { super(unique); _data = data; + verify(); } protected MapToUByte toUByte() { @@ -155,17 +156,21 @@ protected void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, in final int h = (cu - cl) % 8; off += cl; for(int rc = cl; rc < cl + h; rc++, off++) - preAV[_data[rc] & 0xFF] += mV[off]; - for(int rc = cl + h; rc < cu; rc += 8, off += 8) { - preAV[_data[rc] & 0xFF] += mV[off]; - preAV[_data[rc + 1] & 0xFF] += mV[off + 1]; - preAV[_data[rc + 2] & 0xFF] += mV[off + 2]; - preAV[_data[rc + 3] & 0xFF] += mV[off + 3]; - preAV[_data[rc + 4] & 0xFF] += mV[off + 4]; - preAV[_data[rc + 5] & 0xFF] += mV[off + 5]; - preAV[_data[rc + 6] & 0xFF] += mV[off + 6]; - preAV[_data[rc + 7] & 0xFF] += mV[off + 7]; - } + preAV[getIndex(rc)] += mV[off]; + for(int rc = cl + h; rc < cu; rc += 8, off += 8) + preAggregateDenseToRowVec8(mV, preAV, rc, off); + } + + @Override + protected void preAggregateDenseToRowVec8(double[] mV, double[] preAV, int rc, int off) { + preAV[getIndex(rc)] += mV[off]; + preAV[getIndex(rc + 1)] += mV[off + 1]; + preAV[getIndex(rc + 2)] += mV[off + 2]; + preAV[getIndex(rc + 3)] += mV[off + 3]; + preAV[getIndex(rc + 4)] += mV[off + 4]; + preAV[getIndex(rc + 5)] += mV[off + 5]; + preAV[getIndex(rc + 6)] += mV[off + 6]; + preAV[getIndex(rc + 7)] += mV[off + 7]; } @Override diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java index bdab7891b82..1f46cc3886f 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java @@ -49,6 +49,7 @@ public MapToChar(int unique, int size) { public MapToChar(int unique, char[] data) { super(unique); _data = data; + verify(); } @Override @@ -113,8 +114,8 @@ public void write(DataOutput out) throws IOException { out.writeInt(_data.length); final int BS = 100; if(_data.length > BS) { - final byte[] buff = new byte[BS*2]; - for(int i = 0; i < _data.length; ) { + final byte[] buff = new byte[BS * 2]; + for(int i = 0; i < _data.length;) { if(i + BS <= _data.length) { for(int o = 0; o < BS; o++) { IOUtilFunctions.shortToBa(_data[i++], buff, o * 2); @@ -152,17 +153,21 @@ protected void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, in final int h = (cu - cl) % 8; off += cl; for(int rc = cl; rc < cl + h; rc++, off++) - preAV[_data[rc]] += mV[off]; - for(int rc = cl + h; rc < cu; rc += 8, off += 8) { - preAV[_data[rc]] += mV[off]; - preAV[_data[rc + 1]] += mV[off + 1]; - preAV[_data[rc + 2]] += mV[off + 2]; - preAV[_data[rc + 3]] += mV[off + 3]; - preAV[_data[rc + 4]] += mV[off + 4]; - preAV[_data[rc + 5]] += mV[off + 5]; - preAV[_data[rc + 6]] += mV[off + 6]; - preAV[_data[rc + 7]] += mV[off + 7]; - } + preAV[getIndex(rc)] += mV[off]; + for(int rc = cl + h; rc < cu; rc += 8, off += 8) + preAggregateDenseToRowVec8(mV, preAV, rc, off); + } + + @Override + protected void preAggregateDenseToRowVec8(double[] mV, double[] preAV, int rc, int off){ + preAV[getIndex(rc)] += mV[off]; + preAV[getIndex(rc + 1)] += mV[off + 1]; + preAV[getIndex(rc + 2)] += mV[off + 2]; + preAV[getIndex(rc + 3)] += mV[off + 3]; + preAV[getIndex(rc + 4)] += mV[off + 4]; + preAV[getIndex(rc + 5)] += mV[off + 5]; + preAV[getIndex(rc + 6)] += mV[off + 6]; + preAV[getIndex(rc + 7)] += mV[off + 7]; } @Override @@ -304,4 +309,25 @@ public boolean equals(AMapToData e) { e.getUnique() == getUnique() && // Arrays.equals(((MapToChar) e)._data, _data); } + + @Override + protected void preAggregateDDC_DDCSingleCol_vec(AMapToData tm, double[] td, double[] v, int r) { + if(tm instanceof MapToChar) + preAggregateDDC_DDCSingleCol_vecChar((MapToChar) tm, td, v, r); + else + super.preAggregateDDC_DDCSingleCol_vec(tm, td, v, r); + } + + protected final void preAggregateDDC_DDCSingleCol_vecChar(MapToChar tm, double[] td, double[] v, int r) { + final int r2 = r + 1, r3 = r + 2, r4 = r + 3, r5 = r + 4, r6 = r + 5, r7 = r + 6, r8 = r + 7; + v[getIndex(r)] += td[tm.getIndex(r)]; + v[getIndex(r2)] += td[tm.getIndex(r2)]; + v[getIndex(r3)] += td[tm.getIndex(r3)]; + v[getIndex(r4)] += td[tm.getIndex(r4)]; + v[getIndex(r5)] += td[tm.getIndex(r5)]; + v[getIndex(r6)] += td[tm.getIndex(r6)]; + v[getIndex(r7)] += td[tm.getIndex(r7)]; + v[getIndex(r8)] += td[tm.getIndex(r8)]; + } + } diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToCharPByte.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToCharPByte.java index cb7d6199cf2..99d53878844 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToCharPByte.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToCharPByte.java @@ -53,6 +53,7 @@ public MapToCharPByte(int unique, char[] data_c, byte[] data_b) { super(unique); _data_c = data_c; _data_b = data_b; + verify(); } @Override @@ -278,4 +279,26 @@ public boolean equals(AMapToData e) { Arrays.equals(((MapToCharPByte) e)._data_b, _data_b) && // Arrays.equals(((MapToCharPByte) e)._data_c, _data_c); } + + @Override + protected void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, int cu, int off) { + final int h = (cu - cl) % 8; + off += cl; + for(int rc = cl; rc < cl + h; rc++, off++) + preAV[getIndex(rc)] += mV[off]; + for(int rc = cl + h; rc < cu; rc += 8, off += 8) + preAggregateDenseToRowVec8(mV, preAV, rc, off); + } + + @Override + protected void preAggregateDenseToRowVec8(double[] mV, double[] preAV, int rc, int off){ + preAV[getIndex(rc)] += mV[off]; + preAV[getIndex(rc + 1)] += mV[off + 1]; + preAV[getIndex(rc + 2)] += mV[off + 2]; + preAV[getIndex(rc + 3)] += mV[off + 3]; + preAV[getIndex(rc + 4)] += mV[off + 4]; + preAV[getIndex(rc + 5)] += mV[off + 5]; + preAV[getIndex(rc + 6)] += mV[off + 6]; + preAV[getIndex(rc + 7)] += mV[off + 7]; + } } diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java index b3c509b78cf..20b2c77c7c8 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java @@ -48,6 +48,7 @@ public MapToInt(int unique, int size) { private MapToInt(int unique, int[] data) { super(unique); _data = data; + verify(); } protected int[] getData() { @@ -130,19 +131,24 @@ protected void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, in final int h = (cu - cl) % 8; off += cl; for(int rc = cl; rc < cl + h; rc++, off++) - preAV[_data[rc]] += mV[off]; - for(int rc = cl + h; rc < cu; rc += 8, off += 8) { - preAV[_data[rc]] += mV[off]; - preAV[_data[rc + 1]] += mV[off + 1]; - preAV[_data[rc + 2]] += mV[off + 2]; - preAV[_data[rc + 3]] += mV[off + 3]; - preAV[_data[rc + 4]] += mV[off + 4]; - preAV[_data[rc + 5]] += mV[off + 5]; - preAV[_data[rc + 6]] += mV[off + 6]; - preAV[_data[rc + 7]] += mV[off + 7]; - } + preAV[getIndex(rc)] += mV[off]; + for(int rc = cl + h; rc < cu; rc += 8, off += 8) + preAggregateDenseToRowVec8(mV, preAV, rc, off); } + @Override + protected void preAggregateDenseToRowVec8(double[] mV, double[] preAV, int rc, int off){ + preAV[getIndex(rc)] += mV[off]; + preAV[getIndex(rc + 1)] += mV[off + 1]; + preAV[getIndex(rc + 2)] += mV[off + 2]; + preAV[getIndex(rc + 3)] += mV[off + 3]; + preAV[getIndex(rc + 4)] += mV[off + 4]; + preAV[getIndex(rc + 5)] += mV[off + 5]; + preAV[getIndex(rc + 6)] += mV[off + 6]; + preAV[getIndex(rc + 7)] += mV[off + 7]; + } + + @Override protected void preAggregateDenseMultiRowContiguousBy8(double[] mV, int nCol, int nVal, double[] preAV, int rl, int ru, int cl, int cu) { diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToUByte.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToUByte.java index f94e95a9ed3..d545c362996 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToUByte.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToUByte.java @@ -95,17 +95,21 @@ protected void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, in final int h = (cu - cl) % 8; off += cl; for(int rc = cl; rc < cl + h; rc++, off++) - preAV[_data[rc]] += mV[off]; - for(int rc = cl + h; rc < cu; rc += 8, off += 8) { - preAV[_data[rc]] += mV[off]; - preAV[_data[rc + 1]] += mV[off + 1]; - preAV[_data[rc + 2]] += mV[off + 2]; - preAV[_data[rc + 3]] += mV[off + 3]; - preAV[_data[rc + 4]] += mV[off + 4]; - preAV[_data[rc + 5]] += mV[off + 5]; - preAV[_data[rc + 6]] += mV[off + 6]; - preAV[_data[rc + 7]] += mV[off + 7]; - } + preAV[getIndex(rc)] += mV[off]; + for(int rc = cl + h; rc < cu; rc += 8, off += 8) + preAggregateDenseToRowVec8(mV, preAV, rc, off); + } + + @Override + protected void preAggregateDenseToRowVec8(double[] mV, double[] preAV, int rc, int off) { + preAV[getIndex(rc)] += mV[off]; + preAV[getIndex(rc + 1)] += mV[off + 1]; + preAV[getIndex(rc + 2)] += mV[off + 2]; + preAV[getIndex(rc + 3)] += mV[off + 3]; + preAV[getIndex(rc + 4)] += mV[off + 4]; + preAV[getIndex(rc + 5)] += mV[off + 5]; + preAV[getIndex(rc + 6)] += mV[off + 6]; + preAV[getIndex(rc + 7)] += mV[off + 7]; } @Override @@ -121,7 +125,7 @@ public int[] getCounts(int[] ret) { } @Override - public int getMaxPossible(){ + public int getMaxPossible() { return 128; }