Skip to content

Commit

Permalink
[CVE-2012-2098] Integrate fallback sort into the rest, add some more …
Browse files Browse the repository at this point in the history
…tests and fix bug in bucket boundary calculation

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/compress/trunk@1340786 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
bodewig committed May 20, 2012
1 parent fdd7459 commit 0600296
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
* Compress" you'd get:</p>
*
* <pre>
* CompressCommons
* Commons Compress
* CompressCommons
* essCommons Compr
Expand All @@ -51,9 +52,9 @@
* ssCommons Compre
* </pre>
*
* <p>Which results in a new text "s romooCCmmpnse", in adition the
* <p>Which results in a new text "ss romooCCmmpnse", in adition the
* index of the first line that contained the original text is kept -
* in this case it is 0. The idea is that in a long English text all
* in this case it is 1. The idea is that in a long English text all
* permutations that start with "he" are likely suffixes of a "the" and
* thus they end in "t" leading to a larger block of "t"s that can
* better be compressed by the subsequent Move-to-Front, run-length
Expand Down Expand Up @@ -101,7 +102,8 @@ class BlockSort {
* been dropped after 0.9.0 and replaced by a fallback sorting
* algorithm.
*
* I've added the fallbackSort function of 1.0.6.
* I've added the fallbackSort function of 1.0.6 and tried to
* integrate it with the existing code without touching too much.
*/

/*
Expand Down Expand Up @@ -154,13 +156,16 @@ boolean blockSort(final BZip2CompressorOutputStream.Data data, final int last) {
this.workDone = 0;
this.blockRandomised = false;
this.firstAttempt = true;

if (last + 1 < 10000) {
fallbackSort(data, last);
} else {

mainSort(data, last);

if (this.firstAttempt && (this.workDone > this.workLimit)) {
randomiseBlock(data, last);
this.workLimit = this.workDone = 0;
this.firstAttempt = false;
mainSort(data, last);
fallbackSort(data, last);
}
}

final int[] fmap = data.fmap;
Expand All @@ -173,7 +178,27 @@ boolean blockSort(final BZip2CompressorOutputStream.Data data, final int last) {
}

// assert (data.origPtr != -1) : data.origPtr;
return blockRandomised;
return false;
}

/**
* Adapt fallbackSort to the expected interface of the rest of the
* code, in particular deal with the fact that block starts at
* offset 1 (in libbzip2 1.0.6 it starts at 0).
*/
final void fallbackSort(final BZip2CompressorOutputStream.Data data,
final int last) {
data.block[0] = data.block[last + 1];
fallbackSort(data.fmap, data.block, last + 1);
for (int i = 0; i < last + 1; i++) {
--data.fmap[i];
}
for (int i = 0; i < last + 1; i++) {
if (data.fmap[i] == -1) {
data.fmap[i] = last;
break;
}
}
}

/*---------------------------------------------*/
Expand Down Expand Up @@ -415,12 +440,13 @@ private int[] getEclass() {
}

/*
* The C code uses an array of ints to represents the bucket-start
* flags (bhtab). It also contains optimizations to skip over 32
* consecutively set or consecutively unset bits on word
* boundaries at once. For now I've chosen to use the simpler but
* potentially slower code using BitSet - also in the hope that
* using the BitSet#nextXXX methods may be fast enough.
* The C code uses an array of ints (each int holding 32 flags) to
* represents the bucket-start flags (bhtab). It also contains
* optimizations to skip over 32 consecutively set or
* consecutively unset bits on word boundaries at once. For now
* I've chosen to use the simpler but potentially slower code
* using BitSet - also in the hope that using the BitSet#nextXXX
* methods may be fast enough.
*/

/**
Expand All @@ -432,16 +458,22 @@ private int[] getEclass() {
* @param off offset of first byte to sort in block
*/
final void fallbackSort(int[] fmap, byte[] block, int nblock) {
int[] ftab = new int[257];
final int[] ftab = new int[257];
int H, i, j, k, l, r, cc, cc1;
int nNotDone;
int nBhtab;
final int[] eclass = getEclass();

for (i = 0; i < nblock; i++) {
eclass[i] = 0;
}
/*--
LBZ2: Initial 1-char radix sort to generate
initial fmap and initial BH bits.
--*/
for (i = 0; i < nblock; i++) ftab[block[i] & 0xff]++;
for (i = 0; i < nblock; i++) {
ftab[block[i] & 0xff]++;
}
for (i = 1; i < 257; i++) ftab[i] += ftab[i - 1];

for (i = 0; i < nblock; i++) {
Expand All @@ -467,8 +499,6 @@ final void fallbackSort(int[] fmap, byte[] block, int nblock) {
bhtab.clear(nblock + 2 * i + 1);
}

eclass = getEclass();

/*-- LBZ2: the log(N) loop --*/
H = 1;
while (true) {
Expand All @@ -491,10 +521,10 @@ final void fallbackSort(int[] fmap, byte[] block, int nblock) {

/*-- LBZ2: find the next non-singleton bucket --*/
k = r + 1;
k = bhtab.nextSetBit(k);
k = bhtab.nextClearBit(k);
l = k - 1;
if (l >= nblock) break;
k = bhtab.nextClearBit(k);
k = bhtab.nextSetBit(k + 1);
r = k - 1;
if (r >= nblock) break;

Expand Down Expand Up @@ -862,8 +892,8 @@ private void mainQSort3(final BZip2CompressorOutputStream.Data dataShadow,
private static final int SETMASK = (1 << 21);
private static final int CLEARMASK = (~SETMASK);

private void mainSort(final BZip2CompressorOutputStream.Data dataShadow,
final int lastShadow) {
final void mainSort(final BZip2CompressorOutputStream.Data dataShadow,
final int lastShadow) {
final int[] runningOrder = this.mainSort_runningOrder;
final int[] copy = this.mainSort_copy;
final boolean[] bigDone = this.mainSort_bigDone;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,56 @@ public class BlockSortTest {
0, 1, 7, 6, 8, 2, 3, 5, 4
};

private static final byte[] FIXTURE2 = {
'C', 'o', 'm', 'm', 'o', 'n', 's', ' ', 'C', 'o', 'm', 'p', 'r', 'e', 's', 's',
};

private static final byte[] FIXTURE2_BWT = {
's', 's', ' ', 'r', 'o', 'm', 'o', 'o', 'C', 'C', 'm', 'm', 'p', 'n', 's', 'e',
};

@Test
public void testSortFixture() {
BZip2CompressorOutputStream.Data data = new BZip2CompressorOutputStream.Data(1);
System.arraycopy(FIXTURE, 0, data.block, 1, FIXTURE.length);
BlockSort s = new BlockSort(data);
assertFalse(s.blockSort(data, FIXTURE.length - 1));
assertEquals(FIXTURE[FIXTURE.length - 1], data.block[0]);
for (int i = 0; i < FIXTURE.length; i++) {
assertEquals(FIXTURE_BWT[i], data.block[data.fmap[i]]);
}
assertEquals(0, data.origPtr);
DS ds = setUpFixture();
assertFalse(ds.s.blockSort(ds.data, FIXTURE.length - 1));
assertFixtureSorted(ds.data);
assertEquals(0, ds.data.origPtr);
}

@Test
public void testSortFixtureMainSort() {
DS ds = setUpFixture();
ds.s.mainSort(ds.data, FIXTURE.length - 1);
assertFixtureSorted(ds.data);
}

@Test
public void testSortFixtureFallbackSort() {
DS ds = setUpFixture();
ds.s.fallbackSort(ds.data, FIXTURE.length - 1);
assertFixtureSorted(ds.data);
}

@Test
public void testSortFixture2() {
DS ds = setUpFixture2();
assertFalse(ds.s.blockSort(ds.data, FIXTURE2.length - 1));
assertFixture2Sorted(ds.data);
assertEquals(1, ds.data.origPtr);
}

@Test
public void testSortFixture2MainSort() {
DS ds = setUpFixture2();
ds.s.mainSort(ds.data, FIXTURE2.length - 1);
assertFixture2Sorted(ds.data);
}

@Test
public void testSortFixture2FallbackSort() {
DS ds = setUpFixture2();
ds.s.fallbackSort(ds.data, FIXTURE2.length - 1);
assertFixture2Sorted(ds.data);
}

@Test
Expand All @@ -91,4 +130,43 @@ public void testFallbackSort() {
s.fallbackSort(fmap, FIXTURE, FIXTURE.length);
assertArrayEquals(FIXTURE_SORTED, fmap);
}

private DS setUpFixture() {
return setUpFixture(FIXTURE);
}

private void assertFixtureSorted(BZip2CompressorOutputStream.Data data) {
assertFixtureSorted(data, FIXTURE, FIXTURE_BWT);
}

private DS setUpFixture2() {
return setUpFixture(FIXTURE2);
}

private void assertFixture2Sorted(BZip2CompressorOutputStream.Data data) {
assertFixtureSorted(data, FIXTURE2, FIXTURE2_BWT);
}

private DS setUpFixture(byte[] fixture) {
BZip2CompressorOutputStream.Data data = new BZip2CompressorOutputStream.Data(1);
System.arraycopy(fixture, 0, data.block, 1, fixture.length);
return new DS(data, new BlockSort(data));
}

private void assertFixtureSorted(BZip2CompressorOutputStream.Data data,
byte[] fixture, byte[] fixtureBwt) {
assertEquals(fixture[fixture.length - 1], data.block[0]);
for (int i = 0; i < fixture.length; i++) {
assertEquals(fixtureBwt[i], data.block[data.fmap[i]]);
}
}

private static class DS {
private final BZip2CompressorOutputStream.Data data;
private final BlockSort s;
DS(BZip2CompressorOutputStream.Data data, BlockSort s) {
this.data = data;
this.s = s;
}
}
}

0 comments on commit 0600296

Please sign in to comment.