Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion java/bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ To fetch the source data:

```% ./fetch-data.sh```

> :warning: Script will fetch 7GB of data
> :warning: Script will fetch 4GB of data

To generate the derived data:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,8 @@ public static BatchReader createReader(Path root,
long salesRecords) throws IOException {
switch (dataName) {
case "taxi":
return new RecursiveReader(new Path(root, "sources/" + dataName), "csv",
schema, conf, CompressionKind.ZLIB);
return new RecursiveReader(new Path(root, "sources/" + dataName), "parquet",
schema, conf, CompressionKind.NONE);
case "sales":
return new SalesGenerator(salesRecords);
case "github":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
import org.apache.orc.bench.core.convert.BatchReader;

import java.io.IOException;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.util.List;

Expand Down Expand Up @@ -191,16 +190,18 @@ public void convert(ColumnVector cv, int row, Object value) {

private static class DecimalConverter implements AvroConverter {
final int scale;
final double multiplier;
DecimalConverter(int scale) {
this.scale = scale;
this.multiplier = Math.pow(10.0, this.scale);
}
public void convert(ColumnVector cv, int row, Object value) {
if (value == null) {
cv.noNulls = false;
cv.isNull[row] = true;
} else {
DecimalColumnVector tc = (DecimalColumnVector) cv;
tc.vector[row].set(getHiveDecimalFromByteBuffer((ByteBuffer) value, scale));
tc.vector[row].set(HiveDecimal.create(Math.round((double) value * multiplier)));
}
}
}
Expand Down Expand Up @@ -294,11 +295,4 @@ static byte[] getBytesFromByteBuffer(ByteBuffer byteBuffer) {
byteBuffer.get(result);
return result;
}

static HiveDecimal getHiveDecimalFromByteBuffer(ByteBuffer byteBuffer,
int scale) {
byte[] result = getBytesFromByteBuffer(byteBuffer);
HiveDecimal dec = HiveDecimal.create(new BigInteger(result), scale);
return dec;
}
}
24 changes: 12 additions & 12 deletions java/bench/core/src/resources/taxi.schema
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
struct<
vendor_id:int,
pickup_time: timestamp,
dropoff_time: timestamp,
passenger_count: int,
VendorID: bigint,
tpep_pickup_datetime: timestamp,
tpep_dropoff_datetime: timestamp,
passenger_count: bigint,
trip_distance: double,
pickup_longitude: double,
pickup_latitude: double,
ratecode_id: int,
RatecodeID: bigint,
store_and_fwd_flag: string,
dropoff_longitude: double,
dropoff_latitude: double,
payment_type: int,
PULocationID: bigint,
DOLocationID: bigint,
payment_type: bigint,
fare_amount: decimal(8,2),
extra: decimal(8,2),
mta_tax: decimal(8,2),
tip_amount: decimal(8,2),
tolls_amount: decimal(8,2),
improvement_surcharge : decimal(8,2),
total_amount: decimal(8,2)
improvement_surcharge: decimal(8,2),
total_amount: decimal(8,2),
congestion_surcharge: int,
airport_fee: int
>
4 changes: 2 additions & 2 deletions java/bench/fetch-data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
mkdir -p data/sources/taxi
(cd data/sources/taxi; wget -O - https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-11.csv | gzip > yellow_tripdata_2015-11.csv.gz )
(cd data/sources/taxi; wget -O - https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.csv | gzip > yellow_tripdata_2015-12.csv.gz )
(cd data/sources/taxi; wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-11.parquet )
(cd data/sources/taxi; wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.parquet )

mkdir -p data/sources/github
(cd data/sources/github; wget http://data.gharchive.org/2015-11-{01..15}-{0..23}.json.gz)
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ public void setup() throws IOException {
schema = TypeDescription.createDecimal()
.withScale(2)
.withPrecision(precision);
readCsvData(total_amount, root, "total_amount", conf);
readRawData(total_amount, root, "total_amount", conf);
batch = schema.createRowBatchV2();
}
}
Expand All @@ -180,16 +180,16 @@ public void write(OutputState state) throws Exception {
writer.close();
}

static void readCsvData(long[] data,
static void readRawData(long[] data,
Path root,
String column,
Configuration conf) throws IOException {
TypeDescription schema = Utilities.loadSchema("taxi.schema");
int row = 0;
int batchPosn = 0;
BatchReader reader =
new GenerateVariants.RecursiveReader(new Path(root, "sources/taxi"), "csv",
schema, conf, org.apache.orc.bench.core.CompressionKind.ZLIB);
new GenerateVariants.RecursiveReader(new Path(root, "sources/taxi"), "parquet",
schema, conf, org.apache.orc.bench.core.CompressionKind.NONE);
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 0;
TypeDescription columnSchema = schema.findSubtype(column);
Expand Down