apache · mwlon · May 25, 2022 · May 25, 2022 · May 25, 2022 · May 25, 2022
diff --git a/java/bench/README.md b/java/bench/README.md
@@ -24,7 +24,7 @@ To fetch the source data:
 
 ```% ./fetch-data.sh```
 
-> :warning: Script will fetch 7GB of data
+> :warning: Script will fetch 4GB of data
 
 To generate the derived data:
 

diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java
@@ -221,8 +221,8 @@ public static BatchReader createReader(Path root,
                                          long salesRecords) throws IOException {
     switch (dataName) {
       case "taxi":
-        return new RecursiveReader(new Path(root, "sources/" + dataName), "csv",
-            schema, conf, CompressionKind.ZLIB);
+        return new RecursiveReader(new Path(root, "sources/" + dataName), "parquet",
+            schema, conf, CompressionKind.NONE);
       case "sales":
         return new SalesGenerator(salesRecords);
       case "github":

diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java
@@ -41,7 +41,6 @@
 import org.apache.orc.bench.core.convert.BatchReader;
 
 import java.io.IOException;
-import java.math.BigInteger;
 import java.nio.ByteBuffer;
 import java.util.List;
 
@@ -191,16 +190,18 @@ public void convert(ColumnVector cv, int row, Object value) {
 
   private static class DecimalConverter implements AvroConverter {
     final int scale;
+    final double multiplier;
     DecimalConverter(int scale) {
       this.scale = scale;
+      this.multiplier = Math.pow(10.0, this.scale);
     }
     public void convert(ColumnVector cv, int row, Object value) {
       if (value == null) {
         cv.noNulls = false;
         cv.isNull[row] = true;
       } else {
         DecimalColumnVector tc = (DecimalColumnVector) cv;
-        tc.vector[row].set(getHiveDecimalFromByteBuffer((ByteBuffer) value, scale));
+        tc.vector[row].set(HiveDecimal.create(Math.round((double) value * multiplier)));
       }
     }
   }
@@ -294,11 +295,4 @@ static byte[] getBytesFromByteBuffer(ByteBuffer byteBuffer) {
     byteBuffer.get(result);
     return result;
   }
-
-  static HiveDecimal getHiveDecimalFromByteBuffer(ByteBuffer byteBuffer,
-                                                  int scale) {
-    byte[] result = getBytesFromByteBuffer(byteBuffer);
-    HiveDecimal dec = HiveDecimal.create(new BigInteger(result), scale);
-    return dec;
-  }
 }
diff --git a/java/bench/core/src/resources/taxi.schema b/java/bench/core/src/resources/taxi.schema
@@ -1,21 +1,21 @@
 struct<
-  vendor_id:int,
-  pickup_time: timestamp,
-  dropoff_time: timestamp,
-  passenger_count: int,
+  VendorID: bigint,
+  tpep_pickup_datetime: timestamp,
+  tpep_dropoff_datetime: timestamp,
+  passenger_count: bigint,
   trip_distance: double,
-  pickup_longitude: double,
-  pickup_latitude: double,
-  ratecode_id: int,
+  RatecodeID: bigint,
   store_and_fwd_flag: string,
-  dropoff_longitude: double,
-  dropoff_latitude: double,
-  payment_type: int,
+  PULocationID: bigint,
+  DOLocationID: bigint,
+  payment_type: bigint,
   fare_amount: decimal(8,2),
   extra: decimal(8,2),
   mta_tax: decimal(8,2),
   tip_amount: decimal(8,2),
   tolls_amount: decimal(8,2),
-  improvement_surcharge : decimal(8,2),
-  total_amount: decimal(8,2)
+  improvement_surcharge: decimal(8,2),
+  total_amount: decimal(8,2),
+  congestion_surcharge: int,
+  airport_fee: int
 >
diff --git a/java/bench/fetch-data.sh b/java/bench/fetch-data.sh
@@ -15,8 +15,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 mkdir -p data/sources/taxi
-(cd data/sources/taxi; wget -O - https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-11.csv | gzip > yellow_tripdata_2015-11.csv.gz )
-(cd data/sources/taxi; wget -O - https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.csv | gzip > yellow_tripdata_2015-12.csv.gz )
+(cd data/sources/taxi; wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-11.parquet )
+(cd data/sources/taxi; wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.parquet )
 
 mkdir -p data/sources/github
 (cd data/sources/github; wget http://data.gharchive.org/2015-11-{01..15}-{0..23}.json.gz)
diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/DecimalBench.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/DecimalBench.java
@@ -157,7 +157,7 @@ public void setup() throws IOException {
       schema = TypeDescription.createDecimal()
           .withScale(2)
           .withPrecision(precision);
-      readCsvData(total_amount, root, "total_amount", conf);
+      readRawData(total_amount, root, "total_amount", conf);
       batch = schema.createRowBatchV2();
     }
   }
@@ -180,16 +180,16 @@ public void write(OutputState state) throws Exception {
     writer.close();
   }
 
-  static void readCsvData(long[] data,
+  static void readRawData(long[] data,
                           Path root,
                           String column,
                           Configuration conf) throws IOException {
     TypeDescription schema = Utilities.loadSchema("taxi.schema");
     int row = 0;
     int batchPosn = 0;
     BatchReader reader =
-        new GenerateVariants.RecursiveReader(new Path(root, "sources/taxi"), "csv",
-        schema, conf, org.apache.orc.bench.core.CompressionKind.ZLIB);
+        new GenerateVariants.RecursiveReader(new Path(root, "sources/taxi"), "parquet",
+        schema, conf, org.apache.orc.bench.core.CompressionKind.NONE);
     VectorizedRowBatch batch = schema.createRowBatch();
     batch.size = 0;
     TypeDescription columnSchema = schema.findSubtype(column);