Skip to content

Commit

Permalink
Add perf tests for table scans
Browse files Browse the repository at this point in the history
  • Loading branch information
Brian Hulette committed Jan 12, 2018
1 parent 99e58da commit a1edac2
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 0 deletions.
36 changes: 36 additions & 0 deletions js/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pyarrow as pa
import random
import numpy as np
import pandas as pd


cities = [u'Charlottesville', u'New York', u'San Francisco', u'Seattle', u'Terre Haute', u'Washington, DC']

def generate_batch(batch_len):
return pa.RecordBatch.from_arrays([
pa.Array.from_pandas(pd.Series(np.random.uniform(-90,90,batch_len), dtype="float32")),
pa.Array.from_pandas(pd.Series(np.random.uniform(-180,180,batch_len), dtype="float32")),
pa.Array.from_pandas(pd.Categorical((random.choice(cities) for i in range(batch_len)), cities)),
pa.Array.from_pandas(pd.Categorical((random.choice(cities) for i in range(batch_len)), cities))
], ['lat', 'lng', 'origin', 'destination'])

def write_record_batches(fd, batch_len, num_batches):
writer = pa.ipc.RecordBatchStreamWriter(fd, generate_batch(1).schema)
for batch in range(num_batches):
writer.write_batch(generate_batch(batch_len))

writer.close()

if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('filename', help='number of batches')
parser.add_argument('-n', '--num-batches', help='number of batches', type=int, default=10)
parser.add_argument('-b', '--batch-size', help='size of each batch', type=int, default=100000)

args = parser.parse_args()

print "Writing {} {}-element batches to '{}'".format(args.num_batches, args.batch_size, args.filename)
with open(args.filename, 'w') as fd:
write_record_batches(fd, args.batch_size, args.num_batches)
78 changes: 78 additions & 0 deletions js/perf/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,21 @@ for (let { name, buffers} of config) {
suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite);
}

for (let {name, buffers, tests} of require('./table_config')) {
const tableIterateSuite = new Benchmark.Suite(`Table Iterate ${name}`, { async: true });
const tableCountBySuite = new Benchmark.Suite(`Table Count By ${name}`, { async: true });
const vectorCountBySuite = new Benchmark.Suite(`Vector Count By ${name}`, { async: true });
const table = Table.from(buffers);

tableIterateSuite.add(createTableIterateTest(table));
for (test of tests) {
tableCountBySuite.add(createTableCountByTest(table, test.col, test.test, test.value))
vectorCountBySuite.add(createVectorCountByTest(table.columns[test.col], test.test, test.value))
}

suites.push(tableIterateSuite, tableCountBySuite, vectorCountBySuite)
}

console.log('Running apache-arrow performance tests...\n');

run();
Expand Down Expand Up @@ -109,3 +124,66 @@ function createGetByIndexTest(vector) {
}
};
}

function createVectorCountByTest(vector, test, value) {
let op;
if (test == 'gteq') {
op = function () {
sum = 0;
for (cell of vector) {
sum += (cell >= value)
}
}
} else if (test == 'eq') {
op = function () {
sum = 0;
for (cell of vector) {
sum += (cell == value)
}
}
} else {
throw new Error(`Unrecognized test "$test"`);
}

return {
async: true,
name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}, test: ${test}, value: ${value}`,
fn: op
};
}

function createTableIterateTest(table) {
let row;
return {
async: true,
name: `length: ${table.length}`,
fn() { for (row of table) {} }
};
}

function createTableCountByTest(table, column, test, value) {
let op;
if (test == 'gteq') {
op = function () {
sum = 0;
for (row of table) {
sum += (row.get(column) >= value)
}
}
} else if (test == 'eq') {
op = function() {
sum = 0;
for (row of table) {
sum += (row.get(column) == value)
}
}
} else {
throw new Error(`Unrecognized test "${test}"`);
}

return {
async: true,
name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`,
fn: op
};
}
36 changes: 36 additions & 0 deletions js/perf/table_config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

const fs = require('fs');
const path = require('path');
const glob = require('glob');

const config = [];
const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.arrow`));

tests = [
{col: 0, test: 'gteq', value: 0 },
{col: 1, test: 'gteq', value: 0 },
{col: 2, test: 'eq', value: 'Seattle'},
]

for (const filename of filenames) {
const { name } = path.parse(filename);
config.push({ name, buffers: [fs.readFileSync(filename)], tests });
}

module.exports = config;

0 comments on commit a1edac2

Please sign in to comment.