Skip to content

Commit

Permalink
Merge remote-tracking branch 'ccri/table-scan-perf' into js-cpp-refac…
Browse files Browse the repository at this point in the history
…tor-merge_with-table-scan-perf
  • Loading branch information
trxcllnt committed Jan 19, 2018
2 parents f3f3b86 + e20decd commit d2b18d5
Show file tree
Hide file tree
Showing 20 changed files with 1,513 additions and 379 deletions.
1 change: 1 addition & 0 deletions js/gulp/uglify-task.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ const reservePublicNames = ((ESKeywords) => function reservePublicNames(target,
`../${src}/table.js`,
`../${src}/vector.js`,
`../${src}/util/int.js`,
`../${src}/recordbatch.js`,
`../${src}/${mainExport}.js`,
];
return publicModulePaths.reduce((keywords, publicModulePath) => [
Expand Down
2 changes: 1 addition & 1 deletion js/gulp/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ const ESKeywords = [
// EventTarget
`addListener`, `removeListener`, `addEventListener`, `removeEventListener`,
// Arrow properties
`low`, `high`, `data`, `index`, `field`, `validity`, `columns`, `fieldNode`, `subarray`,
`low`, `high`, `data`, `index`, `field`, `columns`, 'numCols', 'numRows', `values`, `valueOffsets`, `nullBitmap`, `subarray`
];

function taskName(target, format) {
Expand Down
14 changes: 7 additions & 7 deletions js/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"clean": "gulp clean",
"debug": "gulp debug",
"perf": "node ./perf/index.js",
"create:perfdata": "python ./test/data/tables/generate.py ./test/data/tables/tracks.arrow",
"release": "./npm-release.sh",
"clean:all": "run-p clean clean:testdata",
"clean:testdata": "gulp clean:testdata",
Expand Down Expand Up @@ -51,16 +52,15 @@
],
"dependencies": {
"@types/text-encoding-utf-8": "1.0.1",
"command-line-args": "5.0.0",
"command-line-args": "5.0.1",
"command-line-usage": "4.1.0",
"flatbuffers": "trxcllnt/flatbuffers-esm",
"json-bignum": "0.0.3",
"text-encoding-utf-8": "^1.0.2",
"ts-node": "4.1.0",
"tslib": "1.8.1"
"tslib": "1.9.0"
},
"devDependencies": {
"@std/esm": "0.19.6",
"@std/esm": "0.19.7",
"@types/flatbuffers": "1.6.5",
"@types/glob": "5.0.34",
"@types/jest": "22.0.1",
Expand All @@ -80,11 +80,11 @@
"gulp-transform-js-ast": "1.0.2",
"gulp-typescript": "3.2.3",
"ix": "2.3.4",
"jest": "22.1.2",
"jest": "22.1.3",
"jest-environment-node-debug": "2.0.0",
"json": "9.0.6",
"lerna": "2.7.1",
"lint-staged": "6.0.0",
"lint-staged": "6.0.1",
"merge2": "1.2.1",
"mkdirp": "0.5.1",
"npm-run-all": "4.1.2",
Expand Down Expand Up @@ -130,7 +130,7 @@
"lcov"
],
"coveragePathIgnorePatterns": [
"format\\/(File|Message|Schema|Tensor)_generated\\.(js|ts)$",
"fb\\/(File|Message|Schema|Tensor)_generated\\.(js|ts)$",
"test\\/.*\\.(ts|tsx|js)$",
"/node_modules/"
],
Expand Down
161 changes: 131 additions & 30 deletions js/perf/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,29 +16,40 @@
// under the License.

// Use the ES5 UMD target as perf baseline
// const { Table, readVectors } = require('../targets/es5/umd');
// const { Table, readVectors } = require('../targets/es5/cjs');
const { Table, readVectors } = require('../targets/es2015/umd');
// const { Table, readVectors } = require('../targets/es2015/cjs');
// const { col, Table, read: readBatches } = require('../targets/es5/umd');
// const { col, Table, read: readBatches } = require('../targets/es5/cjs');
// const { col, Table, read: readBatches } = require('../targets/es2015/umd');
const { col, Table, read: readBatches } = require('../targets/es2015/cjs');

const config = require('./config');
const Benchmark = require('benchmark');

const suites = [];

for (let { name, buffers} of config) {
const parseSuite = new Benchmark.Suite(`Parse ${name}`, { async: true });
const sliceSuite = new Benchmark.Suite(`Slice ${name} vectors`, { async: true });
const iterateSuite = new Benchmark.Suite(`Iterate ${name} vectors`, { async: true });
const getByIndexSuite = new Benchmark.Suite(`Get ${name} values by index`, { async: true });
parseSuite.add(createFromTableTest(name, buffers));
parseSuite.add(createReadVectorsTest(name, buffers));
for (const vector of Table.from(buffers).columns) {
sliceSuite.add(createSliceTest(vector));
iterateSuite.add(createIterateTest(vector));
getByIndexSuite.add(createGetByIndexTest(vector));
}
suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite);
for (let { name, buffers } of require('./table_config')) {
const parseSuiteName = `Parse "${name}"`;
const sliceSuiteName = `Slice "${name}" vectors`;
const iterateSuiteName = `Iterate "${name}" vectors`;
const getByIndexSuiteName = `Get "${name}" values by index`;
const sliceToArraySuiteName = `Slice toArray "${name}" vectors`;
suites.push(createTestSuite(parseSuiteName, createFromTableTest(name, buffers)));
suites.push(createTestSuite(parseSuiteName, createReadBatchesTest(name, buffers)));
const table = Table.from(buffers);
suites.push(...table.columns.map((vector, i) => createTestSuite(getByIndexSuiteName, createGetByIndexTest(vector, table.schema.fields[i].name))));
suites.push(...table.columns.map((vector, i) => createTestSuite(iterateSuiteName, createIterateTest(vector, table.schema.fields[i].name))));
suites.push(...table.columns.map((vector, i) => createTestSuite(sliceToArraySuiteName, createSliceToArrayTest(vector, table.schema.fields[i].name))));
suites.push(...table.columns.map((vector, i) => createTestSuite(sliceSuiteName, createSliceTest(vector, table.schema.fields[i].name))));
}

for (let {name, buffers, countBys, counts} of require('./table_config')) {
const table = Table.from(buffers);

const dfCountBySuiteName = `DataFrame Count By "${name}"`;
const dfFilterCountSuiteName = `DataFrame Filter-Scan Count "${name}"`;
const dfDirectCountSuiteName = `DataFrame Direct Count "${name}"`;

suites.push(...countBys.map((countBy) => createTestSuite(dfCountBySuiteName, createDataFrameCountByTest(table, countBy))));
suites.push(...counts.map(({ col, test, value }) => createTestSuite(dfFilterCountSuiteName, createDataFrameFilterCountTest(table, col, test, value))));
suites.push(...counts.map(({ col, test, value }) => createTestSuite(dfDirectCountSuiteName, createDataFrameDirectCountTest(table, col, test, value))));
}

console.log('Running apache-arrow performance tests...\n');
Expand All @@ -52,7 +63,7 @@ function run() {
var str = x.toString();
var meanMsPerOp = Math.round(x.stats.mean * 100000)/100;
var sliceOf60FPS = Math.round((meanMsPerOp / (1000/60)) * 100000)/1000;
return `${str} (avg: ${meanMsPerOp}ms, or ${sliceOf60FPS}% of a frame @ 60FPS) ${x.suffix || ''}`;
return `${str}\n avg: ${meanMsPerOp}ms\n ${sliceOf60FPS}% of a frame @ 60FPS ${x.suffix || ''}`;
}).join('\n') + '\n');
if (suites.length > 0) {
setTimeout(run, 1000);
Expand All @@ -61,51 +72,141 @@ function run() {
.run({ async: true });
}

function createTestSuite(name, test) {
return new Benchmark.Suite(name, { async: true }).add(test);
}

function createFromTableTest(name, buffers) {
let table;
return {
async: true,
name: `Table.from`,
name: `Table.from\n`,
fn() { table = Table.from(buffers); }
};
}

function createReadVectorsTest(name, buffers) {
let vectors;
function createReadBatchesTest(name, buffers) {
let recordBatch;
return {
async: true,
name: `readVectors`,
fn() { for (vectors of readVectors(buffers)) {} }
name: `readBatches\n`,
fn() { for (recordBatch of readBatches(buffers)) {} }
};
}

function createSliceTest(vector) {
function createSliceTest(vector, name) {
let xs;
return {
async: true,
name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`,
name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`,
fn() { xs = vector.slice(); }
};
}

function createIterateTest(vector) {
function createSliceToArrayTest(vector, name) {
let xs;
return {
async: true,
name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`,
fn() { xs = vector.slice().toArray(); }
};
}

function createIterateTest(vector, name) {
let value;
return {
async: true,
name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`,
name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`,
fn() { for (value of vector) {} }
};
}

function createGetByIndexTest(vector) {
function createGetByIndexTest(vector, name) {
let value;
return {
async: true,
name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`,
name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`,
fn() {
for (let i = -1, n = vector.length; ++i < n;) {
value = vector.get(i);
}
}
};
}

function createDataFrameDirectCountTest(table, column, test, value) {
let sum, colidx = table.schema.fields.findIndex((c)=>c.name === column);

if (test == 'gteq') {
op = function () {
sum = 0;
let batches = table.batches;
let numBatches = batches.length;
for (let batchIndex = -1; ++batchIndex < numBatches;) {
// load batches
const { numRows, columns } = batches[batchIndex];
const vector = columns[colidx];
// yield all indices
for (let index = -1; ++index < numRows;) {
sum += (vector.get(index) >= value);
}
}
}
} else if (test == 'eq') {
op = function() {
sum = 0;
let batches = table.batches;
let numBatches = batches.length;
for (let batchIndex = -1; ++batchIndex < numBatches;) {
// load batches
const { numRows, columns } = batches[batchIndex];
const vector = columns[colidx];
// yield all indices
for (let index = -1; ++index < numRows;) {
sum += (vector.get(index) === value);
}
}
}
} else {
throw new Error(`Unrecognized test "${test}"`);
}

return {
async: true,
name: `name: '${column}', length: ${table.numRows}, type: ${table.columns[colidx].type}, test: ${test}, value: ${value}\n`,
fn: op
};
}

function createDataFrameCountByTest(table, column) {
let colidx = table.schema.fields.findIndex((c)=> c.name === column);

return {
async: true,
name: `name: '${column}', length: ${table.numRows}, type: ${table.columns[colidx].type}\n`,
fn() {
table.countBy(column);
}
};
}

function createDataFrameFilterCountTest(table, column, test, value) {
let colidx = table.schema.fields.findIndex((c)=> c.name === column);
let df;

if (test == 'gteq') {
df = table.filter(col(column).gteq(value));
} else if (test == 'eq') {
df = table.filter(col(column).eq(value));
} else {
throw new Error(`Unrecognized test "${test}"`);
}

return {
async: true,
name: `name: '${column}', length: ${table.numRows}, type: ${table.columns[colidx].type}, test: ${test}, value: ${value}\n`,
fn() {
df.count();
}
};
}
48 changes: 48 additions & 0 deletions js/perf/table_config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

const fs = require('fs');
const path = require('path');
const glob = require('glob');

const config = [];
const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.arrow`));

countBys = {
"tracks": ['origin', 'destination']
}
counts = {
"tracks": [
{col: 'lat', test: 'gteq', value: 0 },
{col: 'lng', test: 'gteq', value: 0 },
{col: 'origin', test: 'eq', value: 'Seattle'},
]
}

for (const filename of filenames) {
const { name } = path.parse(filename);
if (name in counts) {
config.push({
name,
buffers: [fs.readFileSync(filename)],
countBys: countBys[name],
counts: counts[name],
});
}
}

module.exports = config;

0 comments on commit d2b18d5

Please sign in to comment.