Skip to content

Commit

Permalink
Add concept of predicates for filtering dataframes
Browse files Browse the repository at this point in the history
  • Loading branch information
Brian Hulette committed Jan 12, 2018
1 parent 796f45d commit 4d9e8c0
Show file tree
Hide file tree
Showing 4 changed files with 192 additions and 13 deletions.
12 changes: 6 additions & 6 deletions js/perf/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
// under the License.

// Use the ES5 UMD target as perf baseline
// const { DataFrame, Table, readVectors } = require('../targets/es5/umd');
// const { DataFrame, Table, readVectors } = require('../targets/es5/cjs');
// const { DataFrame, Table, readVectors } = require('../targets/es2015/umd');
const { DataFrame, Table, readVectors } = require('../targets/es2015/cjs');
// const { lit, col, DataFrame, Table, readVectors } = require('../targets/es5/umd');
// const { lit, col, DataFrame, Table, readVectors } = require('../targets/es5/cjs');
// const { lit, col, DataFrame, Table, readVectors } = require('../targets/es2015/umd');
const { lit, col, DataFrame, Table, readVectors } = require('../targets/es2015/cjs');

const config = require('./config');
const Benchmark = require('benchmark');
Expand Down Expand Up @@ -280,9 +280,9 @@ function createDataFrameScanCountTest(table, column, test, value) {
function createDataFrameFilterCountTest(table, column, test, value) {
let df = DataFrame.from(table);
if (test == 'gteq') {
df = df.filter((idx, cols)=>cols[column].get(idx) >= value);
df = df.filter(col(table.columns[column].name).gteq(value));
} else if (test == 'eq') {
df = df.filter((idx, cols)=>cols[column].get(idx) == value);
df = df.filter(col(table.columns[column].name).eq(value));
} else {
throw new Error(`Unrecognized test "${test}"`);
}
Expand Down
5 changes: 5 additions & 0 deletions js/src/Arrow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import {
} from './vector/numeric';

import { DataFrame } from './dataframe/dataframe';
import { lit, col } from './dataframe/predicate';

// closure compiler always erases static method names:
// https://github.com/google/closure-compiler/issues/1776
Expand Down Expand Up @@ -88,12 +89,16 @@ export {
};

export { DataFrame } from './dataframe/dataframe';
export { lit, col } from './dataframe/predicate';


/* These exports are needed for the closure umd targets */
try {
const Arrow = eval('exports');
if (typeof Arrow === 'object') {
// string indexers tell closure compiler not to rename these properties
Arrow['lit'] = lit;
Arrow['col'] = col;
Arrow['read'] = read;
Arrow['readAsync'] = readAsync;
Arrow['Table'] = Table;
Expand Down
17 changes: 10 additions & 7 deletions js/src/dataframe/dataframe.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@ import { Vector } from "../vector/vector";
import { StructVector } from "../vector/struct";
import { VirtualVector } from "../vector/virtual";

import { Predicate } from "./predicate"

export type NextFunc = (idx: number, cols: Vector[]) => void;
export type PredicateFunc = (idx: number, cols: Vector[]) => boolean;

export abstract class DataFrame {
constructor(readonly lengths: Uint32Array) {}
public abstract columns: Vector<any>[];
public abstract getBatch(batch: number): Vector[];
public abstract scan(next: NextFunc): void;
public filter(predicate: PredicateFunc): DataFrame {
public filter(predicate: Predicate): DataFrame {
return new FilteredDataFrame(this, predicate);
}

Expand Down Expand Up @@ -120,7 +121,7 @@ class ChunkedDataFrame extends DataFrame {

class FilteredDataFrame extends DataFrame {
public columns: Vector<any>[];
constructor (readonly parent: DataFrame, private predicate: PredicateFunc) {
constructor (readonly parent: DataFrame, private predicate: Predicate) {
super(parent.lengths);
}

Expand All @@ -138,10 +139,11 @@ class FilteredDataFrame extends DataFrame {

// load batches
const columns = this.parent.getBatch(batch);
const predicate = this.predicate.bind(columns);

// yield all indices
for (let idx = -1; ++idx < length;) {
if (this.predicate(idx, columns)) next(idx, columns);
if (predicate(idx, columns)) next(idx, columns);
}
}
}
Expand All @@ -159,19 +161,20 @@ class FilteredDataFrame extends DataFrame {

// load batches
const columns = this.parent.getBatch(batch);
const predicate = this.predicate.bind(columns);

// yield all indices
for (let idx = -1; ++idx < length;) {
if (this.predicate(idx, columns)) ++sum;
if (predicate(idx, columns)) ++sum;
}
}
return sum;
}

filter(predicate: PredicateFunc): DataFrame {
filter(predicate: Predicate): DataFrame {
return new FilteredDataFrame(
this.parent,
(idx, cols) => this.predicate(idx, cols) && predicate(idx, cols)
this.predicate.and(predicate)
);
}
}
171 changes: 171 additions & 0 deletions js/src/dataframe/predicate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import { Vector } from "../vector/vector";

export type ValueFunc<T> = (idx: number, cols: Vector[]) => T|null;
export type PredicateFunc = (idx: number, cols: Vector[]) => boolean;

export abstract class Value<T> {
eq(other: Value<T>|T): Predicate {
if (!(other instanceof Value)) other = new Literal(other);
return new Equals(this, other);
}
lteq(other: Value<T>|T): Predicate {
if (!(other instanceof Value)) other = new Literal(other);
return new LTeq(this, other);
}
gteq(other: Value<T>|T): Predicate {
if (!(other instanceof Value)) other = new Literal(other);
return new GTeq(this, other);
}
}

class Literal<T=any> extends Value<T> {
constructor(public v: T) { super(); }
}

class Col<T=any> extends Value<T> {
vector: Vector<T>;
colidx: number;

constructor(public name: string) { super(); }
bind(cols: Vector[]) {
if (!this.colidx) {
// Assume column index doesn't change between calls to bind
//this.colidx = cols.findIndex(v => v.name.indexOf(this.name) != -1);
this.colidx = -1;
for (let idx = -1; ++idx < cols.length;) {
if (cols[idx].name === this.name) {
this.colidx = idx;
break;
}
}
if (this.colidx < 0) throw new Error(`Failed to bind Col "${this.name}"`)
}
this.vector = cols[this.colidx]
return this.vector.get.bind(this.vector);
}

emitString() { return `cols[${this.colidx}].get(idx)`; }
}

export abstract class Predicate {
abstract bind(cols: Vector[]): PredicateFunc;
and(expr: Predicate): Predicate { return new And(this, expr); }
or(expr: Predicate): Predicate { return new Or(this, expr); }
ands(): Predicate[] { return [this]; }
}

abstract class ComparisonPredicate<T=any> extends Predicate {
constructor(public readonly left: Value<T>, public readonly right: Value<T>) {
super();
}

bind(cols: Vector<any>[]) {
if (this.left instanceof Literal) {
if (this.right instanceof Literal) {
return this._bindLitLit(cols, this.left, this.right);
} else { // right is a Col

return this._bindColLit(cols, this.right as Col, this.left);
}
} else { // left is a Col
if (this.right instanceof Literal) {
return this._bindColLit(cols, this.left as Col, this.right);
} else { // right is a Col
return this._bindColCol(cols, this.left as Col, this.right as Col);
}
}
}

protected abstract _bindLitLit(cols: Vector<any>[], left: Literal, right: Literal): PredicateFunc;
protected abstract _bindColCol(cols: Vector<any>[], left: Col , right: Col ): PredicateFunc;
protected abstract _bindColLit(cols: Vector<any>[], col: Col , lit: Literal ): PredicateFunc;
}

abstract class CombinationPredicate extends Predicate {
constructor(public readonly left: Predicate, public readonly right: Predicate) {
super();
}
}

class And extends CombinationPredicate {
bind(cols: Vector[]) {
const left = this.left.bind(cols);
const right = this.right.bind(cols);
return (idx: number, cols: Vector[]) => left(idx, cols) && right(idx, cols);
}
ands() : Predicate[] { return this.left.ands().concat(this.right.ands()); }
}

class Or extends CombinationPredicate {
bind(cols: Vector[]) {
const left = this.left.bind(cols);
const right = this.right.bind(cols);
return (idx: number, cols: Vector[]) => left(idx, cols) || right(idx, cols);
}
}

class Equals extends ComparisonPredicate {
protected _bindLitLit(_: Vector<any>[], left: Literal, right: Literal): PredicateFunc {
const rtrn: boolean = left.v == right.v;
return () => rtrn;
}

protected _bindColCol(cols: Vector<any>[], left: Col , right: Col ): PredicateFunc {
const left_func = left.bind(cols);
const right_func = right.bind(cols);
return (idx: number, cols: Vector[]) => left_func(idx, cols) == right_func(idx, cols);
}

protected _bindColLit(cols: Vector<any>[], col: Col , lit: Literal ): PredicateFunc {
const col_func = col.bind(cols);
return (idx: number, cols: Vector[]) => col_func(idx, cols) == lit.v;
}
}

class LTeq extends ComparisonPredicate {
protected _bindLitLit(_: Vector<any>[], left: Literal, right: Literal): PredicateFunc {
const rtrn: boolean = left.v <= right.v;
return () => rtrn;
}

protected _bindColCol(cols: Vector<any>[], left: Col , right: Col ): PredicateFunc {
const left_func = left.bind(cols);
const right_func = right.bind(cols);
return (idx: number, cols: Vector[]) => left_func(idx, cols) <= right_func(idx, cols);
}

protected _bindColLit(cols: Vector<any>[], col: Col , lit: Literal ): PredicateFunc {
const col_func = col.bind(cols);
return (idx: number, cols: Vector[]) => col_func(idx, cols) <= lit.v;
}
}

class GTeq extends ComparisonPredicate {
protected _bindLitLit(_: Vector<any>[], left: Literal, right: Literal): PredicateFunc {
const rtrn: boolean = left.v >= right.v;
return () => rtrn;
}

protected _bindColCol(cols: Vector<any>[], left: Col, right: Col): PredicateFunc {
const left_func = left.bind(cols);
const right_func = right.bind(cols);
return (idx: number, cols: Vector[]) => left_func(idx, cols) >= right_func(idx, cols);
}

protected _bindColLit(cols: Vector<any>[], col: Col, lit: Literal): PredicateFunc {
const col_func = col.bind(cols);
return (idx: number, cols: Vector[]) => col_func(idx, cols) >= lit.v;
}
//eval(idx: number, cols: Vector[]) {
// return this.left.eval(idx, cols) >= this.right.eval(idx, cols);
//}
//emitString() {
// return `${this.left.emitString()} >= ${this.right.emitString()}`
//}
//createDictionaryEval(schema, lit: Literal, col: Col): (idx: number, cols: Vector[]) => boolean {
// return this.eval;
//}
}

export function lit(n: number): Value<any> { return new Literal(n); }
export function col(n: string): Value<any> { return new Col(n); }

0 comments on commit 4d9e8c0

Please sign in to comment.