Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coerce expression #1137

Merged
merged 18 commits into from Jun 25, 2015
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 6 additions & 1 deletion blaze/compute/numpy.py
Expand Up @@ -12,7 +12,7 @@
from ..expr import std, var, count, nunique, Summary, IsIn
from ..expr import BinOp, UnaryOp, USub, Not, nelements, Repeat, Concat, Interp
from ..expr import UTCFromTimestamp, DateTimeTruncate
from ..expr import Transpose, TensorDot
from ..expr import Transpose, TensorDot, Coerce
from ..utils import keywords

from .core import base, compute
Expand Down Expand Up @@ -364,3 +364,8 @@ def join_ndarray(expr, lhs, rhs, **kwargs):
if isinstance(rhs, np.ndarray):
rhs = DataFrame(rhs)
return compute_up(expr, lhs, rhs, **kwargs)


@dispatch(Coerce, np.ndarray)
def compute_up(expr, data, **kwargs):
return data.astype(to_numpy_dtype(expr.schema))
13 changes: 9 additions & 4 deletions blaze/compute/sql.py
Expand Up @@ -42,15 +42,15 @@

from multipledispatch import MDNotImplementedError

from odo.backends.sql import metadata_of_engine
from odo.backends.sql import metadata_of_engine, dshape_to_alchemy

from ..dispatch import dispatch

from .core import compute_up, compute, base

from ..expr import Projection, Selection, Field, Broadcast, Expr, IsIn, Slice
from ..expr import (BinOp, UnaryOp, USub, Join, mean, var, std, Reduction,
count, FloorDiv, UnaryStringFunction, strlen, DateTime)
from ..expr import (BinOp, UnaryOp, Join, mean, var, std, Reduction, count,
FloorDiv, UnaryStringFunction, strlen, DateTime, Coerce)
from ..expr import nunique, Distinct, By, Sort, Head, Label, ReLabel, Merge
from ..expr import common_subexpression, Summary, Like, nelements

Expand Down Expand Up @@ -838,7 +838,7 @@ def compute_up(expr, data, **kwargs):
stop = start + 1
else:
raise TypeError('type %r not supported for slicing wih SQL backend'
% type(index).__name__)
% type(index).__name__)

warnings.warn('The order of the result set from a Slice expression '
'computed against the SQL backend is not deterministic.')
Expand All @@ -847,3 +847,8 @@ def compute_up(expr, data, **kwargs):
return select(data).offset(start)
else:
return select(data).offset(start).limit(stop - start)


@dispatch(Coerce, ColumnElement)
def compute_up(expr, data, **kwargs):
return sa.cast(data, dshape_to_alchemy(expr.to)).label(expr._name)
7 changes: 7 additions & 0 deletions blaze/compute/tests/test_numpy_compute.py
Expand Up @@ -467,3 +467,10 @@ def test_timedelta_arith():
sym = symbol('s', discover(dates))
assert (compute(sym + delta, dates) == dates + delta).all()
assert (compute(sym - delta, dates) == dates - delta).all()


def test_coerce():
x = np.arange(1, 3)
s = symbol('s', discover(x))
np.testing.assert_array_equal(compute(s.coerce('float64'), x),
np.arange(1.0, 3.0))
20 changes: 20 additions & 0 deletions blaze/compute/tests/test_sql_compute.py
Expand Up @@ -1682,3 +1682,23 @@ def test_insert_from_subselect(sql_with_float):
odo(sql_with_float, pd.DataFrame).iloc[2:].reset_index(drop=True),
pd.DataFrame([{'c': 1.0}, {'c': 2.0}]),
)


def test_coerce(sql):
n = sql.name
t = symbol(n, discover(sql))
expr = t.B.coerce(to='float64')

# "B" because we're capitalized
expected = 'SELECT cast({t}."B" AS FLOAT(53)) AS "B" FROM {t}'.format(t=n)
result = compute(expr, sql)
assert normalize(str(result)) == normalize(expected)


def test_coerce_bool_and_sum(sql):
n = sql.name
t = symbol(n, discover(sql))
expr = (t.B > 1.0).coerce(to='int32').sum()
result = compute(expr, sql).scalar()
expected = odo(compute(t.B, sql), pd.Series).gt(1).sum()
assert result == expected
24 changes: 22 additions & 2 deletions blaze/expr/expressions.py
Expand Up @@ -22,7 +22,7 @@
__all__ = ['Expr', 'ElemWise', 'Field', 'Symbol', 'discover', 'Projection',
'projection', 'Selection', 'selection', 'Label', 'label', 'Map',
'ReLabel', 'relabel', 'Apply', 'Slice', 'shape', 'ndim', 'label',
'symbol']
'symbol', 'Coerce', 'coerce']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the coerce function be available as a standalone function from the top level package or do you think this is most useful as a method

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'll just leave it as a method for now and remove it from __all__



_attr_cache = dict()
Expand Down Expand Up @@ -705,6 +705,21 @@ def dshape(self):
return dshape(self._dshape)


class Coerce(Expr):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we add a docstring here with some examples?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep will do

__slots__ = '_hash', '_child', 'to'

@property
def schema(self):
return self.to

@property
def dshape(self):
return DataShape(*(self._child.shape + (self.schema,)))

def __str__(self):
return '%s.coerce(to=%r)' % (self._child, str(self.schema))


def apply(expr, func, dshape, splittable=False):
return Apply(expr, func, datashape.dshape(dshape), splittable)

Expand Down Expand Up @@ -757,13 +772,18 @@ def ndim(expr):
return len(shape(expr))


def coerce(expr, to):
return Coerce(expr, dshape(to) if isinstance(to, _strtypes) else to)


dshape_method_list.extend([
(lambda ds: True, set([apply])),
(iscollection, set([shape, ndim])),
(lambda ds: iscollection(ds) and isscalar(ds.measure), set([coerce]))
])

schema_method_list.extend([
(isscalar, set([label, relabel])),
(isscalar, set([label, relabel, coerce])),
(isrecord, set([relabel])),
])

Expand Down
14 changes: 13 additions & 1 deletion blaze/expr/tests/test_expr.py
Expand Up @@ -4,7 +4,7 @@

import pytest

from datashape import dshape, var, datetime_
from datashape import dshape, var, datetime_, float32

from blaze.expr import symbol, label, Field

Expand Down Expand Up @@ -136,3 +136,15 @@ def test_hash_to_different_values():
from blaze.expr.expressions import _attr_cache
assert (expr, '_and') in _attr_cache
assert (expr2, '_and') in _attr_cache


def test_coerce():
s = symbol('s', var * float32)
assert str(s.coerce('int64')) == "s.coerce(to='int64')"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we check the dshape of the coerce node to make sure that it worked properly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point



@pytest.mark.xfail(raises=AttributeError, reason='Should this be valid?')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the question here if we should be able to cast down from float64 to float32? I think the idea of casting records like this is pretty cool.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The question is: Is casting an entire set of rows on a subset of columns (possibly all the columns as well) useful? How often is it the case that you want to change the type of a set of columns? I honestly don't know. I know that I do this in one particular case when I'm shoving a DataFrame that contains strings into a bcolz ctable in Python3.4.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be nice because there are some places that I have tz-aware datetimes in dataframes so the type is object and I would like to coerce all of the columns to datetime instead of object.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the case of tz-aware datetimes they are cast to object because pandas doesn't have a way to manage a contiguous array of tz-aware datetimes (yet).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh actually, I see what you mean ... they might be inferred as object, but you want whatever you're sending them to to see that they are datetimes

def test_coerce_record():
s = symbol('s', 'var * {a: int64, b: float64}')
expr = s.coerce('{a: float64, b: float32}')
assert str(expr) == "s.coerce(to='{a: float64, b: float32}')"