Skip to content

Commit

Permalink
Updates to support pandas 0.23
Browse files Browse the repository at this point in the history
- Updated requirements.txt and setup.py to allow pandas 0.23
- Updated .travis.yml to test pandas 0.22 and 0.23
  - Not updated to account for numpy 1.14
- Corrected imports of is_categorical_dtype from pandas.core.common
  to pandas.api.types
  - This was already implemented in pandas 0.19, so no try/except
    needed
- Replaced pandas.DataFrame.from_items with from_dict, importing
  OrderedDict in tests/
  • Loading branch information
mbadger committed Jul 2, 2018
1 parent 9d2ec18 commit 9f050bb
Show file tree
Hide file tree
Showing 11 changed files with 47 additions and 35 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ env:
- PANDAS_VERSION=0.19 NUMPY_VERSION=1.12
- PANDAS_VERSION=0.20 NUMPY_VERSION=1.13
- PANDAS_VERSION=0.21 NUMPY_VERSION=1.13 NO_SLOW=false
- PANDAS_VERSION=0.22 NUMPY_VERSION=1.13 NO_SLOW=false
- PANDAS_VERSION=0.23 NUMPY_VERSION=1.13 NO_SLOW=false

cache:
directories:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ cvxopt
cvxpy <1.0
numexpr
numpy
pandas >=0.19,<0.22
pandas >=0.19,<0.24
scipy
scikit-learn >=0.19.0,<0.20
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def setup_package():
'cvxpy <1.0',
'numexpr',
'numpy',
'pandas >=0.19, <0.22',
'pandas >=0.19, <0.24',
'scipy',
'scikit-learn >=0.19.0, <0.20'],
extras_require={
Expand Down
3 changes: 2 additions & 1 deletion sksurv/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import numpy
import pandas

from pandas.core.common import is_categorical_dtype
from pandas.api.types import is_categorical_dtype

__all__ = ['categorical_to_numeric', 'encode_categorical', 'standardize']

Expand Down Expand Up @@ -198,4 +198,5 @@ def transform(column):
if isinstance(table, pandas.Series):
return pandas.Series(transform(table), name=table.name, index=table.index)
else:
# Raises a deprecation warning in pandas 0.23
return table.apply(transform, axis=0, reduce=False)
4 changes: 2 additions & 2 deletions sksurv/io/arffwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def _write_header(data, fp, relation_name, index):
name = attribute_names[column]
fp.write("@attribute {0}\t".format(name))

if pandas.core.common.is_categorical_dtype(series) or pandas.core.common.is_object_dtype(series):
if pandas.api.types.is_categorical_dtype(series) or pandas.api.types.is_object_dtype(series):
_write_attribute_categorical(series, fp)
elif numpy.issubdtype(series.dtype, numpy.floating):
fp.write("real")
Expand Down Expand Up @@ -110,7 +110,7 @@ def _check_str_value(x):

def _write_attribute_categorical(series, fp):
"""Write categories of a categorical/nominal attribute"""
if pandas.core.common.is_categorical_dtype(series.dtype):
if pandas.api.types.is_categorical_dtype(series.dtype):
categories = series.cat.categories
string_values = _check_str_array(categories)
else:
Expand Down
4 changes: 2 additions & 2 deletions sksurv/kernels/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,15 +162,15 @@ def _prepare_by_column_dtype(self, X):

for i, dt in enumerate(X.dtypes):
col = X.iloc[:, i]
if pandas.core.common.is_categorical_dtype(dt):
if pandas.api.types.is_categorical_dtype(dt):
if col.cat.ordered:
numeric_ranges.append(col.cat.codes.max() - col.cat.codes.min())
numeric_columns.append(i)
else:
nominal_columns.append(i)

col = col.cat.codes
elif pandas.core.common.is_numeric_dtype(dt):
elif pandas.api.types.is_numeric_dtype(dt):
numeric_ranges.append(col.max() - col.min())
numeric_columns.append(i)
else:
Expand Down
2 changes: 1 addition & 1 deletion sksurv/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def safe_concat(objs, *args, **kwargs):
categories = {}
for df in objs:
if isinstance(df, pandas.Series):
if pandas.core.common.is_categorical_dtype(df.dtype):
if pandas.api.types.is_categorical_dtype(df.dtype):
categories[df.name] = {"categories": df.cat.categories, "ordered": df.cat.ordered}
else:
dfc = df.select_dtypes(include=["category"])
Expand Down
11 changes: 7 additions & 4 deletions tests/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pandas
import numpy

from collections import OrderedDict
from sksurv import column

NUMERIC_DATA_FRAME = pandas.DataFrame(numpy.arange(50).reshape(10, 5))
Expand Down Expand Up @@ -93,10 +94,10 @@ def test_series_categorical():
input_series = pandas.Series(pandas.Categorical.from_codes([1, 1, 0, 2, 0, 1, 2, 1, 2, 0, 0, 1, 2, 2],
["small", "medium", "large"], ordered=False),
name="a_series")
expected_df = pandas.DataFrame.from_items(
expected_df = pandas.DataFrame.from_dict(OrderedDict(
[("a_series=medium", numpy.array([1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0], dtype=float)),
("a_series=large", numpy.array([0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1], dtype=float))
])
]))

actual_df = column.encode_categorical(input_series)

Expand Down Expand Up @@ -159,8 +160,10 @@ def test_duplicate_index(self):
c = rnd.randn(len(a))

index = numpy.ceil(numpy.arange(0, len(a) // 2, 0.5))
df = pandas.DataFrame.from_items([("a_category", pandas.Series(a, index=index)),
("a_number", pandas.Series(c, index=index, copy=True))])
df = pandas.DataFrame.from_dict(OrderedDict([
("a_category", pandas.Series(a, index=index)),
("a_number", pandas.Series(c, index=index, copy=True))
]))

actual_df = column.encode_categorical(df)

Expand Down
6 changes: 4 additions & 2 deletions tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import pandas
import pandas.util.testing as tm

from collections import OrderedDict

from sksurv.io import loadarff, writearff


Expand Down Expand Up @@ -40,7 +42,7 @@ def test_dataframe(self):
with StringIO(contents) as fp:
actual_df = loadarff(fp)

expected_df = pandas.DataFrame.from_items(
expected_df = pandas.DataFrame.from_dict(OrderedDict(
[("attr_nominal",
pandas.Series(pandas.Categorical.from_codes(
[1, 2, 0, -1, 2, 1],
Expand All @@ -50,7 +52,7 @@ def test_dataframe(self):
[2, 0, -1, 1, 0, 1],
['"hard liquor"', 'mate', '"red wine"'])))
]
)
))

tm.assert_frame_equal(expected_df, actual_df, check_exact=True)

Expand Down
4 changes: 3 additions & 1 deletion tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from numpy.testing import run_module_suite, TestCase
import pandas.util.testing as tm

from collections import OrderedDict

from sksurv.preprocessing import OneHotEncoder


Expand Down Expand Up @@ -40,7 +42,7 @@ def encoded_data(data):
else:
expected.append((nam, col))

expected_data = pd.DataFrame.from_items(expected)
expected_data = pd.DataFrame.from_dict(OrderedDict(expected))
return expected_data


Expand Down
42 changes: 22 additions & 20 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import pandas
import numpy

from collections import OrderedDict

from sksurv.util import safe_concat


Expand All @@ -13,9 +15,9 @@ def test_concat_numeric():
a = pandas.Series(rnd.randn(100), name="col_A")
b = pandas.Series(rnd.randn(100), name="col_B")

expected_df = pandas.DataFrame.from_items(
expected_df = pandas.DataFrame.from_dict(OrderedDict(
[(a.name, a), (b.name, b)]
)
))

actual_df = safe_concat((a, b), axis=1)

Expand All @@ -28,9 +30,9 @@ def test_concat_numeric_categorical():
b = pandas.Series(pandas.Categorical.from_codes(
rnd.binomial(4, 0.6, 100), ["C1", "C2", "C3", "C4", "C5"]), name="col_B")

expected_df = pandas.DataFrame.from_items(
expected_df = pandas.DataFrame.from_dict(OrderedDict(
[(a.name, a), (b.name, b)]
)
))

actual_df = safe_concat((a, b), axis=1)

Expand All @@ -39,22 +41,22 @@ def test_concat_numeric_categorical():
@staticmethod
def test_concat_categorical():
rnd = numpy.random.RandomState(14)
a = pandas.DataFrame.from_items([
a = pandas.DataFrame.from_dict(OrderedDict([
("col_A", pandas.Series(pandas.Categorical.from_codes(
rnd.binomial(2, 0.6, 100), ["C1", "C2", "C3"]), name="col_A")),
("col_B", rnd.randn(100))])
b = pandas.DataFrame.from_items([
("col_B", rnd.randn(100))]))
b = pandas.DataFrame.from_dict(OrderedDict([
("col_A", pandas.Series(pandas.Categorical.from_codes(
rnd.binomial(2, 0.2, 100), ["C1", "C2", "C3"]), name="col_A")),
("col_B", rnd.randn(100))])
("col_B", rnd.randn(100))]))

expected_series = pandas.DataFrame.from_items([
expected_series = pandas.DataFrame.from_dict(OrderedDict([
("col_A", pandas.Series(pandas.Categorical.from_codes(
numpy.concatenate((a.col_A.cat.codes.values, b.col_A.cat.codes.values)),
["C1", "C2", "C3"]
))),
("col_B", numpy.concatenate((a.col_B.values, b.col_B.values)))
])
]))
expected_series.index = pandas.Index(a.index.tolist() + b.index.tolist())

actual_series = safe_concat((a, b), axis=0)
Expand All @@ -63,24 +65,24 @@ def test_concat_categorical():

def test_concat_categorical_mismatch(self):
rnd = numpy.random.RandomState(14)
a = pandas.DataFrame.from_items([
a = pandas.DataFrame.from_dict(OrderedDict([
("col_A", pandas.Series(pandas.Categorical.from_codes(
rnd.binomial(2, 0.6, 100), ["C1", "C2", "C3"]), name="col_A")),
("col_B", rnd.randn(100))])
b = pandas.DataFrame.from_items([
("col_B", rnd.randn(100))]))
b = pandas.DataFrame.from_dict(OrderedDict([
("col_A", pandas.Series(pandas.Categorical.from_codes(
rnd.binomial(3, 0.6, 100), ["C1", "C2", "C3", "C4"]), name="col_A")),
("col_B", rnd.randn(100))])
("col_B", rnd.randn(100))]))

self.assertRaisesRegex(ValueError, "categories for column col_A do not match",
safe_concat, (a, b), axis=0)

@staticmethod
def test_concat_dataframe_numeric_categorical():
rnd = numpy.random.RandomState(14)
numeric_df = pandas.DataFrame.from_items(
numeric_df = pandas.DataFrame.from_dict(OrderedDict(
[("col_A", rnd.randn(100)), ("col_B", rnd.randn(100))]
)
))

cat_series = pandas.Series(pandas.Categorical.from_codes(
rnd.binomial(4, 0.6, 100), ["C1", "C2", "C3", "C4", "C5"]), name="col_C")
Expand All @@ -94,18 +96,18 @@ def test_concat_dataframe_numeric_categorical():

def test_concat_duplicate_columns(self):
rnd = numpy.random.RandomState(14)
numeric_df = pandas.DataFrame.from_items([
numeric_df = pandas.DataFrame.from_dict(OrderedDict([
("col_N", rnd.randn(100)), ("col_B", rnd.randn(100)),
("col_A", pandas.Series(pandas.Categorical.from_codes(
rnd.binomial(4, 0.2, 100), ["C1", "C2", "C3", "C4", "C5"]), name="col_A")),
])
]))

cat_df = pandas.DataFrame.from_items([
cat_df = pandas.DataFrame.from_dict(OrderedDict([
("col_A", pandas.Series(pandas.Categorical.from_codes(
rnd.binomial(4, 0.6, 100), ["C1", "C2", "C3", "C4", "C5"]), name="col_A")),
("col_C", pandas.Series(pandas.Categorical.from_codes(
rnd.binomial(1, 0.6, 100), ["Yes", "No"]), name="col_C")),
])
]))

self.assertRaisesRegex(ValueError, "duplicate columns col_A",
safe_concat, (numeric_df, cat_df), axis=1)
Expand Down

0 comments on commit 9f050bb

Please sign in to comment.