Permalink
Please
sign in to comment.
Showing
with
143 additions
and 6 deletions.
- +1 −0 .travis/install_orange.sh
- +78 −0 Orange/data/pandas_compat.py
- +62 −0 Orange/data/tests/test_pandas.py
- +1 −1 appveyor.yml
- +1 −5 requirements-opt.txt
| @@ -0,0 +1,78 @@ | |||
| """Pandas DataFrame↔Table conversion helpers""" | |||
| import numpy as np | |||
| import pandas as pd | |||
| from pandas.api.types import ( | |||
| is_categorical_dtype, is_object_dtype, | |||
| is_datetime64_any_dtype, is_numeric_dtype, | |||
| ) | |||
|
|
|||
| from Orange.data import ( | |||
| Table, Domain, DiscreteVariable, StringVariable, TimeVariable, | |||
| ContinuousVariable, | |||
| ) | |||
|
|
|||
| __all__ = ['table_from_frame'] | |||
|
|
|||
|
|
|||
| def table_from_frame(df, *, force_nominal=False): | |||
| """ | |||
| Convert pandas.DataFrame to Orange.data.Table | |||
| Parameters | |||
| ---------- | |||
| df : pandas.DataFrame | |||
| force_nominal : boolean | |||
| If True, interpret ALL string columns as nominal (DiscreteVariable). | |||
| Returns | |||
| ------- | |||
| Table | |||
| """ | |||
|
|
|||
| def _is_discrete(s): | |||
| return (is_categorical_dtype(s) or | |||
| is_object_dtype(s) and (force_nominal or | |||
| s.nunique() < s.size**.666)) | |||
|
|
|||
| def _is_datetime(s): | |||
| if is_datetime64_any_dtype(s): | |||
| return True | |||
| try: | |||
| if is_object_dtype(s): | |||
| pd.to_datetime(s, infer_datetime_format=True) | |||
| return True | |||
| except Exception: # pylint: disable=broad-except | |||
| pass | |||
| return False | |||
|
|
|||
| # If df index is not a simple RangeIndex (or similar), put it into data | |||
| if not (df.index.is_integer() and (df.index.is_monotonic_increasing or | |||
| df.index.is_monotonic_decreasing)): | |||
| df = df.reset_index() | |||
|
|
|||
| attrs, metas = [], [] | |||
| X, M = [], [] | |||
|
|
|||
| # Iter over columns | |||
| for name, s in df.items(): | |||
| name = str(name) | |||
| if _is_discrete(s): | |||
| discrete = s.astype('category').cat | |||
| attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist())) | |||
| X.append(discrete.codes.replace(-1, np.nan).values) | |||
| elif _is_datetime(s): | |||
| tvar = TimeVariable(name) | |||
| attrs.append(tvar) | |||
| s = pd.to_datetime(s, infer_datetime_format=True) | |||
| X.append(s.astype('str').replace('NaT', np.nan).map(tvar.parse).values) | |||
| elif is_numeric_dtype(s): | |||
| attrs.append(ContinuousVariable(name)) | |||
| X.append(s.values) | |||
| else: | |||
| metas.append(StringVariable(name)) | |||
| M.append(s.values.astype(object)) | |||
|
|
|||
| return Table.from_numpy(Domain(attrs, None, metas), | |||
| np.column_stack(X) if X else np.empty((df.shape[0], 0)), | |||
| None, | |||
| np.column_stack(M) if M else None) | |||
| @@ -0,0 +1,62 @@ | |||
| import unittest | |||
| import numpy as np | |||
| from Orange.data import ContinuousVariable, DiscreteVariable, TimeVariable | |||
|
|
|||
| try: | |||
| import pandas as pd | |||
| except ImportError: | |||
| pd = None | |||
|
|
|||
| @unittest.skipIf(pd is None, "Missing package 'pandas'") | |||
| class TestPandasCompat(unittest.TestCase): | |||
| def test_table_from_frame(self): | |||
| from Orange.data.pandas_compat import table_from_frame | |||
|
|
|||
| nan = np.nan | |||
| df = pd.DataFrame([['a', 1, pd.Timestamp('2017-12-19')], | |||
| ['b', 0, pd.Timestamp('1724-12-20')], | |||
| ['c', 0, pd.Timestamp('1724-12-20')], | |||
| [nan, nan, nan]]) | |||
| table = table_from_frame(df) | |||
| np.testing.assert_equal(table.X, | |||
| [[1, pd.Timestamp('2017-12-19').timestamp()], | |||
| [0, pd.Timestamp('1724-12-20').timestamp()], | |||
| [0, pd.Timestamp('1724-12-20').timestamp()], | |||
| [nan, nan]]) | |||
| np.testing.assert_equal(table.metas.tolist(), [['a'], | |||
| ['b'], | |||
| ['c'], | |||
| [nan]]) | |||
| names = [var.name for var in table.domain.attributes] | |||
| types = [type(var) for var in table.domain.attributes] | |||
| self.assertEqual(names, ['1', '2']) | |||
| self.assertEqual(types, [ContinuousVariable, TimeVariable]) | |||
|
|
|||
| # Force strings nominal | |||
| table = table_from_frame(df, force_nominal=True) | |||
| np.testing.assert_equal(table.X, [[0, 1, pd.Timestamp('2017-12-19').timestamp()], | |||
| [1, 0, pd.Timestamp('1724-12-20').timestamp()], | |||
| [2, 0, pd.Timestamp('1724-12-20').timestamp()], | |||
| [nan, nan, nan]]) | |||
| np.testing.assert_equal(table.metas.tolist(), [[], [], [], []]) | |||
| names = [var.name for var in table.domain.attributes] | |||
| types = [type(var) for var in table.domain.attributes] | |||
| self.assertEqual(names, ['0', '1', '2']) | |||
| self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable]) | |||
|
|
|||
| # Include index | |||
| df.index = list('abaa') | |||
| table = table_from_frame(df) | |||
| np.testing.assert_equal(table.X, | |||
| [[0, 1, pd.Timestamp('2017-12-19').timestamp()], | |||
| [1, 0, pd.Timestamp('1724-12-20').timestamp()], | |||
| [0, 0, pd.Timestamp('1724-12-20').timestamp()], | |||
| [0, nan, nan]]) | |||
| np.testing.assert_equal(table.metas.tolist(), [['a'], | |||
| ['b'], | |||
| ['c'], | |||
| [nan]]) | |||
| names = [var.name for var in table.domain.attributes] | |||
| types = [type(var) for var in table.domain.attributes] | |||
| self.assertEqual(names, ['index', '1', '2']) | |||
| self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable]) | |||
| @@ -1,5 +1 @@ | |||
| # This is required for, and only used by, Parallel Coordinates widget. | |||
| # Once that is ported to whatever, this can be removed, along with | |||
| # Orange/widgets/utils/plot/* | |||
| # Optional because it's hard to install everywhere. | |||
| qt-graph-helpers>=0.1.3 | |||
| pandas | |||
0 comments on commit
e602be2