Skip to content

Commit

Permalink
Merge pull request #26 from axelfahy/feat/convert-categorical
Browse files Browse the repository at this point in the history
feat(cast_to_category_pd): add function to cast type to category
  • Loading branch information
axelfahy committed Aug 19, 2019
2 parents a6600f3 + 8e55417 commit 66b0e94
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 10 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ make all

* 0.1.9
* ADD: Option ``loc`` in ``plot_series`` function.
* ADD: Function ``cast_to_category_pd`` to cast columns to category ``dtype`` automatically.
* 0.1.8
* ADD: Option ``with_missing_datetimes`` in ``plot_series`` function.
* ADD: Mypy for type verification.
Expand Down
7 changes: 4 additions & 3 deletions bff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
from ._version import get_versions

from .fancy import (
concat_with_categories, get_peaks, idict, mem_usage_pd, parse_date,
plot_history, plot_predictions, plot_series, plot_true_vs_pred,
read_sql_by_chunks, sliding_window, value_2_list
cast_to_category_pd, concat_with_categories, get_peaks, idict,
mem_usage_pd, parse_date, plot_history, plot_predictions, plot_series,
plot_true_vs_pred, read_sql_by_chunks, sliding_window, value_2_list
)

from .config import FancyConfig

# Public object of the module.
__all__ = [
'cast_to_category_pd',
'concat_with_categories',
'get_peaks',
'idict',
Expand Down
63 changes: 59 additions & 4 deletions bff/fancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,61 @@
LOGGER = logging.getLogger(name='bff')


def cast_to_category_pd(df: pd.DataFrame, deep: bool = True) -> pd.DataFrame:
"""
Automatically converts columns that are worth stored as ``category`` dtype.
To be casted a column must not be numerical and must have less than 50%
of unique values.
Parameters
----------
df: pd.DataFrame
DataFrame with the columns to cast.
deep: bool, default True
Whether or not to perform a deep copy of the original DataFrame.
Returns
-------
pd.DataFrame
Optimized copy of the input DataFrame.
Examples
--------
>>> import pandas as pd
>>> columns = ['name', 'age', 'country']
>>> df = pd.DataFrame([['John', 24, 'China'],
... ['Mary', 20, 'China'],
... ['Jane', 25, 'Switzerland'],
... ['Greg', 23, 'China'],
... ['James', 28, 'China']],
... columns=columns)
>>> df
name age country
0 John 24 China
1 Jane 25 Switzerland
2 James 28 China
>>> df.dtypes
name object
age int64
country object
dtype: object
>>> df_optimized = cast_to_category_pd(df)
>>> df_optimized.dtypes
name object
age int64
country category
dtype: object
"""
return (df.copy(deep=deep)
.astype({col: 'category' for col in df.columns
if (df[col].dtype == 'object'
and df[col].nunique() / df[col].shape[0] < 0.5)
}
)
)


def concat_with_categories(df_left: pd.DataFrame, df_right: pd.DataFrame,
**kwargs) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -65,10 +120,10 @@ def concat_with_categories(df_left: pd.DataFrame, df_right: pd.DataFrame,
... 'country': 'category'}
>>> columns = list(column_types.keys())
>>> df_left = pd.DataFrame([['John', 'red', 'China'],
['Jane', 'blue', 'Switzerland']],
... ['Jane', 'blue', 'Switzerland']],
... columns=columns).astype(column_types)
>>> df_right = pd.DataFrame([['Mary', 'yellow', 'France'],
['Fred', 'blue', 'Italy']],
... ['Fred', 'blue', 'Italy']],
... columns=columns).astype(column_types)
>>> df_left
name color country
Expand Down Expand Up @@ -137,8 +192,8 @@ def get_peaks(s: pd.Series, distance_scale: float = 0.04):
"""
Get the peaks of a time series having datetime as index.
Only the peaks having an heights higher than 0.75 quantile are returned
and a distance between two peaks at least `df.shape[0]*distance_scale`.
Only the peaks having an height higher than 0.75 quantile are returned
and a distance between two peaks at least ``df.shape[0]*distance_scale``.
Return the dates and the corresponding value of the peaks.
Expand Down
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ description-file = README.md

[flake8]
max-line-length = 100
ignore = F841 # Local variable name is assigned to but never used.
ignore = F841, # Local variable name is assigned to but never used.
W503 # Line break occurred before a binary operator
exclude =
.git,
venv*,
Expand Down
30 changes: 28 additions & 2 deletions tests/test_fancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,43 @@
import numpy as np
from numpy.testing import assert_array_equal
import pandas as pd
from pandas.api.types import CategoricalDtype
import pandas.util.testing as tm

from bff.fancy import (concat_with_categories, get_peaks, idict, mem_usage_pd,
parse_date, value_2_list, sliding_window)
from bff.fancy import (cast_to_category_pd, concat_with_categories, get_peaks, idict,
mem_usage_pd, parse_date, value_2_list, sliding_window)


class TestFancy(unittest.TestCase):
"""
Unittest of Fancy module.
"""

def test_cast_to_category_pd(self):
"""
Test of the `cast_to_category_pd` function.
"""
columns = ['name', 'age', 'country']
df = pd.DataFrame([['John', 24, 'China'],
['Mary', 20, 'China'],
['Jane', 25, 'Switzerland'],
['Greg', 23, 'China'],
['James', 28, 'China']],
columns=columns)
original_types = {'name': np.dtype('O'), 'age': np.dtype('int64'),
'country': np.dtype('O')}
self.assertDictEqual(df.dtypes.to_dict(), original_types)

df_optimized = cast_to_category_pd(df)

tm.assert_frame_equal(df, df_optimized, check_dtype=False, check_categorical=False)

country_type = CategoricalDtype(categories=['China', 'Switzerland'], ordered=False)
optimized_types = {'name': np.dtype('O'), 'age': np.dtype('int64'),
'country': country_type}
print(df_optimized.dtypes.to_dict())
self.assertDictEqual(df_optimized.dtypes.to_dict(), optimized_types)

def test_concat_with_categories(self):
"""
Test of the `concat_with_categories` function.
Expand Down

0 comments on commit 66b0e94

Please sign in to comment.