Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(cast_to_category_pd): add function to cast type to category #26

Merged
merged 1 commit into from
Aug 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ make all

* 0.1.9
* ADD: Option ``loc`` in ``plot_series`` function.
* ADD: Function ``cast_to_category_pd`` to cast columns to category ``dtype`` automatically.
* 0.1.8
* ADD: Option ``with_missing_datetimes`` in ``plot_series`` function.
* ADD: Mypy for type verification.
Expand Down
7 changes: 4 additions & 3 deletions bff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
from ._version import get_versions

from .fancy import (
concat_with_categories, get_peaks, idict, mem_usage_pd, parse_date,
plot_history, plot_predictions, plot_series, plot_true_vs_pred,
read_sql_by_chunks, sliding_window, value_2_list
cast_to_category_pd, concat_with_categories, get_peaks, idict,
mem_usage_pd, parse_date, plot_history, plot_predictions, plot_series,
plot_true_vs_pred, read_sql_by_chunks, sliding_window, value_2_list
)

from .config import FancyConfig

# Public object of the module.
__all__ = [
'cast_to_category_pd',
'concat_with_categories',
'get_peaks',
'idict',
Expand Down
63 changes: 59 additions & 4 deletions bff/fancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,61 @@
LOGGER = logging.getLogger(name='bff')


def cast_to_category_pd(df: pd.DataFrame, deep: bool = True) -> pd.DataFrame:
"""
Automatically converts columns that are worth stored as ``category`` dtype.

To be casted a column must not be numerical and must have less than 50%
of unique values.

Parameters
----------
df: pd.DataFrame
DataFrame with the columns to cast.
deep: bool, default True
Whether or not to perform a deep copy of the original DataFrame.

Returns
-------
pd.DataFrame
Optimized copy of the input DataFrame.

Examples
--------
>>> import pandas as pd
>>> columns = ['name', 'age', 'country']
>>> df = pd.DataFrame([['John', 24, 'China'],
... ['Mary', 20, 'China'],
... ['Jane', 25, 'Switzerland'],
... ['Greg', 23, 'China'],
... ['James', 28, 'China']],
... columns=columns)
>>> df
name age country
0 John 24 China
1 Jane 25 Switzerland
2 James 28 China
>>> df.dtypes
name object
age int64
country object
dtype: object
>>> df_optimized = cast_to_category_pd(df)
>>> df_optimized.dtypes
name object
age int64
country category
dtype: object
"""
return (df.copy(deep=deep)
.astype({col: 'category' for col in df.columns
if (df[col].dtype == 'object'
and df[col].nunique() / df[col].shape[0] < 0.5)
}
)
)


def concat_with_categories(df_left: pd.DataFrame, df_right: pd.DataFrame,
**kwargs) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -65,10 +120,10 @@ def concat_with_categories(df_left: pd.DataFrame, df_right: pd.DataFrame,
... 'country': 'category'}
>>> columns = list(column_types.keys())
>>> df_left = pd.DataFrame([['John', 'red', 'China'],
['Jane', 'blue', 'Switzerland']],
... ['Jane', 'blue', 'Switzerland']],
... columns=columns).astype(column_types)
>>> df_right = pd.DataFrame([['Mary', 'yellow', 'France'],
['Fred', 'blue', 'Italy']],
... ['Fred', 'blue', 'Italy']],
... columns=columns).astype(column_types)
>>> df_left
name color country
Expand Down Expand Up @@ -137,8 +192,8 @@ def get_peaks(s: pd.Series, distance_scale: float = 0.04):
"""
Get the peaks of a time series having datetime as index.

Only the peaks having an heights higher than 0.75 quantile are returned
and a distance between two peaks at least `df.shape[0]*distance_scale`.
Only the peaks having an height higher than 0.75 quantile are returned
and a distance between two peaks at least ``df.shape[0]*distance_scale``.

Return the dates and the corresponding value of the peaks.

Expand Down
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ description-file = README.md

[flake8]
max-line-length = 100
ignore = F841 # Local variable name is assigned to but never used.
ignore = F841, # Local variable name is assigned to but never used.
W503 # Line break occurred before a binary operator
exclude =
.git,
venv*,
Expand Down
30 changes: 28 additions & 2 deletions tests/test_fancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,43 @@
import numpy as np
from numpy.testing import assert_array_equal
import pandas as pd
from pandas.api.types import CategoricalDtype
import pandas.util.testing as tm

from bff.fancy import (concat_with_categories, get_peaks, idict, mem_usage_pd,
parse_date, value_2_list, sliding_window)
from bff.fancy import (cast_to_category_pd, concat_with_categories, get_peaks, idict,
mem_usage_pd, parse_date, value_2_list, sliding_window)


class TestFancy(unittest.TestCase):
"""
Unittest of Fancy module.
"""

def test_cast_to_category_pd(self):
"""
Test of the `cast_to_category_pd` function.
"""
columns = ['name', 'age', 'country']
df = pd.DataFrame([['John', 24, 'China'],
['Mary', 20, 'China'],
['Jane', 25, 'Switzerland'],
['Greg', 23, 'China'],
['James', 28, 'China']],
columns=columns)
original_types = {'name': np.dtype('O'), 'age': np.dtype('int64'),
'country': np.dtype('O')}
self.assertDictEqual(df.dtypes.to_dict(), original_types)

df_optimized = cast_to_category_pd(df)

tm.assert_frame_equal(df, df_optimized, check_dtype=False, check_categorical=False)

country_type = CategoricalDtype(categories=['China', 'Switzerland'], ordered=False)
optimized_types = {'name': np.dtype('O'), 'age': np.dtype('int64'),
'country': country_type}
print(df_optimized.dtypes.to_dict())
self.assertDictEqual(df_optimized.dtypes.to_dict(), optimized_types)

def test_concat_with_categories(self):
"""
Test of the `concat_with_categories` function.
Expand Down