Skip to content

Commit

Permalink
feat(cast_to_category_pd): add function to cast type to category
Browse files Browse the repository at this point in the history
If more memory efficient, the functions cast the type of the column to
category dtype.

Closes #22.
  • Loading branch information
Axel Fahy committed Aug 19, 2019
1 parent a6600f3 commit c68253c
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 10 deletions.
7 changes: 4 additions & 3 deletions bff/__init__.py
Expand Up @@ -4,15 +4,16 @@
from ._version import get_versions

from .fancy import (
concat_with_categories, get_peaks, idict, mem_usage_pd, parse_date,
plot_history, plot_predictions, plot_series, plot_true_vs_pred,
read_sql_by_chunks, sliding_window, value_2_list
cast_to_category_pd, concat_with_categories, get_peaks, idict,
mem_usage_pd, parse_date, plot_history, plot_predictions, plot_series,
plot_true_vs_pred, read_sql_by_chunks, sliding_window, value_2_list
)

from .config import FancyConfig

# Public object of the module.
__all__ = [
'cast_to_category_pd',
'concat_with_categories',
'get_peaks',
'idict',
Expand Down
63 changes: 59 additions & 4 deletions bff/fancy.py
Expand Up @@ -26,6 +26,61 @@
LOGGER = logging.getLogger(name='bff')


def cast_to_category_pd(df: pd.DataFrame, deep: bool = True) -> pd.DataFrame:
"""
Automatically converts columns that are worth stored as ``category`` dtype.
To be casted a column must not be numerical and must have less than 50%
of unique values.
Parameters
----------
df: pd.DataFrame
DataFrame with the columns to cast.
deep: bool, default True
Whether or not to perform a deep copy of the original DataFrame.
Returns
-------
pd.DataFrame
Optimized copy of the input DataFrame.
Examples
--------
>>> import pandas as pd
>>> columns = ['name', 'age', 'country']
>>> df = pd.DataFrame([['John', 24, 'China'],
... ['Mary', 20, 'China'],
... ['Jane', 25, 'Switzerland'],
... ['Greg', 23, 'China'],
... ['James', 28, 'China']],
... columns=columns)
>>> df
name age country
0 John 24 China
1 Jane 25 Switzerland
2 James 28 China
>>> df.dtypes
name object
age int64
country object
dtype: object
>>> df_optimized = cast_to_category_pd(df)
>>> df_optimized.dtypes
name object
age int64
country category
dtype: object
"""
return (df.copy(deep=deep)
.astype({col: 'category' for col in df.columns
if (df[col].dtype == 'object'
and df[col].nunique() / df[col].shape[0] < 0.5)
}
)
)


def concat_with_categories(df_left: pd.DataFrame, df_right: pd.DataFrame,
**kwargs) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -65,10 +120,10 @@ def concat_with_categories(df_left: pd.DataFrame, df_right: pd.DataFrame,
... 'country': 'category'}
>>> columns = list(column_types.keys())
>>> df_left = pd.DataFrame([['John', 'red', 'China'],
['Jane', 'blue', 'Switzerland']],
... ['Jane', 'blue', 'Switzerland']],
... columns=columns).astype(column_types)
>>> df_right = pd.DataFrame([['Mary', 'yellow', 'France'],
['Fred', 'blue', 'Italy']],
... ['Fred', 'blue', 'Italy']],
... columns=columns).astype(column_types)
>>> df_left
name color country
Expand Down Expand Up @@ -137,8 +192,8 @@ def get_peaks(s: pd.Series, distance_scale: float = 0.04):
"""
Get the peaks of a time series having datetime as index.
Only the peaks having an heights higher than 0.75 quantile are returned
and a distance between two peaks at least `df.shape[0]*distance_scale`.
Only the peaks having an height higher than 0.75 quantile are returned
and a distance between two peaks at least ``df.shape[0]*distance_scale``.
Return the dates and the corresponding value of the peaks.
Expand Down
3 changes: 2 additions & 1 deletion setup.cfg
Expand Up @@ -11,7 +11,8 @@ description-file = README.md

[flake8]
max-line-length = 100
ignore = F841 # Local variable name is assigned to but never used.
ignore = F841, # Local variable name is assigned to but never used.
W503 # Line break occurred before a binary operator
exclude =
.git,
venv*,
Expand Down
30 changes: 28 additions & 2 deletions tests/test_fancy.py
Expand Up @@ -9,17 +9,43 @@
import numpy as np
from numpy.testing import assert_array_equal
import pandas as pd
from pandas.api.types import CategoricalDtype
import pandas.util.testing as tm

from bff.fancy import (concat_with_categories, get_peaks, idict, mem_usage_pd,
parse_date, value_2_list, sliding_window)
from bff.fancy import (cast_to_category_pd, concat_with_categories, get_peaks, idict,
mem_usage_pd, parse_date, value_2_list, sliding_window)


class TestFancy(unittest.TestCase):
"""
Unittest of Fancy module.
"""

def test_cast_to_category_pd(self):
"""
Test of the `cast_to_category_pd` function.
"""
columns = ['name', 'age', 'country']
df = pd.DataFrame([['John', 24, 'China'],
['Mary', 20, 'China'],
['Jane', 25, 'Switzerland'],
['Greg', 23, 'China'],
['James', 28, 'China']],
columns=columns)
original_types = {'name': np.dtype('O'), 'age': np.dtype('int64'),
'country': np.dtype('O')}
self.assertDictEqual(df.dtypes.to_dict(), original_types)

df_optimized = cast_to_category_pd(df)

tm.assert_frame_equal(df, df_optimized, check_dtype=False, check_categorical=False)

country_type = CategoricalDtype(categories=['China', 'Switzerland'], ordered=False)
optimized_types = {'name': np.dtype('O'), 'age': np.dtype('int64'),
'country': country_type}
print(df_optimized.dtypes.to_dict())
self.assertDictEqual(df_optimized.dtypes.to_dict(), optimized_types)

def test_concat_with_categories(self):
"""
Test of the `concat_with_categories` function.
Expand Down

0 comments on commit c68253c

Please sign in to comment.