diff --git a/bff/__init__.py b/bff/__init__.py index 829bf0f..70d3720 100644 --- a/bff/__init__.py +++ b/bff/__init__.py @@ -4,15 +4,16 @@ from ._version import get_versions from .fancy import ( - concat_with_categories, get_peaks, idict, mem_usage_pd, parse_date, - plot_history, plot_predictions, plot_series, plot_true_vs_pred, - read_sql_by_chunks, sliding_window, value_2_list + cast_to_category_pd, concat_with_categories, get_peaks, idict, + mem_usage_pd, parse_date, plot_history, plot_predictions, plot_series, + plot_true_vs_pred, read_sql_by_chunks, sliding_window, value_2_list ) from .config import FancyConfig # Public object of the module. __all__ = [ + 'cast_to_category_pd', 'concat_with_categories', 'get_peaks', 'idict', diff --git a/bff/fancy.py b/bff/fancy.py index 4bd9d30..b24d828 100644 --- a/bff/fancy.py +++ b/bff/fancy.py @@ -26,6 +26,61 @@ LOGGER = logging.getLogger(name='bff') +def cast_to_category_pd(df: pd.DataFrame, deep: bool = True) -> pd.DataFrame: + """ + Automatically converts columns that are worth stored as ``category`` dtype. + + To be casted a column must not be numerical and must have less than 50% + of unique values. + + Parameters + ---------- + df: pd.DataFrame + DataFrame with the columns to cast. + deep: bool, default True + Whether or not to perform a deep copy of the original DataFrame. + + Returns + ------- + pd.DataFrame + Optimized copy of the input DataFrame. + + Examples + -------- + >>> import pandas as pd + >>> columns = ['name', 'age', 'country'] + >>> df = pd.DataFrame([['John', 24, 'China'], + ... ['Mary', 20, 'China'], + ... ['Jane', 25, 'Switzerland'], + ... ['Greg', 23, 'China'], + ... ['James', 28, 'China']], + ... columns=columns) + >>> df + name age country + 0 John 24 China + 1 Jane 25 Switzerland + 2 James 28 China + >>> df.dtypes + name object + age int64 + country object + dtype: object + >>> df_optimized = cast_to_category_pd(df) + >>> df_optimized.dtypes + name object + age int64 + country category + dtype: object + """ + return (df.copy(deep=deep) + .astype({col: 'category' for col in df.columns + if (df[col].dtype == 'object' + and df[col].nunique() / df[col].shape[0] < 0.5) + } + ) + ) + + def concat_with_categories(df_left: pd.DataFrame, df_right: pd.DataFrame, **kwargs) -> pd.DataFrame: """ @@ -65,10 +120,10 @@ def concat_with_categories(df_left: pd.DataFrame, df_right: pd.DataFrame, ... 'country': 'category'} >>> columns = list(column_types.keys()) >>> df_left = pd.DataFrame([['John', 'red', 'China'], - ['Jane', 'blue', 'Switzerland']], + ... ['Jane', 'blue', 'Switzerland']], ... columns=columns).astype(column_types) >>> df_right = pd.DataFrame([['Mary', 'yellow', 'France'], - ['Fred', 'blue', 'Italy']], + ... ['Fred', 'blue', 'Italy']], ... columns=columns).astype(column_types) >>> df_left name color country @@ -137,8 +192,8 @@ def get_peaks(s: pd.Series, distance_scale: float = 0.04): """ Get the peaks of a time series having datetime as index. - Only the peaks having an heights higher than 0.75 quantile are returned - and a distance between two peaks at least `df.shape[0]*distance_scale`. + Only the peaks having an height higher than 0.75 quantile are returned + and a distance between two peaks at least ``df.shape[0]*distance_scale``. Return the dates and the corresponding value of the peaks. diff --git a/setup.cfg b/setup.cfg index e456a58..2489110 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,7 +11,8 @@ description-file = README.md [flake8] max-line-length = 100 -ignore = F841 # Local variable name is assigned to but never used. +ignore = F841, # Local variable name is assigned to but never used. + W503 # Line break occurred before a binary operator exclude = .git, venv*, diff --git a/tests/test_fancy.py b/tests/test_fancy.py index a364d40..8226171 100644 --- a/tests/test_fancy.py +++ b/tests/test_fancy.py @@ -9,10 +9,11 @@ import numpy as np from numpy.testing import assert_array_equal import pandas as pd +from pandas.api.types import CategoricalDtype import pandas.util.testing as tm -from bff.fancy import (concat_with_categories, get_peaks, idict, mem_usage_pd, - parse_date, value_2_list, sliding_window) +from bff.fancy import (cast_to_category_pd, concat_with_categories, get_peaks, idict, + mem_usage_pd, parse_date, value_2_list, sliding_window) class TestFancy(unittest.TestCase): @@ -20,6 +21,31 @@ class TestFancy(unittest.TestCase): Unittest of Fancy module. """ + def test_cast_to_category_pd(self): + """ + Test of the `cast_to_category_pd` function. + """ + columns = ['name', 'age', 'country'] + df = pd.DataFrame([['John', 24, 'China'], + ['Mary', 20, 'China'], + ['Jane', 25, 'Switzerland'], + ['Greg', 23, 'China'], + ['James', 28, 'China']], + columns=columns) + original_types = {'name': np.dtype('O'), 'age': np.dtype('int64'), + 'country': np.dtype('O')} + self.assertDictEqual(df.dtypes.to_dict(), original_types) + + df_optimized = cast_to_category_pd(df) + + tm.assert_frame_equal(df, df_optimized, check_dtype=False, check_categorical=False) + + country_type = CategoricalDtype(categories=['China', 'Switzerland'], ordered=False) + optimized_types = {'name': np.dtype('O'), 'age': np.dtype('int64'), + 'country': country_type} + print(df_optimized.dtypes.to_dict()) + self.assertDictEqual(df_optimized.dtypes.to_dict(), optimized_types) + def test_concat_with_categories(self): """ Test of the `concat_with_categories` function.