feat(cast_to_category_pd): add function to cast type to category

If more memory efficient, the functions cast the type of the column to category dtype. Closes #22.
axelfahy · Aug 19, 2019 · c68253c · c68253c
1 parent a6600f3
commit c68253c
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 10 deletions.
diff --git a/bff/__init__.py b/bff/__init__.py
@@ -4,15 +4,16 @@
 from ._version import get_versions
 
 from .fancy import (
-    concat_with_categories, get_peaks, idict, mem_usage_pd, parse_date,
-    plot_history, plot_predictions, plot_series, plot_true_vs_pred,
-    read_sql_by_chunks, sliding_window, value_2_list
+    cast_to_category_pd, concat_with_categories, get_peaks, idict,
+    mem_usage_pd, parse_date, plot_history, plot_predictions, plot_series,
+    plot_true_vs_pred, read_sql_by_chunks, sliding_window, value_2_list
 )
 
 from .config import FancyConfig
 
 # Public object of the module.
 __all__ = [
+    'cast_to_category_pd',
     'concat_with_categories',
     'get_peaks',
     'idict',

diff --git a/bff/fancy.py b/bff/fancy.py
@@ -26,6 +26,61 @@
 LOGGER = logging.getLogger(name='bff')
 
 
+def cast_to_category_pd(df: pd.DataFrame, deep: bool = True) -> pd.DataFrame:
+    """
+    Automatically converts columns that are worth stored as ``category`` dtype.
+
+    To be casted a column must not be numerical and must have less than 50%
+    of unique values.
+
+    Parameters
+    ----------
+    df: pd.DataFrame
+        DataFrame with the columns to cast.
+    deep: bool, default True
+        Whether or not to perform a deep copy of the original DataFrame.
+
+    Returns
+    -------
+    pd.DataFrame
+        Optimized copy of the input DataFrame.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> columns = ['name', 'age', 'country']
+    >>> df = pd.DataFrame([['John', 24, 'China'],
+    ...                    ['Mary', 20, 'China'],
+    ...                    ['Jane', 25, 'Switzerland'],
+    ...                    ['Greg', 23, 'China'],
+    ...                    ['James', 28, 'China']],
+    ...                   columns=columns)
+    >>> df
+        name  age      country
+    0   John   24        China
+    1   Jane   25  Switzerland
+    2  James   28        China
+    >>> df.dtypes
+    name       object
+    age         int64
+    country    object
+    dtype: object
+    >>> df_optimized = cast_to_category_pd(df)
+    >>> df_optimized.dtypes
+    name       object
+    age         int64
+    country  category
+    dtype: object
+    """
+    return (df.copy(deep=deep)
+            .astype({col: 'category' for col in df.columns
+                     if (df[col].dtype == 'object'
+                         and df[col].nunique() / df[col].shape[0] < 0.5)
+                     }
+                    )
+            )
+
+
 def concat_with_categories(df_left: pd.DataFrame, df_right: pd.DataFrame,
                            **kwargs) -> pd.DataFrame:
     """
@@ -65,10 +120,10 @@ def concat_with_categories(df_left: pd.DataFrame, df_right: pd.DataFrame,
     ...                 'country': 'category'}
     >>> columns = list(column_types.keys())
     >>> df_left = pd.DataFrame([['John', 'red', 'China'],
-                                ['Jane', 'blue', 'Switzerland']],
+    ...                         ['Jane', 'blue', 'Switzerland']],
     ...                        columns=columns).astype(column_types)
     >>> df_right = pd.DataFrame([['Mary', 'yellow', 'France'],
-                                 ['Fred', 'blue', 'Italy']],
+    ...                          ['Fred', 'blue', 'Italy']],
     ...                         columns=columns).astype(column_types)
     >>> df_left
        name color      country
@@ -137,8 +192,8 @@ def get_peaks(s: pd.Series, distance_scale: float = 0.04):
     """
     Get the peaks of a time series having datetime as index.
 
-    Only the peaks having an heights higher than 0.75 quantile are returned
-    and a distance between two peaks at least `df.shape[0]*distance_scale`.
+    Only the peaks having an height higher than 0.75 quantile are returned
+    and a distance between two peaks at least ``df.shape[0]*distance_scale``.
 
     Return the dates and the corresponding value of the peaks.
 

diff --git a/setup.cfg b/setup.cfg
@@ -11,7 +11,8 @@ description-file = README.md
 
 [flake8]
 max-line-length = 100
-ignore = F841   # Local variable name is assigned to but never used.
+ignore = F841,   # Local variable name is assigned to but never used.
+         W503    # Line break occurred before a binary operator
 exclude =
     .git,
     venv*,

diff --git a/tests/test_fancy.py b/tests/test_fancy.py
@@ -9,17 +9,43 @@
 import numpy as np
 from numpy.testing import assert_array_equal
 import pandas as pd
+from pandas.api.types import CategoricalDtype
 import pandas.util.testing as tm
 
-from bff.fancy import (concat_with_categories, get_peaks, idict, mem_usage_pd,
-                       parse_date, value_2_list, sliding_window)
+from bff.fancy import (cast_to_category_pd, concat_with_categories, get_peaks, idict,
+                       mem_usage_pd, parse_date, value_2_list, sliding_window)
 
 
 class TestFancy(unittest.TestCase):
     """
     Unittest of Fancy module.
     """
 
+    def test_cast_to_category_pd(self):
+        """
+        Test of the `cast_to_category_pd` function.
+        """
+        columns = ['name', 'age', 'country']
+        df = pd.DataFrame([['John', 24, 'China'],
+                           ['Mary', 20, 'China'],
+                           ['Jane', 25, 'Switzerland'],
+                           ['Greg', 23, 'China'],
+                           ['James', 28, 'China']],
+                          columns=columns)
+        original_types = {'name': np.dtype('O'), 'age': np.dtype('int64'),
+                          'country': np.dtype('O')}
+        self.assertDictEqual(df.dtypes.to_dict(), original_types)
+
+        df_optimized = cast_to_category_pd(df)
+
+        tm.assert_frame_equal(df, df_optimized, check_dtype=False, check_categorical=False)
+
+        country_type = CategoricalDtype(categories=['China', 'Switzerland'], ordered=False)
+        optimized_types = {'name': np.dtype('O'), 'age': np.dtype('int64'),
+                           'country': country_type}
+        print(df_optimized.dtypes.to_dict())
+        self.assertDictEqual(df_optimized.dtypes.to_dict(), optimized_types)
+
     def test_concat_with_categories(self):
         """
         Test of the `concat_with_categories` function.