In [2]:
import pandas as pd
from rich import print as rprint
cseries = pd.Series(['s','m','l'], dtype='category')
rprint(cseries)


In [3]:
import pyarrow as pa
dict_type = pd.ArrowDtype(pa.dictionary(pa.int64(),pa.utf8()))
cseries2 = pd.Series(['m','l','xs','s','xl'], dtype=dict_type)
rprint(cseries2)

Pyarrow native category type: Dictionary

In [4]:
(pd.Series(['sm','m','l'], dtype='category')).rename('size').to_frame().to_feather('./cat.ft')

A feather file with categorical column will read in as a dictionary type

In [5]:
(pd.read_feather('./cat.ft', dtype_backend='pyarrow').loc[:,'size'].dtype)

dictionary<values=string, indices=int8, ordered=0>[pyarrow]

Returning to our initial categorical data series, it has no natural ordering...

In [6]:
rprint(cseries.cat.ordered)

Below, we limit the categories to 's', 'm', and 'l', but the data has values not in those categories. Converting the data to a category type replaces those extra values with NaN.

In [7]:
s2 = pd.Series(['m','l','xs','s','xl'], dtype='string[pyarrow]')
size_type = pd.CategoricalDtype(categories=['s','m','l'],ordered=True)
s3 = s2.astype(size_type)
rprint(s3)

If we have ordered categories, we can make comparisons on them:

In [8]:
rprint(s3 > 's')

If you miss some members of the category when trying to reorder, pandas will throw a ValueError

In [10]:
cseries_reorder = pd.Series(['s','m','l'], dtype='category')
cseries_reorder.cat.reorder_categories(['xs','s','m','l','xl'], ordered=True)

ValueError: items in new_categories are not the same as in old categories

In [14]:
rprint(cseries_reorder.cat.categories)

In [15]:
(cseries_reorder.cat.add_categories(['xs','xl'])).cat.reorder_categories(['xs','s','m','l','xl'], ordered=True)

0    s
1    m
2    l
dtype: category
Categories (5, object): ['xs' < 's' < 'm' < 'l' < 'xl']