In [1]:
import numpy as np
import pandas as pd

- **Categorical variable** -- Variable which takes on a limited, usually fixed, number of possible values

- `Categorical` -- Data type corresponding to categorical variables
  - May have an order
  - Numerical operations may not be performed

Why use `Categorical`?
- Save memory over using a string variable
- Enforce logical order over string variables that is different from lexicographical order
- Use other libraries that deal with categorical data differently than string data

# Object creation

## `Series` creation

In [2]:
# Create Categorical Series
s = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [3]:
# Create DataFrame with non-categorical column
df = pd.DataFrame({'A': ['a', 'b', 'c', 'a']})
df

Unnamed: 0,A
0,a
1,b
2,c
3,a


In [4]:
# Create Categorical Series from existing non-Categorical Series
df['B'] = df['A'].astype('category')
df

Unnamed: 0,A,B
0,a,a
1,b,b
2,c,c
3,a,a


In [5]:
# Show that DataFrame has a non-categorical and Categorical column
df.dtypes

A      object
B    category
dtype: object

In [6]:
# Create DataFrame with column of values
df = pd.DataFrame({'value': np.random.randint(0, 100, 20)})
df

Unnamed: 0,value
0,20
1,50
2,7
3,57
4,37
5,4
6,58
7,31
8,48
9,49


In [7]:
# Create categories for the values
categories = ['{0} - {1}'.format(i, i + 9) for i in range(0, 100, 10)]
categories

['0 - 9',
 '10 - 19',
 '20 - 29',
 '30 - 39',
 '40 - 49',
 '50 - 59',
 '60 - 69',
 '70 - 79',
 '80 - 89',
 '90 - 99']

In [8]:
# Create Categorical column by binning values with cut()
df['group'] = pd.cut(df['value'], range(0, 105, 10), right=False, labels=categories)
df

Unnamed: 0,value,group
0,20,20 - 29
1,50,50 - 59
2,7,0 - 9
3,57,50 - 59
4,37,30 - 39
5,4,0 - 9
6,58,50 - 59
7,31,30 - 39
8,48,40 - 49
9,49,40 - 49


In [9]:
# Create raw Categorical data
raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], categories=['b', 'c', 'd'], ordered=False)
raw_cat

[NaN, 'b', 'c', NaN]
Categories (3, object): ['b', 'c', 'd']

In [10]:
# Create Categorical Series using Categorical data
s = pd.Series(raw_cat)
s

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): ['b', 'c', 'd']

## `DataFrame` creation

In [11]:
# Create Categorical DataFrame
df = pd.DataFrame(
    {
        'A': list('abca'),
        'B': list('bccd')
    },
    dtype='category'
)

df.dtypes

A    category
B    category
dtype: object

In [12]:
# Check categories for column 'A' of Categorical DataFrame
df['A']

0    a
1    b
2    c
3    a
Name: A, dtype: category
Categories (3, object): ['a', 'b', 'c']

In [13]:
# Check categories for column 'B' of Categorical DataFrame
df['B']

0    b
1    c
2    c
3    d
Name: B, dtype: category
Categories (3, object): ['b', 'c', 'd']

In [14]:
# Create non-categorical DataFrame
df = pd.DataFrame({
    'A': list('abca'),
    'B': list('bccd')
})

df.dtypes

A    object
B    object
dtype: object

In [15]:
# Convert non-categorical DataFrame to Categorical DataFrame
df_cat = df.astype('category')
df_cat.dtypes

A    category
B    category
dtype: object

## Controlling behavior
Default behavior when creating `Series` or `DataFrame` with `dtype='category'`:
- Categories are inferred from data
- Categories are unordered

Control behavior using `CategoricalDtype`

In [16]:
# Create ordered Categorical Series
from pandas.api.types import CategoricalDtype
s = pd.Series(['a', 'b', 'c', 'a'], dtype=CategoricalDtype(['b', 'c', 'd'], ordered=True))
s

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): ['b' < 'c' < 'd']

In [17]:
# Create ordered Categorical DataFrame where categories are consistent across columns
df = pd.DataFrame({
    'A': list('abca'),
    'B': list('bccd')
})

df_cat = df.astype(CategoricalDtype(list('abcd'), ordered=True))
df_cat.dtypes

A    category
B    category
dtype: object

In [18]:
# Show categorical data in column 'A'
df_cat['A']

0    a
1    b
2    c
3    a
Name: A, dtype: category
Categories (4, object): ['a' < 'b' < 'c' < 'd']

In [19]:
# Show categorical data in column 'B'
df_cat['B']

0    b
1    c
2    c
3    d
Name: B, dtype: category
Categories (4, object): ['a' < 'b' < 'c' < 'd']

In [20]:
# Create Categorial Series from codes
splitter = np.random.choice([0, 1], 5, p=[0.5, 0.5])
s = pd.Series(pd.Categorical.from_codes(splitter, categories=['train', 'test']))
s

0     test
1    train
2    train
3    train
4     test
dtype: category
Categories (2, object): ['train', 'test']

## Regaining original data

In [21]:
# Create string Series
s = pd.Series(['a', 'b', 'c', 'a'])
s

0    a
1    b
2    c
3    a
dtype: object

In [22]:
# Convert string Series to Categorical Series
s2 = s.astype('category')
s2

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [23]:
# Convert Categorical Series to string Series
s2.astype(str)

0    a
1    b
2    c
3    a
dtype: object

In [24]:
# Convert Categorical Series to string ndarray
np.asarray(s2)

array(['a', 'b', 'c', 'a'], dtype=object)

# `CategoricalDtype`
A categorical type is fully defined by:
- `categories` -- A sequence of unique, non-NaN values
- `ordered` -- A boolean indicating whether the categories are ordered or not

In [25]:
# Create unordered Categorical object
CategoricalDtype(['a', 'b', 'c'])

CategoricalDtype(categories=['a', 'b', 'c'], ordered=False, categories_dtype=object)

In [26]:
# Create ordered Categorical object
CategoricalDtype(['a', 'b', 'c'], ordered=True)

CategoricalDtype(categories=['a', 'b', 'c'], ordered=True, categories_dtype=object)

In [27]:
# Create empty, unordered Categorical object
CategoricalDtype()

CategoricalDtype(categories=None, ordered=False, categories_dtype=None)

## Equality semantics

In [28]:
# Create unordered Category object
c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False)
c1

CategoricalDtype(categories=['a', 'b', 'c'], ordered=False, categories_dtype=object)

In [29]:
# Check equality with another unordered Category object
c1 == CategoricalDtype(['a', 'b', 'c'], ordered=False)

True

In [30]:
# Check equality with another unordered Category object
c1 == CategoricalDtype(['b', 'a', 'c'], ordered=False)

True

In [31]:
# Check equality with an ordered Category object
c1 == CategoricalDtype(['a', 'b', 'c'], ordered=True)

False

# Description

In [32]:
# Create Categorical data
cat = pd.Categorical(['a', 'c', 'c', np.nan], categories=['b', 'a', 'c'])
cat

['a', 'c', 'c', NaN]
Categories (3, object): ['b', 'a', 'c']

In [33]:
# Create DataFrame with Categorical data
df = pd.DataFrame({
    'cat': cat,
    's': ['a', 'c', 'c', np.nan]
})

df

Unnamed: 0,cat,s
0,a,a
1,c,c
2,c,c
3,,


In [34]:
# Describe DataFrame with Categorical data
df.describe()


Unnamed: 0,cat,s
count,3,3
unique,2,2
top,c,c
freq,2,2


In [35]:
# Describe Categorical column in DataFrame
df['cat'].describe()

count     3
unique    2
top       c
freq      2
Name: cat, dtype: object

# Working with categories

In [36]:
# Create Categorical Series
s = pd.Series(['b', 'a', 'c', 'a'], dtype='category')
s

0    b
1    a
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [37]:
# Check categorical information of Categorical Series
s.cat.categories, s.cat.ordered

(Index(['a', 'b', 'c'], dtype='object'), False)

In [38]:
# Note that the result of s.cat.categories and the result of s.unique() may not be the same
s.unique()

['b', 'a', 'c']
Categories (3, object): ['a', 'b', 'c']

## Renaming categories

In [39]:
# Create Categorical Series
s = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [40]:
# Rename categories in Categorical Series with a list
new_categories = ['Group %s' % g for g in s.cat.categories]
s.cat.rename_categories(new_categories)

0    Group a
1    Group b
2    Group c
3    Group a
dtype: category
Categories (3, object): ['Group a', 'Group b', 'Group c']

In [41]:
# Rename categories in Categorical Series with a dict
s.cat.rename_categories({'a': 'x', 'b': 'y', 'c': 'z'})


0    x
1    y
2    z
3    x
dtype: category
Categories (3, object): ['x', 'y', 'z']

In [42]:
# Attempt to rename categories to have non-unique names
try:
    s.cat.rename_categories(['x', 'x', 'x'])
except ValueError as e:
    print("ValueError:", e)

ValueError: Categorical categories must be unique


In [43]:
# Attempt to rename categories to have NaN values
try:
    s.cat.rename_categories(['x', 'y', np.nan])
except ValueError as e:
    print("ValueError:", e)

ValueError: Categorical categories cannot be null


## Appending new categories

In [44]:
# Add new categories to Categorical Series
s = s.cat.add_categories(['d'])
s

0    a
1    b
2    c
3    a
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

## Removing categories

In [45]:
# Remove categories from Categorical Series
s = s.cat.remove_categories(['d'])
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

## Removing unused categories

In [46]:
# Create Categorical Series with unusued categories
s = pd.Series(pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c', 'd']))
s

0    a
1    b
2    a
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [47]:
# Remove unused categories from Categorical Series
s.cat.remove_unused_categories()

0    a
1    b
2    a
dtype: category
Categories (2, object): ['a', 'b']

## Setting categories

In [48]:
# Create Categorical Series
s = pd.Series(['one', 'two', 'four', '-'], dtype='category')
s

0     one
1     two
2    four
3       -
dtype: category
Categories (4, object): ['-', 'four', 'one', 'two']

In [49]:
# Add and remove categories at the same time by setting categories
s = s.cat.set_categories(['one', 'two', 'three', 'four'])
s

0     one
1     two
2    four
3     NaN
dtype: category
Categories (4, object): ['one', 'two', 'three', 'four']

# Sorting and order

In [50]:
# Create an unordered Categorical Series
s = pd.Series(pd.Categorical(['a', 'b', 'c', 'a'], ordered=False))
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [51]:
# Sort unordered Categorical Series
s = s.sort_values()
s

0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [52]:
# Attempt to get min, max of unordered Categorical Series
try:
    s.min(), s.max()
except TypeError as e:
    print("TypeError:", e)

TypeError: Categorical is not ordered for operation min
you can use .as_ordered() to change the Categorical to an ordered one



In [53]:
# Create an ordered Categorical Series
s = pd.Series(pd.Categorical(['a', 'b', 'c', 'a'], ordered=True))
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a' < 'b' < 'c']

In [54]:
# Sort ordered Categorical Series
s = s.sort_values()
s

0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): ['a' < 'b' < 'c']

In [55]:
# Get min, max of ordered Categorical Series
s.min(), s.max()

('a', 'c')

In [56]:
# Change unordered Categorical Series to ordered
s.cat.as_ordered()

0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): ['a' < 'b' < 'c']

In [57]:
# Change ordered Categorical Series to unordered
s.cat.as_unordered()

0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [58]:
# Create ordered Categorical Series with numeric categories
s = pd.Series([1, 2, 3, 1], dtype='category')
s = s.cat.set_categories([2, 3, 1], ordered=True)
s

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

In [59]:
# Categorical values are sorted by category ordered as specified in Category object
# Not sorted by numeric or lexicographical order
s.sort_values()

1    2
2    3
0    1
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

In [60]:
# Get min, max of ordered Categorical Series
s.min(), s.max()

(2, 1)

## Reordering

In [61]:
# Create ordered Categorical Series with numeric categories
s = pd.Series([1, 2, 3, 1], dtype='category')
s = s.cat.set_categories([2, 3, 1], ordered=True)
s

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

In [62]:
# Sort ordered Categorical Series
s.sort_values()

1    2
2    3
0    1
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

In [63]:
# Reorder categories in Categorical Series
s = s.cat.reorder_categories([3, 2, 1], ordered=True)
s

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [3 < 2 < 1]

In [64]:
# Sort reordered Categorical Series
s.sort_values()

2    3
1    2
0    1
3    1
dtype: category
Categories (3, int64): [3 < 2 < 1]

## Multi column sorting

In [65]:
# Create DataFrame with Categorical data
dfs = pd.DataFrame({
    'A': pd.Categorical(
        list('bbeebbaa'),
        categories=['e', 'a', 'b'],
        ordered=True
    ),
    'B': [1, 2, 1, 2, 2, 1, 2, 1]
})

dfs

Unnamed: 0,A,B
0,b,1
1,b,2
2,e,1
3,e,2
4,b,2
5,b,1
6,a,2
7,a,1


In [66]:
# Sort columns of DataFrame
# Categorical column will be sorted by category order
dfs.sort_values(by=['A', 'B'])

Unnamed: 0,A,B
2,e,1
3,e,2
7,a,1
6,a,2
0,b,1
5,b,1
1,b,2
4,b,2


In [67]:
# Reorder categories in Categorical column of DataFrame
dfs['A'] = dfs['A'].cat.reorder_categories(['a', 'b', 'e'])
dfs

Unnamed: 0,A,B
0,b,1
1,b,2
2,e,1
3,e,2
4,b,2
5,b,1
6,a,2
7,a,1


In [68]:
# Sort columns of DataFrame
# Categorical column will be sorted by reordered category order
dfs.sort_values(by=['A', 'B'])

Unnamed: 0,A,B
7,a,1
6,a,2
0,b,1
5,b,1
1,b,2
4,b,2
2,e,1
3,e,2


# Comparisons
Categorical data can sometimes be compared with other data:
- Equality to list objects of the same length as the categorical data
- Any comparison between categorical data and categorical `Series` that are both ordered and have the same category valuse
- Any comparison between categorical data and a scalar

In [69]:
# Create ordered Categorical Series
cat = pd.Series([1, 2, 3]).astype(CategoricalDtype(categories=[3, 2, 1], ordered=True))
cat

0    1
1    2
2    3
dtype: category
Categories (3, int64): [3 < 2 < 1]

In [70]:
# Create ordered Categorical Series
cat_base = pd.Series([2, 2, 2]).astype(CategoricalDtype(categories=[3, 2, 1], ordered=True))
cat_base

0    2
1    2
2    2
dtype: category
Categories (3, int64): [3 < 2 < 1]

In [71]:
# Create ordered Categorical Series
cat_base2 = pd.Series([2, 2, 2]).astype(CategoricalDtype(ordered=True))
cat_base2

0    2
1    2
2    2
dtype: category
Categories (1, int64): [2]

In [72]:
# Compare elements of two Categorical Series with the same categories and ordering
cat > cat_base

0     True
1    False
2    False
dtype: bool

In [73]:
# Compare elements of Categorical Series with scalar
cat > 2

0     True
1    False
2    False
dtype: bool

In [74]:
# Compare equality of elements between two Categorical Series with the same categories and ordering
cat == cat_base

0    False
1     True
2    False
dtype: bool

In [75]:
# Compare equality of elements in Categorical Series with elements in list of same length
cat == np.array([1, 2, 3])

0    True
1    True
2    True
dtype: bool

In [76]:
# Compare equality of elements in Categorical Series with scalar
cat == 2

0    False
1     True
2    False
dtype: bool

In [77]:
# Attempt to compare elements of two Categorical Series with different categories
try:
    cat > cat_base2
except TypeError as e:
    print("TypeError:", e)

TypeError: Categoricals can only be compared if 'categories' are the same.


In [78]:
# Compare elements of two Categorical Series with different categories by converting both to an array
# Then the elements are compared based on underlying type order, not category order
np.asarray(cat) > np.asarray(cat_base2)

array([False, False,  True])

# Operations

In [79]:
# Create Categorical Series
s = pd.Series(pd.Categorical(['a', 'b', 'c', 'c'], categories=['c', 'a', 'b', 'd']))
s

0    a
1    b
2    c
3    c
dtype: category
Categories (4, object): ['c', 'a', 'b', 'd']

In [80]:
# Count occurrences of each category in Categorical Series
s.value_counts()

c    2
a    1
b    1
d    0
Name: count, dtype: int64

In [81]:
# Create DataFrame with Categorical index
df = pd.DataFrame(
    data=[
        [1, 2, 3],
        [4, 5, 6]
    ],
    columns=pd.MultiIndex.from_arrays([
        ['A', 'B', 'B'],
        pd.Categorical(['One', 'One', 'Two'], categories=['One', 'Two', 'Three'], ordered=True)
    ])
).T

df

Unnamed: 0,Unnamed: 1,0,1
A,One,1,4
B,One,2,5
B,Two,3,6


In [82]:
# Sum along categorical columns, including unobserved categories
df.groupby(level=1, observed=False).sum()

Unnamed: 0,0,1
One,3,9
Two,3,6
Three,0,0


In [83]:
# Create DataFrame with categorical data
df = pd.DataFrame({
    'cats': pd.Categorical(['a', 'b', 'b', 'b', 'c', 'c', 'c'], categories=['a', 'b', 'c', 'd']),
    'values': [1, 2, 2, 2, 3, 4, 5]
})

df

Unnamed: 0,cats,values
0,a,1
1,b,2
2,b,2
3,b,2
4,c,3
5,c,4
6,c,5


In [84]:
# Average values by category, including unobserved categories
df.groupby('cats', observed=False).mean()

Unnamed: 0_level_0,values
cats,Unnamed: 1_level_1
a,1.0
b,2.0
c,4.0
d,


In [85]:
# Create DataFrame with categorical data
df2 = pd.DataFrame({
    'cats': pd.Categorical(['a', 'a', 'b', 'b'], categories=['a', 'b', 'c']),
    'B': ['c', 'd', 'c', 'd'],
    'values': [1, 2, 3, 4]
})

df2

Unnamed: 0,cats,B,values
0,a,c,1
1,a,d,2
2,b,c,3
3,b,d,4


In [86]:
# Average values category and value of ['B'], including unobserved categories
df2.groupby(['cats', 'B'], observed=False).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,values
cats,B,Unnamed: 2_level_1
a,c,1.0
a,d,2.0
b,c,3.0
b,d,4.0
c,c,
c,d,


In [87]:
# Create DataFrame with categorical data
df = pd.DataFrame({
    'A': pd.Categorical(['a', 'a', 'b', 'b'], categories=['a', 'b', 'c']),
    'B': ['c', 'd', 'c', 'd'],
    'values': [1, 2, 3, 4]
})

df

Unnamed: 0,A,B,values
0,a,c,1
1,a,d,2
2,b,c,3
3,b,d,4


In [88]:
# Pivot DataFrame, including unobserved categories
pd.pivot_table(df, values='values', index=['A', 'B'], observed=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,values
A,B,Unnamed: 2_level_1
a,c,1.0
a,d,2.0
b,c,3.0
b,d,4.0


# Data munging

## Getting

In [89]:
# Create DataFrame with categorical data
idx = pd.Index(['h', 'i', 'j', 'k', 'l', 'm', 'n'])
cats = pd.Series(['a', 'b', 'b', 'b', 'c', 'c', 'c'], dtype='category', index=idx)
values = [1, 2, 2, 2, 3, 4, 5]

df = pd.DataFrame({'cats': cats, 'values': values}, index=idx)
df

Unnamed: 0,cats,values
h,a,1
i,b,2
j,b,2
k,b,2
l,c,3
m,c,4
n,c,5


In [90]:
# Get DataFrame rows 3 through 4, all columns
df.iloc[2:4, :]

Unnamed: 0,cats,values
j,b,2
k,b,2


In [91]:
# Show that resulting DataFrame has Categorical data
df.iloc[2:4, :].dtypes

cats      category
values       int64
dtype: object

In [92]:
# Get DataFrame rows 'h' through 'j', column 'cats'
df.loc['h':'j', 'cats']

h    a
i    b
j    b
Name: cats, dtype: category
Categories (3, object): ['a', 'b', 'c']

In [93]:
# Show that resulting Series has Categorical data
df.loc['h':'j', 'cats'].dtypes

CategoricalDtype(categories=['a', 'b', 'c'], ordered=False, categories_dtype=object)

In [94]:
# Get rows of DataFrame where value of 'cats' is 'b'
df[df['cats'] == 'b']

Unnamed: 0,cats,values
i,b,2
j,b,2
k,b,2


In [95]:
# Show that resulting DataFrame has Categorical data
df[df['cats'] == 'b'].dtypes

cats      category
values       int64
dtype: object

In [96]:
# Get row 'h' of DataFrame
# Notice that resulting Series does *not* have Categorical data
df.loc['h', :]

cats      a
values    1
Name: h, dtype: object

In [97]:
# Get element at row 0, column 0 of DataFrame
# Notice that resulting scalar is not Categorical data
df.iat[0, 0]

'a'

In [98]:
# Get element at row 'h', column 'cats' of DataFrame in a way that returns Categorical data
df.loc[['h'], 'cats']

h    a
Name: cats, dtype: category
Categories (3, object): ['a', 'b', 'c']

## String and datetime accessors

In [99]:
# Create categorical Series from string Series
str_s = pd.Series(list('aabb'))
str_cat = str_s.astype('category')
str_cat

0    a
1    a
2    b
3    b
dtype: category
Categories (2, object): ['a', 'b']

In [100]:
# Perform string function on categorical data
str_cat.str.upper()

0    A
1    A
2    B
3    B
dtype: object

In [101]:
# Create categorical Series from datetime Series
date_s = pd.Series(pd.date_range('11/28/2024', periods=5))
date_cat = date_s.astype('category')
date_cat

0   2024-11-28
1   2024-11-29
2   2024-11-30
3   2024-12-01
4   2024-12-02
dtype: category
Categories (5, datetime64[ns]): [2024-11-28, 2024-11-29, 2024-11-30, 2024-12-01, 2024-12-02]

In [102]:
# Perform datetime function on categorical data
date_cat.dt.day_name()

0    Thursday
1      Friday
2    Saturday
3      Sunday
4      Monday
dtype: object

## Setting

In [103]:
# Create DataFrame with categorical data
idx = pd.Index(['h', 'i', 'j', 'k', 'l', 'm', 'n'])
cats = pd.Categorical(['a', 'a', 'a', 'a', 'a', 'a', 'a'], categories=['a', 'b'])
values = [1, 1, 1, 1, 1, 1, 1]

df = pd.DataFrame({'cats': cats, 'values': values}, index=idx)
df

Unnamed: 0,cats,values
h,a,1
i,a,1
j,a,1
k,a,1
l,a,1
m,a,1
n,a,1


In [104]:
# Modify rows 3 through 4 of DataFrame,
# setting the values of 'cats' to a category that is in the list of categories but unusued
df.iloc[2:4, :] = [['b', 2], ['b', 2]]
df

Unnamed: 0,cats,values
h,a,1
i,a,1
j,b,2
k,b,2
l,a,1
m,a,1
n,a,1


In [105]:
# Attempt to modify rows 3 through 4 of DataFrame,
# setting the values of 'cats' to a category that is not in the list of categories
try:
    df.iloc[2:4, :] = [['c', 2], ['c', 2]]
except TypeError as e:
    print("TypeError:", e)

TypeError: Cannot setitem on a Categorical with a new category, set the categories first


In [106]:
# Change values of 'cats' in rows 'j' through 'k' using categorical data with the same overall categories as 'cats'
df.loc['j':'k', 'cats'] = pd.Categorical(['a', 'a'], categories=['a', 'b'])
df

Unnamed: 0,cats,values
h,a,1
i,a,1
j,a,2
k,a,2
l,a,1
m,a,1
n,a,1


In [107]:
# Attempt to change values of 'cats' in rows 'j' through 'k' using categorical data with different categories to 'cats'
try:
    df.loc['j':'k', 'cats'] = pd.Categorical(['b', 'b'], categories=['a', 'b', 'c'])
except TypeError as e:
    print("TypeError:", e)

TypeError: Cannot set a Categorical with another, without identical categories


## Merging / concatenation

In [108]:
# Create categorical Series
s1 = pd.Series(['a', 'b'], dtype='category')
s1

0    a
1    b
dtype: category
Categories (2, object): ['a', 'b']

In [109]:
# Create categorical Series
s2 = pd.Series(['a', 'b', 'a'], dtype='category')
s2

0    a
1    b
2    a
dtype: category
Categories (2, object): ['a', 'b']

In [110]:
# Concatenate two categorical Series with the same categories
# Result is categorical Series with same categories
pd.concat([s1, s2])

0    a
1    b
0    a
1    b
2    a
dtype: category
Categories (2, object): ['a', 'b']

In [111]:
# Create categorical Series with different categories
s3 = pd.Series(['b', 'c'], dtype='category')
s3

0    b
1    c
dtype: category
Categories (2, object): ['b', 'c']

In [112]:
# Concatenate two categorical Series with different categories
# Result is object Series
pd.concat([s1, s3])

0    a
1    b
0    b
1    c
dtype: object

In [113]:
# Concatenate two categorical Series with different categories
# Result is object Series, which can be converted to categorical Series
pd.concat([s1, s3]).astype('category')

0    a
1    b
0    b
1    c
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [114]:
# Use union_categoricals to automatically generate a new Categorical
# from the result of concatenating two categorical Series with different categories
# Note that this does not create a new categorical *Series*!
from pandas.api.types import union_categoricals
union_categoricals([s1, s3])

['a', 'b', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']

In [115]:
# Create categorical Series with integer categories
int_cats = pd.Series([1, 2], dtype='category')
int_cats

0    1
1    2
dtype: category
Categories (2, int64): [1, 2]

In [116]:
# Create categorical Series with float categories
float_cats = pd.Series([3.0, 4.0], dtype='category')
float_cats

0    3.0
1    4.0
dtype: category
Categories (2, float64): [3.0, 4.0]

In [117]:
# Concatenate two categorical Series with integer and float categories
# Result is float Series
pd.concat([int_cats, float_cats])

0    1.0
1    2.0
0    3.0
1    4.0
dtype: float64

## Unioning

In [118]:
# Combine Categoricals with different categories using union_categoricals
a = pd.Categorical(['b', 'c'])
b = pd.Categorical(['a', 'b'])
union_categoricals([a, b])

['b', 'c', 'a', 'b']
Categories (3, object): ['b', 'c', 'a']

In [119]:
# Combine Categoricals with different categories and sort new categories
union_categoricals([a, b], sort_categories=True)

['b', 'c', 'a', 'b']
Categories (3, object): ['a', 'b', 'c']

In [120]:
# Combine two ordered Categoricals with the same categories
a = pd.Categorical(['a', 'b'], ordered=True)
b = pd.Categorical(['a', 'b', 'a'], ordered=True)
union_categoricals([a, b])

['a', 'b', 'a', 'b', 'a']
Categories (2, object): ['a' < 'b']

In [121]:
# Attempt to combine two ordered Categoricals with different categories
a = pd.Categorical(['a', 'b'], ordered=True)
b = pd.Categorical(['b', 'c'], ordered=True)
try:
    union_categoricals([a, b])
except TypeError as e:
    print("TypeError:", e)

TypeError: to union ordered Categoricals, all categories must be the same


In [122]:
# Combine two ordered Categoricals with different categories by ignoring orderings
union_categoricals([a, b], ignore_order=True)

['a', 'b', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']

In [123]:
# Create Categorical data where 'b' is coded to 0
c1 = pd.Categorical(['b', 'c'])
c1, c1.codes

(['b', 'c']
 Categories (2, object): ['b', 'c'],
 array([0, 1], dtype=int8))

In [124]:
# Create Categorical data where 'b' is coded to 1
c2 = pd.Categorical(['a', 'b'])
c2, c2.codes

(['a', 'b']
 Categories (2, object): ['a', 'b'],
 array([0, 1], dtype=int8))

In [125]:
# Combine two Categoricals where 'b' is coded to different integers
# Note that 'b' is coded to 0 in the result
c3 = union_categoricals([c1, c2])
c3, c3.codes

(['b', 'c', 'a', 'b']
 Categories (3, object): ['b', 'c', 'a'],
 array([0, 1, 2, 0], dtype=int8))

# Getting data in/out

# Missing data

In [126]:
# Create categorical Series with missing values
# Note that the missing value occurs in the *values* of the categorical data, not in the list of categories!
# np.NaN cannot be a category!
s = pd.Series(['a', 'b', np.nan, 'a'], dtype='category')
s

0      a
1      b
2    NaN
3      a
dtype: category
Categories (2, object): ['a', 'b']

In [127]:
# Missing values do not have a category, and are coded as -1
s.cat.categories, s.cat.codes

(Index(['a', 'b'], dtype='object'),
 0    0
 1    1
 2   -1
 3    0
 dtype: int8)

# Differences to R's `factor`

# Gotchas

## Memory usage
- `Categorical` memory usage is proportional to **number of categories + data length**
- `object` memory usage is proportional to **data length**

In [None]:
# Create object Series with 2000 strings total, 2 unique strings
s = pd.Series(['foo', 'bar'] * 1000)

# Check memory usage of object Series
s.memory_usage()

16132

In [None]:
# Convert object Series to categorical Series with 2000 values total, 2 categories
s_cat = s.astype('category')

# Check categories
print(s_cat.cat.categories)

# Check memory usage of categorical Series
# This is actually much less, because the data can be encoded as integers using category codes
# Only two copies of the original strings need to be stored!
s_cat.memory_usage()

Index(['bar', 'foo'], dtype='object')


2256

In [131]:
# Create object Series with 2000 strings total, 2000 unique strings
s = pd.Series([str(i) for i in range(2000)])

# Check memory usage of object Series
s.memory_usage()

16132

In [None]:
# Convert object Series to categorical Series with 2000 values total, 2000 categories
s_cat = s.astype('category')

# Check memory usage of categorical Series
# This is actually much more, because each string need to be stored + its integer encoding!
s_cat.memory_usage()

86220

## `Categorical` is not a `numpy` array

## dtype in apply

## `Categorical` index

## Side effects