# Python - Categorical Data with Pandas

In [1]:
import numpy as np
import pandas as pd

In [2]:
colors = pd.Series(['green', 'yellow', 'black','blue', 'green', 'red', 'yellow'])
print(colors)
pd.unique(colors)

0     green
1    yellow
2     black
3      blue
4     green
5       red
6    yellow
dtype: object


array(['green', 'yellow', 'black', 'blue', 'red'], dtype=object)

## Represent categories by numeric

In [3]:
# black = 0, blue = 1, green = 2, red = 3, yellow = 4
values = pd.Series([0,0,4,3, 2,1,1, 0, 4] * 2)
colors = pd.Series(['black', 'blue', 'green', 'red', 'yellow'])
colors.take(values)

0     black
0     black
4    yellow
3       red
2     green
1      blue
1      blue
0     black
4    yellow
0     black
0     black
4    yellow
3       red
2     green
1      blue
1      blue
0     black
4    yellow
dtype: object

## Categorical Data Type 

In [4]:
sales_data = pd.read_csv("test_data/datasets/sales.csv")
sales_data.head(10)

Unnamed: 0,Region,Country,Item Type,Sales Channel,Order Priority,Order Date,Order ID,Ship Date,Units Sold,Unit Price,Unit Cost,Total Revenue,Total Cost,Total Profit
0,Australia and Oceania,Tuvalu,Baby Food,Offline,H,5/28/2010,669165933,6/27/2010,9925,255.28,159.42,2533654.0,1582243.5,951410.5
1,Central America and the Caribbean,Grenada,Cereal,Online,C,8/22/2012,963881480,9/15/2012,2804,205.7,117.11,576782.8,328376.44,248406.36
2,Europe,Russia,Office Supplies,Offline,L,5/2/2014,341417157,5/8/2014,1779,651.21,524.96,1158502.59,933903.84,224598.75
3,Sub-Saharan Africa,Sao Tome and Principe,Fruits,Online,C,6/20/2014,514321792,7/5/2014,8102,9.33,6.92,75591.66,56065.84,19525.82
4,Sub-Saharan Africa,Rwanda,Office Supplies,Offline,L,2/1/2013,115456712,2/6/2013,5062,651.21,524.96,3296425.02,2657347.52,639077.5
5,Australia and Oceania,Solomon Islands,Baby Food,Online,C,2/4/2015,547995746,2/21/2015,2974,255.28,159.42,759202.72,474115.08,285087.64
6,Sub-Saharan Africa,Angola,Household,Offline,M,4/23/2011,135425221,4/27/2011,4187,668.27,502.54,2798046.49,2104134.98,693911.51
7,Sub-Saharan Africa,Burkina Faso,Vegetables,Online,H,7/17/2012,871543967,7/27/2012,8082,154.06,90.93,1245112.92,734896.26,510216.66
8,Sub-Saharan Africa,Republic of the Congo,Personal Care,Offline,M,7/14/2015,770463311,8/25/2015,6070,81.73,56.67,496101.1,343986.9,152114.2
9,Sub-Saharan Africa,Senegal,Cereal,Online,H,4/18/2014,616607081,5/30/2014,6593,205.7,117.11,1356180.1,772106.23,584073.87


In [5]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Region          100 non-null    object 
 1   Country         100 non-null    object 
 2   Item Type       100 non-null    object 
 3   Sales Channel   100 non-null    object 
 4   Order Priority  100 non-null    object 
 5   Order Date      100 non-null    object 
 6   Order ID        100 non-null    int64  
 7   Ship Date       100 non-null    object 
 8   Units Sold      100 non-null    int64  
 9   Unit Price      100 non-null    float64
 10  Unit Cost       100 non-null    float64
 11  Total Revenue   100 non-null    float64
 12  Total Cost      100 non-null    float64
 13  Total Profit    100 non-null    float64
dtypes: float64(5), int64(2), object(7)
memory usage: 11.1+ KB


In [6]:
sales_data.describe()

Unnamed: 0,Order ID,Units Sold,Unit Price,Unit Cost,Total Revenue,Total Cost,Total Profit
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,555020400.0,5128.71,276.7613,191.048,1373488.0,931805.7,441682.0
std,260615300.0,2794.484562,235.592241,188.208181,1460029.0,1083938.0,438537.9
min,114606600.0,124.0,9.33,6.92,4870.26,3612.24,1258.02
25%,338922500.0,2836.25,81.73,35.84,268721.2,168868.0,121443.6
50%,557708600.0,5382.5,179.88,107.275,752314.4,363566.4,290768.0
75%,790755100.0,7369.0,437.2,263.33,2212045.0,1613870.0,635828.8
max,994022200.0,9925.0,668.27,524.96,5997055.0,4509794.0,1719922.0


In [7]:
sales_data.memory_usage()

Index             128
Region            800
Country           800
Item Type         800
Sales Channel     800
Order Priority    800
Order Date        800
Order ID          800
Ship Date         800
Units Sold        800
Unit Price        800
Unit Cost         800
Total Revenue     800
Total Cost        800
Total Profit      800
dtype: int64

In [8]:
# Let's convert Region, Country, Item Type, Sales Channel, Order Priority to category data type
sales_data_cats = sales_data[["Region", "Country", "Item Type", "Sales Channel", "Order Priority"]].astype("category")
sales_data_cats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Region          100 non-null    category
 1   Country         100 non-null    category
 2   Item Type       100 non-null    category
 3   Sales Channel   100 non-null    category
 4   Order Priority  100 non-null    category
dtypes: category(5)
memory usage: 4.3 KB


In [9]:
sales_data_cats.describe()

Unnamed: 0,Region,Country,Item Type,Sales Channel,Order Priority
count,100,100,100,100,100
unique,7,76,12,2,4
top,Sub-Saharan Africa,The Gambia,Clothes,Offline,H
freq,36,4,13,50,30


In [10]:
type(sales_data_cats.Region.values)

pandas.core.arrays.categorical.Categorical

In [11]:
print(sales_data_cats.Region.values.codes)
print(sales_data_cats.Region.values.categories)

[1 2 3 6 6 1 6 6 6 6 0 6 0 2 0 3 0 6 0 1 3 3 2 1 3 3 1 6 3 6 3 6 1 0 6 2 4
 6 0 3 6 4 6 3 0 6 3 3 3 6 3 6 4 6 6 6 1 3 3 6 1 3 6 4 2 6 6 2 3 6 0 4 6 6
 4 5 1 0 3 1 3 4 4 6 6 5 6 6 4 3 6 1 4 3 2 6 0 6 5 6]
Index(['Asia', 'Australia and Oceania', 'Central America and the Caribbean',
       'Europe', 'Middle East and North Africa', 'North America',
       'Sub-Saharan Africa'],
      dtype='object')


### Give order to Order Priority

In [12]:
# No order currently
sales_data_cats['Order Priority']

0     H
1     C
2     L
3     C
4     L
     ..
95    M
96    L
97    C
98    M
99    L
Name: Order Priority, Length: 100, dtype: category
Categories (4, object): ['C', 'H', 'L', 'M']

In [13]:
# Reorder the categories
sales_data_cats['Order Priority'] = sales_data_cats['Order Priority'].cat.reorder_categories(['L', 'M', 'H', 'C'], ordered=True)
sales_data_cats['Order Priority']

0     H
1     C
2     L
3     C
4     L
     ..
95    M
96    L
97    C
98    M
99    L
Name: Order Priority, Length: 100, dtype: category
Categories (4, object): ['L' < 'M' < 'H' < 'C']

## Creating Categorical Directly

In [14]:
countries = pd.Categorical(['Canada', 'UK', 'India', 'India', 'Canada', 'China', 'US'])
print(countries.codes)
print(countries)

[0 3 2 2 0 1 4]
['Canada', 'UK', 'India', 'India', 'Canada', 'China', 'US']
Categories (5, object): ['Canada', 'China', 'India', 'UK', 'US']


In [15]:
# Alternatively you can specify the codes
categories = ['Canada', 'UK', 'India', 'China', 'US']
codes = [0, 1, 2,3, 3, 1, 4, 4]
countries = pd.Categorical.from_codes(codes, categories)
print(countries.codes)
print(countries)

[0 1 2 3 3 1 4 4]
['Canada', 'UK', 'India', 'China', 'China', 'UK', 'US', 'US']
Categories (5, object): ['Canada', 'UK', 'India', 'China', 'US']


# Categorical Methods

In [21]:
# Let's use the sales data
for col in sales_data_cats.columns:
    sales_data[col] = sales_data_cats[col]
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Region          100 non-null    category
 1   Country         100 non-null    category
 2   Item Type       100 non-null    category
 3   Sales Channel   100 non-null    category
 4   Order Priority  100 non-null    category
 5   Order Date      100 non-null    object  
 6   Order ID        100 non-null    int64   
 7   Ship Date       100 non-null    object  
 8   Units Sold      100 non-null    int64   
 9   Unit Price      100 non-null    float64 
 10  Unit Cost       100 non-null    float64 
 11  Total Revenue   100 non-null    float64 
 12  Total Cost      100 non-null    float64 
 13  Total Profit    100 non-null    float64 
dtypes: category(5), float64(5), int64(2), object(2)
memory usage: 11.3+ KB


In [22]:
sales_data.memory_usage()

Index              128
Region             456
Country           2812
Item Type          496
Sales Channel      224
Order Priority     304
Order Date         800
Order ID           800
Ship Date          800
Units Sold         800
Unit Price         800
Unit Cost          800
Total Revenue      800
Total Cost         800
Total Profit       800
dtype: int64

In [27]:
print(sales_data.Region.cat.codes)
print(sales_data.Region.cat.categories)

0     1
1     2
2     3
3     6
4     6
     ..
95    6
96    0
97    6
98    5
99    6
Length: 100, dtype: int8
Index(['Asia', 'Australia and Oceania', 'Central America and the Caribbean',
       'Europe', 'Middle East and North Africa', 'North America',
       'Sub-Saharan Africa'],
      dtype='object')


In [28]:
sales_data.Region.cat?

[0;31mType:[0m        CategoricalAccessor
[0;31mString form:[0m <pandas.core.arrays.categorical.CategoricalAccessor object at 0x7f8f6dd286d0>
[0;31mFile:[0m        ~/miniconda3/lib/python3.8/site-packages/pandas/core/arrays/categorical.py
[0;31mDocstring:[0m  
Accessor object for categorical properties of the Series values.

Be aware that assigning to `categories` is a inplace operation, while all
methods return new categorical data per default (but can be called with
`inplace=True`).

Parameters
----------
data : Series or CategoricalIndex

Examples
--------
>>> s = pd.Series(list("abbccc")).astype("category")
>>> s
0    a
1    b
2    b
3    c
4    c
5    c
dtype: category
Categories (3, object): ['a', 'b', 'c']

>>> s.cat.categories
Index(['a', 'b', 'c'], dtype='object')

>>> s.cat.rename_categories(list("cba"))
0    c
1    b
2    b
3    a
4    a
5    a
dtype: category
Categories (3, object): ['c', 'b', 'a']

>>> s.cat.reorder_categories(list("cba"))
0    a
1    b
2    b
3   

## Categorical Encodings

### Pandas

In [39]:
sales_data['Order Priority']

0     H
1     C
2     L
3     C
4     L
     ..
95    M
96    L
97    C
98    M
99    L
Name: Order Priority, Length: 100, dtype: category
Categories (4, object): ['L' < 'M' < 'H' < 'C']

In [38]:
pd.get_dummies(sales_data['Order Priority'])

Unnamed: 0,L,M,H,C
0,0,0,1,0
1,0,0,0,1
2,1,0,0,0
3,0,0,0,1
4,1,0,0,0
...,...,...,...,...
95,0,1,0,0
96,1,0,0,0
97,0,0,0,1
98,0,1,0,0


### scikit-learn

In [49]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, LabelBinarizer

ordinal_encoder = OrdinalEncoder()

sales_data['ordinal_encoded']= one_hot_encoder.fit_transform(sales_data[['Order Priority']])
sales_data[['Order Priority', 'ordinal_encoded']].head(10)

Unnamed: 0,Order Priority,ordinal_encoded
0,H,1.0
1,C,0.0
2,L,2.0
3,C,0.0
4,L,2.0
5,C,0.0
6,M,3.0
7,H,1.0
8,M,3.0
9,H,1.0


In [51]:
one_hot_encoder = OneHotEncoder()

encoded = one_hot_encoder.fit_transform(sales_data[['Order Priority']])
pd.DataFrame(encoded.toarray(), columns=one_hot_encoder.categories_).head(10)

Unnamed: 0,C,H,L,M
0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0
5,1.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0
7,0.0,1.0,0.0,0.0
8,0.0,0.0,0.0,1.0
9,0.0,1.0,0.0,0.0


In [53]:
label_encoder = LabelEncoder()

sales_data['label_encoded']= label_encoder.fit_transform(sales_data[['Order Priority']])
sales_data[['Order Priority', 'label_encoded']].head(10)

Unnamed: 0,Order Priority,label_encoded
0,H,1
1,C,0
2,L,2
3,C,0
4,L,2
5,C,0
6,M,3
7,H,1
8,M,3
9,H,1


In [63]:
label_binarizer = LabelBinarizer()

encoded = label_binarizer.fit_transform(sales_data['Order Priority'])
pd.DataFrame(encoded, columns=label_binarizer.classes_).head(10)

Unnamed: 0,C,H,L,M
0,0,1,0,0
1,1,0,0,0
2,0,0,1,0
3,1,0,0,0
4,0,0,1,0
5,1,0,0,0
6,0,0,0,1
7,0,1,0,0
8,0,0,0,1
9,0,1,0,0


### Category Encoders

In [65]:
!pip install -Uqq category_encoders

In [70]:
import category_encoders as ce
# help(ce)

## Memory Usage

In [71]:
colors = pd.Series(['black', 'blue', 'green', 'red', 'yellow'] * 100000)

In [72]:
colors.memory_usage()

4000128

In [73]:
colors_cat = colors.astype('category')
colors_cat.memory_usage()

500340