In [19]:
# Categorical data can be classified as oridinal or nominal data
# Ordinal data is the one which exhibits some order and can be sorted.
# For example, the shirt size XL, L, M can be defined such that XL > L > M

# Nominal data is one which doesn't imply any order such as a color of t-shirt
# We would not necessarily say that red is greater green or vice-versa

# MAPPING ORDINAL VALUES
import pandas as pd

df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']
])

df.columns = ['color', 'size', 'price', 'class_label']
df

Unnamed: 0,color,size,price,class_label
0,green,M,10.1,class1
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [20]:
size_mapping = {
    'XL': 3,
    'L': 2,
    'M': 1
}

df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,class_label
0,green,1,10.1,class1
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [21]:
# If you want to get back to the same string representation of size, we could do the following:

inv_mapping = {int(v):k for k, v in size_mapping.items()}
inv_mapping

df['size'] = df['size'].map(inv_mapping)
df

Unnamed: 0,color,size,price,class_label
0,green,M,10.1,class1
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [22]:
# Let us convert it back:
df['size'] = df['size'].map(size_mapping)

In [23]:
# Encoding class labels

import numpy as np
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['class_label']))}
df['class_label'] = df['class_label'].map(class_mapping)
df

Unnamed: 0,color,size,price,class_label
0,green,1,10.1,0
1,red,2,13.5,0
2,blue,3,15.3,1


In [24]:
# You can reverse the encoding as follows:
inv_encoding = {int(v): k for k, v in class_mapping.items()}
df['class_label'] = df['class_label'].map(inv_encoding)
df

Unnamed: 0,color,size,price,class_label
0,green,1,10.1,class1
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [26]:
# Alternatively, there is a convenient way to encode class labels by using LabelEncoder 
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['class_label'].values)
y

array([0, 0, 1])

In [28]:
# We can use inverse_transform to get back our integer class labels back to their string representation
class_le.inverse_transform(y)

array(['class1', 'class1', 'class2'], dtype=object)