In [44]:
# Two types of categorical data: ordinal (can be sorted) and nominal (cannot be sorted)

In [45]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [46]:
# Create a dataframe
# Note that this sets up only the data, not the names of the columns
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']    
]
)
# This assigns names to each column
df.columns = ['color', 'size', 'price', 'classlabel']

df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [47]:
# Mapping the ordinal feature ('size')
size_mapping = {'XL':3, 'L':2, 'M':1}

df['size'] = df['size'].map(size_mapping)

df

# Note that this can be run only once. 
# Once the feature is mapped, mapping it again will only give NaN

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [48]:
# Inverse mapping
inv_size_mapping = {v:k for k,v in size_mapping.items()}
inv_size_mapping

{3: 'XL', 2: 'L', 1: 'M'}

In [49]:
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

In [50]:
#df['size'] = df['size'].map(inv_size_mapping)
#df
# The following onehotencoding codes need 'size' to be integers. Therefore the codes in this cell are quoted.

In [51]:
# Mapping class labels to integers
print(df['classlabel'])
print(np.unique(df['classlabel']))
class_mapping = {label:idx for idx, label in enumerate(np.unique(df['classlabel']))}
print(class_mapping)
# enumerate() allows us to get the index and the entry at the same time.

0    class2
1    class1
2    class2
Name: classlabel, dtype: object
['class1' 'class2']
{'class1': 0, 'class2': 1}


In [52]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [53]:
inv_class_mapping = {v:k for k,v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [54]:
# Using sklearn functions to convert class labels
class_le = LabelEncoder()
class_le.fit(df['classlabel'])
class_le.transform(df['classlabel'])
df['classlabel'] = class_le.transform(df['classlabel'])
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [55]:
# Inverse transform
df['classlabel'] = class_le.inverse_transform(df['classlabel'])
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [56]:
# 
print(df[['color','size','price']])
X = df[['color','size','price']].values
print(X)
# X contains only the values, no column names. X is also a 2D numpy array.
color_le = LabelEncoder()
X[:,0] = color_le.fit_transform(X[:,0]) # This fit and transform the first column of X into integers
print(X)
# However, we cannot sort the colors. Therefore, the above transformation is not proper.

   color  size  price
0  green     1   10.1
1    red     2   13.5
2   blue     3   15.3
[['green' 1 10.1]
 ['red' 2 13.5]
 ['blue' 3 15.3]]
[[1 1 10.1]
 [2 2 13.5]
 [0 3 15.3]]


In [57]:
# One-hot encoding
# Create a feature for each color, then assign 0 or 1 to this feature.
# Get the values in the data frame
X = df[['color','size','price']].values
# One-hot-encoder
color_ohe = OneHotEncoder()
# The first column of X. However, the result is a 1D array.
print(X[:,0])
# reshape into a column vector is necessary, otherwise error will occur.
print(X[:,0].reshape(-1,1))
# The original result given by the one-hot-encoder is a sparse matrix.
print(color_ohe.fit_transform(X[:,0].reshape(-1,1)))
# This turns the sparse matrix to a 2D array.
print(color_ohe.fit_transform(X[:,0].reshape(-1,1)).toarray())

['green' 'red' 'blue']
[['green']
 ['red']
 ['blue']]
  (0, 1)	1.0
  (1, 2)	1.0
  (2, 0)	1.0
[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


In [58]:
# Alternative to transform only the 'color' column

X = df[['color','size','price']].values
c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0]),
    ('nothing', 'passthrough', [1,2])
])
print(c_transf.fit_transform(X))
# astype(): Copy of the array, cast to a specified type.
c_transf.fit_transform(X).astype(float)

[[0.0 1.0 0.0 1 10.1]
 [0.0 0.0 1.0 2 13.5]
 [1.0 0.0 0.0 3 15.3]]


array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [62]:
# Alternative by using pandas
pd.get_dummies(df[['color','size','price']])
# Note that the one-hot features are listed at the end, even if 'color' is the first feature in the original dataframe.

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,0,1,0
1,2,13.5,0,0,1
2,3,15.3,1,0,0


In [64]:
# One-hot encoding may cause problems when subsequent operations involve matrix inversion.
# Therefore, we remove one one-hot feature.
# By nnly removing one such feature, we do not lose any information.
# The following code drops the first one-hot feature.
pd.get_dummies(df[['color','size','price']], drop_first=True)

Unnamed: 0,size,price,color_green,color_red
0,1,10.1,1,0
1,2,13.5,0,1
2,3,15.3,0,0


In [66]:
# Drop the first column with OneHotEncoder
color_ohe = OneHotEncoder(categories='auto', drop='first')
c_transf = ColumnTransformer([
    ('onehot', color_ohe, [0]),
    ('nothing', 'passthrough', [1,2])
])
c_transf.fit_transform(X).astype(float)

array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  2. , 13.5],
       [ 0. ,  0. ,  3. , 15.3]])

In [None]:
# If there are too many unique values for a nominal feature, one-hot encoding will result in too many one-hot features.
# In this case, binary encoding or frequency encoding can be used.

In [68]:
# Encoding ordinal features
# Recover the original dataframe
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']    
]
)
df.columns = ['color', 'size', 'price', 'classlabel']

In [72]:
df['x>M'] = df['size'].apply(lambda x: 1 if x in {'L', 'XL'} else 0)
df['x>L'] = df['size'].apply(lambda x: 1 if x=='XL' else 0)
# Note that these codes create two new columns named 'x>M' and 'x>L'
df

Unnamed: 0,color,size,price,classlabel,x>M,x>L
0,green,M,10.1,class2,0,0
1,red,L,13.5,class1,1,0
2,blue,XL,15.3,class2,1,1
