In [None]:
# A notebook for exploring the contrasts created by each category_encoders option

In [None]:
import numpy as np
import pandas as pd              
import category_encoders as ce   
from sklearn.preprocessing import LabelEncoder

pd.options.display.float_format = '{:.2f}'.format # to make legible

# make some data
df = pd.DataFrame({
    'color':["a", "c", "a", "a", "b", "b"], 
    'outcome':[1, 2, 0, 0, 0, 1]})

# set up X and y
X = df.drop('outcome', axis = 1)
y = df.drop('color', axis = 1)

In [None]:
#!pip install category-encoders

# Classic Encoders

## sklearn.preprocessing.LabelEncoder()

##### *Label Encoder:*

Sklearn provides a very efficient tool for encoding the levels of categorical features into numeric values. LabelEncoder encode labels with a value between 0 and n_classes-1 where n is the number of distinct labels. If a label repeats it assigns the same value to as assigned earlier.

In [9]:
X

Unnamed: 0,color
0,a
1,c
2,a
3,a
4,b
5,b


In [22]:
print(X) 

le = LabelEncoder()
encoded = le.fit_transform(np.ravel(X))    # warning thrown without np.ravel

print("\n The result of transforming X with LabelEncoder:")
print(encoded)
print(type(encoded))

  color
0     a
1     c
2     a
3     a
4     b
5     b

 The result of transforming X with LabelEncoder:
[0 2 0 0 1 1]
<class 'numpy.ndarray'>


In [23]:
le.classes_

array(['a', 'b', 'c'], dtype=object)

### If you want to set your own encodings

In [14]:
encoding_dict = {}
encoding_dict['a'] = 1
encoding_dict['b'] = 2
encoding_dict['c'] = 3

In [15]:
encoding_dict

{'a': 1, 'b': 2, 'c': 3}

In [16]:
def encode_column(x, encoding_dict=encoding_dict):
    return encoding_dict[x]

In [17]:
df['encoded color'] = df['color'].apply(lambda x: encode_column(x))

In [18]:
df[['encoded color', 'color']]

Unnamed: 0,encoded color,color
0,1,a
1,3,c
2,1,a
3,1,a
4,2,b
5,2,b


## One-Hot 

In [19]:
ce_one_hot = ce.OneHotEncoder(cols = ['color'])
ce_one_hot.fit_transform(X, y)

Unnamed: 0,color_1,color_2,color_3
0,1,0,0
1,0,1,0
2,1,0,0
3,1,0,0
4,0,0,1
5,0,0,1


In [20]:
X

Unnamed: 0,color
0,a
1,c
2,a
3,a
4,b
5,b
