In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.pipeline import Pipeline

A categorical encoder replaces variable labels with a calculated or arbitrary number. We will study:

#One Hot Encoder
#Ordinal Encoder
#Rare Label Encode

In [2]:
# one hot encoder

from feature_engine.encoding import OneHotEncoder

In [3]:
df = sns.load_dataset('penguins').filter(['species', 'island', 'sex'])
df.head()

Unnamed: 0,species,island,sex
0,Adelie,Torgersen,Male
1,Adelie,Torgersen,Female
2,Adelie,Torgersen,Female
3,Adelie,Torgersen,
4,Adelie,Torgersen,Female


In [4]:
from feature_engine.imputation import DropMissingData
pipeline = Pipeline([
      ('drop_na', DropMissingData() ),
      ('ohe', OneHotEncoder(variables=['species', 'island', 'sex']) )
])


df = pipeline.fit_transform(df)
df

Unnamed: 0,species_Adelie,species_Chinstrap,species_Gentoo,island_Torgersen,island_Biscoe,island_Dream,sex_Male,sex_Female
0,1,0,0,1,0,0,1,0
1,1,0,0,1,0,0,0,1
2,1,0,0,1,0,0,0,1
4,1,0,0,1,0,0,0,1
5,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...
338,0,0,1,0,1,0,0,1
340,0,0,1,0,1,0,0,1
341,0,0,1,0,1,0,1,0
342,0,0,1,0,1,0,0,1


In [5]:
df = sns.load_dataset('penguins').filter(['species', 'island', 'sex'])
df.head()

Unnamed: 0,species,island,sex
0,Adelie,Torgersen,Male
1,Adelie,Torgersen,Female
2,Adelie,Torgersen,Female
3,Adelie,Torgersen,
4,Adelie,Torgersen,Female


In [6]:
pipeline = Pipeline([
      ( 'drop_na', DropMissingData() ),
      ('ohe', OneHotEncoder(variables=['species', 'island', 'sex'], drop_last=True) )
])


df = pipeline.fit_transform(df)
df

Unnamed: 0,species_Adelie,species_Chinstrap,island_Torgersen,island_Biscoe,sex_Male
0,1,0,1,0,1
1,1,0,1,0,0
2,1,0,1,0,0
4,1,0,1,0,0
5,1,0,1,0,1
...,...,...,...,...,...
338,0,0,0,1,0
340,0,0,0,1,0
341,0,0,0,1,1
342,0,0,0,1,0


In [7]:
# ordinal encofder

from feature_engine.encoding import OrdinalEncoder

In [8]:
df = sns.load_dataset('penguins').filter(['species', 'island', 'sex'])
df.head()

Unnamed: 0,species,island,sex
0,Adelie,Torgersen,Male
1,Adelie,Torgersen,Female
2,Adelie,Torgersen,Female
3,Adelie,Torgersen,
4,Adelie,Torgersen,Female


In [9]:
from feature_engine.imputation import DropMissingData
pipeline = Pipeline([
      ( 'drop_na', DropMissingData() ),
      ('ordinal_encoder', OrdinalEncoder(encoding_method='arbitrary') )
])

df = pipeline.fit_transform(df)
df

Unnamed: 0,species,island,sex
0,0,0,0
1,0,0,1
2,0,0,1
4,0,0,1
5,0,0,0
...,...,...,...
338,2,1,1
340,2,1,1
341,2,1,0
342,2,1,1


In [10]:
for col in df.columns.to_list():
  print(f"{col} \n{df[col].value_counts()} \n\n")

species 
0    146
2    119
1     68
Name: species, dtype: int64 


island 
1    163
2    123
0     47
Name: island, dtype: int64 


sex 
0    168
1    165
Name: sex, dtype: int64 




In [11]:
pipeline['ordinal_encoder'].encoder_dict_

{'species': {'Adelie': 0, 'Chinstrap': 1, 'Gentoo': 2},
 'island': {'Torgersen': 0, 'Biscoe': 1, 'Dream': 2},
 'sex': {'Male': 0, 'Female': 1}}

In [12]:
from feature_engine.encoding import RareLabelEncoder

In [13]:
df = sns.load_dataset('titanic').filter(['parch', 'sibsp']).astype('object')
print(df.shape)
df.head()

(891, 2)


Unnamed: 0,parch,sibsp
0,0,1
1,0,1
2,0,0
3,0,1
4,0,0


In [14]:
df.isnull().sum()

parch    0
sibsp    0
dtype: int64

In [15]:
for col in df.columns.to_list():
  print(f"{col} \n{df[col].value_counts(normalize=True)} \n\n")

parch 
0    0.760943
1    0.132435
2    0.089787
5    0.005612
3    0.005612
4    0.004489
6    0.001122
Name: parch, dtype: float64 


sibsp 
0    0.682379
1    0.234568
2    0.031425
4    0.020202
3    0.017957
8    0.007856
5    0.005612
Name: sibsp, dtype: float64 




In [16]:
from feature_engine.imputation import DropMissingData
pipeline = Pipeline([
      ( 'drop_na', DropMissingData() ),
      ('rle_parch', RareLabelEncoder(tol=0.1,
                                     n_categories=2,
                                     variables=['parch']) ), 
      ('rle_sibsp', RareLabelEncoder(tol=0.08,
                                     n_categories=2,
                                     variables=['sibsp']) )
])

df = pipeline.fit_transform(df)
df.head()

Unnamed: 0,parch,sibsp
0,0,1
1,0,1
2,0,0
3,0,1
4,0,0


In [17]:
for col in df.columns.to_list():
  print(f"{col} \n{df[col].value_counts(normalize=True)} \n\n")

parch 
0       0.760943
1       0.132435
Rare    0.106622
Name: parch, dtype: float64 


sibsp 
0       0.682379
1       0.234568
Rare    0.083053
Name: sibsp, dtype: float64 




In [18]:
df = sns.load_dataset('titanic').filter(['parch', 'sibsp']).astype('object')
for col in df.columns.to_list():
  print(f"{col} \n{df[col].value_counts(normalize=True)} \n\n")

parch 
0    0.760943
1    0.132435
2    0.089787
5    0.005612
3    0.005612
4    0.004489
6    0.001122
Name: parch, dtype: float64 


sibsp 
0    0.682379
1    0.234568
2    0.031425
4    0.020202
3    0.017957
8    0.007856
5    0.005612
Name: sibsp, dtype: float64 




In [19]:
pipeline = Pipeline([
      ( 'drop_na', DropMissingData() ),
      ('rle_parch', RareLabelEncoder(tol=0.1,
                                     n_categories=2,
                                     variables=['parch']) ), 
      ('rle_sibsp', RareLabelEncoder(tol=0.08,
                                     n_categories=2,
                                     variables=['sibsp']) ),
      ('ordinal_encoder', OrdinalEncoder(encoding_method='arbitrary',
                                         variables= ['parch', 'sibsp']) )
])

df = pipeline.fit_transform(df)

for col in df.columns.to_list():
  print(f"{col} \n{df[col].value_counts(normalize=True)} \n\n")

parch 
0    0.760943
1    0.132435
2    0.106622
Name: parch, dtype: float64 


sibsp 
1    0.682379
0    0.234568
2    0.083053
Name: sibsp, dtype: float64 


