#### ***We will understand the difference among these three techniques:***

1. LabelEncoder and OneHotEncoder

2. DictVectorizer

3. Pandas get_dummies


In [18]:
import numpy as np
import pandas as pd

np.set_printoptions(threshold=np.inf, linewidth=200, formatter={'float': '{: 0.3f}'.format})

pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 100)

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

np.random.seed(123)


In [4]:
np.random.seed(1)
random_values1 = np.random.choice([1, 2, 3], size=20)

np.random.seed(2)
random_values2 = np.random.choice([1, 2], size=20)

np.random.seed(3)
random_values3 = np.random.randn(20)

print('Random values 1: ', random_values1) 
print()
print('Random values 2: ', random_values2)
print()
print('Random values 3: ', random_values3)


Random values 1:  [2 1 1 2 2 1 1 2 1 2 1 3 2 3 1 3 2 3 1 1]

Random values 2:  [1 2 2 1 1 2 1 2 1 2 1 2 2 2 2 2 2 2 1 1]

Random values 3:  [ 1.789  0.437  0.096 -1.863 -0.277 -0.355 -0.083 -0.627 -0.044 -0.477 -1.314  0.885  0.881  1.710  0.050 -0.405 -0.545 -1.546  0.982 -1.101]


In [5]:
df = pd.DataFrame({'Var1': random_values1, 'Var2': random_values2}, dtype='category')

df

Unnamed: 0,Var1,Var2
0,2,1
1,1,2
2,1,2
3,2,1
4,2,1
5,1,2
6,1,1
7,2,2
8,1,1
9,2,2


#### ***Instantiate LabelEncoder object***
---

In [6]:
categ_le = LabelEncoder() 

df_labelEncoder = df.apply(lambda x: categ_le.fit_transform(x))
df_labelEncoder

Unnamed: 0,Var1,Var2
0,1,0
1,0,1
2,0,1
3,1,0
4,1,0
5,0,1
6,0,0
7,1,1
8,0,0
9,1,1


#### ***Instantiate OneHotEncoder object***
---

#### df_labelEncoder

In [7]:
categ_ohe = OneHotEncoder(sparse_output=False, drop='first').fit(df_labelEncoder)

# Transform the data
df_onehotencoder = categ_ohe.transform(df_labelEncoder)

# Get the feature names
feature_names = categ_ohe.get_feature_names_out()

print(feature_names)
print()
print(df_onehotencoder)

['Var1_1' 'Var1_2' 'Var2_1']

[[ 1.000  0.000  0.000]
 [ 0.000  0.000  1.000]
 [ 0.000  0.000  1.000]
 [ 1.000  0.000  0.000]
 [ 1.000  0.000  0.000]
 [ 0.000  0.000  1.000]
 [ 0.000  0.000  0.000]
 [ 1.000  0.000  1.000]
 [ 0.000  0.000  0.000]
 [ 1.000  0.000  1.000]
 [ 0.000  0.000  0.000]
 [ 0.000  1.000  1.000]
 [ 1.000  0.000  1.000]
 [ 0.000  1.000  1.000]
 [ 0.000  0.000  1.000]
 [ 0.000  1.000  1.000]
 [ 1.000  0.000  1.000]
 [ 0.000  1.000  1.000]
 [ 0.000  0.000  0.000]
 [ 0.000  0.000  0.000]]


#### df

In [8]:
categ_ohe = OneHotEncoder(sparse_output=False, drop='first').fit(df)

# Transform the data
df_onehotencoder = categ_ohe.transform(df)

# Get the feature names
feature_names = categ_ohe.get_feature_names_out()

print(feature_names)
print()
print(df_onehotencoder)

['Var1_2' 'Var1_3' 'Var2_2']

[[ 1.000  0.000  0.000]
 [ 0.000  0.000  1.000]
 [ 0.000  0.000  1.000]
 [ 1.000  0.000  0.000]
 [ 1.000  0.000  0.000]
 [ 0.000  0.000  1.000]
 [ 0.000  0.000  0.000]
 [ 1.000  0.000  1.000]
 [ 0.000  0.000  0.000]
 [ 1.000  0.000  1.000]
 [ 0.000  0.000  0.000]
 [ 0.000  1.000  1.000]
 [ 1.000  0.000  1.000]
 [ 0.000  1.000  1.000]
 [ 0.000  0.000  1.000]
 [ 0.000  1.000  1.000]
 [ 1.000  0.000  1.000]
 [ 0.000  1.000  1.000]
 [ 0.000  0.000  0.000]
 [ 0.000  0.000  0.000]]


#### ***Instantiate DictVectorizer object***
---

In [9]:
df = df.astype(str)

In [10]:
df_dict = df.to_dict(orient='records') 

df_dict

[{'Var1': '2', 'Var2': '1'},
 {'Var1': '1', 'Var2': '2'},
 {'Var1': '1', 'Var2': '2'},
 {'Var1': '2', 'Var2': '1'},
 {'Var1': '2', 'Var2': '1'},
 {'Var1': '1', 'Var2': '2'},
 {'Var1': '1', 'Var2': '1'},
 {'Var1': '2', 'Var2': '2'},
 {'Var1': '1', 'Var2': '1'},
 {'Var1': '2', 'Var2': '2'},
 {'Var1': '1', 'Var2': '1'},
 {'Var1': '3', 'Var2': '2'},
 {'Var1': '2', 'Var2': '2'},
 {'Var1': '3', 'Var2': '2'},
 {'Var1': '1', 'Var2': '2'},
 {'Var1': '3', 'Var2': '2'},
 {'Var1': '2', 'Var2': '2'},
 {'Var1': '3', 'Var2': '2'},
 {'Var1': '1', 'Var2': '1'},
 {'Var1': '1', 'Var2': '1'}]

In [11]:
categ_dicVec = DictVectorizer(sparse=False).fit(df_dict)

# Transform the data
df_dicVec = categ_dicVec.transform(df_dict)

print(categ_dicVec.vocabulary_)
print(categ_dicVec.feature_names_)
print()

df_dicVec 

{'Var1=1': 0, 'Var1=2': 1, 'Var1=3': 2, 'Var2=1': 3, 'Var2=2': 4}
['Var1=1', 'Var1=2', 'Var1=3', 'Var2=1', 'Var2=2']



array([[ 0.000,  1.000,  0.000,  1.000,  0.000],
       [ 1.000,  0.000,  0.000,  0.000,  1.000],
       [ 1.000,  0.000,  0.000,  0.000,  1.000],
       [ 0.000,  1.000,  0.000,  1.000,  0.000],
       [ 0.000,  1.000,  0.000,  1.000,  0.000],
       [ 1.000,  0.000,  0.000,  0.000,  1.000],
       [ 1.000,  0.000,  0.000,  1.000,  0.000],
       [ 0.000,  1.000,  0.000,  0.000,  1.000],
       [ 1.000,  0.000,  0.000,  1.000,  0.000],
       [ 0.000,  1.000,  0.000,  0.000,  1.000],
       [ 1.000,  0.000,  0.000,  1.000,  0.000],
       [ 0.000,  0.000,  1.000,  0.000,  1.000],
       [ 0.000,  1.000,  0.000,  0.000,  1.000],
       [ 0.000,  0.000,  1.000,  0.000,  1.000],
       [ 1.000,  0.000,  0.000,  0.000,  1.000],
       [ 0.000,  0.000,  1.000,  0.000,  1.000],
       [ 0.000,  1.000,  0.000,  0.000,  1.000],
       [ 0.000,  0.000,  1.000,  0.000,  1.000],
       [ 1.000,  0.000,  0.000,  1.000,  0.000],
       [ 1.000,  0.000,  0.000,  1.000,  0.000]])

#### ***get_dummies object***
---

In [12]:
X = pd.get_dummies(df, prefix_sep='_', drop_first=True, dtype=int)

X

Unnamed: 0,Var1_2,Var1_3,Var2_2
0,1,0,0
1,0,0,1
2,0,0,1
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,0
7,1,0,1
8,0,0,0
9,1,0,1


### ***Using --- Pipeline, OneHotEncoder, ColumnTransformer ---***
---

In [13]:
# Assuming random_values1, random_values2, and random_values3 are already defined
df = pd.DataFrame({
    'Var1': random_values1,
    'Var2': random_values2,
    'Var3': random_values3
}, dtype='category')

# Directly convert Var3 to continuous (float) if not already
df['Var3'] = pd.to_numeric(df['Var3'], errors='coerce')

df.dtypes

Var1    category
Var2    category
Var3     float64
dtype: object

In [14]:
numeric_features = df.select_dtypes('number').columns
categorical_features = df.select_dtypes('category').columns

print('Numeric features: ', numeric_features )
print('Categorical features: ', categorical_features )

Numeric features:  Index(['Var3'], dtype='object')
Categorical features:  Index(['Var1', 'Var2'], dtype='object')


In [15]:
# transform numeric features
numeric_transformer = Pipeline(
    steps=[
        # ("imputer", SimpleImputer(strategy="median")), 
           ("scaler", StandardScaler())
           ]
)

# transform categorical features
categorical_transformer = Pipeline(
    steps=[
        # ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
           ("onehot", OneHotEncoder(sparse_output=False, drop= 'first', dtype = np.int64))
           ]
)

# combine both transformations
col_transformer = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, numeric_features),
        ("categorical", categorical_transformer, categorical_features)
    ], 
    remainder='passthrough'    
).set_output(transform='pandas')



In [16]:
from sklearn import set_config

set_config(display='diagram')
col_transformer

In [17]:
col_transformer.fit_transform(df)

Unnamed: 0,numeric__Var3,categorical__Var1_2,categorical__Var1_3,categorical__Var2_2
0,1.939045,1,0,0
1,0.543751,0,0,1
2,0.192882,0,0,1
3,-1.829694,1,0,0
4,-0.192943,1,0,0
5,-0.272784,0,0,1
6,0.007919,0,0,0
7,-0.553719,1,0,1
8,0.048086,0,0,0
9,-0.399153,1,0,1
