In [26]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler
)

In [27]:
df = pd.DataFrame({'col_a': [1,2,np.nan,1,2,1.5,1,np.nan],
                   'col_b': ['cat','dog',np.nan,'dog','dog',np.nan,'dog','cat'],
                   'col_c':[1,3,3,3,7,3,1,3]})
df = df.astype({'col_b':'category'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   col_a   6 non-null      float64 
 1   col_b   6 non-null      category
 2   col_c   8 non-null      int64   
dtypes: category(1), float64(1), int64(1)
memory usage: 392.0 bytes


In [28]:
df

Unnamed: 0,col_a,col_b,col_c
0,1.0,cat,1
1,2.0,dog,3
2,,,3
3,1.0,dog,3
4,2.0,dog,7
5,1.5,,3
6,1.0,dog,1
7,,cat,3


In [29]:
df_train = df.iloc[:5,:].reset_index(drop=True)
df_test = df.iloc[5:,:].reset_index(drop=True)

In [30]:
num_cols = df_train.select_dtypes('number').columns
cat_cols = df_train.select_dtypes('category').columns

In [33]:
cat_proc = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

num_proc = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    #('minmax', MinMaxScaler())
])

processor = ColumnTransformer(transformers=[
    ('cat', cat_proc, cat_cols),
    ('num', num_proc, num_cols)
])

In [34]:
encode_train = processor.fit_transform(df_train)
df_train_tf = pd.DataFrame(encode_train, columns=processor.get_feature_names_out())
encode_test = processor.transform(df_test)
df_test_tf = pd.DataFrame(encode_test, columns=processor.get_feature_names_out())
display(df_train_tf.info())
display(df_train_tf)
display(df_test_tf.info())
display(df_test_tf)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   cat__col_b_dog  5 non-null      float64
 1   num__col_a      5 non-null      float64
 2   num__col_c      5 non-null      float64
dtypes: float64(3)
memory usage: 252.0 bytes


None

Unnamed: 0,cat__col_b_dog,num__col_a,num__col_c
0,0.0,1.0,1.0
1,1.0,2.0,3.0
2,1.0,1.5,3.0
3,1.0,1.0,3.0
4,1.0,2.0,7.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   cat__col_b_dog  3 non-null      float64
 1   num__col_a      3 non-null      float64
 2   num__col_c      3 non-null      float64
dtypes: float64(3)
memory usage: 204.0 bytes


None

Unnamed: 0,cat__col_b_dog,num__col_a,num__col_c
0,1.0,1.5,3.0
1,1.0,1.0,1.0
2,0.0,1.5,3.0
