In [1]:
import pandas as pd

In [2]:
from sklearn.impute import SimpleImputer

In [3]:
from sklearn.compose import ColumnTransformer, make_column_selector

In [4]:
df = pd.read_csv("sample_dataset.csv")

In [5]:
numerical = df.select_dtypes(exclude = "object").columns
categorical = df.select_dtypes(include = "object").columns


In [7]:
test = df.select_dtypes(include='object').columns.values
test

array(['area error'], dtype=object)

In [7]:
numerical

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'smoothness error',
       'compactness error', 'concavity error', 'concave points error',
       'symmetry error', 'fractal dimension error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension', 'target'],
      dtype='object')

In [8]:
categorical

Index(['area error'], dtype='object')

# Using ColumnTransformer

In [9]:
cleaner = ColumnTransformer([
    ('numerical_transformer', SimpleImputer(strategy='mean'), numerical),
    ('categorical_transformer', SimpleImputer(strategy='most_frequent'), categorical)
])

In [10]:
cleaner.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, ..., 0.1189, 0.0, 'A'],
       [20.57, 17.77, 132.9, ..., 0.08436317021276594, 0.0, 'A'],
       [19.69, 21.25, 130.0, ..., 0.08757999999999999, 0.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, ..., 0.0782, 0.0, 'A'],
       [20.6, 29.33, 140.1, ..., 0.124, 0.0, 'A'],
       [7.76, 19.311829268292684, 47.92, ..., 0.07039, 1.0, 'A']],
      dtype=object)

In [11]:
cleaner = ColumnTransformer([
    ('numerical_transformer', SimpleImputer(strategy='mean'), [0,1,2]),
    ('categorical_transformer', SimpleImputer(strategy='most_frequent'), categorical)
])

cleaner.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, 'A'],
       [20.57, 17.77, 132.9, 'A'],
       [19.69, 21.25, 130.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, 'A'],
       [20.6, 29.33, 140.1, 'A'],
       [7.76, 19.311829268292684, 47.92, 'A']], dtype=object)

In [13]:
cleaner = ColumnTransformer([
    ('numerical_transformer', SimpleImputer(strategy='mean'), [0,1,2]),
    ('categorical_transformer', SimpleImputer(strategy='most_frequent'), categorical)
] , remainder='drop')

cleaner.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, 'A'],
       [20.57, 17.77, 132.9, 'A'],
       [19.69, 21.25, 130.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, 'A'],
       [20.6, 29.33, 140.1, 'A'],
       [7.76, 19.311829268292684, 47.92, 'A']], dtype=object)

# make_column_selector

In [19]:
cleaner = ColumnTransformer([
    ('numerical_transformer', SimpleImputer(strategy='mean'), make_column_selector(dtype_exclude="object")),
    ('categorical_transformer', SimpleImputer(strategy='most_frequent'),  make_column_selector(dtype_include="object"))
] , remainder='drop')

cleaner.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, ..., 0.1189, 0.0, 'A'],
       [20.57, 17.77, 132.9, ..., 0.08436317021276594, 0.0, 'A'],
       [19.69, 21.25, 130.0, ..., 0.08757999999999999, 0.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, ..., 0.0782, 0.0, 'A'],
       [20.6, 29.33, 140.1, ..., 0.124, 0.0, 'A'],
       [7.76, 19.311829268292684, 47.92, ..., 0.07039, 1.0, 'A']],
      dtype=object)