In [1]:
import pandas as pd

In [2]:
from sklearn.impute import SimpleImputer

In [3]:
from sklearn.compose import ColumnTransformer, make_column_selector

In [14]:
df = pd.read_csv("sample_dataset.csv")

In [17]:
df.head().T

Unnamed: 0,0,1,2,3,4
mean radius,,20.57,19.69,11.42,20.29
mean texture,10.38,17.77,21.25,20.38,14.34
mean perimeter,122.8,132.9,130.0,77.58,
mean area,1001.0,1326.0,1203.0,386.1,
mean smoothness,0.1184,,0.1096,0.1425,
mean compactness,0.2776,,0.1599,0.2839,0.1328
mean concavity,0.3001,0.0869,,0.2414,0.198
mean concave points,0.1471,0.07017,,,
mean symmetry,0.2419,,,0.2597,0.1809
mean fractal dimension,0.07871,0.05667,0.05999,0.09744,


In [20]:
numerical = df.select_dtypes(exclude = "object").columns
categorical = df.select_dtypes(include = "object").columns


In [21]:
numerical

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'smoothness error',
       'compactness error', 'concavity error', 'concave points error',
       'symmetry error', 'fractal dimension error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension', 'target'],
      dtype='object')

In [22]:
categorical

Index(['area error'], dtype='object')

# Using ColumnTransformer

In [23]:
cleaner = ColumnTransformer([
    ('numerical_transformer', SimpleImputer(strategy='mean'), numerical),
    ('categorical_transformer', SimpleImputer(strategy='most_frequent'), categorical)
])

In [24]:
cleaner.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, ..., 0.1189, 0.0, 'A'],
       [20.57, 17.77, 132.9, ..., 0.08436317021276594, 0.0, 'A'],
       [19.69, 21.25, 130.0, ..., 0.08758, 0.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, ..., 0.0782, 0.0, 'A'],
       [20.6, 29.33, 140.1, ..., 0.124, 0.0, 'A'],
       [7.76, 19.311829268292684, 47.92, ..., 0.07039, 1.0, 'A']],
      shape=(569, 31), dtype=object)

In [25]:
cleaner = ColumnTransformer([
    ('numerical_transformer', SimpleImputer(strategy='mean'), [0,1,2]),
    ('categorical_transformer', SimpleImputer(strategy='most_frequent'), categorical)
])

cleaner.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, 'A'],
       [20.57, 17.77, 132.9, 'A'],
       [19.69, 21.25, 130.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, 'A'],
       [20.6, 29.33, 140.1, 'A'],
       [7.76, 19.311829268292684, 47.92, 'A']],
      shape=(569, 4), dtype=object)

In [26]:
cleaner = ColumnTransformer([
    ('numerical_transformer', SimpleImputer(strategy='mean'), [0,1,2]),
    ('categorical_transformer', SimpleImputer(strategy='most_frequent'), categorical)
] , remainder='drop')

cleaner.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, 'A'],
       [20.57, 17.77, 132.9, 'A'],
       [19.69, 21.25, 130.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, 'A'],
       [20.6, 29.33, 140.1, 'A'],
       [7.76, 19.311829268292684, 47.92, 'A']],
      shape=(569, 4), dtype=object)

# make_column_selector

In [27]:
cleaner = ColumnTransformer([
    ('numerical_transformer', SimpleImputer(strategy='mean'), make_column_selector(dtype_exclude="object")),
    ('categorical_transformer', SimpleImputer(strategy='most_frequent'),  make_column_selector(dtype_include="object"))
] , remainder='drop')

cleaner.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, ..., 0.1189, 0.0, 'A'],
       [20.57, 17.77, 132.9, ..., 0.08436317021276594, 0.0, 'A'],
       [19.69, 21.25, 130.0, ..., 0.08758, 0.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, ..., 0.0782, 0.0, 'A'],
       [20.6, 29.33, 140.1, ..., 0.124, 0.0, 'A'],
       [7.76, 19.311829268292684, 47.92, ..., 0.07039, 1.0, 'A']],
      shape=(569, 31), dtype=object)