In [1]:
import pandas as pd

In [3]:
from sklearn.pipeline import Pipeline, make_pipeline

In [4]:
from sklearn.compose import ColumnTransformer, make_column_selector

In [5]:
from sklearn.impute import SimpleImputer

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
from sklearn.decomposition import PCA

In [9]:
from sklearn.feature_selection import SelectKBest, f_classif

In [10]:
df = pd.read_csv("sample_dataset.csv")

In [12]:
X = df.iloc[:,0:-1]
y = df.iloc[:,-1]

# Transformation of numerical and categorical columns

**Numerical variables:**
* Blank filling with median value
* Standardization

**Categorical variables**
* Blank filling with most frequent value
* One-hot encoding

In [15]:
transformer = ColumnTransformer([
    ('numerical', make_pipeline(SimpleImputer(strategy='median'), StandardScaler()), make_column_selector(dtype_exclude="object")),
    ('categorical', make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder(sparse=False)), make_column_selector(dtype_include="object"))
])

# PCA

In [16]:
pca = PCA(n_components = 10)

# Feature selection

In [17]:
selector = SelectKBest(f_classif, k = 5)

# Pipeline

In [18]:
pipeline = Pipeline([
    ('transformation', transformer),
    ('pca', pca),
    ('feature_selection', selector)
])

In [19]:
pipeline.fit_transform(X,y)

array([[ 8.52256693,  2.64397025, -1.57565929, -3.52420932, -2.60947471],
       [ 2.7962396 , -3.89824876,  0.10428628, -1.61475964, -0.15809814],
       [ 4.56985298, -1.18416339, -0.23153196, -0.95077724,  0.12671358],
       ...,
       [ 1.05053101, -2.22225349,  1.11959769,  2.06978343,  1.97774725],
       [10.21620874,  0.39525183, -2.47260956,  1.09079088, -0.72399179],
       [-5.32259519, -0.24553613,  1.22274009,  1.40540407,  0.49477953]])

In [20]:
X1 = transformer.fit_transform(X)

In [21]:
X2 = pca.fit_transform(X1)

In [22]:
X3 = selector.fit_transform(X2,y)

In [23]:
X3

array([[ 8.52256712,  2.64396851, -1.57561261, -3.52446951, -2.60845348],
       [ 2.79623957, -3.8982489 ,  0.1042695 , -1.61475386, -0.15768526],
       [ 4.56985322, -1.18416374, -0.23150433, -0.95100719,  0.12842975],
       ...,
       [ 1.05053102, -2.22225157,  1.11957651,  2.06979884,  1.97775598],
       [10.21620891,  0.39525334, -2.47257087,  1.09064114, -0.72455767],
       [-5.32259524, -0.24553994,  1.22278056,  1.40553737,  0.49408014]])

Change PCA components to 15 and feature selection variables to 3

In [24]:
pipeline.set_params(pca__n_components = 15, feature_selection__k = 3)

Pipeline(steps=[('transformation',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x1CC94280>),
                                                 ('categorical',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   On

In [25]:
pipeline.fit_transform(X,y)

array([[ 8.52256695,  2.6439705 , -1.57565811],
       [ 2.79623958, -3.89824778,  0.10426844],
       [ 4.56985297, -1.18416163, -0.23154453],
       ...,
       [ 1.05053098, -2.22225238,  1.11958246],
       [10.2162088 ,  0.39525014, -2.47257541],
       [-5.32259514, -0.24553979,  1.22278872]])

Change the numerical cleaning strategy to 'mean'

In [28]:
pipeline.set_params(transformation__numerical__simpleimputer__strategy = 'mean')

Pipeline(steps=[('transformation',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x1CC94280>),
                                                 ('categorical',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(spars

In [29]:
pipeline.fit_transform(X,y)

array([[ 8.53475992e+00,  2.61353433e+00, -1.49046227e+00],
       [ 2.73326665e+00, -3.71106369e+00, -1.19848923e-03],
       [ 4.63649132e+00, -1.19050135e+00, -2.87697660e-01],
       ...,
       [ 1.08873882e+00, -2.15395818e+00,  1.16642212e+00],
       [ 1.01791411e+01,  5.24430864e-01, -2.43315954e+00],
       [-5.31069498e+00, -2.64129075e-01,  1.39741301e+00]])