# Exercise 1

* Load **sample_dataset.csv** and select only the features: mean radius, area error, mean perimeter
* Apply the following transformations using ColumnTransformer and Pipeline:
    * Numerical features:
        * Cleaning using the mean value
        * Transformation using the Yeo-Johnson transformation
    * Categorical features:
        * Cleaning using the most probable value
        * One-hot encoding with dense output

In [23]:
import pandas as pd

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder


In [24]:
df = pd.read_csv("sample_dataset.csv").loc[:,['mean radius', 'area error', 'mean perimeter']]

In [25]:
numerical_pipeline = Pipeline([
    ('cleaner', SimpleImputer(strategy = 'mean')),
    ('power', PowerTransformer())
])

categorical_pipeline = Pipeline([
    ('cleaner', SimpleImputer(strategy = 'most_frequent')),
    ('encoder', OneHotEncoder(sparse=False))
])

transformer = ColumnTransformer([
    ('numerical', numerical_pipeline, make_column_selector(dtype_exclude="object")),
    ('categorical', categorical_pipeline, make_column_selector(dtype_include="object"))
])

In [26]:
transformer.fit_transform(df)

array([[ 0.14898925,  1.31305907,  1.        ,  0.        ,  0.        ],
       [ 1.77124867,  1.61013246,  1.        ,  0.        ,  0.        ],
       [ 1.59585762,  1.52800056,  1.        ,  0.        ,  0.        ],
       ...,
       [ 0.88394456,  0.82431697,  1.        ,  0.        ,  0.        ],
       [ 1.77704664,  1.80409019,  1.        ,  0.        ,  0.        ],
       [-2.82849658, -2.89045254,  1.        ,  0.        ,  0.        ]])

# Exercise 2

* Modify the transformations of the previous exercise according to these settings and using set_params:
    * Numerical features: change the cleaning value to the median value
    * Categorical features: change the cleaning value to 'N' constant value

In [27]:
transformer.set_params(numerical__cleaner__strategy = 'median',
                      categorical__cleaner__strategy = 'constant',
                       categorical__cleaner__fill_value = 'N'
                      )

ColumnTransformer(transformers=[('numerical',
                                 Pipeline(steps=[('cleaner',
                                                  SimpleImputer(strategy='median')),
                                                 ('power',
                                                  PowerTransformer())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x1CA1C238>),
                                ('categorical',
                                 Pipeline(steps=[('cleaner',
                                                  SimpleImputer(fill_value='N',
                                                                strategy='constant')),
                                                 ('encoder',
                                                  OneHotEncoder(sparse=False))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x1CA1C898>)])

In [28]:
transformer.fit_transform(df)

array([[-0.05595844,  1.33240916,  1.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.77695743,  1.61842447,  1.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.61194827,  1.53959676,  1.        ,  0.        ,  0.        ,
         0.        ],
       ...,
       [ 0.93017209,  0.85656353,  1.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.78239188,  1.80382232,  1.        ,  0.        ,  0.        ,
         0.        ],
       [-2.91245376, -2.95836499,  1.        ,  0.        ,  0.        ,
         0.        ]])