# Exercise 1

* Load **sample_dataset.csv**
* Replace the missings in the categorical variables with "N"
* Replace the missings in the numerical variables with the mean value

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector

In [2]:
df = pd.read_csv("sample_dataset.csv")

In [3]:
cleaner = ColumnTransformer([
    ('numerical', SimpleImputer(strategy = 'mean'), make_column_selector(dtype_exclude="object")),
    ('categorical', SimpleImputer(strategy = 'constant', fill_value='N'), make_column_selector(dtype_include="object"))
])

In [4]:
cleaner.fit_transform(df)[0:20]

array([[14.059547717842323, 10.38, 122.8, 1001.0, 0.1184, 0.2776, 0.3001,
        0.1471, 0.2419, 0.07871, 1.095, 1.2096921658986175, 8.589,
        0.006399, 0.025096302631578946, 0.03124424538258575, 0.01587,
        0.03003, 0.006193, 25.38, 17.33, 107.32284782608694, 2019.0,
        0.1622, 0.6656, 0.7119, 0.2654, 0.4601, 0.1189, 0.0, 'A'],
       [20.57, 17.77, 132.9, 1326.0, 0.09715625, 0.10453064583333332,
        0.0869, 0.07017, 0.18140467091295118, 0.05667, 0.5435, 0.7339,
        3.398, 0.005225, 0.01308, 0.03124424538258575, 0.0134,
        0.020561043121149897, 0.003532, 24.99, 23.41, 158.8, 1956.0,
        0.1238, 0.1866, 0.2416, 0.186, 0.275, 0.08436317021276594, 0.0,
        'A'],
       [19.69, 21.25, 130.0, 1203.0, 0.1096, 0.1599, 0.09406254601366744,
        0.04911533769633508, 0.18140467091295118, 0.05999, 0.7456,
        0.7869, 4.585, 0.00615, 0.04006, 0.03832, 0.02058,
        0.020561043121149897, 0.004571, 23.57, 25.53, 107.32284782608694,
        1709.0, 0.14

# Exercise 2

* Load **sample_dataset.csv**
* Replace the missings in the float variables using KNN with 10 neighbors and distance-based weights
* Replace the missings in the categorical variables using the most frequent value

In [5]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer, make_column_selector

In [6]:
df = pd.read_csv("sample_dataset.csv")

In [9]:
cleaner = ColumnTransformer([
    ('float_variables', KNNImputer(n_neighbors = 10, weights = 'distance'), make_column_selector(dtype_include="float64")),
    ('categorical', SimpleImputer(strategy="most_frequent"), make_column_selector(dtype_exclude="float64")),
])

In [10]:
cleaner.fit_transform(df)[0:20]

array([[15.637884267617215, 10.38, 122.8, 1001.0, 0.1184, 0.2776, 0.3001,
        0.1471, 0.2419, 0.07871, 1.095, 1.0448467128221952, 8.589,
        0.006399, 0.02197527611663749, 0.030693187915457644, 0.01587,
        0.03003, 0.006193, 25.38, 17.33, 123.36315600216004, 2019.0,
        0.1622, 0.6656, 0.7119, 0.2654, 0.4601, 0.1189, 'A', 0],
       [20.57, 17.77, 132.9, 1326.0, 0.10305396267747076,
        0.12811649985916518, 0.0869, 0.07017, 0.18111787742572352,
        0.05667, 0.5435, 0.7339, 3.398, 0.005225, 0.01308,
        0.03408077400986087, 0.0134, 0.017132636584381317, 0.003532,
        24.99, 23.41, 158.8, 1956.0, 0.1238, 0.1866, 0.2416, 0.186,
        0.275, 0.08216467870588637, 'A', 0],
       [19.69, 21.25, 130.0, 1203.0, 0.1096, 0.1599, 0.09508436537408958,
        0.06366090049192921, 0.18107639304126572, 0.05999, 0.7456,
        0.7869, 4.585, 0.00615, 0.04006, 0.03832, 0.02058,
        0.0202236810391123, 0.004571, 23.57, 25.53, 129.6530215057103,
        1709.0, 0.

In [11]:
df.dtypes

mean radius                float64
mean texture               float64
mean perimeter             float64
mean area                  float64
mean smoothness            float64
mean compactness           float64
mean concavity             float64
mean concave points        float64
mean symmetry              float64
mean fractal dimension     float64
radius error               float64
texture error              float64
perimeter error            float64
area error                  object
smoothness error           float64
compactness error          float64
concavity error            float64
concave points error       float64
symmetry error             float64
fractal dimension error    float64
worst radius               float64
worst texture              float64
worst perimeter            float64
worst area                 float64
worst smoothness           float64
worst compactness          float64
worst concavity            float64
worst concave points       float64
worst symmetry      