In [10]:
import warnings
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.metrics import adjusted_rand_score, silhouette_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder

In [11]:
tips = sns.load_dataset('tips')
X = tips.loc[:, ('total_bill', 'tip')]
y = tips.loc[:, ('day')]

**Esercizio 4.5.1**

In [12]:
# Esercizio E22.1
pipe = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('kmeans', KMeans(n_init='auto'))
])
y_pred = pipe.fit_predict(X)
print(f'ARI con n_clusters=8: {round(adjusted_rand_score(y, y_pred), 5)}')

pipe.set_params(kmeans__n_clusters=5)
y_pred = pipe.fit_predict(X)
print(f'ARI con n_clusters=5: {round(adjusted_rand_score(y, y_pred), 5)}')

ARI con n_clusters=8: 0.00157
ARI con n_clusters=5: 0.00397


**Esercizio 4.5.2**

In [17]:
titanic = sns.load_dataset('titanic')

numerical_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline([
    ('scaler', OrdinalEncoder()),
    ('imputer', SimpleImputer())
])

ct = ColumnTransformer(
    [('num_transf', numerical_transformer, ['age', 'fare']),
    ('cat_transf', categorical_transformer, ['survived', 'pclass', 'sex', 'embarked'])],
    remainder='drop'
)

complex_pipe = Pipeline([
    ('transformer', ct),
    ('kmeans', KMeans(n_init='auto'))
])

labels = complex_pipe.fit_predict(titanic)

array([1, 4, 5, 3, 1, 1, 3, 1, 5, 7, 5, 3, 1, 1, 5, 2, 1, 6, 5, 7, 6, 6,
       7, 3, 5, 5, 0, 3, 7, 1, 0, 4, 7, 6, 0, 3, 7, 1, 5, 7, 5, 2, 0, 7,
       7, 1, 1, 7, 0, 5, 1, 1, 4, 2, 0, 3, 2, 0, 2, 1, 0, 3, 3, 1, 0, 7,
       2, 1, 5, 1, 6, 5, 6, 0, 1, 1, 1, 1, 6, 5, 1, 1, 7, 3, 2, 5, 1, 1,
       3, 1, 1, 1, 3, 1, 1, 1, 0, 4, 2, 6, 5, 1, 3, 1, 1, 1, 5, 1, 1, 7,
       3, 7, 1, 5, 7, 1, 1, 6, 0, 5, 6, 1, 0, 2, 3, 7, 1, 1, 7, 1, 0, 1,
       5, 2, 6, 0, 3, 3, 1, 0, 7, 5, 5, 1, 6, 6, 1, 5, 6, 6, 6, 3, 1, 1,
       1, 0, 7, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3, 5, 3, 1, 3, 1, 5, 1, 0, 1,
       1, 4, 6, 1, 5, 0, 1, 6, 5, 3, 7, 3, 1, 1, 2, 6, 5, 6, 4, 4, 1, 1,
       7, 2, 1, 1, 1, 0, 1, 5, 1, 7, 7, 4, 1, 2, 1, 6, 1, 4, 5, 6, 4, 6,
       1, 6, 1, 1, 3, 1, 6, 1, 6, 5, 3, 1, 6, 5, 6, 5, 6, 2, 6, 6, 7, 7,
       6, 1, 0, 3, 5, 2, 3, 6, 1, 5, 3, 1, 5, 7, 4, 3, 4, 2, 1, 1, 3, 3,
       5, 6, 1, 1, 3, 3, 3, 1, 2, 0, 7, 3, 5, 6, 1, 5, 1, 1, 1, 1, 3, 0,
       1, 1, 6, 7, 3, 4, 0, 5, 1, 0, 0, 3, 3, 4, 7,

**Esercizio 4.5.3**

In [15]:
param_grid_pipe = {
    'kmeans__n_clusters': list(range(3, 9)),
    'kmeans__algorithm': ['lloyd', 'elkan']
}
search = GridSearchCV(
    pipe,
    param_grid_pipe,
    scoring='adjusted_rand_score',
    n_jobs=-1,
    cv=10).fit(X.values)



**Esercizio 4.5.4**

In [16]:
fs = VarianceThreshold()

numerical_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler()),
    ('selector', VarianceThreshold())
])

categorical_transformer = Pipeline([
    ('scaler', OrdinalEncoder()),
    ('imputer', SimpleImputer())
])

ct = ColumnTransformer(
    [('num_transf', numerical_transformer, ['age', 'fare']),
    ('cat_transf', categorical_transformer, ['survived', 'pclass', 'sex', 'embarked'])],
    remainder='drop'
)

very_complex_pipe = Pipeline([
    ('transformer', ct),
    ('kmeans', KMeans())
])

param_grid_very_complex_pipe = {
    'transformer__num_transf__selector__threshold': [0.0, 0.05],
    'kmeans__n_clusters': list(range(3, 9)),
}

very_complex_search = GridSearchCV(
    very_complex_pipe,
    param_grid_very_complex_pipe,
    scoring='adjusted_rand_score',
    n_jobs=-1,
    cv=10).fit(titanic)

60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Angelo\.virtualenvs\pcs-exercises-v7kX8Rwt\lib\site-packages\sklearn\model_selection\_validation.py", line 684, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "c:\Users\Angelo\.virtualenvs\pcs-exercises-v7kX8Rwt\lib\site-packages\sklearn\pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\Angelo\.virtualenvs\pcs-exercises-v7kX8Rwt\lib\site-packages\sklearn\pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\Angelo\.virtualenvs\pcs-exercises-v7kX8Rwt\lib