In [1]:
import warnings
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder

def warn(*args, **kwargs):
    pass
warnings.warn = warn

In [2]:
# Caricamento del dataset
tips = sns.load_dataset('tips')
X = tips.loc[:, ('total_bill', 'tip')]
y = tips.loc[:, ('day')]

In [3]:
# Esercizio E22.1
pipe = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('kmeans', KMeans())
])
y_pred = pipe.fit_predict(X)
print(adjusted_rand_score(y, y_pred))

pipe.set_params(kmeans__n_clusters=5)
y_pred = pipe.fit_predict(X)
print(adjusted_rand_score(y_pred, y))

0.0012905855758142501
0.009089589270701748


In [4]:
# Esercizio E22.2
df = pd.read_csv('../../data/titanic.csv')

numerical_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline([
    ('scaler', OrdinalEncoder()),
    ('imputer', SimpleImputer())
])

ct = ColumnTransformer(
    [('num_transf', numerical_transformer, ['Age', 'Fare']),
    ('cat_transf', categorical_transformer, ['Survived', 'Pclass', 'Sex', 'Embarked'])],
    remainder='drop'
)

complex_pipe = Pipeline([
    ('transformer', ct),
    ('kmeans', KMeans())
])

labels = complex_pipe.fit_predict(df)

In [5]:
# Esercizio E22.3
param_grid_pipe = {
    'kmeans__n_clusters': list(range(3, 9)),
    'kmeans__algorithm': ['lloyd', 'elkan']
}
search = GridSearchCV(
    pipe,
    param_grid_pipe,
    scoring='adjusted_rand_score',
    n_jobs=-1,
    cv=10).fit(X.values)

In [6]:
# Esercizio E22.4
fs = VarianceThreshold()

numerical_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler()),
    ('selector', VarianceThreshold())
])

categorical_transformer = Pipeline([
    ('scaler', OrdinalEncoder()),
    ('imputer', SimpleImputer())
])

ct = ColumnTransformer(
    [('num_transf', numerical_transformer, ['Age', 'Fare']),
    ('cat_transf', categorical_transformer, ['Survived', 'Pclass', 'Sex', 'Embarked'])],
    remainder='drop'
)

very_complex_pipe = Pipeline([
    ('transformer', ct),
    ('kmeans', KMeans())
])

param_grid_very_complex_pipe = {
    'transformer__num_transf__selector__threshold': [0.0, 0.05],
    'kmeans__n_clusters': list(range(3, 9)),
}

very_complex_search = GridSearchCV(
    very_complex_pipe,
    param_grid_very_complex_pipe,
    scoring='adjusted_rand_score',
    n_jobs=-1,
    cv=10).fit(df)