# Exercise: Clustering

In this exercise you learn how clustering works by looking at some datasets with different clustering methods. 

**Note:** The code in this notebook might look quite scaring --- but no worries: you can simply run the code, look at the outputs and plots and in some cases use the sliders above the plots to vary the parameters. There is no need to understand every detail of code to solve the exercises.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score

from ipywidgets import interact
import ipywidgets as widgets

import seaborn as sns

  import pandas.util.testing as tm


## Compare different clustering methods

In [2]:
import os
print(os.environ['DATASET_PATH'])
dataset_path = os.environ['DATASET_PATH']
#dataset_path = './data'

KeyError: 'DATASET_PATH'

In [None]:
df = pd.read_csv(dataset_path+'/cluster_data.csv')

def fit_and_plot_clustering(df, clusterer, print_number=False):
    df = df.copy()
    df['clusterid'] = clusterer.fit_predict(df[['x', 'y']])
    n_clusters = df['clusterid'].max() + 1
    n_outliers = np.sum(df['clusterid'] == -1)
    if print_number:
        print(f'number of clusters: {n_clusters}\nnumber of outliers: {n_outliers}')
    cmap = plt.get_cmap('Set1', n_clusters+1)
    ax = df.plot(kind='scatter', x='x', y='y', c='clusterid', cmap=cmap, s=80, colorbar=False)
    ax.grid()
    
style = {'description_width': '150px'}
layout = widgets.Layout(width='400px')

### DBSCAN

In [None]:
def plot_dbscan(eps = 0.3, min_samples = 10):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    fit_and_plot_clustering(df, clusterer=dbscan, print_number=True)

i = interact(plot_dbscan 
            , eps=widgets.SelectionSlider(options=np.arange(0.1,1.1,0.1), description='Epsilon', layout=layout, style=style, disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , min_samples=widgets.SelectionSlider(options=range(1, 21), description='Minimum of samples', layout=layout, style=style, disabled=False,continuous_update=False,orientation='horizontal',readout=True)
        );

**Exercise:**
- What are the best parameters to cluster this dataset?

### KMeans

In [None]:
def plot_kmeans(k = 3):
    kmeans = KMeans(n_clusters=k)
    fit_and_plot_clustering(df, clusterer=kmeans)

i = interact(plot_kmeans 
            , k=widgets.SelectionSlider(options=range(2,20), description='Number of clusters (k)', layout=layout, style=style, disabled=False,continuous_update=False,orientation='horizontal',readout=True)
        );

#### Best `k` with silhouette value

In [None]:
sil_values = []
for k in range(2,20):
    kmeans = KMeans(n_clusters=k)
    clust = kmeans.fit_predict(df[['x', 'y']])
    sil_values.append(silhouette_score(df[['x', 'y']], clust))

fig, ax = plt.subplots()
ax.plot(range(2,20),sil_values)
ax.set_xlabel('k')
ax.set_xticks(range(2,20, 2))
ax.set_ylabel('Silhouette value');

**Exercise:**
- What is the best choice for `k`?
- Why is k-means not a good method for this dataset?

### Hierarchical clustering

In [None]:
def plot_hclust(n_clusters = 3, linkage = 'ward'):
    hclust = AgglomerativeClustering(n_clusters=n_clusters, linkage = linkage )
    fit_and_plot_clustering(df, clusterer=hclust)

style = {'description_width': '150px'}
layout = widgets.Layout(width='400px')
i = interact(plot_hclust 
            , n_clusters=widgets.SelectionSlider(options=range(2,10),description='Number of clusters', layout=layout, style=style, disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , linkage=widgets.Dropdown(options=['ward', 'complete', 'average', 'single'],description='Number of clusters', layout=layout, style=style, disabled=False,continuous_update=False,orientation='horizontal',readout=True)
        );

**Exercise:**
- What are the best parameters to cluster this dataset?

# The Effect of Scaling

In [None]:
df_scale = pd.read_csv(dataset_path+'/scale_cluster_data.csv')
scaler = StandardScaler()
scaler.fit(df_scale[['x', 'y']])
scaled_data = scaler.transform(df_scale[['x', 'y']])
df_scale['scaled_x'] = scaled_data[:,0]
df_scale['scaled_y'] = scaled_data[:,1]

In [None]:
def plot_cluster_scaling(scaler, eps = 0.3, min_samples = 10):
    scaled_values = scaler.fit_transform(df_scale[['x','y']])
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    
    clust = dbscan.fit_predict(scaled_values)
    n_clusters = np.max(clust) + 1
    n_outliers = np.sum(clust == -1)
    print(f'number of clusters: {n_clusters}\nnumber of outliers: {n_outliers}')
    
    plot_df = pd.DataFrame(scaled_values, columns=['x','y'])
    cmap = plt.get_cmap('Set1', n_clusters+1)
    ax = plot_df.plot(kind='scatter', x='x', y='y', c=clust, cmap=cmap, s=80, colorbar=False, figsize=(8,8))
    ax.grid()
    
    
i = interact(plot_cluster_scaling 
            , scaler=widgets.RadioButtons(options=[('None', FunctionTransformer(validate=False)), ('Z-Transform', StandardScaler()), ('Min-Max', MinMaxScaler())]
                                          ,description='Scaler', layout=layout, style=style
                                          ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , eps=widgets.SelectionSlider(options=np.arange(0.1,1.1,0.1), value=0.4, description='Epsilon', layout=layout, style=style, disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , min_samples=widgets.SelectionSlider(options=range(1, 21), value=10, description='Minimum of samples', layout=layout, style=style, disabled=False,continuous_update=False,orientation='horizontal',readout=True)
        );



**Exercise:**
- How does the scaling influence the clustering?
- How does the scaling affect the choice of the parameters?

# Semantical Evaluation: Customer Segmentation

Now we have a look at a "real" dataset and see how we can evaluate the different method and parameter choices.

Let's load the [customer dataset](https://www.kaggle.com/shwetabh123/mall-customers) ...

In [None]:
df = pd.read_csv(dataset_path+'/Mall_Customers.csv')
df.head()

... and prepare it for our purpose.

In [None]:
X = df[['Age','Annual Income (k$)','Spending Score (1-100)']].copy()
X.columns = ['Age', 'Income', 'Score']
X['Gender'] = df['Gender'] == 'Male'
X.astype(float).head()

## K-Means

In [None]:
cols = ['Age', 'Income', 'Score', 'Gender']
def make_kmeans(attributs=cols, scaler=MinMaxScaler(), pca=False, components=2, k=3, show='features'):
    attributs = list(attributs)
    scaled_values = scaler.fit_transform(X[attributs])
    
    components = min(components, len(attributs))
    if pca:
        values = PCA().fit_transform(scaled_values)[:, :components]
    else:
        values = scaled_values
    
    cluster = KMeans(n_clusters=k).fit_predict(values)
    
    if show == 'features':
        df_plot = X.astype(float).copy()
    elif show == 'values':
        df_plot = pd.DataFrame(values)
    
    df_plot['cluster'] = cluster
        
    sns.pairplot(df_plot, hue="cluster", diag_kind = 'hist', diag_kws={'alpha':0.5}
                 , vars=[c for c in df_plot.columns if c != 'cluster'] 
                )


style = {'description_width': '150px'}
layout = widgets.Layout(width='400px')
i = interact(make_kmeans 
            , attributs=widgets.SelectMultiple(options=cols, value=cols, rows=len(cols)
                                          ,description='Features', layout=layout, style=style
                                          ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , scaler=widgets.RadioButtons(options=[ ('Z-Transform', StandardScaler()), ('Min-Max', MinMaxScaler()), ('None', FunctionTransformer(validate=False))]
                                          ,description='Scaler', layout=layout, style=style
                                          ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , pca=widgets.RadioButtons(options=[('without', False), ('with', True)]
                                                  ,description='PCA'
                                                  , layout=layout, style=style
                                                  ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , components=widgets.SelectionSlider(options=range(1, 5), value=4
                                                ,description='PCA components'
                                                , layout=layout, style=style
                                                ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , k=widgets.SelectionSlider(options=range(2, 8)
                                                ,description='k'
                                                , layout=layout, style=style
                                                ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , show=widgets.RadioButtons(options=[('Features', 'features'), ('Values', 'values')]
                                                  ,description='Show'
                                                  , layout=layout, style=style
                                                  ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)

            )

**Exercise:**
- Can we achieve better result with PCA?
- What happens if we remove features from the clustering?
- What is the best value for `k`?
- How would you name the identified clusters?

## DBSCAN

In [None]:
cols = ['Age', 'Income', 'Score', 'Gender']
def make_dbscan(attributs=cols, scaler=MinMaxScaler(), pca=False, components=2, eps=.5, min_samples=5, show='features'):
    attributs = list(attributs)
    scaled_values = scaler.fit_transform(X[attributs])
    
    components = min(components, len(attributs))
    if pca:
        values = PCA().fit_transform(scaled_values)[:, :components]
    else:
        values = scaled_values
    
    cluster = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(values)
    #cluster = [cm.Dark2.colors[c] for c in cluster]
    #print('Silhouette score:', silhouette_score(scaled_values, cluster))
    print('Found', 1+np.max(cluster), 'clusters with', np.sum(cluster == -1), 'outliers')
    
    if show == 'features':
        df_plot = X.astype(float).copy()
    elif show == 'values':
        df_plot = pd.DataFrame(values)
    
    df_plot['cluster'] = cluster
        
    sns.pairplot(df_plot, hue="cluster", diag_kind = 'hist', diag_kws={'alpha':0.5}
                 , vars=[c for c in df_plot.columns if c != 'cluster'] 
                )


style = {'description_width': '150px'}
layout = widgets.Layout(width='400px')
i = interact(make_dbscan 
            , attributs=widgets.SelectMultiple(options=cols, value=cols, rows=len(cols)
                                          ,description='Features', layout=layout, style=style
                                          ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , scaler=widgets.RadioButtons(options=[('Z-Transform', StandardScaler()), ('Min-Max', MinMaxScaler()), ('None', FunctionTransformer(validate=False))]
                                          ,description='Scaler', layout=layout, style=style
                                          ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , pca=widgets.RadioButtons(options=[('without', False), ('with', True)]
                                                  ,description='PCA'
                                                  , layout=layout, style=style
                                                  ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , components=widgets.SelectionSlider(options=range(1, 5), value=4
                                                ,description='PCA components'
                                                , layout=layout, style=style
                                                ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , eps=widgets.SelectionSlider(options=[.1, .25, .5, .6, .7, .8, .9, 1., 1.25, 1.5, 2.], value = .5
                                                ,description='Epsilon'
                                                , layout=layout, style=style
                                                ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , min_samples=widgets.SelectionSlider(options=range(1, 20), value=5
                                                ,description='Min. samples'
                                                , layout=layout, style=style
                                                ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)
            , show=widgets.RadioButtons(options=[('Features', 'features'), ('Values', 'values')]
                                                  ,description='Show'
                                                  , layout=layout, style=style
                                                  ,disabled=False,continuous_update=False,orientation='horizontal',readout=True)

            )

**Exercise:**
- Does the clusters found by DBSCAN differ from the clusters found by k-means?
- How does the parameters `epsilon` and `min_samples` influence the clusters? What are good values?