## 10.2 Segmentation data

In [None]:
%config InlineBackend.figure_format = 'svg'
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 70)

In [None]:
import pandas as pd
seg_df = pd.read_csv('http://bit.ly/PMR-ch5')
seg_df.head()

In [None]:
from google.colab import files

f = files.upload()

In [None]:
import pandas as pd

seg_df = pd.read_csv('segment_dataframe_Python_intro_Ch5.csv',
                     index_col=0)
seg_df.head()

In [None]:
seg_df['is_female'] = seg_df.gender == 'female'
seg_sub = seg_df.drop(['Segment', 'gender'], axis=1)
seg_sub.head()

In [None]:
from sklearn import preprocessing

seg_sc = pd.DataFrame(preprocessing.scale(seg_sub),
                      columns=seg_sub.columns)
seg_sc.head()

### 10.3 Check function

In [None]:
pd.pivot_table(seg_sub, index=seg_df.Segment)

In [None]:
import numpy as np

# Output not shown
pd.pivot_table(seg_sub, index=seg_df.Segment,
               aggfunc=[np.mean, np.std]).unstack()

In [None]:
pd.pivot_table(seg_sub, index=seg_df.Segment,
               aggfunc=lambda x: np.percentile(x, 95))

In [None]:
def check_clusters(data, labels):
  return pd.pivot_table(data,
                        index=labels)

check_clusters(seg_sub, seg_df.Segment)

### 10.3.2 Hierarchical clustering and distances

In [None]:
# Vector of differences
np.array([1, 2, 3]) - np.array([2, 3, 2])

In [None]:
# Sum of the squared distances
np.sum((np.array([1, 2, 3]) - np.array([2, 3, 2]))**2)

In [None]:
# Root sum of the squared distances
np.sqrt(np.sum((np.array([1, 2, 3]) - np.array([2, 3, 2]))**2))

In [None]:
from scipy.spatial import distance

distance.pdist([np.array([1, 2, 3]), np.array([2, 3, 2])])

In [None]:
distance.pdist(seg_sc.iloc[:3])

In [None]:
distance.squareform(distance.pdist(seg_sc.iloc[:3]))

In [None]:
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.hist(distance.pdist(seg_sub))
plt.title('Pairwise distances from unscaled data')
plt.xlabel('Distance')
plt.ylabel('Count')
plt.subplot(1,2,2)
plt.hist(distance.pdist(seg_sc))
plt.xlabel('Distance')
plt.ylabel('Count')
plt.title('Pairwise distances from scaled data')

In [None]:
from scipy.cluster import hierarchy

linkages = hierarchy.linkage(seg_sc, method='ward')

In [None]:
hierarchy.dendrogram(linkages)
plt.show()

In [None]:
hierarchy.dendrogram(linkages, orientation='top',
                     truncate_mode='lastp', p=20)
plt.show()

In [None]:
plt.subplot(1,2,1)
hierarchy.dendrogram(linkages, leaf_rotation=0)
plt.xlim((0,200))
plt.subplot(1,2,2)
hierarchy.dendrogram(linkages, leaf_rotation=0)
plt.xlim((2800, 3000))
plt.show()

In [None]:
# Similar
seg_sub.loc[[17, 51]]

In [None]:
# Dissimilar
seg_sub.loc[[163, 88]]

In [None]:
# Dissimilar
seg_sub.loc[[17,163]]

In [None]:
hierarchy.cophenet(linkages, distance.pdist(seg_sc))[0]

In [None]:
# Not shown
hierarchy.dendrogram(linkages, color_threshold=9)
plt.show()

In [None]:
labels = hierarchy.fcluster(linkages, t=4, criterion='maxclust')
list(zip(*np.unique(labels, return_counts=True)))

In [None]:
check_clusters(seg_sub, labels)

In [None]:
linkages_unscaled = hierarchy.linkage(seg_sub, method='ward')
hierarchy.dendrogram(linkages_unscaled)
plt.show()

In [None]:
labels_unscaled = hierarchy.fcluster(linkages_unscaled, t=3,
                                     criterion='maxclust')
check_clusters(seg_sub, labels_unscaled)

In [None]:
def cluster_plot_raw(x, y, labels):
  for l in np.unique(labels):
    idx = labels == l
    plt.scatter(x[idx],
                y[idx],
                label=l)
  plt.legend()
  plt.xlabel(x.name)
  plt.ylabel(y.name)

In [None]:
cluster_plot_raw(seg_sub.age, seg_sub.income, labels_unscaled)

### 10.3.3 Mean-based clustering: K-*means*

In [None]:
def check_clusters(data, labels):
  print(list(zip(*np.unique(labels, return_counts=True))))
  
  return pd.pivot_table(data,
                        index=labels)

In [None]:
import numpy as np
from sklearn import cluster

np.random.seed(536)
centroids, labels, inertia = cluster.k_means(seg_sc, n_clusters=4)
check_clusters(seg_sub, labels)

In [None]:
centroids, k_labels_unscaled4, inertia = cluster.k_means(seg_sub,
                                                         n_clusters=4)
check_clusters(seg_sub, k_labels_unscaled4)

In [None]:
centroids, k_labels_unscaled3, inertia = cluster.k_means(seg_sub,
                                                         n_clusters=3)
check_clusters(seg_sub, k_labels_unscaled3)

In [None]:
import matplotlib.pyplot as plt
seg_sub.boxplot(column='income', by=k_labels_unscaled4)
plt.xlabel('Cluster')
plt.ylabel('Income')
plt.suptitle('') # Remove cluster id subtitle

In [None]:
from sklearn import decomposition
from matplotlib import cm

def cluster_plot(data_df, labels):
  p = decomposition.PCA(random_state=132, svd_solver='full')
  scaled_transformed = p.fit_transform(preprocessing.scale(data_df))
  for l in np.unique(labels):
    idx = np.where(labels == l)[0]
    plt.scatter(scaled_transformed[idx, 0],
                scaled_transformed[idx, 1],
                label=l)
  plt.legend()
  plt.title('First two components explain {}% of the variance'
            .format(round(100*p.explained_variance_ratio_[:2].sum())))
  plt.xlabel('First principal component')
  plt.ylabel('Second principal component')

cluster_plot(seg_sub, k_labels_unscaled4)

### 10.3.5 Model-based clustering: Gaussian Mixture Models

In [None]:
from sklearn import mixture

gmm4 = mixture.GaussianMixture(n_components=4,
                              covariance_type='full',
                              random_state=323).fit(seg_sub)
gmm4_labels = gmm4.predict(seg_sub)
gmm4.bic(seg_sub)

In [None]:
check_clusters(seg_sub, gmm4_labels)

In [None]:
gmm_n_test = [mixture.GaussianMixture(n_components=n,
                                      covariance_type='full',
                                      random_state=323)
                       .fit(seg_sub) for n in range(1,14)]
plt.plot(range(1, 14), [g.bic(seg_sub) for g in gmm_n_test])

In [None]:
gmm_n_v_test = {v: [mixture.GaussianMixture(n_components=n,
                                        covariance_type=v,
                                        random_state=323)
                       .fit(seg_sub) for n in range(1,14)]
                for v in ['full', 'tied', 'diag', 'spherical']}
gmm_n_v_test_bic = {v: [g.bic(seg_sub) for g in m]
                    for v, m in gmm_n_v_test.items()}
pd.DataFrame(gmm_n_v_test_bic).plot()

In [None]:
gmm_n_v_test = {v: [mixture.GaussianMixture(n_components=n,
                                        covariance_type=v,
                                        random_state=323)
                       .fit(seg_sc) for n in range(1,14)]
                for v in ['full', 'tied', 'diag', 'spherical']}
gmm_n_v_test_bic = {v: [g.bic(seg_sc) for g in m]
                    for v, m in gmm_n_v_test.items()}
pd.DataFrame(gmm_n_v_test_bic).plot()

In [None]:
gmm5 = mixture.GaussianMixture(n_components=5,
                               covariance_type='diag',
                               random_state=323).fit(seg_sub)
gmm5_labels = gmm5.predict(seg_sub)

In [None]:
check_clusters(seg_sub, gmm5_labels)

In [None]:
cluster_plot_raw(seg_sub.age, seg_sub.income, gmm5_labels)

In [None]:
n_obs = seg_sub.shape[0]
cluster_plot_raw(seg_sub.subscribe + .3*np.random.rand(n_obs),
                 seg_sub.is_female + .3*np.random.rand(n_obs),
                 gmm5_labels)