# Preliminaries

## Libraries

In [None]:
import time
import itertools
from itertools import cycle

# Data Handling
import pandas as pd
import numpy as np
from scipy import linalg

from sklearn.preprocessing import scale, normalize, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

# Dimensionality Reduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Clustering
from sklearn.cluster import KMeans, MiniBatchKMeans, SpectralClustering, DBSCAN, OPTICS, cluster_optics_dbscan, AgglomerativeClustering, Birch
from scipy.cluster import hierarchy
import hdbscan
from kemlglearn.cluster.consensus import SimpleConsensusClustering
import openensembles as oe
# Not scalable with n_samples
from sklearn.cluster import MeanShift, AffinityPropagation
from sklearn.cluster import estimate_bandwidth
from sklearn.mixture import GaussianMixture

# Evaluation
from pyclustertend import hopkins, vat, ivat
from yellowbrick.cluster import KElbowVisualizer
from kneed import KneeLocator
import gapstat_rs
from gap_statistic import OptimalK
from sklearn import metrics
from sklearn.metrics import silhouette_score
from amltlearn.metrics.cluster import calinski_harabasz_score, davies_bouldin_score

# Visualization
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import animation
import matplotlib.gridspec as gridspec
import mpl_toolkits.mplot3d.axes3d as p3
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from pylab import *
import plotly
import plotly.express as px
import plotly.graph_objs as go

%matplotlib inline 
sns.set_context('poster')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}

## Functions

In [None]:
def plot_corr(df):
    corr = df.corr()
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    f, ax = plt.subplots(figsize=(11, 9))
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
# The "Gap Statistic" module allows using any clustering algorithm
# This function takes X (data) k, and func (the chosen clustering algorithm)
# It returns a tuple of the centorid locations, and the labels assigned to X

def special_clustering_func(X, k, func):
    """ 
    Special clustering function which uses the MeanShift
    model from sklearn.
    
    These user defined functions *must* take the X and a k 
    and can take an arbitrary number of other kwargs, which can
    be pass with `clusterer_kwargs` when initializing OptimalK
    """
    
    # Here you can do whatever clustering algorithm you heart desires,
    # but we'll do a simple wrap of the MeanShift model in sklearn.
    
    m = func
    m.fit(X)
    
    # Return the location of each cluster center,
    # and the labels for each point.
    return m.cluster_centers_, m.predict(X)

In [None]:
def prepare_pca(n_components, data, labels):
    matrix = PCA(n_components=n_components, svd_solver='full').fit_transform(data)
    
    names = ['x', 'y', 'z']
    df_matrix = pd.DataFrame(matrix)
    df_matrix.rename({i:names[i] for i in range(n_components)}, axis=1, inplace=True)
    df_matrix['labels'] = labels
    
    return df_matrix

In [None]:
def prepare_tsne(n_components, data, labels):
    pca = PCA(.95, svd_solver='full') 
    X_pca = pca_n.fit_transform(data)
    
    tsne = TSNE(n_components=n_components, verbose=0, perplexity=40, n_iter=300)
    matrix = tsne.fit_transform(X_pca)
    
    names = ['x', 'y', 'z']
    df_matrix = pd.DataFrame(matrix)
    df_matrix.rename({i:names[i] for i in range(n_components)}, axis=1, inplace=True)
    df_matrix['labels'] = labels
    
    return df_matrix

# Preprocess Data

## Load Data

In [None]:
df = pd.read_pickle(r'2018_5yr_cendatagov_ESTIMATES_v3.pkl')

In [None]:
df.shape

## Null Values

In [None]:
df.dropna(axis=1, how='any', inplace=True)

In [None]:
df.shape

## Preprocessing Steps

##### Sources

https://scikit-learn.org/stable/modules/preprocessing.html

https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html#sphx-glr-auto-examples-preprocessing-plot-scaling-importance-py

Scaling features is a requirement for the optimal performance of many machine learning algorithms.

Use StandardScaler to help standardize the dataset’s features onto unit scale (mean = 0 and variance = 1).

Use the MinMaxScaler for feature scaling when we do not assume that the shape of all the features follows a normal distribution.

In [None]:
# Gather all the values from the feature columns of the dataframe
X_raw = df[df.columns[~df.columns.isin(["GEOID","GEO_ID","NAME"])]].values

In [None]:
# Use StandardScaler to help standardize the dataset’s features onto unit scale (mean = 0 and variance = 1)
X = StandardScaler().fit_transform(X_raw)

### Subset Data

In [None]:
df = df.sample(n=10000, axis=0).reset_index(drop=True)

In [None]:
df.shape

In [None]:
X_raw = df[df.columns[~df.columns.isin(["GEOID","GEO_ID","NAME"])]].values

In [None]:
X = StandardScaler().fit_transform(X_raw)

# Explorartory Data Analysis

## Correlation Matrix

##### Sources

https://www.geeksforgeeks.org/multidimensional-data-analysis-in-python/

In [None]:
# generating correlation heatmap 
sns.heatmap(df[df.columns[~df.columns.isin(["GEOID","GEO_ID","NAME"])]].corr(), annot = True) 
  
# posting correlation heatmap to output console  
plt.show() 

In [None]:
#plot_corr(df)

### Trisurface Plot for Correlation Matrix

In [None]:
# generating correlation data 
df = df[df.columns[~df.columns.isin(["GEOID","GEO_ID","NAME"])]].corr() 
df.index = range(0, len(df)) 
df.rename(columns = dict(zip(df.columns, df.index)), inplace = True) 
df = df.astype(object) 
  
# Generating coordinates with corresponding correlation values 
for i in range(0, len(df)): 
    for j in range(0, len(df)): 
        if i != j: 
            df.iloc[i, j] = (i, j, df.iloc[i, j]) 
        else: 
            df.iloc[i, j] = (i, j, 0) 

df_list = [] 
  
# flattening dataframe values 
for sub_list in df.values: 
    df_list.extend(sub_list) 

# converting list of tuples into trivariate dataframe 
plot_df = pd.DataFrame(df_list) 
  
fig = plt.figure() 
ax = Axes3D(fig) 
  
# plotting 3D trisurface plot 
ax.plot_trisurf(plot_df[0], plot_df[1], plot_df[2],  
                    cmap = cm.jet, linewidth = 0.2) 
  
plt.show() 

## Cluster Tendency

##### Sources

https://pypi.org/project/pyclustertend/

https://pyclustertend.readthedocs.io/en/latest/

https://www.kaggle.com/lachhebo/hopkins-test

https://medium.com/@stevenzook_98922/regarding-the-hopkins-test-beware-of-the-implementation-you-use-as-the-value-returned-may-be-2f6db7849987

https://www.datanovia.com/en/lessons/assessing-clustering-tendency/

https://stats.stackexchange.com/questions/332651/validating-cluster-tendency-using-hopkins-statistic

Before applying any clustering method on the data, it’s important to evaluate whether the data sets contains meaningful clusters (i.e.: non-random structures) or not. This process is defined as the assessing of clustering tendency or the feasibility of the clustering analysis.

In [None]:
# Scale the data
X_scale = scale(X_raw)

### Hopkins Test

The Hopkins statistic is used to assess the clustering tendency of a data set by measuring the probability that a given data set is generated by a uniform data distribution. In other words, it tests the spatial randomness of the data.

Three different results are possible: 1) H = 0.5: the dataset reveals no clustering structure in the formula; 2) H close to 0, a significant evidence that the data might be cluster-able; 3) H is close to 1, in this case the test is indecisive (data are neither clustered nor random)

In [None]:
#hopkins(X_scale, X_scale.shape[0])

### VAT

VAT (visual assessment of tendency) is an algorithm which creates a visualisation of a specific dataset, which can be useful to obtain an insight on the number of clusters and cluster hierarchy. 

In [None]:
#vat(X_scale)

In [None]:
# The ivat algorithm is a improved version of the vat algorithm which produce more precise images at the cost of a heavier computing cost
#ivat(X_scale)

# Dimensionality Reduction

## PCA

##### Sources

https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html#Choosing-the-number-of-components

PCA works by using orthogonal transformations to convert correlates features into a set of values of linearly uncorrelated features. What is left are features that contain the largest possible variance. One of the most important applications of PCA is for speeding up machine learning algorithms. 

Although PCA might be successful in reducing the dimensionality of the data, it does not seem to visualize clusters very intuitively. This happens often with high dimensional data because it is typically clustered around the same point and PCA extracts that information.

### Inspect the Coordinates of the Right-Singular Vectors

In [None]:
# Print the shape
print("X:", X.shape)

# Find the minimum dimension
s = min(X.shape)
print("s = min({}, {}) == {}".format(X.shape[0], X.shape[1], s))

In [None]:
# Singular Value Decomposition
U, Sigma, VT = np.linalg.svd(X, full_matrices=False)

print("U:", U.shape)
print("Sigma:", Sigma.shape)
print("VT:", VT.shape)

In [None]:
# Inspect the coordinates of the top two (k_approx, below) right-singular vectors
m, d = X.shape
k_approx = 2
assert k_approx <= s

# Plot the components of the first k_approx=2 singular vectors
fig, axs = plt.subplots(1, k_approx, sharex=True, sharey=True,
                        figsize=(10*k_approx, 10))
for k in range(k_approx):
    axs[k].scatter(np.arange(min(m, d)), VT[k, :].T)

### 2D Scatter Plot

In [None]:
# PCA with sklearn

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

print('Explained variation per principal component: {} \n'
      .format(pca.explained_variance_ratio_))

print('Cumulative explained variation of the principal components: {}'.format(np.sum(pca.explained_variance_ratio_)))

In [None]:
# It's possible to take the original data and project it onto the 2-dimensional subspace defined by the first two right-singular vectors.
df_pca = pd.DataFrame(data=X_pca, columns=['component_1', 'component_2'])

fig = plt.figure(figsize=(10, 10))

plt.scatter(df_pca["component_1"], df_pca["component_2"])

ax = plt.gca()
ax.axis('square')

## t-SNE
### (with Prior Dimensionality Reduction)

##### Sources

https://towardsdatascience.com/visualising-high-dimensional-datasets-using-pca-and-t-sne-in-python-8ef87e7915b

t-Distributed Stochastic Neighbor Embedding (t-SNE) is another technique for dimensionality reduction and is particularly well suited for the visualization of high-dimensional datasets. Contrary to PCA it is not a mathematical technique but a probablistic one.

t-SNE minimizes the divergence between two distributions: a distribution that measures pairwise similarities of the input objects and a distribution that measures pairwise similarities of the corresponding low-dimensional points in the embedding

Since t-SNE scales quadratically in the number of objects N, its applicability is limited to data sets with only a few thousand input objects; beyond that, learning becomes too slow to be practical (and the memory requirements become too large)

It is highly recommended to use another dimensionality reduction method (e.g. PCA for dense data or TruncatedSVD for sparse data) before using t-SNE to reduce the number of dimensions to a reasonable amount (e.g. 50) if the number of features is very high.

In [None]:
# scikit-learn chooses the minimum number of principal components such that 95% of the variance is retained.
#pca = PCA(.95, svd_solver='full') 

# Select the number of components for PCA
pca_n = PCA(n_components=50, svd_solver='full')
X_pca_n = pca_n.fit_transform(X)

# The explained variance tells you how much information (variance) can be attributed to each of the principal components.
print('Cumulative explained variation for the principal components: {}'.format(np.sum(pca_n.explained_variance_ratio_)))

In [None]:
time_start = time.time()

tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300)
X_pca_tsne = tsne.fit_transform(X_pca_n)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

df_pca_tsne = pd.DataFrame(data=X_pca_tsne, columns=['component_1', 'component_2'])

### 2D Scatter Plot

#### Version 1

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(
    x="component_1", y="component_2",
    palette=sns.color_palette("hls", 10),
    data=df_pca_tsne,
    legend="full",
    alpha=0.3,
)

#### Version 2

In [None]:
fig = plt.figure(figsize=(10, 10))

plt.scatter(df_pca_tsne["component_1"], df_pca_tsne["component_2"])

ax = plt.gca()
ax.axis('square')

# Clustering

### Choose the Appropriate Number of Components

By setting n_components=2, we are compacting all of the features into two dimensions. This value is convenient for visualization on a two-dimensional plot.

However, only using two components means that the not all of the explained variance of the input data will be captures. Explained variance measures the discrepancy between the transformed data and the actual input data.

For the model, it will be important to conduct parameter tuning because it is a powerful method to maximize performance from clustering.

In [None]:
'''
kmeans_kwargs = {
    "init": "random",
    "n_clusters": n_clusters,
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

# Empty list to hold evaluation metrics
silhouette_scores = []
for n in range(2, 11):
    # This set the number of components for pca,
    # but leaves other steps unchanged
    pca_n = PCA(n_components=n)
    X_pca_n = pca_n.fit_transform(X)

    kmeans = KMeans(**kmeans_kwargs)
    kmeans.fit(X_pca_n)
    
    silhouette_coef = silhouette_score(
        X_pca_n,
        kmeans.labels_,
    )

    # Add metric to the appropriate list
    silhouette_scores.append(silhouette_coef)
'''

In [None]:
'''
plt.style.use("fivethirtyeight")
plt.figure(figsize=(6, 6))
plt.plot(
    range(2, 11),
    silhouette_scores,
    c="#008fd5")

plt.xlabel("n_components")
plt.title("Clustering Performance as a Function of n_components")
plt.tight_layout()
plt.show()
'''

In [None]:
# Select the optimal number of components
#components = 2

# Rerun PCA or t-SNE with the optimal number of components selected
#pca = PCA(n_components=components)
#X_cluster = pca.fit_transform(X)

X_cluster = X_pca_tsne

## K-Means

##### Sources

https://blog.floydhub.com/introduction-to-k-means-clustering-in-python-with-scikit-learn/

https://realpython.com/k-means-clustering-python/

##### Characteristics

*Parameters:* number of clusters

*Scalability:* Very large n_smaples

*Usecase:* General-purpose, even cluster size, flat geometry, not too many clusters

*Geometry (metric used):* Distances between points

### Choose the Appropriate Number of Clusters

#### Elbow Method

In [None]:
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    #"algorithm": "full",
    "random_state": 42,
}

# A list holds the SSE values for each k
sse = []
for k in range(2, 20):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X_cluster)
    sse.append(kmeans.inertia_)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(2, 20), sse)
plt.xticks(range(2, 20))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
kl = KneeLocator(
    range(2, 20), sse, curve="convex", direction="decreasing"
)

n_clusters = kl.elbow
n_clusters

#### Silhouette Score

The silhouette coefficient is a measure of cluster cohesion and separation (i.e., maximum class spread/variance). It quantifies how well a data point fits into its assigned cluster based on two factors: How close the data point is to other points in the cluster; How far away the data point is from points in other clusters. The higher the number the better.

In [None]:
# A list holds the silhouette coefficients for each k
silhouette_coefficients = []

# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 20):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X_cluster)
    score = silhouette_score(X_cluster, kmeans.labels_)
    silhouette_coefficients.append(score)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(2, 20), silhouette_coefficients)
plt.xticks(range(2, 20))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

#### Calinski-Harabasz Score and Davies-Bouldin Score

Calinski-Harabasz Index: interclass-intraclass distance ratio; the higher the number the better

Davies-Bouldin Criteria: maximum interclass-intraclass distance ratio; the lower the number the better

In [None]:
lscores = []
nclusters = 20
for nc in range(2,nclusters+1):
    km = KMeans(n_clusters=nc, n_init=10, random_state=0)
    labels = km.fit_predict(X_cluster)
    lscores.append((
        calinski_harabasz_score(X_cluster, labels),
        davies_bouldin_score(X_cluster, labels)))

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(131)
plt.plot(range(2,nclusters+1), [x for x,_ in lscores])
ax = fig.add_subplot(132)
plt.plot(range(2,nclusters+1), [x for _, x in lscores])

plt.show()

#### The Gap Statistic

##### Sources

https://towardsdatascience.com/clustering-evaluation-strategies-98a4006fcfc

https://github.com/milesgranger/gap_statistic

https://github.com/milesgranger/gap_statistic/blob/master/Example.ipynb

https://anaconda.org/milesgranger/gap-statistic/notebook

A powerful statistical method to find the optimal number of clusters. Assess the number of clusters comparing a clustering with the expected distribution of data given the null hypothesis (no clusters) 

Computes the different clusterings of the data increasing the number of clusters and compares to clusters of data generated from a uniform distribution 

The inter-class distance matrix Sw is computed for both and compared. The correct number of clusters is where the widest gap appears between the Sw of the data and the unform data.

In [None]:
# Create an "optimalK" object
optimalK = OptimalK(parallel_backend='rust')
optimalK

In [None]:
# Call "optimalK" with a list of clusters to fit to
n_clusters = optimalK(X_cluster, cluster_array=np.arange(1, 20))
print('Optimal clusters: ', n_clusters)

In [None]:
# A DataFrame of gap values with each passed cluster count is now available
optimalK.gap_df.head()

In [None]:
# Plot the n_clusters against their gap values
plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3)
plt.scatter(optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters,
            optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()

#### Cross-Validation

The “right” number of clusters in a data set can also be determined by cross-validation. 

First, divide the given data set into m parts. Next, use m-1 parts to build a clustering model, and use the remaining part to test the quality of the clustering. For example, for each point in the test set, we can find the closest centroid. 

Consequently, we can use the sum of squared distances between all points in the test set and the closest centroids to measure how well the clustering model fits the test set. 

For any integer k > 0, we repeat this process m times to derive clusterings of k clusters by using each part in turn as the test set. 

The average of the quality measure is taken as the overall quality measure. We can then compare the overall quality measure with respect to different values of k, and find the number of clusters that best fits the data. 

### K-Means with Optimal K

init: Use "k-means++" instead of "random" to ensure centroids are initialized with some distance between them. In most cases, this will be an improvement over "random".

n_clusters: The optimal number of clusters that was found in the previous step

n_init: Increase the number of initializations to ensure a stable solution is found

max_iter: Increase the number of iterations per initialization to ensure that k-means will converge

In [None]:
# Select the optimal number of clusters
n_clusters = 4

In [None]:
# Instantiate the k-means algorithm
kmeans_kwargs = {
    "init": "k-means++",
    "n_clusters": n_clusters,
    "n_init": 50,
    "max_iter": 500,
    #"algorithm": "full",
    "random_state": 42,
}

kmeans = KMeans(**kmeans_kwargs)

# Fit the algorithm to the features
kmeans.fit(X_cluster)

In [None]:
# The number of iterations required to converge
kmeans.n_iter_

In [None]:
# Final locations of the centroid
kmeans.cluster_centers_

### Internal Criteria

##### Sources

https://www.cs.upc.edu/~bejar/URL/material/04-Validation.pdf

https://nbviewer.jupyter.org/github/bejar/AMLTNotebooks/blob/master/Notebooks/10ClusterValidation.ipynb

These indices do not require ground truth labels. They measure properties expected in a good clustering: compact groups and well-separated groups 

The indices are based on the model of the groups. We can use indices based on the attributes’ values measuring the properties of a good clustering. The indices are based on statistical properties of the attributes of the model: value distribution and distances distribution.

Recent studies (Arbelatiz et al, 2013) have exhaustively tested internal indices, and some have a performance significativelly better than other. The study concludes that Silhouette, Davies-Bouldin and Calinski Harabasz perform well in a wide range of situations

#### SSE

Some of the indices correspond directly to the objective function optimizated

In [None]:
# The lowest SSE value
kmeans.inertia_

#### Silhouette Index

Silhouette coefficient values range between -1 and 1. Larger numbers indicate that samples are closer to their clusters than they are to other clusters.

A score of 1 denotes the best meaning that the data point o is very compact within the cluster to which it belongs and far away from the other clusters. Values near 0 denote overlapping clusters.

The worst value is -1. When the silhouette coefficient value is negative, this means that, in expectation, o is closer to the objects in another cluster than to the objects in the same cluster as o. In many cases, this is a bad situation and should be avoided.

In the scikit-learn implementation of the silhouette coefficient, the average silhouette coefficient of all the samples is summarized into one score. 

In [None]:
# Compute the silhouette score for the algorithm
kmeans_silhouette = silhouette_score(
    X_cluster, kmeans.labels_
).round(2)

kmeans_silhouette

#### Calinski-Harabasz Index      

In [None]:
calinski_harabasz_score(X_cluster, labels)

#### Davies-Bouldin Criteria

In [None]:
davies_bouldin_score(X_cluster, labels)

## Other Clustering Algorithms

##### Sources

https://towardsdatascience.com/an-introduction-to-clustering-algorithms-in-python-123438574097

https://towardsdatascience.com/the-5-clustering-algorithms-data-scientists-need-to-know-a36d136ef68

https://machinelearningmastery.com/clustering-algorithms-with-python/

https://github.com/bejar/AMLTNotebooks/blob/master/Code/Validation/ValidationAuthors.py

https://hdbscan.readthedocs.io/en/latest/comparing_clustering_algorithms.html#getting-set-up

https://scikit-learn.org/stable/modules/clustering.html#clustering

### Mini-Batch KMeans

Mini-Batch K-Means is a modified version of k-means that makes updates to the cluster centroids using mini-batches of samples rather than the entire dataset, which can make it faster for large datasets, and perhaps more robust to statistical noise.

In [None]:
nclusters = 10

for nc in range(2,nclusters+1):
    # define the model
    model = MiniBatchKMeans(n_clusters=nc)
    # fit the model
    model.fit(X_cluster)
    # assign a cluster to each example
    yhat = model.predict(X_cluster)
    # retrieve unique clusters
    clusters = unique(yhat)
    # create scatter plot for samples from each cluster
    for cluster in clusters:
        # get row indexes for samples with this cluster
        row_ix = where(yhat == cluster)
        # create scatter of these samples
        plt.scatter(X[row_ix, 0], X[row_ix, 1])
    # show the plot
    plt.show()

### Spectral Clustering

##### Sources

*Parameters:* number of clusters

*Scalability:* Medium n_samples, small n_clusters

*Usecase:* Few clusters, even clsuter size, non-flat geometry

*Geometry (metric used):* Graph distance (e.g., nearest-neighbor graph)

In [None]:
nclusters = 10
lscores = []

for nc in range(2,nclusters+1):
    spec = SpectralClustering(n_clusters=nc, affinity='nearest_neighbors', n_neighbors=15, random_state=0)
    labels = spec.fit_predict(X_cluster)
    lscores.append((
        silhouette_score(X_cluster, labels),
        calinski_harabasz_score(X_cluster, labels),
        davies_bouldin_score(X_cluster, labels)))

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(131)
plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores])
ax = fig.add_subplot(132)
plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores])
ax = fig.add_subplot(133)
plt.plot(range(2,nclusters+1), [x for _, _, x in lscores])

plt.show()

###  DBSCAN

##### Characteristics

*Parameters:* neighborhood size

*Scalability:* Very large n_samples, medium clusters

*Usecase:* Non-flat geometry, uneven cluster sizes

*Geometry (metric used):* Distances between nearest points

In [None]:
# Computer DBSCAN
db = DBSCAN(eps=0.2, min_samples=10)

# Fit the algorithms to the features
db.fit(X_cluster)

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

In [None]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

In [None]:
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f"
      % silhouette_score(X, labels))

In [None]:
# Plot result

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X_cluster[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = X_cluster[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()


###  HDBSCAN

In [None]:
def plot_clusters(data, algorithm, args, kwds):
    start_time = time.time()
    labels = algorithm(*args, **kwds).fit_predict(data)
    end_time = time.time()
    palette = sns.color_palette('deep', np.unique(labels).max() + 1)
    colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
    plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)
    plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
    plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14)

In [None]:
plot_clusters(X_cluster, hdbscan.HDBSCAN, (), {'min_cluster_size':1000, 'min_samples':1})

### OPTICS

##### Characteristics

*Parameters:* minimum cluster membership

*Scalability:* Very large n_samples, large n_clusters

*Usecase:* Non-flat geometry, uneven cluster sizes, variable cluster density

*Geometry (metric used):* Distances between points

OPTICS clustering (where OPTICS is short for Ordering Points To Identify the Clustering Structure) is a modified version of DBSCAN described above.

In [None]:
clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)

# Run the fit
clust.fit(X_cluster)

labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
                                   core_distances=clust.core_distances_,
                                   ordering=clust.ordering_, eps=0.5)
labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
                                   core_distances=clust.core_distances_,
                                   ordering=clust.ordering_, eps=2)

space = np.arange(len(X))
reachability = clust.reachability_[clust.ordering_]
labels = clust.labels_[clust.ordering_]

plt.figure(figsize=(10, 7))
G = gridspec.GridSpec(2, 3)
ax1 = plt.subplot(G[0, :])
ax2 = plt.subplot(G[1, 0])
ax3 = plt.subplot(G[1, 1])
ax4 = plt.subplot(G[1, 2])

# Reachability plot
colors = ['g.', 'r.', 'b.', 'y.', 'c.']
for klass, color in zip(range(0, 5), colors):
    Xk = space[labels == klass]
    Rk = reachability[labels == klass]
    ax1.plot(Xk, Rk, color, alpha=0.3)
ax1.plot(space[labels == -1], reachability[labels == -1], 'k.', alpha=0.3)
ax1.plot(space, np.full_like(space, 2., dtype=float), 'k-', alpha=0.5)
ax1.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.', alpha=0.5)
ax1.set_ylabel('Reachability (epsilon distance)')
ax1.set_title('Reachability Plot')

# OPTICS
colors = ['g.', 'r.', 'b.', 'y.', 'c.']
for klass, color in zip(range(0, 5), colors):
    Xk = X_cluster[clust.labels_ == klass]
    ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
ax2.plot(X[clust.labels_ == -1, 0], X_cluster[clust.labels_ == -1, 1], 'k+', alpha=0.1)
ax2.set_title('Automatic Clustering\nOPTICS')

# DBSCAN at 0.5
colors = ['g', 'greenyellow', 'olive', 'r', 'b', 'c']
for klass, color in zip(range(0, 6), colors):
    Xk = X_cluster[labels_050 == klass]
    ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3, marker='.')
ax3.plot(X_cluster[labels_050 == -1, 0], X_cluster[labels_050 == -1, 1], 'k+', alpha=0.1)
ax3.set_title('Clustering at 0.5 epsilon cut\nDBSCAN')

# DBSCAN at 2.
colors = ['g.', 'm.', 'y.', 'c.']
for klass, color in zip(range(0, 4), colors):
    Xk = X[labels_200 == klass]
    ax4.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
ax4.plot(X_cluster[labels_200 == -1, 0], X_cluster[labels_200 == -1, 1], 'k+', alpha=0.1)
ax4.set_title('Clustering at 2.0 epsilon cut\nDBSCAN')

plt.tight_layout()
plt.show()

### Agglomerative / Ward Hierarchical Clustering

##### Characteristics

*Parameters:* number of clusters or distance threshold, linkage type, distance

*Scalability:* Large n_samples and n_clusters

*Usecase:* Many clusters, possibly connectivity constraints, non Euclidean distances

*Geometry (metric used):* Any pariwise distance

In [None]:
Z = hierarchy.linkage(X_cluster, 'single')

plt.figure()
dn = hierarchy.dendrogram(Z)
plt.title('Hierarchical Clustering Dendrogram')

In [None]:
nclusters = 10

for linkage in ('ward','average','complete','single'):
    for nc in range(2,nclusters+1):
        clustering = AgglomerativeClustering(linkage=linkage, n_clusters=nc)
        clustering.fit(X_cluster)
        print('Linkage: %s' % linkage)
        print('Number of Clusters: %d' % nc)
        print("Silhouette Coefficient: %0.3f"
          % silhouette_score(X_cluster, clustering.labels_))
        print("\n")

### Birch

##### Characteristics

*Parameters:* branching factor, treshold, optional global clusterer

*Scalability:* Large n_clusters and n_samples. Birch does not scale very well to high dimensional data. As a rule of thumb if n_features is greater than twenty, it is generally better to use MiniBatchKMeans.

*Usecase:* Large dataset, outlier removal, data reduction

*Geometry (metric used):* Euclidean distance between points

In [None]:
nclusters = 10
for nc in range(2,nclusters+1):
    # define the model
    model = Birch(threshold=0.01, n_clusters=nc)
    # fit the model
    model.fit(X_cluster)
    # assign a cluster to each example
    yhat = model.predict(X_cluster)
    # retrieve unique clusters
    clusters = unique(yhat)
    # create scatter plot for samples from each cluster
    for cluster in clusters:
        # get row indexes for samples with this cluster
        row_ix = where(yhat == cluster)
        # create scatter of these samples
        plt.scatter(X_cluster[row_ix, 0], X_cluster[row_ix, 1])
    # show the plot
    plt.show()

### NOT SCALABLE WITH N_SAMPLES

### Affinity propagation

##### Characteristics

*Parameters:* damping, sample preference

*Scalability:* Not scalable with n_samples

*Usecase:* Many clusters, uneven cluster size, non-flat geometry

*Geometry (metric used):* Graph distance (e.g. nearest-neighbor graph)

In [None]:
# Compute Affinity Propagation
af = AffinityPropagation(preference=-50).fit(X_cluster)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

# Plot result
from itertools import cycle

plt.close('all')
plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
    cluster_center = X_cluster[cluster_centers_indices[k]]
    plt.plot(X_cluster[class_members, 0], X_cluster[class_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
    for x in X_cluster[class_members]:
        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

### Mean-Shift

##### Characteristics

*Parameters:* bandwidth

*Scalability:* Not scalable with n_samples

*Usecase:* Many clusters, uneven cluster size, non-flat geometry

*Geometry (metric used):* Distances between points

In [None]:
bandwidth = estimate_bandwidth(X_cluster, quantile=0.2, n_samples=10000)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X_cluster)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)

In [None]:
plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(X_cluster[my_members, 0], X_cluster[my_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

### Gaussian Mixture

##### Sources

https://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_selection.html#sphx-glr-auto-examples-mixture-plot-gmm-selection-py

##### Characteristics

*Parameters:* many

*Scalability:* Not scalable

*Usecase:* Flat geometry, good for density estimation

*Geometry (metric used):* Euclidean distance between points

In [None]:
nclusters = 10
lscores = []

for nc in range(2, nclusters+1):
    gmm = GaussianMixture(n_components=nc, covariance_type='diag')
    gmm.fit(X_cluster)
    labels = gmm.predict(X_cluster)
    lscores.append((
        silhouette_score(X_cluster, labels),
        calinski_harabasz_score(X_cluster, labels),
        davies_bouldin_score(X_cluster, labels)
    ))
    
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(131)
plt.plot(range(2,nclusters+1), [x for x,_,_ in lscores])
ax = fig.add_subplot(132)
plt.plot(range(2,nclusters+1), [x for _, x,_ in lscores])
ax = fig.add_subplot(133)
plt.plot(range(2,nclusters+1), [x for _, _, x in lscores])

plt.show()

In [None]:
lowest_bic = np.infty
bic = []
n_components_range = range(1, 10)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a Gaussian mixture with EM
        gmm = GaussianMixture(n_components=n_components,
                                      covariance_type=cv_type)
        gmm.fit(X_cluster)
        bic.append(gmm.bic(X_cluster))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm

bic = np.array(bic)
color_iter = itertools.cycle(['navy', 'turquoise', 'cornflowerblue',
                              'darkorange'])
clf = best_gmm
bars = []

# Plot the BIC scores
plt.figure(figsize=(8, 6))
spl = plt.subplot(2, 1, 1)
for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
    xpos = np.array(n_components_range) + .2 * (i - 2)
    bars.append(plt.bar(xpos, bic[i * len(n_components_range):
                                  (i + 1) * len(n_components_range)],
                        width=.2, color=color))
plt.xticks(n_components_range)
plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
plt.title('BIC score per model')
xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
    .2 * np.floor(bic.argmin() / len(n_components_range))
plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
spl.set_xlabel('Number of components')
spl.legend([b[0] for b in bars], cv_types)

## Consensus Clustering

##### Sources

https://www.cs.upc.edu/~bejar/URL/material/06-Consensus.pdf

https://nbviewer.jupyter.org/github/bejar/URLNotebooks/blob/master/Notebooks/12ConsensusClustering.ipynb

https://learning.oreilly.com/library/view/hands-on-ensemble-learning/9781789612851/549666a5-7fe2-4ea9-867b-d5f8c640d28f.xhtml

https://github.com/PacktPublishing/Hands-On-Ensemble-Learning-with-Python/tree/master/Chapter08

### Simple Consensus Clustering

n_clusters = Number of clusters

n_clusters_base = Number of clusters to use the base classifier

n_components = Number of components of the consensus

ncb_rand = If the number of clusters of each component is chosen randomly in the interval [ 2..n_clusters ]

In [None]:
nc = 4
km = KMeans(n_clusters=nc)

cons = SimpleConsensusClustering(n_clusters=nc, n_clusters_base=4, n_components=10, ncb_rand=False)

lkm = km.fit_predict(X_cluster)
cons.fit(X_cluster)
lcons = cons.labels_

print('K-M SS =', silhouette_score(X_cluster, labels))
print('SCC SS =', silhouette_score(X_cluster, labels))

### OpenEnsembles - Majority Vote

In [None]:
#feature_names = list(df[df.columns[~df.columns.isin(["GEOID","GEO_ID","NAME"])]].columns)

feature_names = ['component_1', 'component_2']

cluster_data = oe.data(pd.DataFrame(X_cluster), feature_names)

ensemble = oe.cluster(cluster_data)

In [None]:
# --- SECTION 3 ---
# Create the ensembles and calculate the silhouette score
for K in [2, 3, 4, 5, 6, 7]:
    for ensemble_size in [3, 4, 5]:
        ensemble = oe.cluster(cluster_data)
        for i in range(ensemble_size):
            name = f'kmeans_{ensemble_size}_{i}'
            ensemble.cluster('parent', 'kmeans', name, K)

        preds = ensemble.finish_majority_vote(threshold=0.5)
        print(f'K: {K}, size {ensemble_size}:', end=' ')
        print('%.2f' % silhouette_score(X_cluster, preds.labels['majority_vote']))

### OpenEnsembles - Graph Closure

In [None]:
#feature_names = list(df[df.columns[~df.columns.isin(["GEOID","GEO_ID","NAME"])]].columns)

feature_names = ['component_1', 'component_2']

cluster_data = oe.data(pd.DataFrame(X_cluster), feature_names)

ensemble = oe.cluster(cluster_data)

In [None]:
for K in [2, 3, 4, 5, 6, 7]:
    for ensemble_size in [3, 4, 5]:
        ensemble = oe.cluster(cluster_data)
        for i in range(ensemble_size):
            name = f'kmeans_{ensemble_size}_{i}'
            ensemble.cluster('parent', 'kmeans', name, K)

        preds = ensemble.finish_graph_closure(threshold=0.5)
        print(f'K: {K}, size {ensemble_size}:', end=' ')
        print('%.2f' % silhouette_score(X_cluster, preds.labels['majority_vote']))

### OpenEnsembles - Occurence

In [None]:
#feature_names = list(df[df.columns[~df.columns.isin(["GEOID","GEO_ID","NAME"])]].columns)

feature_names = ['component_1', 'component_2']

cluster_data = oe.data(pd.DataFrame(X_cluster), feature_names)

ensemble = oe.cluster(cluster_data)

In [None]:
# Create the ensembles and calculate the homogeneity score
for K in [2, 3, 4, 5, 6, 7]:
    for ensemble_size in [3, 4, 5]:
        ensemble = oe.cluster(cluster_data)
        for i in range(ensemble_size):
            name = f'kmeans_{ensemble_size}_{i}'
            ensemble.cluster('parent', 'kmeans', name, K)

        preds = ensemble.finish_co_occ_linkage(threshold=0.5)
        print(f'K: {K}, size {ensemble_size}:', end=' ')
        print('%.2f' % silhouette_score(X_cluster, preds.labels['majority_vote']))

# Evaluation of Chosen Model

##### Sources

https://towardsdatascience.com/cluster-analysis-create-visualize-and-interpret-customer-segments-474e55d00ebb

https://github.com/MaartenGr/CustomerSegmentation/blob/master/Customer%20Segmentation.ipynb

In [None]:
# Choose the algorithm that produced the best results
chosen_model = kmeans

## Visualizations

### 2D

In [None]:
# The dataset utilized for clustering may have used more than 2 components
# So, take the original processed matrix "X" and project it onto 2 dimensions
# Then, attach the cluster labels to the final outputted dataframe

df_pca = prepare_pca(2, X, chosen_model.labels_)
df_tsne = prepare_tsne(2, X, chosen_model.labels_)

#### Version 1

In [None]:
def plot_2d_v1 (df, model):
    colors = plt.cm.Spectral(np.linspace(0, 1, len(df.labels.unique())))

    for color, label in zip(colors, df.labels.unique()):
    
        tempdf = df[df.labels == label]
        plt.scatter(tempdf.x, tempdf.y, c=color)
    
    plt.scatter(model.cluster_centers_[:,0], model.cluster_centers_[:, 1], c='r', s=500, alpha=0.7, )
    plt.grid(True)
    plt.show()

In [None]:
plot_2d_v1(df_tsne, chosen_model)

### 3D

In [None]:
# The dataset utilized for clustering may have more than 3 components
# So, take the original processed matrix "X" and project it onto 3 dimensions
# Then, attach the cluster labels to the final outputted dataframe

df_pca = prepare_pca(3, X, chosen_model.labels_)
df_tsne = prepare_tsne(3, X, chosen_model.labels_)

#### Version 1

In [None]:
def plot_3d_v1(df, name='labels'):
    fig = px.scatter_3d(df, x='x', y='y', z='z', color=name, opacity=0.5)
    
    fig.update_traces(marker=dict(size=3))
    
    fig.show()

In [None]:
plot_3d_v1(df_tsne)

#### Version 2

In [None]:
def plot_3d_v2(df):
    fig = plt.figure(figsize=(10,10))

    ax = fig.add_subplot(111, projection='3d')
    plt.scatter(df[:, 0], df[:, 1], zs=df[:, 2], depthshade=False, c=df[:, 3], s=100)

    plt.show()

In [None]:
plot_3d_v2(df_tsne)

### 3D Animation

In [None]:
def plot_animation(df, name):
    def update(num):
        ax.view_init(200, num)

    N=360
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(df['x'], df['y'], df['z'], c=df["labels"],
               s=6, depthshade=True, cmap='Paired')
    ax.set_zlim(-15, 25)
    ax.set_xlim(-20, 20)
    plt.tight_layout()
    ani = animation.FuncAnimation(fig, update, N, blit=False, interval=50)
    #ani.save('{}.gif'.format(name), writer='imagemagick')
    plt.show()

In [None]:
plot_animation(df_tsne, "chosen_model")

## Interpreting Clusters

Now that the clusters have been created, it would be nice to determine what makes each one unique. This will help with the understanding of the different observations. 

### Variance Within Variables and Between Clusters

One way to see the differences between the clsuters is to take the average value of each cluster and visualize it

The problem with this approach is that we simply have too many variables. Not all of them are likely to be important when creating the clusters. Instead, select the most important columns based on the following approach.

Group datapoints by cluster and take the average. Then, calculate the standard deviation between those values for each variable. Variables with a higher standard deviation indicate that there are large differences between clusters and the variable might be important.

In [None]:
# Go back the original dataframe that has all the features
df_temp = df[df.columns[~df.columns.isin(["GEOID","GEO_ID","NAME"])]].copy()

# Setting all variables between 0 and 1 in order to better visualize the results
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_temp))
df_scaled.columns = df_temp.columns

In [None]:
# Save the labels of the chosen model
df_temp["labels"] = chosen_model.labels_
df_scaled["labels"] = chosen_model.labels_

In [None]:
# Calculate variables with largest differences (by standard deviation)
# The higher the standard deviation in a variable based on average values for each cluster
# The more likely that the variable is important when creating the cluster
df_mean = df_scaled.loc[df_scaled.labels!=-1, :].groupby('labels').mean().reset_index()

results = pd.DataFrame(columns=['Variable', 'Std'])

for column in df_mean.columns[1:]:
    results.loc[len(results), :] = [column, np.std(df_mean[column])]

In [None]:
# Choose the number of columns to evaluate
num_cols = 7

# Put the contents of the top columns in a list
selected_columns = list(results.sort_values('Std', ascending=False)
                        .head(num_cols).Variable.values) + ["labels"]

In [None]:
# Plot data
tidy = df_scaled[selected_columns].melt(id_vars='labels')
fig, ax = plt.subplots(figsize=(15, 5))
sns.barplot(x='labels', y='value', hue='variable', data=tidy, palette='Set3')
plt.legend(loc='upper right')

#### Heatmap of Important Features

##### Sources

https://www.bigendiandata.com/2017-04-18-Jupyter_Customer360/

In [None]:
# Filter the scaled dataframe for the labels and important featues
df_scaled_sub = df_scaled[selected_columns]

In [None]:
# Split the dataframe into cluster groups.
# Then, compute the mean for all columns in every group
df_grouped = df_scaled_sub.groupby(["labels"], sort=True).mean()

In [None]:
# Put the column labels in a list
labels = list(df_scaled_sub.labels.unique())
labels.sort()

In [None]:
data = [go.Heatmap(z=df_grouped.values.tolist(), 
                   y=labels,
                   x=list(df_grouped.columns),
                   colorscale='Viridis')]

plotly.offline.iplot(data, filename='pandas-heatmap')

#### Density Plots of Important Features

##### Sources

https://radiant-rstats.github.io/docs/multivariate/kclus.html

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.density.html

In [None]:
for i in range(len(selected_columns[:-1])):

    df_density = pd.DataFrame(df_scaled_sub[df_scaled_sub["labels"]==0].iloc[:,i])
    col = list(df_density.columns)[0]
    df_density.rename(columns={col:"0"}, inplace=True)

    for l in labels[1:]:
        new_col = df_scaled_sub[df_scaled_sub["labels"]==l].iloc[:,i]
        df_density = pd.concat([df_density, new_col], axis=1, sort=False)
        df_density.rename(columns={col:str(l)}, inplace=True)
    
    print("Density Plot for Feature: " + selected_columns[i])
    df_density.plot.kde()

### Random Forest Feature Selection

Apply a Random Forest modelw ith the cluster labels as the target variable. This method can helpdetermine the features which are important in the generation of clusters.

This method requires a bit more work since it's important to check the accuracy of the Random Forest model to accurately extract important features. Note that this step has been skipped below.

In [None]:
from sklearn.ensemble import RandomForestClassifier
y = df_temp.iloc[:,-1]
X = df_temp.iloc[:,:-1]

In [None]:
clf = RandomForestClassifier(n_estimators=100).fit(X, y)
selected_columns_rf = list(pd.DataFrame(np.array([clf.feature_importances_, X.columns]).T, columns=['Importance', 'Feature'])
           .sort_values("Importance", ascending=False)
           .head(num_cols)
           .Feature
           .values)

In [None]:
tidy = df_scaled[selected_columns_rf+['labels']].melt(id_vars='labels')
fig, ax = plt.subplots(figsize=(15, 5))
sns.barplot(x='labels', y='value', hue='variable', data=tidy, palette='Set3')
plt.legend(loc='upper right')