## Install the Kneed Package 

In [None]:
# if using google colab: 
!pip install kneed
# if using Jupyter notebook, open terminal (see demonstration video) and then:
pip install kneed

## Import required packages

In [None]:
import matplotlib.pyplot as plt  #needed to create data visualizations
import pandas as pd #needed to import data and manipulate dataframe
import numpy as np #needed to manipulate variables
from kneed import KneeLocator #needed to find knee in knee plot
import sklearn.cluster #needed to fit clustering models
from sklearn.cluster import KMeans #needed to use Kmeans clustering
from sklearn.metrics import silhouette_score #needed to generate metrics from kmeans
from sklearn.preprocessing import StandardScaler #needed to normalize features

## Convert variables to Z-scores (Z-transformation/Normalization Procedure)

In [None]:
features = df[["Col_1", "Col_2", "Col_3"]] #select only those variables you want included in the clustering analysis

In [None]:
scaler = StandardScaler #save this function to an object called scaler
scaled_features = scaler.fit_transform(features) #perform normalization transformation on your selected variables

## Set the parameters of your k-means clustering analysis

In [None]:
kmeans = KMeans(
    init="random", #random initialization
    n_clusters = 3, #set k to size 3 (or whatever you need to)
    n_init = 10, #number of random initializations, leave this set to 10
    max_iter=300, #number of iterations in clustering analysis in recalculating centroids
    random_state=42, #42 is arbitrary, set to a specific number for reproducibility purposes
)

## Fit model and examine error

In [None]:
kmeans.fit(scaled_features) #fit your kmeans model to the normalized variables
kmeans.intertia_ #generate the error from the model

## Run For Loop of different K values (e.g., 1-11 clusters)

In [None]:
kmeans_elbow = {
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
} #create a kmeans parameter dictionary, contains all previous parameters apart from setting number of clusters

sse = [] #create an empty list where we will save our error values for each kmeans clustering analysis of size k (1-11)

for k in range(1,11): #write loop of fitting kmeans model for different sizes of k (1-11)
    kmeans = KMeans(n_clusters=k, **kmeans_elbow) #specify parameters of kmeans model
    kmeans.fit(scaled_features) #fit kmeans model to the normalized variables
    sse.append(kmeans.inertia_) #generate the error of each model then save it to our list called 'sse'

## Plot the Elbow Method Plot (Number of Clusters by Error)

In [None]:
plt.plot(range(1,11), sse) #plotting the range of k (1-11) on x-axis, and the error (sse) on the y-axis
plt.xticks(range(1,11)) #set the tick marks on the x-axis to 1-11
plt.xlabel("Number of Clusters") #label the x-axis
plt.ylabel("SSE") #label the y-axis
plt.show()

## Locate the Knee in your Plot

In [None]:
kl = KneeLocator(
    range(1,11), sse, curve="convex", direction="decreasing") #specify that that knee plot is decreasing and you're looking for the convex point (i.e., the elbow) in the plot

kl.elbow #show the elbow value (i.e., optimal value of k, or number of clusters)

## Create a Normalized Centroid Table

In [None]:
centroids = kmeans.cluster_centers_ #save centroids from kmeans analysis to object called 'centroids'
centroid_table = pd.DataFrame(centroids, #create table where values of table cells are the centroids
                              columns = ["Col_1", "Col_2", "Col_3"] #set the names of the columns based on what variables were included in the analysis,
                              index =['Cluster_0', 'Cluster_1', 'Cluster_2', 'Cluster_3', 'Cluster_4'] #set the index to the names of each cluster, here k = 5) 
centroid_table #display table

## Create a De-Normalized Centroid Table

In [None]:
unscaled = scaler.inverse_transform(centroids)
unscaled_table = pd.DataFrame(unscaled, #create table where values of table cells are the centroids
                              columns = ["Col_1", "Col_2", "Col_3"] #set the names of the columns based on what variables were included in the analysis,
                              index =['Cluster_0', 'Cluster_1', 'Cluster_2', 'Cluster_3', 'Cluster_4'] #set the index to the names of each cluster, here k = 5

unscaled_table

## Plot Normalized Centroids in Line Chart

In [None]:
centroid_table.plot(kind = 'line', y = ['Var_1', 'Var_2', 'Var_3']) #select which variables from clustering analysis to evaluate to show average value for each cluster
plt.show()

## Plot De-Normalized Centroids in Bar Chart

In [None]:
unscaled_centroid_table.plot(kind = 'bar', y = ['Var_1', 'Var_2']) #select which variables from clustering analysis to plot
plt.title("Descriptive Title")
plt.ylabel("Descriptive Y-axis Label")
plt.show()