In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas
import plotly.figure_factory as ff

In [2]:
# hierarchal clustering

## declares each data pt with own cluster then merges two most similar clusters until declared stopping point is reached

## 3 methods:
##      ward--default. selects clusters based on least variance, leads to relatively equal size clusters
##      average. selects clusters that have smallest avg distance between all their pts
##      complete. merges clusters that have smallest max distance


In [3]:
# dendrograms

## graph that keeps a axis values and connects all points as they are clustered

In [5]:
file_path = "Resources/new_iris_data.csv"\

df_iris = pd.read_csv(file_path)

df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [7]:
# need to scale to standardize for PCA

iris_scaled = StandardScaler().fit_transform(df_iris)

In [8]:
# Initialize PCA model

# n components will reduce our features from 4 to 2
pca = PCA(n_components=2)

In [9]:
# Get two principal components for the iris data.
iris_pca = pca.fit_transform(iris_scaled)

In [10]:
df_iris_pca = pd.DataFrame(data=iris_pca, columns=['principal_component_1', 'principal_component_2'])

df_iris_pca.head()

Unnamed: 0,principal_component_1,principal_component_2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [12]:
# Create the dendrogram
fig = ff.create_dendrogram(df_iris_pca, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()

In [13]:
# cutoff 5 will give us 3 clusters

In [14]:
agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(df_iris_pca)

In [15]:
df_iris_pca['class'] = model.labels_

df_iris_pca.head()

Unnamed: 0,principal_component_1,principal_component_2,class
0,-2.264542,0.505704,0
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,0


In [16]:
df_iris_pca.hvplot.scatter(x='principal_component_1', y='principal_component_2', hover_cols=['class'], by='class')

In [None]:
# k mean versus hierarchal clustering

## k means is randomly initialized, so you need to have an idea of how mnay clusters you're looking for ahead of time
## k means works best for spherical looking data with similiar density points, closely grouped

## hierarchal clustering and dendrograms can let us see how many clusters we might want
## however hierarchal clustering can be slow on larger datasets