In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas
import plotly.figure_factory as ff
from sklearn.cluster import KMeans

In [7]:
# Load the data 
file_path = "/Users/annettedblackburn/Desktop/Data_Analytics_Bootcamp/Module 18 - Unsupervised Machine Learning and Cryptocurrencies/new_iris_data.csv"
df_iris = pd.read_csv(file_path)
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
# Standardize data with StandardScaler
iris_scaled = StandardScaler().fit_transform(df_iris)
# Initialize PCA model
pca = PCA(n_components=2)
# Get two principal components for the iris data.
iris_pca = pca.fit_transform(iris_scaled)
# Transform PCA data to a DataFrame
df_iris_pca = pd.DataFrame(
    data=iris_pca, columns=["principle component 1","principle component 2"])
df_iris_pca.head()

Unnamed: 0,principle component 1,principle component 2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [9]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.72770452, 0.23030523])

In [10]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [11]:
# Initialize the K-means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_iris_pca)

# Predict clusters
predictions = model.predict(df_iris_pca)

# Add the predicted class columns
df_iris_pca["class"] = model.labels_
df_iris_pca.head()


Unnamed: 0,principle component 1,principle component 2,class
0,-2.264542,0.505704,0
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,0


In [12]:
df_iris_pca.hvplot.scatter(
    x="principle component 1",
    y="principle component 2",
    hover_cols=["class"],
    by="class",
)

In [13]:
# Create the dendrogram
fig = ff.create_dendrogram(df_iris_pca, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()

In [15]:
# 3 clusters 
agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(df_iris_pca)

In [16]:
# Add a new class column to df_iris
df_iris_pca["class"] = model.labels_
df_iris_pca.head()

Unnamed: 0,principle component 1,principle component 2,class
0,-2.264542,0.505704,0
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,0


In [17]:
df_iris_pca.hvplot.scatter(
    x="principle component 1",
    y="principle component 2",
    hover_cols=["class"],
    by="class",
)

In [18]:
# Trying other clusters
agg = AgglomerativeClustering(n_clusters=7)
model = agg.fit(df_iris_pca)

In [19]:
# Add a new class column to df_iris
df_iris_pca["class"] = model.labels_
df_iris_pca.head()

Unnamed: 0,principle component 1,principle component 2,class
0,-2.264542,0.505704,6
1,-2.086426,-0.655405,4
2,-2.36795,-0.318477,4
3,-2.304197,-0.575368,4
4,-2.388777,0.674767,6


In [20]:
df_iris_pca.hvplot.scatter(
    x="principle component 1",
    y="principle component 2",
    hover_cols=["class"],
    by="class",
)