In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

from matplotlib import pyplot as plt

In [2]:
# Loading data
file_path = "Resources/new_iris_data.csv"
df_iris = pd.read_csv(file_path)

df_iris.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [3]:
# Standardize data with StandardScaler
iris_scaled=StandardScaler().fit_transform(df_iris)
print(iris_scaled[0:5])

[[-0.90068117 -1.3412724   1.03205722 -1.31297673]
 [-1.14301691 -1.3412724  -0.1249576  -1.31297673]
 [-1.38535265 -1.39813811  0.33784833 -1.31297673]
 [-1.50652052 -1.2844067   0.10644536 -1.31297673]
 [-1.02184904 -1.3412724   1.26346019 -1.31297673]]


In [4]:
iris_scaled[0:5]

array([[-0.90068117, -1.3412724 ,  1.03205722, -1.31297673],
       [-1.14301691, -1.3412724 , -0.1249576 , -1.31297673],
       [-1.38535265, -1.39813811,  0.33784833, -1.31297673],
       [-1.50652052, -1.2844067 ,  0.10644536, -1.31297673],
       [-1.02184904, -1.3412724 ,  1.26346019, -1.31297673]])

In [5]:
# Initialize PCA model
pca = PCA(n_components=2)  #reducing the features from 4 to 2

In [6]:
# After creating the PCA model, we apply dimensionality reduction on the scaled dataset:
# Get two principal components for the iris data.
iris_pca = pca.fit_transform(iris_scaled)

In [7]:
# Transform PCA data to a DataFrame

df_iris_pca=pd.DataFrame(
    data=iris_pca, columns=["principal component 1", "principal component 2"])
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [8]:
# Fetch the explained variance

pca.explained_variance_ratio_

array([0.72770452, 0.23030523])

he first principal component contains 72.77% of the variance and the second contains 23.03%. Together, they contain 95.80% of the information.



In [10]:
# Find the best value for K
ssq = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris_pca)
    ssq.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": ssq}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

# # Plot
# plt.plot(list(sse.keys()), list(sse.values()))
# plt.xlabel('k')
# plt.ylabel('SSE')
# plt.title('Elbow Method')
# plt.show()

Use the principal components data with the K-means algorithm with a K value of 3. We could consider 2, but the direction shifts more after 3:

In [13]:
# Initialize the K-means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_iris_pca)

# Predict clusters
predictions = model.predict(df_iris_pca)

# Add the predicted class columns
df_iris_pca["class"] = model.labels_
df_iris_pca.head(10)

Unnamed: 0,principal component 1,principal component 2,class
0,-2.264542,0.505704,0
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,0
5,-2.070537,1.518549,0
6,-2.445711,0.074563,0
7,-2.233842,0.247614,0
8,-2.341958,-1.095146,0
9,-2.188676,-0.448629,0


In [14]:
# Plotting the clusters with 2 features
df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)

In [15]:
# Initialize the K-means model
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(df_iris_pca)

# Predict clusters
predictions = model.predict(df_iris_pca)

# Add the predicted class columns
df_iris_pca["class"] = model.labels_
df_iris_pca.head(10)

Unnamed: 0,principal component 1,principal component 2,class
0,-2.264542,0.505704,3
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,3
5,-2.070537,1.518549,3
6,-2.445711,0.074563,0
7,-2.233842,0.247614,0
8,-2.341958,-1.095146,0
9,-2.188676,-0.448629,0


In [16]:
# Plotting the clusters with 2 features
df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)