[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/allegheny-college-cmpsc-105-spring-2025/site/blob/main/code/data_clustering_activity.ipynb)

# Data Clustering Activity

Scatter plots visualize relationships between two variables, but we can do more with the visualization. What if two variables are not necessarily correlated, but they form clusters?

![](https://miro.medium.com/v2/resize:fit:1400/1*-vVYAADrOKB0_Y0T381_Gw.png)

This teaching content has been adpated from the K-Means Clustering video (click to get to video):

[![K-Means Clustering](https://embed.filekitcdn.com/e/bLsrpyq139NahA7eLBZSXF/e5WBFjHKjAELum3VjebTr4)](https://www.youtube.com/embed/9Mmj8NMCqEQ?si=RtqknpxEIIcVLJ20&t=84 "K-Means")


![](https://miro.medium.com/v2/resize:fit:720/1*YYiQed4kj_EZ2qfg_imDWA.png)

![](https://mlforanalytics.com/wp-content/uploads/2018/04/euclidean-distance.jpg)

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy

In [15]:
# Get clustering data raw url from this repo (https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/datasets/data/iris.csv)

df = pd.read_csv(
    '',
    sep=',',
    names=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'species'],
    skiprows=1
)

In [None]:
# Display the first few rows of the DataFrame
df.head()

# Display "all" (but middle rows may be skipped for large datasets)
df

In [None]:
# Check the number of rows and columns, i.e. shape


In [None]:
# Wrangle column names for convenience
df = df.rename(columns = {'sepal length (cm)':'sl',
                     'sepal width (cm)':'sw',
                     'petal length (cm)':'pl',
                     'petal width (cm)':'pw',
                     })

df

In [None]:
# Make a plot to see if sepal length and width are related

plt.figure()
plt.title('Relationship of sepal dimensions')
plt.scatter(df.sl, df.sw)
plt.xlabel('length (cm)')
plt.ylabel('width (cm)')
plt.show()


In [72]:
# implement one iteration of K-means by hand
# pre-existing knowledge: there are three types of irises in the dataset

# set three centroids, where I think the cluster centers might be
c1 = np.array([5, 3.5])
c2 = np.array([5.5, 2.5])
c3 = np.array([7, 3])

In [None]:
# Visualize the centroids on the plot

plt.figure()
plt.title('Relationship of sepal dimensions - Seeded Centroids')
plt.scatter(df.sl, df.sw)
plt.xlabel('length (cm)')
plt.ylabel('width (cm)')
plt.scatter([c1[0], c2[0], c3[0]], [c1[1], c2[1], c3[1]], marker='o', s=200, color='r')
plt.legend(['data', 'centroid guess'])
plt.show()

In [74]:
# compute the distance of EVERY data point to EVERY centroid
# this is the euclidean distance formula

df['dc1'] = ((df.sl - c1[0]) ** 2 + (df.sw - c1[1]) ** 2) ** 0.5

In [75]:
df['dc2'] = ((df.sl - c2[0]) ** 2 + (df.sw - c2[1]) ** 2) ** 0.5
df['dc3'] = ((df.sl - c3[0]) ** 2 + (df.sw - c3[1]) ** 2) ** 0.5

In [76]:
# for every point, identify which distance is smallest
df['smallest'] = np.min(df.loc[:,['dc1', 'dc2', 'dc3']], axis=1)

In [None]:
# check that .loc was used correctly

df.loc[:,['dc1', 'dc2', 'dc3']]

In [None]:
df.smallest

In [79]:
# use the argmin numpy function to also get the column that contained the smallest value

df['cmin'] = np.argmin(df.loc[:,['dc1', 'dc2', 'dc3']], axis=1)

In [None]:
df.cmin

In [None]:
# use the cmin column to visualize which data got assigned to which cluster

plt.figure()
plt.title('Relationship of sepal dimensions - Cluser Assignment')

plt.scatter(df.sl[df.cmin == 0], df.sw[df.cmin == 0]) # layer 1 for first cluster
plt.scatter(df.sl[df.cmin == 1], df.sw[df.cmin == 1]) # layer 2 for second cluster
plt.scatter(df.sl[df.cmin == 2], df.sw[df.cmin == 2]) # layer 3 for second cluster
plt.scatter([c1[0], c2[0], c3[0]], [c1[1], c2[1], c3[1]], marker='o', s=200) # layer 4

plt.xlabel('length (cm)')
plt.ylabel('width (cm)')

plt.legend(['cluster 1', 'cluster 2', 'cluster 3', 'centroid guess'])
plt.show()

In [None]:
# check to see if the centroid coordinates need to be adjusted
# they should be moved to the mean value of the cluster

c1new = np.array([df.sl[df.cmin == 0].mean(), df.sw[df.cmin == 0].mean()])
c2new = np.array([df.sl[df.cmin == 1].mean(), df.sw[df.cmin == 1].mean()])
c3new = np.array([df.sl[df.cmin == 2].mean(), df.sw[df.cmin == 2].mean()])

print(c1, c1new)
print(c2, c2new)
print(c3, c3new)

In [None]:
print("c1 is off by: ", c1 - c1new)
print("c2 is off by: ", c2 - c2new)
print("c3 is off by: ", c3 - c3new)

In [None]:
# if the centroids moved more than a very very very small amount, then we need to
# rerun the algorithm using the new centroids.
# this process repeats until the centroids move only a tiny amount, 0.0001, for example
# in both the x and y position

In [92]:
# use scipy to do the same thing
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.vq.kmeans.html

centroids = scipy.cluster.vq.kmeans(df.loc[:, ['sl', 'sw']], k_or_guess=3)


In [None]:
centroids

In [96]:
# let's see if the cluster coordinates change if four dimensions are used
# i.e. use sl, sw, pl, and pw to get centroids

centroids4 = scipy.cluster.vq.kmeans(df.loc[:, ['sl', 'sw', 'pl', 'pw']], k_or_guess=3)

In [None]:
centroids4

In [None]:
# GOAL: visualize the difference in 2D
# This means figuring out which data belong to each centroid in centroids
# Then figure out which data belong to each centroid in centroids4
# Then plotting

df_centroids = pd.DataFrame()
df_centroids['dc1'] = ((df.sl - centroids[0,0]) ** 2 + (df.sw - centroids[0,1]) ** 2) ** 0.5
df_centroids['dc2'] = ((df.sl - centroids[1,0]) ** 2 + (df.sw - centroids[1,1]) ** 2) ** 0.5
df_centroids['dc3'] = ((df.sl - centroids[2,0]) ** 2 + (df.sw - centroids[2,1]) ** 2) ** 0.5

df_centroids['cmin'] = np.argmin(df_centroids, axis=1)
df_centroids

In [None]:
# TODO: copy and modify the code above to figure out which data belong to which cluster when the centroids4 are used

df_centroids4 = pd.DataFrame()


In [None]:
plt.figure(figsize=[12,4])

plt.subplot(1,2,1)
plt.title('K-Means results with 2 variables')
plt.scatter(df.sl[df_centroids.cmin == 0], df.sw[df_centroids.cmin == 0]) # layer 1 for first cluster
plt.scatter(df.sl[df_centroids.cmin == 1], df.sw[df_centroids.cmin == 1]) # layer 2 for second cluster
plt.scatter(df.sl[df_centroids.cmin == 2], df.sw[df_centroids.cmin == 2]) # layer 3 for second cluster
plt.scatter([centroids[0,0], centroids[1,0], centroids[2,0]], [centroids[0,1], centroids[1,1], centroids[2,1]], marker='o', s=200) # layer 4
plt.xlabel('length (cm)')
plt.ylabel('width (cm)')
plt.legend(['cluster 1', 'cluster 2', 'cluster 3', 'centroids'])

plt.subplot(1,2,2)
plt.title('K-Means results with 4 variables')
# TODO: layer 1 for first cluster
# TODO: layer 2 for second cluster
# TODO: layer 3 for third cluster
# TODO: layer 4 for centroids4
# TODO: xlabel
# TODO: ylabel
# TODO: legend

plt.show()

In [None]:
# plot the original classes

plt.figure()
plt.title('Relationship of sepal dimensions - Real Classes')

plt.scatter(df.sl[df.species == 0], df.sw[df.species == 0]) # layer 1 for first cluster
plt.scatter(df.sl[df.species == 1], df.sw[df.species == 1]) # layer 2 for second cluster
plt.scatter(df.sl[df.species == 2], df.sw[df.species == 2]) # layer 3 for second cluster

plt.xlabel('length (cm)')
plt.ylabel('width (cm)')

plt.show()

In [None]:
# TODO: how might you figure out which method was better, based on the figures about?
# TODO: Post your idea to discord

## Discord Post

TODO: Don't forget to ask questions in Discord. One Discord post per week is required.

## Submit this completed activity to your activities repository