In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# read the input data
data = pd.read_csv("../input/xclara/xclara.csv")
print(data.head())
print(data.shape)

In [None]:
# plot the data points
X = data.values
plt.scatter(X[:, 0], X[:, 1], c='black', s=1)
plt.show()

In [None]:
# choose the number of clusters and the initial cluster centers randomly
k = 3 
C = np.random.randint(0, np.max(X)-20, size=(k, 2))
print(f"C:{C}")

In [None]:
# plot the data points and the initial cluster locations
plt.scatter(X[:, 0], X[:, 1], c='black', s=1)
plt.scatter(C[:, 0], C[:, 1], c='g', s=200, marker='*')
plt.show()

In [None]:
# euclidian distance between points (a, b)
def euclidean_distance(a, b):
    return np.linalg.norm(a - b)

In [None]:
# cluster class of each data point
clusters = np.zeros(len(X))

iter = 0
max_iter = 300 # max iterations that kmeans is allowed to run
while True:

    # Find distance of each point from cluster centroids
    for i in range(len(X)):
        dist = np.zeros(k)
        for c in range(k):
            dist[c] = euclidean_distance(X[i], C[c])

        ic = np.argmin(dist)
        clusters[i] = ic

    C_new = np.zeros((k, 2))
    for c in range(k):
        samples_ix = np.where(clusters == c)[0]
        C_new[c] = np.mean(X[samples_ix, :], axis=0)

    iter += 1
    if euclidean_distance(C, C_new) < 0.0001 or iter == max_iter:
        break
    else:
        C = C_new

In [None]:
# final clusters are stored in C
print(f"Clusters = {C}")

# print the data points and the final cluster locations
plt.scatter(X[:, 0], X[:, 1], c='black', s=1)
plt.scatter(C[:, 0], C[:, 1], c='blue', s=200, marker='*')
plt.show()

In [None]:
# plot the clusters in different colors with the cluster centers
colors = ['r', 'g', 'b', 'y', 'c', 'm']
fig, ax = plt.subplots()

# plot the data points
for i in range(k):
        samples_ix = np.where(clusters == i)[0]
        ax.scatter(X[samples_ix, 0], X[samples_ix, 1], s=7, c=colors[i])
        
# plot the cluster centers
ax.scatter(C[:, 0], C[:, 1], marker='*', s=200, c='#050505')
plt.show()