## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from numpy.random import normal
from numpy.random import uniform
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot_2samples
import plotly.express as px

## Data

In [6]:
data = pd.DataFrame(np.array([[1, 2, 3, 4, 5, 6], 
                              [13, 15, 17, 20, 22, 24],
                              [30, 20, 31, 50, 58, 51]]),
                   index=['Id', 'X', 'Y']).T
display(data)

Unnamed: 0,Id,X,Y
0,1,13,30
1,2,15,20
2,3,17,31
3,4,20,50
4,5,22,58
5,6,24,51


## Find Centroids

In [44]:
#Note: given clusters are C1 = {P1, P2}, C2 = {P3, P4}, C3 = {P5, P6}
#Find centriod, ki, for each cluster
#Recall that the centroid is simply the median point of a cluster

k1x = (13+15)/2
k1y = (30+20)/2

k2x = (17+20)/2
k2y = (31+50)/2

k3x = (22+24)/2
k3y = (58+51)/2

## Find dist. between centroids and each point

In [31]:
#Function to find dist. between centroids and each point
def dist(kix, kiy):
    dist = []
    i = 0
    while i in range(6):
        dist.append(math.sqrt((data.at[i, 'X'] - kix)**2 + (data.at[i, 'Y'] - kiy)**2))
        i += 1
    
    return dist


#Dist from each centroid to each point
K1 = dist(k1x, k1y)
K2 = dist(k2x, k2y)
K3 = dist(k3x, k3y)

#Data frame of these sets of distances for easy comparison
K = pd.DataFrame([K1, K2, K3], index =['K1', 'K2', 'K3']).T
display(K)

Unnamed: 0,K1,K2,K3
0,5.09902,11.85327,26.462237
1,5.09902,20.796634,35.415392
2,6.708204,9.617692,24.253866
3,25.70992,9.617692,5.408327
4,33.955854,17.846568,3.640055
5,27.856777,11.85327,3.640055


## Determine new clusters by assigning each point to the cluster it is closest to.

C1 = {P1, P2, P3} <br>
C2 = {} <br>
C3 = {P4, P5, P6}

In this case cluster C2 is empty.

## Find the SSE for C1 and C3

We must first find the new centroids for C1 and C3

In [32]:
c1x = (13+15+17)/3
c1y = (30+20+31)/3

c3x = (20+22+24)/3
c3y = (50+58+51)/3

In [43]:
#Then we find the distance between the new centroids and the points in their respective cluster

#This finds the dist of all points
C1temp = dist(c1x, c1y)
C3temp = dist(c3x, c3y)

#This shows only the dist we want
C1 = C1temp[0:3]
display(C1)
C3 = C3temp[3:6]
display(C3)

#We find the SSE by Squaring each value in C1 or C3 and then adding them together
SSEc1 = C1[0]**2 + C1[1]**2 + C1[2]**2
display(SSEc1)

SSEc3 = C3[0]**2 + C3[1]**2 + C3[2]**2
display(SSEc3)

[3.605551275463989, 7.0, 4.47213595499958]

[3.605551275463989, 5.0, 2.8284271247461903]

82.0

46.0

Clearly the SSE for C1, 82, is larger than the SSE for C3, 46. So we remove one point from C1 to add to C2 so that C2 is no longer empty. This should also cause the SSE of C1 to be reduced. We will choose P2 to be removed from C1 and added to C2 since it contributes the most to the high SSE value of C1. So the new clusters are as follows:

C1 = {P1, P3} <br>
C2 = {P2} <br>
C3 = {P4, P5, P6}

Notice that the number of clusters is still 3 and there is no longer an empty cluster.