In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

Problem 1: KMEANS


In [None]:
data = pd.read_csv("./kMeansData.csv")
data.shape


normalization

In [None]:
# Please remember to normalize the data as preprocessing
scalar = MinMaxScaler()
data = scalar.fit_transform(data)
data = pd.DataFrame(data, columns=['x1','x2'])

In [None]:
# plot before implement kmeans algorithm
plt.plot(data.x1, data.x2, 'o')
plt.show()

demonstration step-by-step


In [None]:
# step 1: Randomly pick centroids
random.seed(123)
centroids = np.c_[
    [random.uniform(min(data.x1), max(data.x1)) for i in range(3)],
    [random.uniform(min(data.x2), max(data.x2)) for i in range(3)]
]
# 3*2 matrix: 3 centroid, 2 dimenstions for each (x1, x2)
centroids

In [None]:
from scipy.spatial import distance_matrix
# step 2: Distance from points to centroids
dist = distance_matrix(data, centroids)
# dist is a 150 * 3 distance matrix, 
# 150 rows represents 150 trainting points, 3 columns means the distance to 3 centroids

# 150 * 3 explaination: 
# for example #3 row, #0 col means the distance between the 4th training point and the 1st centroid
dist.shape

In [None]:
# assign the training with nearest centroid's label
labels = dist.argmin(axis=1)
labels

In [None]:
# update centroids
old_centroids, centroids = centroids, np.array([
    data.iloc[np.where(labels==0)[0],0:2].mean(),
    data.iloc[np.where(labels==1)[0],0:2].mean(), 
    data.iloc[np.where(labels==2)[0],0:2].mean()
])

In [None]:
# calculate the error: culmulate 3 new centroids' "moving" distance
tolerance = 0
for k in range(3):
    tolerance += np.linalg.norm(old_centroids[k]- centroids[k])
    
tolerance

In [None]:
# based on the above step:
def calculate_cluster(X, centroids):
    dist = distance_matrix(X,centroids)
    return dist.argmin(axis=1)

In [None]:
def calculate_centroid(X, labels, centroids):

    old_centroids, centroids = centroids, np.array([
        data.iloc[np.where(labels==0)[0],0:2].mean(),
        data.iloc[np.where(labels==1)[0],0:2].mean(), 
        data.iloc[np.where(labels==2)[0],0:2].mean()
    ])
    tolerance = 0
    for k in range(3):
        tolerance += np.linalg.norm(old_centroids[k]- centroids[k])
    return centroids, tolerance

In [None]:
# begin to train:
random.seed(456)
centroids = np.c_[
    [random.uniform(min(data.x1), max(data.x1)) for i in range(3)],
    [random.uniform(min(data.x2), max(data.x2)) for i in range(3)]
]

for epoch in range(1000):
    labels = calculate_cluster(data, centroids)
    centroids, tolerance = calculate_centroid(data, labels, centroids)
    # I decide to set the early stop point as 0.0001 for a higher precision
    if tolerance < 0.0001:
        print(epoch, tolerance)
        break

In [None]:
# check the trained cluster_centroids and cluster label
centroids, labels

In [None]:
# plot the results
plt.plot(data.iloc[np.where(labels==0)[0],0], data.iloc[np.where(labels==0)[0],1], 'o', alpha=0.4)
plt.plot(data.iloc[np.where(labels==1)[0],0], data.iloc[np.where(labels==1)[0],1], 'o', alpha=0.4)
plt.plot(data.iloc[np.where(labels==2)[0],0], data.iloc[np.where(labels==2)[0],1], 'o', alpha=0.4)

plt.plot(centroids[:,0], centroids[:,1], 'ro')
plt.show()

confirmation:

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(data)
print(kmeans.cluster_centers_)

In [None]:
plt.plot(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], 'ro')
plt.plot(data[kmeans.labels_ == 0]['x1'], data[kmeans.labels_ == 0]['x2'], 'o', alpha=0.4)
plt.plot(data[kmeans.labels_ == 1]['x1'], data[kmeans.labels_ == 1]['x2'], 'o', alpha=0.4)
plt.plot(data[kmeans.labels_ == 2]['x1'], data[kmeans.labels_ == 2]['x2'], 'o', alpha=0.4)
plt.show()

Problem 2: RBF Classification


In [None]:
rbf = pd.read_csv('./rbfClassification.csv')
# Normalization
scalar = MinMaxScaler()
rbfnorm = scalar.fit_transform(rbf[['x1','x2']])
rbfnorm = pd.DataFrame(np.c_[rbfnorm, rbf.cls], columns=['x1','x2', 'cls'])

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(rbfnorm[['x1', 'x2']])

2-1

In [None]:
# Report cluster centers coordinate
centroids = kmeans.cluster_centers_
centroids

2-2

In [None]:
from scipy.spatial import distance_matrix
r = distance_matrix(rbfnorm[['x1', 'x2']], centroids)
r.shape

In [None]:
# set the hyper-parameters
ngamma = -0.5
phi = np.exp(ngamma*r)

In [None]:
# construct the phi
phi = np.c_[[1]*20, phi]
phi

In [None]:
# train the parameters
w = np.dot( np.linalg.pinv(phi), rbfnorm.cls )
w

In [None]:
# calculate the probability
pred_prob = phi @ w.reshape(-1,1)
pred_prob

In [None]:
# take the threshold
pred = list(map(lambda x: 1 if x>=0.5 else 0, pred_prob))
pred

In [None]:
# accuracy
sum(pred == rbfnorm.cls) / 20

In [None]:
# compared with the kmeans without kernel:
sum(kmeans.predict(rbfnorm[['x1','x2']]) == rbfnorm.cls) / 20