### Benjamin Crom (Panther# 002-36-7349)
__Mini-Project 3: K-means Clustering and Breast Cancer__<br>
__CS 6980: Introduction to Data Science__<br>
__26 February 2018__

![title](assignment.png)

In [1]:
import collections

import matplotlib.pyplot
import numpy

![title](fig_a1.png)

In [2]:
def create_group_vector(matrix_X, center_matrix_mu):
    group_list = []
    for this_feature_vector in matrix_X:
        nearest_center_vector_index, min_distance = None, None
        for i, this_center_vector in enumerate(center_matrix_mu):
            this_distance = numpy.linalg.norm(this_feature_vector - this_center_vector)
            if not min_distance or this_distance < min_distance:
                min_distance = this_distance
                nearest_center_vector_index = i

        if nearest_center_vector_index is not None:
            group_list.append(nearest_center_vector_index)
        else:
            raise ValueError('No nearest center vector value')

    return numpy.array(group_list)


def generate_center_matrix_from_group_vector(matrix_X, num_clusters_k, group_vector):
    new_center_vector_list = []
    for i in range(num_clusters_k):
        group_feature_matrix = matrix_X[group_vector == i]
        group_centroid = group_feature_matrix.mean(axis=0)
        new_center_vector_list.append(group_centroid)

    center_matrix_mu = numpy.matrix(numpy.array(new_center_vector_list))
    return center_matrix_mu


def perform_k_means_clustering(matrix_X, num_clusters_k, center_matrix_mu, tolerance, max_iteration_limit):
    group_vector = create_group_vector(matrix_X, center_matrix_mu)

    old_center_matrix_mu = None
    change = tolerance + 1
    i = 0
    while change > tolerance and i < max_iteration_limit:
        old_center_matrix_mu = center_matrix_mu
        center_matrix_mu = generate_center_matrix_from_group_vector(matrix_X, num_clusters_k, group_vector)
        change = numpy.linalg.norm(old_center_matrix_mu - center_matrix_mu)
        group_vector = create_group_vector(matrix_X, center_matrix_mu)            
        i += 1

    return group_vector

![title](fig_b.png)

In [3]:
NUM_CLUSTERS = 2
TOLERANCE = 0.01
MAX_ITER = 1000

csv_array = numpy.genfromtxt('breast_data.csv', delimiter=',')
breast_data_matrix = numpy.matrix(csv_array)

group_vector = numpy.random.randint(NUM_CLUSTERS, size=len(breast_data_matrix))  # Random partition into k clusters
center_matrix_mu = generate_center_matrix_from_group_vector(breast_data_matrix, NUM_CLUSTERS, group_vector)
k_means_result = perform_k_means_clustering(breast_data_matrix, NUM_CLUSTERS, center_matrix_mu, TOLERANCE, MAX_ITER)
k_means_result

array([1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

![title](fig_c.png)

In [4]:
csv_array = numpy.genfromtxt('breast_truth.csv', delimiter=',')
breast_truth_array = numpy.array(csv_array)
accuracy = round(
    100 * collections.Counter(k_means_result == breast_truth_array)[True] / len(breast_truth_array),
    2
)
print(f'Accuracy: {accuracy}%')

Accuracy: 85.41%


![title](fig_d.png)

In [17]:
for i in range(20):
    random_rows = numpy.random.randint(len(breast_data_matrix), size=NUM_CLUSTERS)  # Pick random feature vectors
    random_center_matrix = breast_data_matrix[random_rows,:]
    k_means_result = perform_k_means_clustering(breast_data_matrix, NUM_CLUSTERS, center_matrix_mu, TOLERANCE, MAX_ITER)
    accuracy = round(
        100 * collections.Counter(k_means_result == breast_truth_array)[True] / len(breast_truth_array),
        2
    )

    print(f'Choosing entries {random_rows[0]} and {random_rows[1]} as starting centers '
          f'yields an accuracy of {accuracy}%')
    
print('')
print('The results do not change no matter which feature vectors I use as my starting centers.')
print('This suggests that the dataset is convex and the only local optimum is also a global optimum.')

Choosing entries 218 and 561 as starting centers yields an accuracy of 85.41%
Choosing entries 148 and 373 as starting centers yields an accuracy of 85.41%
Choosing entries 356 and 342 as starting centers yields an accuracy of 85.41%
Choosing entries 508 and 296 as starting centers yields an accuracy of 85.41%
Choosing entries 338 and 255 as starting centers yields an accuracy of 85.41%
Choosing entries 489 and 271 as starting centers yields an accuracy of 85.41%
Choosing entries 21 and 248 as starting centers yields an accuracy of 85.41%
Choosing entries 185 and 552 as starting centers yields an accuracy of 85.41%
Choosing entries 470 and 510 as starting centers yields an accuracy of 85.41%
Choosing entries 459 and 234 as starting centers yields an accuracy of 85.41%
Choosing entries 163 and 393 as starting centers yields an accuracy of 85.41%
Choosing entries 15 and 102 as starting centers yields an accuracy of 85.41%
Choosing entries 498 and 58 as starting centers yields an accuracy

![title](fig_e.png)

![title](fig_f1.png)

In [11]:
generate_center_matrix_from_group_vector(breast_data_matrix, NUM_CLUSTERS, breast_truth_array)
k_means_result = perform_k_means_clustering(breast_data_matrix, NUM_CLUSTERS, center_matrix_mu, TOLERANCE, MAX_ITER)
accuracy = round(
    100 * collections.Counter(k_means_result == breast_truth_array)[True] / len(breast_truth_array),
    2
)
print(f'Accuracy: {accuracy}%')

Accuracy: 85.41%


![title](fig_g.png)