### Benjamin Crom (Panther# 002-36-7349)
__Mini-Project 3: K-means Clustering and Breast Cancer__<br>
__CS 6980: Introduction to Data Science__<br>
__26 February 2018__

![title](assignment.png)

In [1]:
import collections
import numpy
import scipy.io

![title](fig_a1.png)

In [2]:
def create_group_vector(matrix_X, center_matrix_mu):
    """Assign points to current centers"""
    group_list = []
    for this_feature_vector in matrix_X:
        nearest_center_vector_index = None
        min_distance = None
        for i, this_center_vector in enumerate(center_matrix_mu):
            this_distance = numpy.linalg.norm(
                this_feature_vector - this_center_vector
            )

            if not min_distance or this_distance < min_distance:
                min_distance = this_distance
                nearest_center_vector_index = i

        if nearest_center_vector_index is not None:
            group_list.append(nearest_center_vector_index)
        else:
            raise ValueError('No nearest center vector value')

    group_vector = numpy.array(group_list)
    return group_vector


def generate_center_matrix(matrix_X, num_clusters_k, group_vector):
    """Recalculate centers"""
    new_center_vector_list = []
    for i in range(num_clusters_k):
        group_feature_matrix = matrix_X[group_vector == i]
        group_centroid = group_feature_matrix.mean(axis=0)
        new_center_vector_list.append(group_centroid)

    center_vector_array = numpy.array(new_center_vector_list)
    center_matrix_mu = numpy.matrix(center_vector_array)
    return center_matrix_mu


def perform_k_means_clustering(matrix_X, num_clusters_k, center_matrix_mu,
                               tolerance, max_iteration_limit):
    group_vector = create_group_vector(matrix_X, center_matrix_mu)
    change = tolerance + 1
    old_center_matrix_mu = None
    i = 0
    while change > tolerance and i < max_iteration_limit:
        old_center_matrix_mu = center_matrix_mu
        center_matrix_mu = generate_center_matrix(matrix_X,
                                                  num_clusters_k,
                                                  group_vector)

        change = numpy.linalg.norm(old_center_matrix_mu - center_matrix_mu)
        group_vector = create_group_vector(matrix_X, center_matrix_mu)
        i += 1

    return group_vector

![title](fig_b.png)

In [3]:
NUM_CLUSTERS = 2
TOLERANCE = 0.01
MAX_ITER = 1000

csv_array = numpy.genfromtxt('breast_data.csv', delimiter=',')
breast_data_matrix = numpy.matrix(csv_array)

# Random partition into k clusters
group_vector = numpy.random.randint(NUM_CLUSTERS, size=len(breast_data_matrix))
center_matrix_mu = generate_center_matrix(breast_data_matrix,
                                          NUM_CLUSTERS,
                                          group_vector)

k_means_result = perform_k_means_clustering(breast_data_matrix,
                                            NUM_CLUSTERS,
                                            center_matrix_mu,
                                            TOLERANCE, MAX_ITER)

k_means_result

array([1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

![title](fig_c.png)

In [4]:
def get_accuracy(k_means_result, breast_truth_array):
    comparison_vector = k_means_result == breast_truth_array
    comparison_counter = collections.Counter(comparison_vector)
    accuracy = comparison_counter[True] / len(breast_truth_array)
    if accuracy < 50.0:
        accuracy = 100 - accuracy  # Take complement if accuracy below 50%

    rounded_accuracy = round(accuracy, 2)
    return rounded_accuracy


csv_array = numpy.genfromtxt('breast_truth.csv', delimiter=',')
breast_truth_array = numpy.array(csv_array)

accuracy = get_accuracy(k_means_result, breast_truth_array)
print(f'Accuracy: {accuracy}%')

Accuracy: 99.15%


![title](fig_d.png)

In [5]:
for i in range(20):
    # Pick random feature vectors
    random_entries = numpy.random.randint(
        len(breast_data_matrix),
        size=NUM_CLUSTERS
    )

    random_center_matrix = breast_data_matrix[random_entries, :]
    k_means_result = perform_k_means_clustering(breast_data_matrix,
                                                NUM_CLUSTERS,
                                                center_matrix_mu,
                                                TOLERANCE,
                                                MAX_ITER)

    accuracy = get_accuracy(k_means_result, breast_truth_array)
    print(f'Choosing entries {random_entries[0]} and {random_entries[1]} '
          f'as starting centers yields an accuracy of {accuracy}%')

print('\nThe results do not change no matter which feature vectors I use as my '
      'starting centers.\nThis suggests that the only local optimum is also a '
      'global optimum.')

Choosing entries 68 and 22 as starting centers yields an accuracy of 99.15%
Choosing entries 396 and 373 as starting centers yields an accuracy of 99.15%
Choosing entries 350 and 320 as starting centers yields an accuracy of 99.15%
Choosing entries 210 and 299 as starting centers yields an accuracy of 99.15%
Choosing entries 149 and 262 as starting centers yields an accuracy of 99.15%
Choosing entries 394 and 331 as starting centers yields an accuracy of 99.15%
Choosing entries 154 and 537 as starting centers yields an accuracy of 99.15%
Choosing entries 452 and 195 as starting centers yields an accuracy of 99.15%
Choosing entries 254 and 567 as starting centers yields an accuracy of 99.15%
Choosing entries 38 and 141 as starting centers yields an accuracy of 99.15%
Choosing entries 329 and 155 as starting centers yields an accuracy of 99.15%
Choosing entries 7 and 89 as starting centers yields an accuracy of 99.15%
Choosing entries 498 and 165 as starting centers yields an accuracy of

![title](fig_e.png)

In [6]:
data = scipy.io.loadmat('mu_init.mat')
mu_init_array = data['mu_init'].transpose()
center_matrix_mu = numpy.matrix(mu_init_array)
k_means_result = perform_k_means_clustering(breast_data_matrix,
                                            NUM_CLUSTERS,
                                            center_matrix_mu,
                                            TOLERANCE, MAX_ITER)

accuracy = get_accuracy(k_means_result, breast_truth_array)
print(f'Accuracy: {accuracy}%')

Accuracy: 99.85%


![title](fig_f1.png)

In [None]:
center_matrix_mu = generate_center_matrix(breast_data_matrix,
                                          NUM_CLUSTERS,
                                          breast_truth_array)

k_means_result = perform_k_means_clustering(breast_data_matrix,
                                            NUM_CLUSTERS,
                                            center_matrix_mu,
                                            TOLERANCE, MAX_ITER)

accuracy = get_accuracy(k_means_result, breast_truth_array)
print(f'Accuracy: {accuracy}%')

Accuracy: 99.15%


![title](fig_g.png)

In [None]:
import cs6980_mini_project_2

# ======================== LOAD DATA ========================
X = numpy.genfromtxt('breast_data.csv', delimiter=',')
Y = numpy.genfromtxt('breast_truth.csv', delimiter=',')

# ======================= SPLIT DATA ========================
is_training = numpy.random.randint(0, 5, len(X)) > 0
is_test = numpy.invert(is_training)
Y_train = Y[is_training]
X_train = X[is_training]
Y_test = Y[is_test]
X_test = X[is_test]

# ==================== GRADIENT ASCENT ======================
eta = 0.0000001
tol = 0.0017
beta = numpy.array([0] * 30)
beta, l_beta = cs6980_mini_project_2.gradient_ascent(X_train,
                                                     Y_train,
                                                     beta,
                                                     eta,
                                                     tol)

# ========================== TEST ===========================
Y_hat = cs6980_mini_project_2.classify_logReg(X_test, beta)
accuracy = get_accuracy(Y_hat, Y_test)
print('Supervised Approach: Logistic regression yields an accuracy of '
      f'{accuracy}%\n\n'
      'Unsupervised Approach: I do not know of any unsupervised clustering\n'
      'algorithm which would achieve a higher accuracy.')

  for i in range(len(X))
