### Benjamin Crom (Panther# 002-36-7349)
__Mini-Project 3: K-means Clustering and Breast Cancer__<br>
__CS 6980: Introduction to Data Science__<br>
__26 February 2018__

![title](assignment.png)

In [1]:
import collections
import numpy
import scipy.io

![title](fig_a1.png)

In [2]:
def create_group_vector(matrix_X, center_matrix_mu):
    """Assign points to current centers"""
    group_list = []
    for this_feature_vector in matrix_X:
        nearest_center_vector_index = None
        min_distance = None
        for i, this_center_vector in enumerate(center_matrix_mu):
            this_distance = numpy.linalg.norm(this_feature_vector - this_center_vector)
            if not min_distance or this_distance < min_distance:
                min_distance = this_distance
                nearest_center_vector_index = i

        if nearest_center_vector_index is not None:
            group_list.append(nearest_center_vector_index)
        else:
            raise ValueError('No nearest center vector value')

    group_vector = numpy.array(group_list)
    return group_vector


def generate_center_matrix(matrix_X, num_clusters_k, group_vector):
    """Recalculate centers"""
    new_center_vector_list = []
    for i in range(num_clusters_k):
        group_feature_matrix = matrix_X[group_vector == i]
        group_centroid = group_feature_matrix.mean(axis=0)
        new_center_vector_list.append(group_centroid)

    center_vector_array = numpy.array(new_center_vector_list)
    center_matrix_mu = numpy.matrix(center_vector_array)
    return center_matrix_mu


def perform_k_means_clustering(matrix_X, num_clusters_k, center_matrix_mu, tolerance, max_iteration_limit):
    group_vector = create_group_vector(matrix_X, center_matrix_mu)
    change = tolerance + 1
    old_center_matrix_mu = None
    i = 0
    while change > tolerance and i < max_iteration_limit:
        old_center_matrix_mu = center_matrix_mu
        center_matrix_mu = generate_center_matrix(matrix_X, num_clusters_k, group_vector)
        change = numpy.linalg.norm(old_center_matrix_mu - center_matrix_mu)
        group_vector = create_group_vector(matrix_X, center_matrix_mu)            
        i += 1

    return group_vector

![title](fig_b.png)

In [3]:
NUM_CLUSTERS = 2
TOLERANCE = 0.01
MAX_ITER = 1000

csv_array = numpy.genfromtxt('breast_data.csv', delimiter=',')
breast_data_matrix = numpy.matrix(csv_array)

group_vector = numpy.random.randint(NUM_CLUSTERS, size=len(breast_data_matrix))  # Random partition into k clusters
center_matrix_mu = generate_center_matrix(breast_data_matrix, NUM_CLUSTERS, group_vector)

k_means_result = perform_k_means_clustering(breast_data_matrix, NUM_CLUSTERS, center_matrix_mu, TOLERANCE, MAX_ITER)
k_means_result

array([0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,

![title](fig_c.png)

In [4]:
def get_accuracy(k_means_result, breast_truth_array):
    accuracy = round(100 * collections.Counter(k_means_result == breast_truth_array)[True] / len(breast_truth_array),
                     2)

    if accuracy < 50.0:
        accuracy = 100 - accuracy  # Take complement if accuracy below 50%
        
    return accuracy


csv_array = numpy.genfromtxt('breast_truth.csv', delimiter=',')
breast_truth_array = numpy.array(csv_array)

accuracy = get_accuracy(k_means_result, breast_truth_array)
print(f'Accuracy: {accuracy}%')

Accuracy: 85.41%


![title](fig_d.png)

In [5]:
for i in range(20):
    random_entries = numpy.random.randint(len(breast_data_matrix), size=NUM_CLUSTERS)  # Pick random feature vectors
    random_center_matrix = breast_data_matrix[random_entries,:]
    k_means_result = perform_k_means_clustering(breast_data_matrix, NUM_CLUSTERS, center_matrix_mu, TOLERANCE,
                                                MAX_ITER)

    accuracy = get_accuracy(k_means_result, breast_truth_array)
    print(f'Choosing entries {random_entries[0]} and {random_entries[1]} as starting centers '
          f'yields an accuracy of {accuracy}%')

print('')
print('The results do not change no matter which feature vectors I use as my starting centers.')
print('This suggests that the only local optimum is also a global optimum.')

Choosing entries 63 and 215 as starting centers yields an accuracy of 85.41%
Choosing entries 123 and 161 as starting centers yields an accuracy of 85.41%
Choosing entries 38 and 473 as starting centers yields an accuracy of 85.41%
Choosing entries 114 and 138 as starting centers yields an accuracy of 85.41%
Choosing entries 231 and 192 as starting centers yields an accuracy of 85.41%
Choosing entries 40 and 78 as starting centers yields an accuracy of 85.41%
Choosing entries 563 and 250 as starting centers yields an accuracy of 85.41%
Choosing entries 392 and 497 as starting centers yields an accuracy of 85.41%
Choosing entries 556 and 560 as starting centers yields an accuracy of 85.41%
Choosing entries 498 and 475 as starting centers yields an accuracy of 85.41%
Choosing entries 136 and 216 as starting centers yields an accuracy of 85.41%
Choosing entries 378 and 300 as starting centers yields an accuracy of 85.41%
Choosing entries 152 and 459 as starting centers yields an accuracy 

![title](fig_e.png)

In [6]:
data = scipy.io.loadmat('mu_init.mat')
mu_init_array = data['mu_init'].transpose()
center_matrix_mu = numpy.matrix(mu_init_array)
k_means_result = perform_k_means_clustering(breast_data_matrix, NUM_CLUSTERS, center_matrix_mu, TOLERANCE, MAX_ITER)
accuracy = get_accuracy(k_means_result, breast_truth_array)
print(f'Accuracy: {accuracy}%')

Accuracy: 85.41%


![title](fig_f1.png)

In [7]:
center_matrix_mu = generate_center_matrix(breast_data_matrix, NUM_CLUSTERS, breast_truth_array)
k_means_result = perform_k_means_clustering(breast_data_matrix, NUM_CLUSTERS, center_matrix_mu, TOLERANCE, MAX_ITER)
accuracy = get_accuracy(k_means_result, breast_truth_array)
print(f'Accuracy: {accuracy}%')

Accuracy: 85.41%


![title](fig_g.png)

In [8]:
import cs6980_mini_project_2

# ======================== LOAD DATA ========================
X = numpy.genfromtxt('breast_data.csv', delimiter=',')
Y = numpy.genfromtxt('breast_truth.csv', delimiter=',')

# ======================= SPLIT DATA ========================
is_training = numpy.random.randint(0, 5, len(X)) > 0
is_test = numpy.invert(is_training)
Y_train = Y[is_training]
X_train = X[is_training]
Y_test = Y[is_test]
X_test = X[is_test]

# ==================== GRADIENT ASCENT ======================
eta = 0.0000001
tol = 0.0017
beta = numpy.array([0] * 30)
beta, l_beta = cs6980_mini_project_2.gradient_ascent(X_train, Y_train, beta, eta, tol)

# ========================== TEST ===========================
Y_hat = cs6980_mini_project_2.classify_logReg(X_test, beta)
accuracy = get_accuracy(Y_hat, Y_test)
print(f'Supervised Approach: Logistic regression yields an accuracy of {accuracy}%')
print('')
print('Unsupervised Approach: I do not know of any unsupervised clustering algorithm '
      'which would achieve a higher accuracy.')

Supervised Approach: Logistic regression yields an accuracy of 92.37%

Unsupervised Approach: I do not know of any unsupervised clustering algorithm which would achieve a higher accuracy.
