### Name: Bram Otten

### Student ID: 10992456

### Group: F

Please fill in you name, student ID and group above, and also edit the filename according to the specified format.

In [None]:
import math
import matplotlib.pylab as plt
import numpy as np

from sklearn import datasets
from random import uniform

iris = datasets.load_iris()
X_matrix = iris.data

# K-means 


For this assignment the setup will be a little different from the previous weeks. Instead of incrementally writing functions to build the algorithm, it will be up to you to design the code of the algorithm from scratch. The first algorithm we will be covering is *K-means*. The pseudocode from Alpayding for this algorithm (figure 7.3) is:

* Initialize $m_i$, $i$ = $1$, ... , $k$, for example, to $k$ random $x^t$
* Repeat
    * For all $x^t \in X$
        * $b_i^t \leftarrow \left\{\begin{array}{ll} 1 & if\ \Vert x^t − m_i \Vert \ =\ min_j\ \Vert x^t − m_j \Vert\\ 
            0 & otherwise \\ \end{array}\right.$
    * For all $m_i$, $i$ = $1$, ... , $k$
        * $m_i \leftarrow \sum_t b_i^t x^t / \sum_t b_i^t$
* Until $m_i$ converge

This video by [Andrew on the topic](https://www.youtube.com/watch?v=6u19018FeHg&index=78&list=PLZ9qNFMHZ-A4rycgrgOYma6zxF4BZGGPW) might also be helpful to understand what the algorithm does. Again the notation is a little different, but the ideas are exactly the same.

## Implementing the algorithm [10 pts]

For your code, make **functions** for each of the following components of the k-means algorithm:

* Randomly initialize means from the data [1pt]
* Compute the distance between 2 points [1pt]
* Compute the matrix $b$ containing the assignments of points to clusters, based on the current means [2pts]
* Compute the matrix $m$ containing the computed mean vectors, based on the current assignment of clusters [2pts]
* Plot the means (as x's) and their assigned points (as dots), with a different color for each cluster (here you may assume the points will all be 2-dimensional, to allow them to be plotted) [2pts]
* Determine if the algorithm has converged based on the sets of current and new means [1pt]
* Combine all these functions in a general k-means function [1pt]

## Showing the results [1 pt]

To show your code works, run the algorithm on the Iris dataset using only the last 2 variables of the data. Use a value of 3 for $k$ and plot the means with their assignments. Start by running the algorithm step by step and plotting for each step. Start with the random means and then show at least 2 more steps, to show the means moving as the algorithm iterates. Finally, show the resulting plot where the means have converged and the algorithm has stopped.


In [None]:
def initialize_means(data, k):
    means = []
    minimums = []
    maximums = []
    for column in data.transpose():
        minimums.append(np.amin(column))
        maximums.append(np.amax(column))

    for class_index in range(k):
        coordinates = []
        for axis in range(len(minimums)):
            coordinates.append(uniform(minimums[axis],
                                       maximums[axis]))
        means.append(coordinates)

    means = np.array(means)
    return means


def dist_two_points(A, B):
    total = 0
    dimension = len(A)
    for i in range(dimension):
        total += (A[i] - B[i]) ** 2
    return total ** 0.5


def compute_b_matrix(data, means, k):
    best_class_list = []
    for datapoint in data:
        best_class = -1
        smallest_dist = float('inf')
        for i in range(k):
            cur_dist = dist_two_points(datapoint, means[i])
            if cur_dist < smallest_dist:
                smallest_dist = cur_dist
                best_class = i
        best_class_list.append(best_class)
    return np.array(best_class_list)


def compute_m_matrix(data, b_matrix, k, old_m_matrix):
    dimensions = len(data[0])
    m_matrix = [[0 for i in range(dimensions)] for j in range(k)]
    counter = [0 for j in range(k)]
    for i in range(len(data)):
        c_i = b_matrix[i]
        data_i = data[i]
        m_matrix[c_i] += data_i
        counter[c_i] += 1

    for i in range(k):
        if counter[i] > 0:
            m_matrix[i] = np.divide(m_matrix[i], counter[i])
        else:
            # Stupid quick solution, but the old m[i] is useless
            m_matrix[i] = data[0]

    return np.array(m_matrix)


def determine_convergence(old, new):
    return np.array_equal(old, new)

In [None]:
def printy_compute_k_means(data, k):
    m_matrix = initialize_means(data, k)
    print("Means:")
    print(m_matrix)
    print("---")
    print("Dist", data[0], m_matrix[0], "=",
          dist_two_points(data[0], m_matrix[0]))
    print("---")
    print("Data and associated b:")
    print(data[0:5])
    b_matrix = compute_b_matrix(data, m_matrix, k)
    print(b_matrix[0:5])
    print("---")
    print("New means (or m_matrix):")
    m_matrix = compute_m_matrix(data, b_matrix, k, m_matrix)
    print(m_matrix)
    print("---")
    print("Can do that again:")
    b_matrix = compute_b_matrix(data, m_matrix, k)
    new_m_matrix = compute_m_matrix(data, b_matrix, k, m_matrix)
    print(new_m_matrix)
    print("---")
    print("Was I already done?",
          determine_convergence(m_matrix, new_m_matrix))


def plot_k_means(data, b_matrix, m_matrix, k):
    # We can assume 2D data here.
    k_list_x1 = [[] for i in range(k)]
    k_list_x2 = [[] for i in range(k)]
    for i in range(len(b_matrix)):
        k_list_x1[b_matrix[i]].append(data[i, 0])
        k_list_x2[b_matrix[i]].append(data[i, 1])

    plt.figure()
    for i in range(k):
        plt.scatter(k_list_x1[i], k_list_x2[i],
                    label='Class %d' % i, s=8)
    plt.scatter(m_matrix[:, 0], m_matrix[:, 1],
                marker='x', color='black')
    plt.legend(loc='upper left')
    plt.show()


def compute_k_means(data, k, plotting=True):
    m_matrix = initialize_means(data, k)
    b_matrix = compute_b_matrix(data, m_matrix, k)
    new_m_matrix = compute_m_matrix(data, b_matrix, k, m_matrix)
    while determine_convergence(m_matrix, new_m_matrix) == False:
        if plotting:
            plot = plot_k_means(data, b_matrix, m_matrix, k)
            plt.show(plot)
        m_matrix = new_m_matrix
        b_matrix = compute_b_matrix(data, m_matrix, k)
        new_m_matrix = compute_m_matrix(data, b_matrix, k, m_matrix)
    return m_matrix


iris_data = np.array(X_matrix[:, 2:4])
iris_k = 3
# printy_compute_k_means(iris_data, iris_k)
# print("***")
print("Last two iris columns %d means clustered:" % iris_k)
iris_k_means = compute_k_means(iris_data, iris_k)

## Elbow Method [4 pts]

For this dataset we have the benefit of already knowing the number of clusters. However, there are even some things we can do if the number clusters is not known or set before hand. One possbile approach is the *elbow method*. Watch the video from Andrew on the topic below or find another resource describing it.

[Choosing the number of clusters?](https://www.youtube.com/watch?v=izCbbMbRWHw&list=PLZ9qNFMHZ-A4rycgrgOYma6zxF4BZGGPW&index=81)

Now extend your implementation to include the following:

* Create a sensible cost function using the data and the current values of $b$ and $m$, that steps of the algorithm will minimize. [1 pt]
* Create a function to compute the converged cost of a specific value of $k$ repeatedly and averaging this. [1 pt]
* Running your code many times will most likely result in an error in your `compute_means` function at some point. Find out what is causing the error and create a new version of `compute_means` that solves this problem in some sensible way. [1 pts]
* Compute the average cost for k values $1$ to $10$ and combine them in a plot. Briefly discuss if this plot corresponds with your expectations. [1 pt]


In [None]:
def clustering_cost(data, k, m, b):
    total = 0.0
    for i in range(len(data)):
        m_i = m[b[i]]
        total += dist_two_points(data[i], m_i)
    return total / len(data)


def clustering_repeater(data, k, repeats=3):
    cost = 0.0
    for i in range(repeats):
        m_i = compute_k_means(data, k, False)
        b_i = compute_b_matrix(data, m_i, k)
        cost += clustering_cost(data, k, m_i, b_i)
    return cost / k


def average_costs_plots(data, max_k=10):
    cost_list = []
    for k in range(1, max_k + 1):
        k_cost = clustering_repeater(data, k)
        cost_list.append(k_cost)
    plt.figure()
    plt.title('Normalized Euclidean cost (y-axis) vs k (x-axis)')
    plt.plot(range(1, max_k + 1), cost_list)
    plt.show()


average_costs_plots(iris_data)

print("These are (usually) the results I would have expected.")
print("Costs decrease as k increases, because a higher k means")
print("more means and more means means it's 'harder' for")
print("datapoints to be far from their mean.")
print()
print("But sometimes a higher k is associated with a higher cost.")
print("I'll conveniently ascribe that to randomness.")

## kNN [5 pts]

The final algorithm of this week is kNN, the quintessential non-parametric classification algorithm. To implement this you may use any resource on the algorithm you prefer, be it the description in the sides, in *Alpaydin*, a video on the topic or some other content you found. The division of functions entirely up to you as well, you will only be scored on the following components being present:

* A functioning implementation of kNN [2 pts]
* A version of kNN that weights the contribution of k datapoints based on distance to the point being evaluated [1 pt]
* A sufficient documentation of your functions and the choices you made in your design [1 pt]
* A structured comparision of the 2 algorithms using different values of K on the Iris dataset [1 pt]

In [None]:
def validation_split(data, ratio=0.5):
    split_data = np.copy(data)
    np.random.shuffle(split_data)
    split_index = int(len(data) * ratio)
    return (split_data[:split_index, :], split_data[split_index:, :])


def get_k_nearest_list(ref, data, k):
    n = len(data)
    nn_list = {}
    for j in range(n):
        nn_list[j] = dist_two_points(ref, data[j])
    kn_list.append(sorted(nn_list, key=nn_list.get)[1:k + 1])
    return kn_list


def knn_label(train_data, train_labels,
              val_data, k, weighted=False):
    val_data = val_data.tolist()
    n = len(train_data)
    n_classes = np.amax(train_labels)
    label_list = []
    for ref in val_data:

        # First, get k closest points.
        distances = {}
        for i in range(n):
            distances[i] = dist_two_points(ref, train_data[i])
        sorted_dist = sorted(distances, key=distances.get)
        closest = sorted_dist[:k]

        # Figure out which class is more likely based on k neighbours.
        # My strategy is something like: get labels, return most
        # occuring one. If weighted,
        stupid_list = []
        for j in range(k):
            j_n_label = train_labels[closest[j]]
            stupid_list.append(j_n_label)
        occ = np.ndarray.tolist(np.bincount(stupid_list,
                                            minlength=n_classes))
        if weighted:
            for i in range(len(closest)):
                occ_i = train_labels[closest[j]]
                d = distances[closest[i]]

                # Now, the glorious weighting:
                occ[occ_i] -= d * 2
                # I didn't see much in the book or slides.

        best_label = np.argmax(occ)
        label_list.append(best_label)
    return label_list


(iris_data_train, iris_data_val) = validation_split(iris_data)
for k in [2, 3, 5, 10]:
    k_means = compute_k_means(iris_data_train, k, False)
    k_means_labels = compute_b_matrix(iris_data_val, k_means, k)
    k_means_cost = clustering_cost(iris_data_val, k,
                                   k_means, k_means_labels)

    # Here, I'll complain about the lack of mentioning
    # how to deal with the init (/the switch to supervision).
    # I'm using the k means' training stuff now.

    k_nn_labels = knn_label(iris_data_train, k_means_labels,
                            iris_data_val, k)
    k_nn_labels_w = knn_label(iris_data_train, k_means_labels,
                              iris_data_val, k, True)
    k_nn_cost = clustering_cost(iris_data_val, k,
                                k_means, k_nn_labels)
    k_nn_cost_w = clustering_cost(iris_data_val, k,
                                  k_means, k_nn_labels_w)

    print("The %d-means algorithm returned" % k)
    print(k_means)
    print("For a cost of", k_means_cost)
    print()
    print("The %d-NN algorithm achieved a cost of" % k, k_nn_cost)
    print("Or, weighted:", k_nn_cost_w)
    print()