In [1]:
# import numpy here

# Some utility functions that we will use later (don't worry about these)

In [2]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

In [3]:
def plot_scatter(x1, x2, labels):
    plt.figure(figsize=(10,6))
    plt.scatter(x=x1, y=x2, c=labels, cmap=plt.cm.gist_rainbow)
    plt.show()

In [4]:
def plot_centroids(x1, x2, centroids1, centroids2):
    labels = np.zeros(x1.shape)
    labels_centroids = np.ones(centroids1.shape) * 100
    
    x1 = np.concatenate((x1, centroids1))
    x2 = np.concatenate((x2, centroids2))
    labels = np.concatenate((labels, labels_centroids))
    plt.figure(figsize=(6,6))
    plt.scatter(x=x1, y=x2, c=labels, cmap=plt.cm.bwr)
    plt.show()

In [5]:
def plot_kmeans(x1, x2, centroids1, centroids2, labels):
    non_centroid_labels = np.zeros(x1.shape)
    centroid_labels = np.ones(centroids1.shape)

    x1_centroids = np.concatenate((x1, centroids1))
    x2_centroids = np.concatenate((x2, centroids2))
    centroid_labels = np.concatenate((non_centroid_labels, centroid_labels))
    
    plt.figure(figsize=(20,6))
    plt.subplot(121)
    plt.scatter(x=x1, y=x2, c=labels, cmap=plt.cm.gist_rainbow)
    plt.subplot(122)
    plt.scatter(x=x1_centroids, y=x2_centroids, c=centroid_labels, cmap=plt.cm.bwr)
    plt.show()

In [6]:
def display_iris(X, y):
    no_samples, no_features = X.shape
    feature_names = ["Sepal Length", "Sepal Width", "Petal Length", "Petal Width", "Flower Type"]
    space_before = 30
    column_space = len(max(feature_names, key=lambda x: len(x)))
    body = ""
    body += " " * space_before
    for feature in feature_names:
        body += " | "
        body += feature
        body += " " * (column_space - len(feature))
    body += " |\n"
    for _ in range(space_before + len(feature_names) * column_space + (len(feature_names) + 1) * 3):
        body += "-"
    body += "\n"
    for i in range(no_samples):
        flower_no = "Flower number {0} at index {1}".format(i + 1, i)
        body += flower_no
        body += " " * (space_before - len(flower_no))
        for j in range(4):
            body += " | "
            current_cell = str(X[i, j])
            body += current_cell
            body += " " * (column_space - len(current_cell))
        body += " | "
        if y[i] == 0:
            current = "Setosa"
        elif y[i] == 1:
            current = "Versicolour"
        else:
            current = "Virginica"
        body += current
        body += " " * (column_space - len(current))
        body += " |\n"
    print(body)

# What is a matrix / a 2D array?

You can think of a matrix as a grid of numbers (or an ordered collection of vectors). <br />
In python, they can be represented as list of lists of numbers (arrays). <br /><br />


In [7]:
# Play with python and numpy matrices here

# Let's play with some actual data

The dataset we are going to look at is the Iris dataset: <br />

* Every row represents a different flower.
* Every column represents a different **feature** for that flower

The features (columns) reprsent the Sepal Length, Sepal Width, Petal Length and Petal Width for each flower. <br />
<br />
Each flower is one of three different types: Setosa, Versicolour, and Virginica. <br />

In [8]:
iris = load_iris()
X = iris.data
y = iris.target

In [9]:
display_iris(X, y)

                               | Sepal Length | Sepal Width  | Petal Length | Petal Width  | Flower Type  |
------------------------------------------------------------------------------------------------------------
Flower number 1 at index 0     | 5.1          | 3.5          | 1.4          | 0.2          | Setosa       |
Flower number 2 at index 1     | 4.9          | 3.0          | 1.4          | 0.2          | Setosa       |
Flower number 3 at index 2     | 4.7          | 3.2          | 1.3          | 0.2          | Setosa       |
Flower number 4 at index 3     | 4.6          | 3.1          | 1.5          | 0.2          | Setosa       |
Flower number 5 at index 4     | 5.0          | 3.6          | 1.4          | 0.2          | Setosa       |
Flower number 6 at index 5     | 5.4          | 3.9          | 1.7          | 0.4          | Setosa       |
Flower number 7 at index 6     | 4.6          | 3.4          | 1.4          | 0.3          | Setosa       |
Flower number 8 at index 7 

In [10]:
# Try getting only the petal features (columns) here

In [11]:
# Let's plot them (using plot_scatter())

## What if we don't know what the type for every flower is?
## Let's try _clustering_ the data
Clustering is a method of grouping data points in different groups, called **clusters**. <br />
Data points in the same cluster are "similar" to one another.

## Algorithm: K-means clustering
1. Find a measure of "similarity" for points (rows) in the data (e.g. distance between each row) 
2. Choose how many clusters do you want to divide the data in, e. g. _k_
3. Select _k_ random points from the data, we'll call these cluster _centroids_
4. Repeat the following steps many times: <br />
    a. For each point (row) in the data, assign/label it with the _centroid_ it is closest to &rarr; Each point will be labeled to the _centroid_ it is closest to <br />
    b. For each _centroid_: take all points assigned to it in step **a** and find the mean of these points &rarr; This new point will be the new _centroid_ <br />
5. At the end, the points assigned to each _centroid_ will belong to a different cluster.

### 1. Finding a measure of similarity between two data points

In [12]:
# Write function to measure "similarity" here

### 2. Choosing how many clusters do you want to divide the data in 

In [13]:
# Choose the number of clusters

### 3. Select _k_ random points from the data

In [14]:
# Write the funciton to initialize the centroids here

### 4. Repeating the following:
#### a. Label each point to a centroid

In [15]:
# Write the function to label each point in the data here

#### b. Find new centroids from mean of labeled points

In [16]:
# Write the function to find new centroids by finding the mean of the labeled points

### Putting it all together

In [17]:
# Write the function to cluster a matrix of data by using the ones you just wrote

def k_means(matrix, n):
    pass