# Intro to numpy

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
small_matrix = np.array([[1,2], [4,5], [7,8], [100,0]])
print(small_matrix)
print(small_matrix.shape)
plt.scatter(small_matrix[:,0], small_matrix[:,1])

In [None]:
col = np.array([-7,1,1,0]).reshape([-1,1])
print(col.shape)
plt.scatter(small_matrix[:,0], small_matrix[:,1], c=col)

In [None]:
x = np.random.normal(0, 1, 100)
y = np.random.normal(5, 16, 100)
plt.scatter(x, y)

In [None]:
def make_blob(center, radius, num_samples=100):
    """
    Args:
        center: length 2 list specifying x and y coords
        radius: number specifying variance in both coords
        num_samples: integer specifying number of samples, default=100
    Returns:
        np.array with shape (num_samples, 2); centered at center with stddev=radius
    """
    x = np.random.normal(center[0], radius, num_samples)
    y = np.random.normal(center[1], radius, num_samples)
    return np.c_[x, y]

# or return np.random.normal(center, radius, [num_samples, 2])

c = make_blob([5,10], 0.1)
assert c.shape == (100,2)
plt.scatter(c[:,0], c[:,1])

# Aside - omit

In [None]:
class Blob(object):
    
    def __init__(self, center, radius):
        self.x_center = center[0]
        self.y_center = center[1]
        self.radius = radius
        
class BlobFromCoords(object):
    
    def __init__(self, x_center, y_center, radius):
        self.x_center = x_center
        self.y_center = y_center
        self.radius = radius

In [None]:
b = BlobFromCoords(1,4,5)
c = Blob([1,4],5)
print b.x_center, c.x_center

def better_make_blob(blob, num_samples=100):
    """
    blob: object with attributes x_center, y_center, radius
    num_samples: integer specifying number of samples, default=100
    """
    x = np.random.normal(blob.x_center, blob.radius, num_samples)#.reshape([-1,1])
    y = np.random.normal(blob.y_center, blob.radius, num_samples)
    return np.c_[x,y]

print better_make_blob(b)[:5]
print better_make_blob(c)[:5]

# More numpy operations

In [None]:
# more stacking
x = np.array([[1,22],[3,4]])
y = np.array([[4,4], [6,6]])
z = np.r_[x, y]
print(z.shape)
print(z)

In [None]:
# exercise
t = #z with a column of 5's appended

assert t.shape == (4, 3)
assert (t[:,2] == 5).all()

In [None]:
def make_labeled_blobs(centers, radii):
    """
    Args:
        centers: list of centers (list of pairs of numbers)
        radii: list of radii (list of numbers)
    Returns:
        np.array of all the blobs, with a "label" column
    """
    label = 0
    arr_to_return = None
    for c, r in zip(centers, radii):
        this_cluster = make_blob(c, r)
        # exercise
        label_col = # column of correct size with value label everywhere
        this_cluster_labeled = np.c_[this_cluster, label_col]
        if arr_to_return is None:
            arr_to_return = this_cluster_labeled
        else:
            arr_to_return = np.r_[arr_to_return, this_cluster_labeled]
        label += 1
    return arr_to_return

labeled_clusters = make_labeled_blobs([[0,4], [7,7], [15,0], [-2,-1]], [0.5, 1, 2, 1])

print(labeled_clusters.shape)

plt.scatter(labeled_clusters[:,0], labeled_clusters[:,1], c=labeled_clusters[:,2])


        
    

# k-means clustering

Given (subset of Euclidean space X *, number of clusters k), returns partition of X into k clusters.

    choose k centroids from X (at random); these are cluster centers

    until convergence:

        assign every point in X to its closest centroid
    
            (now we have a partition)
        
        recalculate the centroid of each cluster
    

\* can be made more general

trying to minimize distances within clusters


In [None]:
from sklearn.cluster import KMeans
def example_kmeans(num_clusters=2):
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(labeled_clusters[:,:2])
    plt.scatter(labeled_clusters[:,0], labeled_clusters[:,1], c=kmeans.labels_)
    return kmeans

example_kmeans(2)

In [None]:
# exercise: see how the answer changes with k - insert some new cells here
example_kmeans(3)

In [None]:
kmeans_1 = example_kmeans(4)

# How does k-means handle a linear transformation?

In [None]:
transf = np.array([[0.2, 0.01], [1.1, -0.9]])
# exercise
transf_labeled_clusters = # labeled_clusters transformed by transf
print(transf_labeled_clusters.shape)
plt.scatter(transf_labeled_clusters[:,0], transf_labeled_clusters[:,1], c=labeled_clusters[:,2])

In [None]:
kmeans_2 = KMeans(n_clusters=4, random_state=0).fit(transf_labeled_clusters)
plt.scatter(transf_labeled_clusters[:,0], transf_labeled_clusters[:,1], c=kmeans_2.labels_)

In [None]:
# execise: visualize the clustering of the transformed data in the original coordinates
plt.scatter( ... )

# DBSCAN

Find core samples of high density and grows them, labels points in sparse regions as noise. Requires "epsilon" (threshold for points to be considered neighbors), and "number of neighbors" (number of points in a neighborhood for a point to be considered core).

In [None]:
from sklearn.cluster import DBSCAN
dbs_1 = DBSCAN(eps=1, min_samples=2).fit(labeled_clusters[:,:2])
plt.scatter(labeled_clusters[:,0], labeled_clusters[:,1], c=dbs_1.labels_)

print "DBSCAN found {} core points.".format(len(dbs_1.core_sample_indices_))


In [None]:
dbs_2 = DBSCAN(min_samples=2).fit(transf_labeled_clusters)
plt.scatter(transf_labeled_clusters[:,0], transf_labeled_clusters[:,1], c=dbs_2.labels_)

print "DBSCAN found {} core points.".format(len(dbs_2.core_sample_indices_))


In [None]:
plt.scatter(labeled_clusters[:,0], labeled_clusters[:,1], c=dbs_2.labels_)

# Spectral clustering

### Idea
Again specify the number of clusters and similarity matrix.

Map data via normalized Laplacian to $k$-dimensional Euclidean space.

Perform (e.g.) $k$-means there.

### Detail

Let $A$ be similarity matrix ($n \times n$), except zeroes on diagonal.

Let $D$ be diagonal matrix of $A$'s row sums.

Set $L := D^{-1/2} A D^{-1/2}$

Form $X := n \times k$ matrix of top eigenvectors of $L$.

Renormalize $X$ to have norm 1 rows.

Perform (e.g.) $k$-means, view as clustering of rows; use same clustering of rows of $A$.

### Aside
Can define a kernel on graphs using similar ideas.

In [None]:
from sklearn.cluster import SpectralClustering
spec_1 = SpectralClustering(n_clusters=4).fit(labeled_clusters)
plt.scatter(labeled_clusters[:,0], labeled_clusters[:,1], c=spec_1.labels_)

In [None]:
spec_2 = SpectralClustering(n_clusters=4).fit(transf_labeled_clusters)
plt.scatter(labeled_clusters[:,0], labeled_clusters[:,1], c=spec_2.labels_)

# Another fun example

In [None]:
points = np.random.uniform(0, 1000, 500)

def f(arr, radius):
    x_, y_ = lambda t : radius * np.cos(t), lambda t : radius * np.sin(t)
    f_x, f_y = np.vectorize(x_), np.vectorize(y_)
    return np.c_[f_x(arr), f_y(arr)]

circ_1 = f(points, 1.)
plt.scatter(circ_1[:,0], circ_1[:,1])

In [None]:
circ_2 = f(np.random.uniform(0, 1000, 500), 3.)
two_circ = np.r_[circ_1, circ_2]
plt.scatter(two_circ[:,0], two_circ[:,1])

In [None]:
km_circ = KMeans(n_clusters=2, random_state=0).fit(two_circ)
plt.scatter(two_circ[:,0], two_circ[:,1], c=(km_circ.labels_) * 90, cmap='viridis')

In [None]:
dbs_circ = DBSCAN().fit(two_circ)
plt.scatter(two_circ[:,0], two_circ[:,1], c=dbs_circ.labels_ * 90, cmap='viridis')
#print dbs.labels_

In [None]:
spec_circ = SpectralClustering(n_clusters=2).fit(two_circ)
plt.scatter(two_circ[:,0], two_circ[:,1], c=spec_circ.labels_* 90, cmap='viridis')
# try a perturbatiion