In [3]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torchvision import datasets, transforms
from torch.utils.data import random_split
from torch.autograd import Variable
# import torchvision
import os
import random

In [4]:
import pandas as pd # pandas is a data manipulation library
import numpy as np #provides numerical arrays and functions to manipulate the arrays efficiently
import matplotlib.pyplot as plt # data visualization library
import sklearn
from sklearn.datasets import fetch_olivetti_faces

# Exercise : Convolution Neural Network

In [5]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# Exercise: k-means Initialization

In [6]:
# !pip3 show scikit-learn

In [7]:
# !pip3 show numpy

If your versions don't match, the following commands (or their anaconda version) could help to get the newest stable release. If you need help with this, please ask the TAs during instruction hours.

In [8]:
# !pip3 install scikit-learn --upgrade
# !pip3 install numpy --upgrade

The functions generating the datasets are given here:

In [9]:
def generateMoons(epsilon, n):
    moons, labels = sklearn.datasets.make_moons(n_samples=n, noise=epsilon, random_state=7)
    return "moons", moons, labels, 2
def generateBlobs(epsilon, n):
    blobs, labels = sklearn.datasets.make_blobs(n_samples=n,centers=3, cluster_std=[epsilon + 1, epsilon + 2.5, epsilon + 0.5])
    return "blobs", blobs, labels, 3

Implement the centroid initialization here. Right now, it returns a random initialization. 

In [10]:
def init_centroids_greedy_pp(D,r,l=10):
    '''
        :param r: (int) number of centroids (clusters)
        :param D: (np-array) the data matrix
        :param l: (int) number of centroid candidates in each step
        :return: (np-array) 'X' the selected centroids from the dataset
    '''   
    
#     X1 = pd.DataFrame(D)
#     centroids = X1 .sample(l)#.values
    
    np.random.seed(42)
    centroids = [D[0]]

    for _ in range(1, r):
        dist_sq = np.array([min([np.inner(c-x,c-x) for c in centroids]) for x in D])
        probs = dist_sq/dist_sq.sum()
        cumulative_probs = probs.cumsum()
        t = np.random.rand()
        
        for j, p in enumerate(cumulative_probs):
            if t < p:
                i = j
                break
        
        centroids.append(D[i])

    return np.array(centroids)

In [11]:
import scipy
def spectral_clustering(W,r, X_init):
    '''
        :param W: (np-array) nxn similarity/weighted adjacency matrix
        :param r: (int) number of centroids (clusters)
        :param X_init: (function) the centroid initialization function 
        :return: (np-array) 'Y' the computed cluster assignment matrix
    '''

    
    L = np.diag(np.array(W.sum(0))[0]) - W
    Lambda, V = scipy.sparse.linalg.eigsh(L, k=r+1, which="SM")
    A = V[:,1:]
    initial_points = X_init(A,r)
    X, Y = kmeans(A, r, initial_points)
    return Y

This is the $k$-means implementation from the lecture accompanying notebook.

In [12]:
def RSS(D,X,Y):
    return np.sum((D- Y@X.T)**2)

In [13]:
def getY(labels):
    '''
        Compute the cluster assignment matrix Y from the categorically encoded labels
    '''
    Y = np.eye(max(labels)+1)[labels]
    return Y
def update_centroid(D,Y):
    cluster_sizes = np.diag(Y.T@Y).copy()
    cluster_sizes[cluster_sizes==0]=1
    return D.T@Y/cluster_sizes
def update_assignment(D,X):
    dist = np.sum((np.expand_dims(D,2) - X)**2,1)
    labels = np.argmin(dist,1)
    return getY(labels)
def kmeans(D,r, X_init, epsilon=0.00001, t_max=10000):
    X = X_init.copy()
    Y = update_assignment(D,X)
    rss_old = RSS(D,X,Y) +2*epsilon
    t=0
    #Looping as long as difference of objective function values is larger than epsilon
    while rss_old - RSS(D,X,Y) > epsilon and t < t_max-1:
        rss_old = RSS(D,X,Y)
        X = update_centroid(D,Y)
        Y = update_assignment(D,X)
        t+=1
    print(t,"iterations")
    return X,Y

We generate a dataset.

In [14]:
n=500
dataID, D, labels, r = generateMoons(0.05,n)

Run kmeans and spectral clustering based on the initialization technique.

In [16]:
# for r in [15,25,30,35]:
X_init = init_centroids_greedy_pp(D,r)
X,Y = kmeans(D,r, X_init)
#     print(f'NMI for r={r}',normalized_mutual_info_score(Y.T[0],Y.T[1]))

ValueError: operands could not be broadcast together with shapes (500,2,1) (15,2) 

In [None]:
from sklearn.metrics.cluster import normalized_mutual_info_score
normalized_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])

Plot the clustering. The initial centroids are marked in red, and the final centroids are marked in blue. You can use this visualization to see if your initialization makes sense. It doesn't work for spectral clustering.

In [None]:
fig = plt.figure()
ax = plt.axes()
ax.axis('equal')
ax.scatter(D[:, 0], D[:, 1], c=np.argmax(Y,axis=1), s=10)
ax.scatter(X_init.T[:, 0], X_init.T[:, 1], c='red', s=50, marker = 'D')
ax.scatter(X.T[:, 0], X.T[:, 1], c='blue', s=50, marker = 'D')

We generate the moons dataset and compute spectral clustering with the implemented initialization technique.

In [None]:
dataID, D, labels, r = generateMoons(0.05,n)

In [None]:
from sklearn.neighbors import radius_neighbors_graph, kneighbors_graph
# Implement here the computation of W as knn graph
W = radius_neighbors_graph(D,0.5,include_self=False)
Y = spectral_clustering(W,r,init_centroids_greedy_pp)

plt.scatter(D[:, 0], D[:, 1], c=np.argmax(Y,axis=1), s=10)
plt.title('%s'  % ( dataID) )
plt.show()