# Demo: Single round of hierarchical clustering

In [1]:
%reset -f

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib notebook



In [3]:
import re

In [4]:
from scipy.sparse import coo_matrix

In [5]:
from sklearn.neighbors import (
    NearestNeighbors,
    KNeighborsClassifier)
from sklearn.cluster import (
    KMeans,
    MiniBatchKMeans,
    AgglomerativeClustering,
    SpectralClustering)

In [6]:
np.random.seed(1)

In [7]:
filename = "../data/CCS/subsetShots_5pct_reduced.csv"
df = pd.read_csv(filename)
df = df.iloc[:600, :]  # use this line for quick testing
df["original_index"] = df.index
print "{} rows, {} columns".format(*df.shape)
n_obs = df.shape[0]

600 rows, 54 columns


In [8]:
feature_column_indices = [i for i in range(len(df.columns)) if df.columns[i].startswith("wavelength")]
df_features = df.iloc[:, feature_column_indices]
print "feature-only data: {} rows, {} columns".format(*df_features.shape)

feature-only data: 600 rows, 50 columns


In [9]:
example_spectrum = df.iloc[0, feature_column_indices]
wavelengths = [float(re.match("wavelength_(.*)", df.columns[i]).group(1)) for i in feature_column_indices]

In [10]:
#plt.stem(wavelengths, example_spectrum)

In [11]:
n_neighbors = 20

In [12]:
#nn_obj = NearestNeighbors(n_neighbors = n_neighbors, algorithm = 'ball_tree', metric='euclidean').fit(df_features)
#distances, indices = nn_obj.kneighbors(df_features)

In [13]:
@np.vectorize
def get_num_representatives(n):
    "a function that grows like f(n) but smoothly transitions to n below n0"
    def f(x): return x ** (1/3.)
    def fp(x): return (1/3.) * x ** (-2/3.)
    n0 = 500.0
    a = n0 - f(n0) / fp(n0)
    b = 1.0 / fp(n0)
    if n < n0:
        return int(n)
    else:
        return int(a + b * f(n))

In [14]:
num_representatives = get_num_representatives(n_obs)
# NOTE: doesn't support custom metric?
km_obj = KMeans(
    n_clusters = num_representatives,
    precompute_distances = False,
    copy_x = True,
    n_init = 1,
).fit(df_features)
centers = km_obj.cluster_centers_

In [15]:
print "representatives (cluster centers): {} rows, {} cols".format(*centers.shape)

representatives (cluster centers): 593 rows, 50 cols


In [16]:
def nn_result_to_sparse_similarity(distances, indices, sigma):
    similarities = np.exp(-distances / sigma)
    # TODO: does it matter if we swap (rows, cols)? -- need to examine this construction more closely...
    rows = np.array([[i for k in range(len(indices[i]))] for i in range(len(indices))]).flatten()
    cols = indices.flatten()
    values = similarities.flatten()
    result = coo_matrix((values, (rows, cols)),
        shape = (len(indices), len(indices)))
    return result

def get_sparse_similarity(data, n_neighbors, metric = 'euclidean', sigma = 1.0):
    nn_obj = NearestNeighbors(
        n_neighbors = n_neighbors,
        algorithm = 'ball_tree',
        metric = metric
    ).fit(data)
    distances, indices = nn_obj.kneighbors(data)
    return nn_result_to_sparse_similarity(distances, indices, sigma = sigma)

In [17]:
def get_subset_sparse_square(sparse_matrix, indices):
    rows = sparse_matrix.row
    cols = sparse_matrix.col
    data = sparse_matrix.data
    i_subset = []
    j_subset = []
    d_subset = []
    for k in range(len(sim_flat)):
        if rows[k] in indices and cols[k] in indices:
            i_subset.append(rows[k])
            j_subset.append(cols[k])
            d_subset.append(data[k])
    return coo_matrix((d_subset, (i_subset, j_subset)))

In [18]:
sim_sparse = get_sparse_similarity(centers, n_neighbors=n_neighbors)
print sim_sparse.shape

(593, 593)


In [19]:
spc_obj = SpectralClustering(n_clusters = 2, affinity = 'precomputed')
representative_partition = spc_obj.fit_predict(sim_sparse)



In [20]:
print "partitioned representatives: #0: {}, #1: {}".format(
    len(representative_partition[representative_partition == 0]),
    len(representative_partition[representative_partition == 1]))

partitioned representatives: #0: 331, #1: 262


In [21]:
nn_classifier = KNeighborsClassifier(
    n_neighbors = n_neighbors,
    algorithm = 'ball_tree',
    metric = 'euclidean',
).fit(centers, representative_partition)

In [22]:
full_partition = nn_classifier.predict(df_features)

In [23]:
df_part0 = df[full_partition == 0]
df_part1 = df[full_partition == 1]
print "partitioned full dataset: #0: {}, #1: {}".format(len(df_part0), len(df_part1))

partitioned full dataset: #0: 327, #1: 273


# Run hierarchical clustering using `hier_clust.py`

In [24]:
import hier_clust
hc = hier_clust.HierClust()
tree, assign = hc.fit(df_features[:600])

In [25]:
print len(assign)
print len(np.unique(assign))

600
600


# Recovering tree structure

In [26]:
from tree_util import reconstruct_tree

In [27]:
annot_df = pd.read_csv("../data/CCS/subsetShots_5pct_reduced_with_cluster_id.csv")

In [28]:
tree = reconstruct_tree(annot_df["cluster_id"])

In [29]:
print tree.prune(2).map_data(lambda x: len(x)).reduce_leaves(lambda x, y: x + y)
print tree.prune(2).map_data(lambda x: len(x)).reduce_all(lambda x, y, z: x + y + z)
print tree.prune(2).map_data(lambda x: len(x)).str_display()

11146
33438
Tree(data = 11146, children = [
  Tree(data = 917, children = [
    Tree(data = 776, children = [])
    Tree(data = 141, children = [])
  ])
  Tree(data = 10229, children = [
    Tree(data = 4597, children = [])
    Tree(data = 5632, children = [])
  ])
])
