# Getting colors for plotting and evaluating clustering



In [None]:
# Baked-in within python modules
from collections import defaultdict

# Alphabetical order for nonstandard python modules is conventional
# We're doing "import superlongname as abbrev" for our laziness - this way we don't have to type out the whole thing each time.

# Python plotting library
import matplotlib as mpl
import matplotlib.pyplot as plt

# Numerical python library (pronounced "num-pie")
import numpy as np

# Dataframes in Python
import pandas as pd

# T-test of independent samples
from scipy.stats import ttest_ind

# Statistical plotting library we'll use
import seaborn as sns
sns.set(style='whitegrid')

# Matrix decomposition
from sklearn.decomposition import PCA, FastICA

# Manifold learning
from sklearn.manifold import MDS, TSNE

# Clustering
from sklearn.cluster import KMeans, MiniBatchKMeans

# Plotting dendrograms
from scipy.cluster import hierarchy

# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline

In [None]:
macaulay2016_expression = pd.read_csv('../data/macaulay2016/gene_expression_s.csv', index_col=0)


# Set maximum columns to display as 50 because the dataframe has 49 columns
pd.options.display.max_columns = 50

macaulay2016_metadata = pd.read_csv('../data/macaulay2016/sample_info_qc.csv', index_col=0)
# Add column for gfp
macaulay2016_metadata['gfp_color'] = ['#31a354' if c == 'HIGH' else '#e5f5e0' for c in macaulay2016_metadata['condition']]

# Necessary step for converting the parsed cluster color to be usable with matplotlib
macaulay2016_metadata['cluster_color'] = macaulay2016_metadata['cluster_color'].map(eval)

# --- Filter macaulay2016 data --- #
ensembl_genes = [x for x in macaulay2016_expression.index if x.startswith('ENS')]
cells_pass_qc = macaulay2016_metadata["Pass QC"].index[macaulay2016_metadata["Pass QC"]]

macaulay2016_expression_filtered = macaulay2016_expression.loc[ensembl_genes, cells_pass_qc]

# Recalculate TPM
macaulay2016_expression_filtered = 1e6 * macaulay2016_expression_filtered / macaulay2016_expression_filtered.sum()

# Transpose so it's machine learning format
macaulay2016_expression_filtered = macaulay2016_expression_filtered.T

# Take only "expressed genes" with expression greater than 1 in at least 3 cells
mask = (macaulay2016_expression_filtered > 1).sum() >= 3
macaulay2016_expression_filtered = macaulay2016_expression_filtered.loc[:, mask]
print('macaulay2016_expression_filtered.shape', macaulay2016_expression_filtered.shape)

# Add 1 and log10
macaulay2016_expression_log10 = np.log10(macaulay2016_expression_filtered + 1)

# Macaulay2016 plotting colors
macaulay2016_gfp_colors = macaulay2016_metadata.loc[macaulay2016_expression_log10.index, 'gfp_color']

# Get cluster colors from the paper
macaulay2016_cluster_colors_from_paper = macaulay2016_metadata.loc[macaulay2016_expression_log10.index, 'cluster_color']
macaulay2016_clusters_from_paper = macaulay2016_metadata.loc[macaulay2016_expression_log10.index, 'cluster']
macaulay2016_cluster_to_color_from_paper = dict(zip(macaulay2016_clusters_from_paper, macaulay2016_cluster_colors_from_paper))

## Clarification of hierarchical clustering goals

Use hierarchical clustering on either PCA or ICA to assign clusters to the Macaulay data and plot the PCA (or ICA) plot with the reduced clusters. **Are you able to recover the original clusters?** Use as many code cells as you need.

To clarify, the full steps for evaluating your hierarchical clustering on the Macaulay2016 dataset are:

1. Perform dimensionality reduction
2. Cluster the reduced data
3. Cut the dendrogram from the clustered data
4. Get the cluster colors and assignments
5. Re-plot the data with the sample colors
6. See how your clusters match with the Macaulay dataset

## How to get any number of colors for your data

In [None]:
kmeans = KMeans(n_clusters=6)
kmeans.fit(macaulay2016_expression_log10)

In [None]:
macaulay2016_smusher = PCA(n_components=2)
macaulay2016_smushed = pd.DataFrame(
    macaulay2016_smusher.fit_transform(macaulay2016_expression_log10),
    index=macaulay2016_expression_log10.index)

In [None]:
macaulay2016_kmeans_centroids = pd.DataFrame(macaulay2016_smusher.transform(
    kmeans.cluster_centers_))
macaulay2016_kmeans_centroids

In [None]:
fig, ax = plt.subplots()

ax.scatter(macaulay2016_smushed[0], macaulay2016_smushed[1], color="Teal", 
           linewidth=1, edgecolor='white')
ax.scatter(macaulay2016_kmeans_centroids[0], macaulay2016_kmeans_centroids[1], 
            color='k', marker='x', s=100, linewidth=3)

In [None]:
kmeans.predict(macaulay2016_expression_log10)

In [None]:
sns.choose_colorbrewer_palette('qualitative')

In [None]:
husl_palette = sns.color_palette('husl', n_colors=20)
sns.palplot(husl_palette)

In [None]:
kmeans_palette = sns.color_palette('Set1', n_colors=6)
sns.palplot(kmeans_palette)

In [None]:
labels = pd.Series(kmeans.predict(macaulay2016_expression_log10), 
                   index=macaulay2016_expression_log10.index)
colors = [kmeans_palette[i] for i in labels]
print(len(labels))
print(len(colors))

In [None]:
fig, ax = plt.subplots()

ax.scatter(macaulay2016_smushed[0], macaulay2016_smushed[1], color=colors, 
           linewidth=1, edgecolor='grey')
ax.scatter(macaulay2016_kmeans_centroids[0], macaulay2016_kmeans_centroids[1], 
            color='k', marker='x', s=100, linewidth=3)

### Exercise 1

Change the number of clusters to 20 and use the `"husl"` palette for coloring

## Evaluating clustering

How do we evaluate the clusters that we found versus the clusters from the paper?

In [None]:
# Get the unique names of the original Macaulay2016 clusters
cluster_names = macaulay2016_metadata.cluster.unique()
# Sort them in alphabetical order so that they're in the order we want
cluster_names.sort()

# Map the cluster name to an integer number
cluster_name_to_integer = dict(zip(cluster_names, range(len(cluster_names))))

paper_cluster_integers = macaulay2016_metadata.cluster.map(cluster_name_to_integer)
paper_cluster_integers.head()

In [None]:
macaulay2016_palette = [macaulay2016_cluster_to_color_from_paper[x] for x in cluster_names]

In [None]:
from sklearn.metrics import confusion_matrix

confusion = pd.DataFrame(confusion_matrix(paper_cluster_integers, labels), 
                         index=cluster_names)
confusion.index.name = 'Macaulay2016 Labels'
confusion.columns.name = 'K-Means Predicted'

confusiongrid = sns.clustermap(confusion, annot=True, fmt='d', figsize=(4, 4),
               col_cluster=False, row_cluster=False, 
               row_colors=macaulay2016_palette, col_colors=kmeans_palette)

# rotate the ylabels to be horizontal instead of vertical
plt.setp(confusiongrid.ax_heatmap.get_yticklabels(), rotation=0);

### Evaluating clustering: Rand score

The [Rand index](https://en.wikipedia.org/wiki/Rand_index) is a numeric value indicating 

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score
adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 1])


In [None]:
adjusted_rand_score([0, 0, 1, 1], [1, 1, 0, 0])


### Exercise 2

Try your own labels and values to see the rand score. You can try as many samples or classes as you want

In [None]:
# adjusted_rand_score([XXXX], [XXXX])

### Exercise 3

Get the Rand score of your clustering

In [None]:
# YOUR CODE HERE