 ## Using different clustering methodes from the pytometry package.
 In this example we will evaluate some clustering algorithms for AnnData objects using the pytometry package.

In [57]:
from __future__ import division, print_function

import sys
MODULE_FULL_PATH = '/home/thomas/PycharmProjects/pytometry'
sys.path.insert(1, MODULE_FULL_PATH)

import csv
from tkinter import Tk, filedialog, messagebox
import anndata
import FlowCytometryTools as fct
from matplotlib.pylab import *
from natsort import natsorted
from sklearn.datasets import make_blobs
import os.path
from pathlib import Path
import pytometry.clustering.cluster as cls
import operator
import pandas as pd

Choose a maximum number of centers for your clustering. The clustering will be calculated for 2 centers until the maximum number of centers.

In [58]:
max_center = 4

Lets create some testdata with 8 features and 3 centers. A Cluster object will also be initialized.

In [59]:
elements = range(4)

for data in elements:
    alldata, reference_Label = make_blobs(750, n_features=8, centers=3)
    adata = anndata.AnnData(X=alldata)
    cluster = cls.Cluster(adata, max_center)

Now you can cluster your data with one of the four clustering methodes. K-means, c-means, Leiden algorithm and a SOM. Let's start with the Leiden algorithm, using a resolution of 1. Please note that the leiden algotihm doesn't need an a priori number of centers.

In [60]:
    cluster.leidend_clustering(resolution=1)

0.10418820381164551

The clustering results will be stored in the .obs section, as well es in a variable "cluster.community_cluster".

In [61]:
    leiden = cluster.adata.obs['leiden']
    leiden = cluster.community_cluster

Now the silhouette score and the gap score will be calculated. To calculate the gap score you will need the inertia first.

In [62]:
    cluster.silhouette_score(methode='leiden')


    inertia = cluster.compute_inertia(label=cluster.community_cluster)
    gap = cluster.gap_score(methode='leiden', inertia=inertia, n_center=len(np.unique(cluster.community_cluster)))

  s = [np.sum((np.array(clusters)[i] - center[i, :]) ** 2) for i in range(0, len(center))]


Now we calculate the k-means clustering for the specified range of centers. The k-means package also contains a methode to calculate the optimal number of centers.

In [63]:
    cluster.k_means_clustering()
    cluster.calc_optimal_k()
    k = cluster.optimal_k

Calculate the silhouette and the gap score for the specified range of centers.

In [64]:
    cluster.silhouette_score()
    gaps = dict()
    for n_center in range(2, max_center+1):
        gaps[str(n_center)] = cluster.gap_score('kmeans', cluster.k_medoids_inertia[str(n_center)], n_center)
    
    observation = pd.Categorical(cluster.k_means_clust[str(k)].astype('U'), categories=natsorted(np.unique(cluster.community_cluster).astype('U')))
    cluster.adata.obs['kmeans'] = observation

Now we calculate the fuzzy c-means clustering for the specified range of centers. The integer representation is stored in the .obs section, the float (fuzzy) representation is stored in the .uns section. Calculate also the silhouette and the gap score for the specified range of centers.

In [65]:
    cluster.fuzzy_clustering()

    observation = pd.Categorical(cluster.fuzzy_membership[str(k)], categories=np.unique(cluster.community_cluster))
    cluster.adata.obs['fuzzy_int'] = observation
    
    cluster.silhouette_score(methode='fuzzy')

    gaps = dict()
    for i in range(2, max_center+1):
        inertia = cluster.compute_inertia(label=cluster.fuzzy_membership[str(i)])
        gap = cluster.gap_score(methode='fuzzy', inertia=inertia, n_center=i)
        gaps[str(i)] = gap

    cluster.adata.uns['fuzzy_float'] = cluster.fuzzy_cluster[str(cluster.optimal_k)]

  s = [np.sum((np.array(clusters)[i] - center[i, :]) ** 2) for i in range(0, len(center))]


Lastly we calculate the SOM clustering. Again we calculate the silhouette and the gap score for the specified range of centers.

In [66]:
    cluster.som_clustering()
    cluster.silhouette_score(methode='SOM')
    for i in range(2, max_center + 1):
        inertia = cluster.compute_inertia(label=cluster.SOM_label[str(i)])
        gap = cluster.gap_score(methode='SOM', inertia=inertia, n_center=len(np.unique(cluster.SOM_label[str(i)])))
        gaps[str(i)] = gap

    observation = pd.Categorical(cluster.SOM_label[str(k)], categories=np.unique(k))
    cluster.adata.obs['som'] = observation

  s = [np.sum((np.array(clusters)[i] - center[i, :]) ** 2) for i in range(0, len(center))]


For all four clustering methodes there is also the option to return the runtime.

In [67]:
time = cluster.leidend_clustering(resolution=1)
print(time)

0.06450653076171875


In [68]:
time = cluster.k_means_clustering()
print(time)

0.13340258598327637


In [69]:
time = cluster.fuzzy_clustering()
print(time)

0.014683961868286133


For the SOM clustering you can return the cluster index, the calculated weights and the runtime

In [70]:
index, weights, time = cluster.som_clustering()
print(time)

0.08607745170593262
