In [2]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics.cluster import silhouette_score
from sklearn.decomposition import PCA
import h5py
import numpy as np
import pandas as pd

  from ._conv import register_converters as _register_converters


### Read from h5

In [3]:
input_file = "../simulated/hgmm100/6_groups/hgmm100_sim_loc1_zheng17.h5"
h5f = h5py.File(input_file, 'r')

In [4]:
matrix = h5f['matrix'].value
barcodes = h5f['cell_attrs']['cell_names'].value

In [7]:
h5f

<HDF5 file "hgmm100_sim_loc1_zheng17.h5" (mode r)>

In [6]:
pd.DataFrame(matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,56.214752,0.0,0.0,0.952792,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,13.339094,0.0,0.0,0.0,0.0,0.0,0.0,1.905585,0.0
1,21.86636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.981776,...,0.0,40.750942,0.0,0.0,0.0,0.0,0.0,0.0,1.987851,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,58.710495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26.62723,0.0,0.0,2.576829,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,15.031501,0.858943,0.0,0.0,0.0,0.429471,0.0,4.294714,0.0
4,32.795376,0.0,0.273295,1.366474,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,14.21133,0.0,0.0,0.819884,0.0,0.0,0.0,0.0,0.0
5,16.346373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,41.410809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,91.63932,0.0,0.0,1.949773,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,25.347046,0.0,0.0,0.0,0.0,0.0,0.0,3.412102,0.0
7,31.363279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,18.210936,0.0,0.0,0.0,0.0,0.0,0.0,3.035156,0.0
8,49.711399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,19.88456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,26.581583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,18.607109,0.0,0.0,0.0,14.61987,1.329079,0.0,0.0,0.0


In [67]:
h5f.close()

### Read from csv

In [20]:
input_file = "../simulated/hgmm100/6_groups/analysis/block_zifa/hgmm100_sim_loc1.csv"

In [21]:
df = pd.read_csv(input_file, header=None)

In [22]:
 df

Unnamed: 0,0
0,b'Cell1'
1,b'Cell2'
2,b'Cell3'
3,b'Cell4'
4,b'Cell5'
5,b'Cell6'
6,b'Cell7'
7,b'Cell8'
8,b'Cell9'
9,b'Cell10'


In [14]:
barcodes = df[0].values
df = df.drop(axis=1, columns=[0])
matrix = df.values

In [26]:
pca = PCA(2)
pca_Zhat = pca.fit_transform(matrix)

In [35]:
agg = AgglomerativeClustering(4)

In [37]:
agg.get_params()

{'affinity': 'euclidean',
 'compute_full_tree': 'auto',
 'connectivity': None,
 'linkage': 'ward',
 'memory': None,
 'n_clusters': 4,
 'pooling_func': <function numpy.core.fromnumeric.mean>}

In [46]:
agg.set_params(n_clusters=7)

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=7,
            pooling_func=<function mean at 0x2b8e53f66620>)

Agglomerative
- Linkage
- Affinity
- n_clust

K-means

### Silhouette with k-means

In [13]:
k_min = 2
k_max = 15
data = pca_Zhat
distance = 'euclidean'

In [47]:
k_range = range(k_min, k_max)

In [48]:
predicted_labels = [KMeans(k).fit_predict(data) for k in k_range]

In [58]:
predicted_labels[0]

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0], dtype=int32)

In [51]:
silhouette_scores = [silhouette_score(X=data, labels=obj, metric=distance) for obj in predicted_labels]

In [59]:
type(silhouette_scores)

list

In [61]:
silhouette_scores

[0.57496303,
 0.80883324,
 0.70629627,
 0.7482883,
 0.75858897,
 0.6254273,
 0.5299004,
 0.48155132,
 0.44082218,
 0.400135,
 0.44779128,
 0.40647224,
 0.41726068]

In [62]:
max_index = np.argmax(silhouette_scores)

In [63]:
predicted_labels[max_index]

array([2, 0, 0, 2, 1, 0, 1, 2, 2, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 2, 0, 1,
       0, 1, 1, 1, 0, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2,
       2, 2, 0, 0, 2, 0, 1, 1, 1, 2, 2, 0], dtype=int32)