In [1]:
import os
import sys
import scipy.io as sio
sys.path.insert(0,os.path.abspath('..'))
import time
import numpy as np
import SIMLR
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi
from sklearn.metrics.cluster import adjusted_rand_score as ari
from scipy.sparse import csr_matrix
import h5py
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.manifold import TSNE
from sklearn.decomposition import FactorAnalysis

  from ._conv import register_converters as _register_converters


### Splatter Data

In [17]:

h5_file_path =  '../simulated/9_groups_09.03.2018/melanomaS2_sim_de0.1_loc1_zheng17.h5'

In [18]:
h5f = h5py.File(h5_file_path, 'r')
data = h5f['matrix'][:]


In [19]:
list(h5f.keys())

['cell_attrs', 'gene_attrs', 'matrix']

In [10]:
h5f.close()

In [20]:
cell_attrs = h5f['cell_attrs']
cell_groups = cell_attrs['cell_groups'][:]
h5f.close()

In [21]:
vfunc1 = np.vectorize(lambda t: t.decode('UTF-8'))
cell_groups = vfunc1(cell_groups)
vfunc2 = np.vectorize(lambda t: int(t[-1:]))
cell_groups = vfunc2(cell_groups)

In [22]:
cell_groups.shape

(2216,)

In [23]:
cell_groups

array([6, 4, 8, ..., 3, 3, 6])

In [24]:
size = data.shape[0]*data.shape[1] 

In [25]:
np.count_nonzero(data) / size * 100

7.815839350180505

In [26]:
data.shape

(2216, 1000)

In [27]:
#data = np.log10(1+data) 
data = np.log1p(data)
#data = np.log2(1+data)
##take log transform of gene counts. This is very important since it makes the data more gaussian
X = data
c = 10

In [28]:
X.shape

(2216, 1000)

### PCA

In [29]:
from sklearn.decomposition import PCA

In [30]:
### if the number of genes are more than 500, we recommend to perform pca first!
print('Start to Run PCA on the RNA-seq data!\n')
start_main = time.time()
if X.shape[1]>500:
    sk_pca = PCA(n_components=500).fit_transform(X)
else:
    X = X.todense()
print('Successfully Run PCA! PCA took %f seconds in total\n' % (time.time() - start_main))

Start to Run PCA on the RNA-seq data!

Successfully Run PCA! PCA took 7.542176 seconds in total



In [31]:
### if the number of genes are more than 500, we recommend to perform pca first!
print('Start to Run PCA on the RNA-seq data!\n')
start_main = time.time()
if X.shape[1]>500:
    fast_pca = SIMLR.helper.fast_pca(X,500)
else:
    X = X.todense()
print('Successfully Run PCA! PCA took %f seconds in total\n' % (time.time() - start_main))

Start to Run PCA on the RNA-seq data!

Successfully Run PCA! PCA took 8.135637 seconds in total



In [32]:
sk_pca

array([[-3.7640412e+00, -3.4654870e+00, -2.9067771e+00, ...,
         5.1799607e-02, -4.6983708e-02,  8.8149905e-02],
       [ 4.3785858e+00, -3.1525836e+00, -1.7819156e-01, ...,
        -1.5027785e-01,  9.3986645e-02,  4.6035293e-02],
       [ 5.0825983e-01,  4.9246616e+00, -3.6044424e+00, ...,
        -3.6140449e-02, -5.2986294e-03,  4.0971372e-02],
       ...,
       [-1.2836295e+00,  7.9501122e-01,  1.7527990e+00, ...,
         5.9174456e-02, -7.9074554e-02,  4.0365127e-03],
       [-1.1536378e+00,  1.6899648e+00,  1.9099833e+00, ...,
        -2.9531773e-02, -2.3121027e-02, -1.3635076e-02],
       [-2.5607970e+00, -1.9192547e+00, -9.6278483e-01, ...,
         2.8449696e-02, -2.3110675e-02, -4.8065048e-02]], dtype=float32)

In [33]:
fast_pca

array([[ 0.20133498, -0.19013257, -0.16473354, ..., -0.0015512 ,
        -0.00338604, -0.0204238 ],
       [-0.30505601, -0.22529048, -0.01315562, ...,  0.06333356,
        -0.00553513,  0.00643446],
       [-0.01430464,  0.14216705, -0.10747989, ...,  0.01558909,
        -0.00771909, -0.00889238],
       ...,
       [ 0.08536696,  0.05423081,  0.12350567, ..., -0.01423562,
         0.01252234,  0.00373931],
       [ 0.06952028,  0.10445906,  0.12194862, ...,  0.00127415,
        -0.01240584, -0.01496465],
       [ 0.14518692, -0.1116128 , -0.05783497, ..., -0.00977505,
        -0.0348038 , -0.01886068]])

### SIMLR

In [34]:
print('Start to Run SIMLR!\n')
start_main = time.time()
simlr = SIMLR.SIMLR_LARGE(num_of_rank=c, num_of_neighbor=100, max_iter=15); ###This is how we initialize an object for SIMLR. the first input is number of rank (clusters) and the second input is number of neighbors. The third one is an binary indicator whether to use memory-saving mode. you can turn it on when the number of cells are extremely large to save some memory but with the cost of efficiency.
S, F,val, ind = simlr.fit(X)
print('Successfully Run SIMLR! SIMLR took %f seconds in total\n' % (time.time() - start_main))

Start to Run SIMLR!

SIMLR highly recommends you to perform PCA first on the data

Please use the in-line function fast_pca on your input

#######OS PROCESS ID#####
8805
Successfully Run SIMLR! SIMLR took 11.905928 seconds in total



In [35]:
F.shape

(2216, 10)

In [36]:
y_pred = simlr.fast_minibatch_kmeans(F,c)

In [37]:
from sklearn.cluster import KMeans

In [38]:
y_pred = KMeans(c).fit_predict(F)

In [39]:
k_range = range(5,15)

In [43]:
xx = KMeans(c).fit_predict(F)

In [44]:
from sklearn.metrics.cluster import silhouette_score as ss

In [45]:
type((1,2,3,4,5))

tuple

In [48]:
len(cell_groups)

2216

In [57]:
iterable = (x*x for x in range(5))
np.fromiter(iterable, np.float)

array([ 0.,  1.,  4.,  9., 16.])

In [49]:
F.shape

(2216, 10)

In [101]:
KMeans(2).fit_predict(F)

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [100]:
np.fromiter(kg,dtype=int)

ValueError: setting an array element with a sequence.

In [73]:
ss

<function sklearn.metrics.cluster.unsupervised.silhouette_score>

In [94]:
ss(X=F, labels=KMeans(3).fit_predict(F))

0.4121366120005745

In [105]:
gg = (ss(X=F, labels=KMeans(c).fit_predict(F)) for c in [8,9])

In [106]:
np.fromiter(gg, dtype=float)

array([0.86931424, 0.97121241])

In [83]:
res = np.fromiter((ss(X=F, labels=KMeans(c).fit_predict(F)) for c in [5,6]), dtype=float, count=2)

In [104]:
np.fromiter((KMeans(c).fit_predict(F) for c in [5,6]), dtype='int32', count=2)

ValueError: setting an array element with a sequence.

In [84]:
res

array([0.60937375, 0.68967539])

In [None]:
y_pred

In [None]:
type(y_pred)

In [None]:
print('NMI value is %f \n' % nmi(y_pred.flatten(),cell_groups))
print('ARI value is %f \n' % ari(y_pred.flatten(),cell_groups))

NMI value is 0.947123 

ARI value is 0.971228 