In [373]:
import numpy as np
import sys
import pandas as pd
import scanpy as sc
import math
from GenSA import *
import scipy
from scipy.optimize import dual_annealing
from annoy import AnnoyIndex
import random
from igraph import *
from sklearn.preprocessing import normalize

### Required Functions

In [374]:
def annPartition(data):
    f = data.shape[0]
    print("datashpae",data.shape)
    print("Building graphs with", data.shape[1], "nodes...")
    t = AnnoyIndex(f, 'angular')
    for i in range(data.shape[1]):
        v = data[:,i]
        t.add_item(i, v)
    
    t.build(30)
    
    get_nn = lambda x: t.get_nns_by_item(x, 6)

    indices = list(map(get_nn, np.arange(data.shape[1])));
    
    indices = np.array(indices)
#     print(indices)
    
    fin = []
    for i in range(indices.shape[0]):
        for j in indices[i]:
            fin.append((i,j))
    fin = np.array(fin)
    g = Graph(fin)
    G = g.simplify(multiple=False, loops = False)
    print("Louvain Partition...")
    partition = G.community_leiden(objective_function = "modularity");
#     print(partition.membership)
    
    dataMatrix = np.c_[np.arange(data.shape[1]),np.array(partition.membership)]
    print("Done...")

    return dataMatrix

### Script Begins here

In [375]:
adata = sc.read_10x_mtx('hg19/', var_names='gene_symbols', cache=True)   
# print(adata)

In [376]:
ob = adata.X
ob = scipy.sparse.csr_matrix.toarray(ob)
print(ob.shape)


(2700, 32738)


In [377]:
# (adata)
nsamples=500
method = "sps"
optm_parameters=True
pinit=0.195
pfin = 0.9
K=500

In [378]:
# if(nsamples>adata.shape[1]):
#     return adata

# if(method not in ["random", "sps"]):
#     print("Method not found")
#     exit()

no_samples = ob.shape[1]
init = no_samples if no_samples < 20000 else min(20000,round(no_samples/3))
print(init)

# random sample of ids from sample = 0 to no_samples - 1 of size init
sample_ids = np.random.choice(list(range(0, no_samples,1)), init) 
print(sample_ids.shape)
print(sample_ids)

10913
(10913,)
[12858 20362 21012 ... 14977 15212 31759]


In [388]:
# if(method=="sps"):
"""
if(!any(reducedDimNames(object)=="CComponents"))
    data = Log2Normalize(normcounts(object)[SingleCellExperiment::rowData(object)$HVG, sample_ids],return.sparse = FALSE)
else
    data = as.matrix(normcounts(object)[, sample_ids])
"""

data = normalize(ob)
data = np.take(ob, sample_ids, axis = 1)
print(data.shape)

# return numpy array
partition = annPartition(data)

(2700, 10913)
datashpae (2700, 10913)
Building graphs with 10913 nodes...
Louvain Partition...
Done...


In [423]:
def optimized_param(partition, nsamples = 500):
    if(nsamples> partition.shape[0]):
        nsamples = partiton.shape[0]
    
    global_min = 0
    tol = 1e-3
    max_time = 20
    lower = np.array([0.05, 0.9, 500])
    upper = np.array([0.1, 0.95, 4000])
    
    params = None
    
    def pin_find(params, max_c = nsamples):
        output = np.array(partition)
        #print(output.shape)
        pinit = params[0]
        pfin = params[1]
        K = params[2]
        
        unique_elements, counts_elements = np.unique(output[:,1], return_counts=True)
        cluster_freq = np.asarray((counts_elements), dtype = int)
        #print(cluster_freq)
        prop = np.round((pinit - np.exp(-cluster_freq/K) * (pinit - pfin) )* cluster_freq)
        #print(cluster_freq.shape, prop.shape)
        cluster_freq = np.vstack((cluster_freq,prop)).T
        #print(cluster_freq.shape)
                
        subsamples_lovain = np.empty((0))
        
        for i in range(len(prop)):
            subsamples_lovain = np.concatenate((subsamples_lovain, np.random.choice( output[output[:,1]==i,0], size = int(prop[i]), replace = False)), axis = None)

        #print((subsamples_lovain))
        return np.abs(max_c - subsamples_lovain.shape[0]) 

        # FIX ME: numpy.reshape() of some kind to be done here
        # prop = reshape2::melt(prop)$value
    
    #out = gensa(func = pin_find, x0 = params, bounds = list(zip(lower, upper)))
    out = dual_annealing(func = pin_find, x0 = params, bounds = list(zip(lower, upper)))
    
    print(out)    
    # returning best set of parameters
    return out.x

        
    

In [424]:


# ---- How to convert partition-----
# data = pd.Series([1, 1, 1, 2, 3, 3, 3, 3, 4, 4, 5])
# data.value_counts()

if(optm_parameters==True):
    param = optimized_param(partition, nsamples)
    pinit = param[0]
    pfin = param[1]
    K = param[2]
    print("Optimized parameters:\n", param,"\n")

# print(partition.shape)
# unique_elements, counts_elements = np.unique(partition[:,1], return_counts=True)
# cluster_freq = np.asarray((counts_elements), dtype = int)
# #print(cluster_freq)
# prop = np.round((pinit - np.exp(-cluster_freq/K) * (pinit - pfin) )* cluster_freq)
# #print(cluster_freq.shape, prop.shape)
# cluster_freq = np.vstack((cluster_freq,prop)).T
# #print(cluster_freq.shape)

# subsamples = np.empty((0))
        
# for i in range(len(prop)):
#     subsamples = np.concatenate((subsamples, np.random.choice( partition[partition[:,1]==i,0], size = int(prop[i]), replace = False)), axis = None)


# print(len(subsamples), "Samples extracted.\n")

"""

#old seed

# frequeny table of partition[:][1]
cluster_freq = np.array(np.unique(partition[:][1], return_counts=True).T)
prop = np.round((pinit - np.exp(-cluster_freq/K) * (pinit - pfin) )* cluster_freq)
cluster_freq = np.vstack(cluster_freq, prop).T

# FIX ME: numpy.reshape() of some kind to be done here
# prop = reshape2::melt(prop)$value

subsample = []

for i in range(len(prop)):
    subsample = subsample + np.random.choice(partition[np.nonzero(partition[:][1]==i)], size = prop[i], replace = False)

subsample = np.asarray(subsample)

"""

#     .Random.seed = oldseed
#     SummarizedExperiment::colData(object)$Sampling = rep(FALSE, ncol(object))
#     SummarizedExperiment::colData(object)$Sampling[sample_ids[subsamples]] =  TRUE




     fun: 891
 message: ['Maximum number of iteration reached']
    nfev: 6073
    nhev: 0
     nit: 1000
    njev: 0
  status: 0
 success: True
       x: array([5.01309616e-02, 9.00275704e-01, 5.00115805e+02])
Optimized parameters:
 [5.01309616e-02 9.00275704e-01 5.00115805e+02] 



'\n\n#old seed\n\n# frequeny table of partition[:][1]\ncluster_freq = np.array(np.unique(partition[:][1], return_counts=True).T)\nprop = np.round((pinit - np.exp(-cluster_freq/K) * (pinit - pfin) )* cluster_freq)\ncluster_freq = np.vstack(cluster_freq, prop).T\n\n# FIX ME: numpy.reshape() of some kind to be done here\n# prop = reshape2::melt(prop)$value\n\nsubsample = []\n\nfor i in range(len(prop)):\n    subsample = subsample + np.random.choice(partition[np.nonzero(partition[:][1]==i)], size = prop[i], replace = False)\n\nsubsample = np.asarray(subsample)\n\n'

In [452]:
unique_elements, counts_elements = np.unique(partition[:,1], return_counts=True)
cluster_freq = np.asarray((counts_elements), dtype = int)
print(cluster_freq.shape)
prop = np.round((pinit - np.exp(-cluster_freq/K) * (pinit - pfin) )* cluster_freq)

# #print(cluster_freq.shape, prop.shape)
cluster_freq = np.vstack((cluster_freq,prop)).T
# #print(cluster_freq.shape)

subsamples = np.empty((0))
        
for i in range(len(prop)):
    subsamples = np.concatenate((subsamples, np.random.choice(partition[partition[:,1]==i,0], size = int(prop[i]), replace = False)), axis = None)

subsamples = np.asarray(subsamples, dtype = int)

print(len(subsamples), "Samples extracted.\n")


(12,)
1391 Samples extracted.

10901


In [453]:
type(subsamples)

numpy.ndarray

a
b
