In [2]:
import pickle
import pandas as pd
import numpy as np
from scipy import sparse

from tools import remove_rare_genes, rnaseqTools

# Macosko 2015: All cell types

In [3]:
#read count and cluster data
countfile = 'datasets/retina/macosko_all/GSE63472_P14Retina_merged_digital_expression.txt'
clusterfile = 'datasets/retina/macosko_all/retina_clusteridentities.txt'
counts, genes, cells = rnaseqTools.sparseload(countfile, sep='\t')
genes = np.array([g.capitalize() for g in genes])
cluster_assignments = pd.read_csv(clusterfile, sep='\t', header=None)

......................... done


In [4]:
cells_clustered = cluster_assignments[0].values
clusters = cluster_assignments[1].values
cell2cluster = dict(zip(cells_clustered, clusters))

#keep only cells that have a cluster assignment
cells_clustered_idx = np.isin(cells, cells_clustered)
cells = cells[cells_clustered_idx]
counts = counts[cells_clustered_idx,:]

#keep only genes with detections in 5 or more cells
counts,genes = remove_rare_genes(counts,genes,5)

#get numeric cluster identities
clusters = np.array([cell2cluster[c] for c in cells]) - 1

#setup class names
amacrines = {i : 'Amacrine cells' for i in range(2,23+1)}
bipolars = {i : 'Bipolar cells' for i in range(25,33+1)}
rest = {0 : 'Horizontal cells',
                        1 : 'Ganglion cells',
                        23: 'Rods',
                        24: 'Cones',
                        33: 'Mueller glia',
                        34: 'Astrocytes',
                        35: 'Fibroblasts',
                        36: 'Vascular endothelium',
                        37: 'Pericytes',
                        38: 'Microglia'}
classmapping= {**bipolars,**amacrines,**rest}
class_names = np.array([classmapping[i] for i in range(39)])

#setup unique cluster names
cluster_names = np.array(['                    ']*39)
for i in range(39):
    if classmapping[i] in ['Amacrine cells','Bipolar cells']:
        cluster_names[i]=classmapping[i]+' (%u)'%(i)
    else: 
        cluster_names[i]=classmapping[i]

#get batch identities
replicates = [c.split('_')[0] for c in cells]
replicate_names,replicates = np.unique(replicates,return_inverse=True)

#use batch/replicate identity to get the two larger batches
assert np.all(replicate_names==np.array(['p1','r1','r2','r3','r4','r5','r6']))
macosco_replicate_ids = np.array([7, 1, 2, 3, 4, 5, 6]) #see suppl. text of macosko paper
macosco_batch_ids = np.array([1, 0, 0, 0, 1, 1, 1]) #see suppl. text of macosko paper
batches = macosco_batch_ids[replicates]

Of 23743 total genes, returning 19285 genes that are detected in 5 or more cells.
Output shape: (44808, 19285)


In [5]:
dataset = dict(counts= counts,
               genes = genes,
               clusters = clusters,
               cluster_names = cluster_names,
               class_names = class_names,
               batches = batches,
               replicates=replicates,
               replicate_names=replicate_names,
               replicate_ids=macosco_replicate_ids,
               batch_ids_per_replicate=macosco_batch_ids,
               clusterfile = clusterfile,
               countfile=countfile,
               label = 'Macosko2015')

In [6]:
[print(k, type(dataset[k])) for k in dataset.keys()]

counts <class 'scipy.sparse.csc.csc_matrix'>
genes <class 'numpy.ndarray'>
clusters <class 'numpy.ndarray'>
cluster_names <class 'numpy.ndarray'>
class_names <class 'numpy.ndarray'>
batches <class 'numpy.ndarray'>
replicates <class 'numpy.ndarray'>
replicate_names <class 'numpy.ndarray'>
replicate_ids <class 'numpy.ndarray'>
batch_ids_per_replicate <class 'numpy.ndarray'>
clusterfile <class 'str'>
countfile <class 'str'>
label <class 'str'>


[None, None, None, None, None, None, None, None, None, None, None, None, None]

In [7]:
print(dataset)

{'counts': <44808x19285 sparse matrix of type '<class 'numpy.float64'>'
	with 32796924 stored elements in Compressed Sparse Column format>, 'genes': array(['Kitl', 'Tmtc3', 'Cep290', ..., 'Gm22701', 'Gm21464', 'Bc021614'],
      dtype='<U14'), 'clusters': array([ 1,  1,  1, ...,  1, 23, 23]), 'cluster_names': array(['Horizontal cells', 'Ganglion cells', 'Amacrine cells (2)',
       'Amacrine cells (3)', 'Amacrine cells (4)', 'Amacrine cells (5)',
       'Amacrine cells (6)', 'Amacrine cells (7)', 'Amacrine cells (8)',
       'Amacrine cells (9)', 'Amacrine cells (10)', 'Amacrine cells (11)',
       'Amacrine cells (12)', 'Amacrine cells (13)',
       'Amacrine cells (14)', 'Amacrine cells (15)',
       'Amacrine cells (16)', 'Amacrine cells (17)',
       'Amacrine cells (18)', 'Amacrine cells (19)',
       'Amacrine cells (20)', 'Amacrine cells (21)',
       'Amacrine cells (22)', 'Rods', 'Cones', 'Bipolar cells (25)',
       'Bipolar cells (26)', 'Bipolar cells (27)', 'Bipolar cells (

In [8]:
with open(countfile + '_preprocessed.pickle', "wb") as f:
    pickle.dump(dataset, f, protocol=4)

# Shekhar 2016: Bipolar cells

In [9]:
#read count and cluster data
countfile = 'datasets/retina/shekhar_bipolar/GSE81904_BipolarUMICounts_Cell2016.txt.gz'
clusterfile = 'datasets/retina/shekhar_bipolar/clust_retinal_bipolar.txt'
counts, genes, cells = rnaseqTools.sparseload(countfile, sep='\t')
genes = np.array([g.capitalize() for g in genes])
cluster_assignments = pd.read_csv(clusterfile, sep='\t')

cells_clustered = cluster_assignments['NAME']
clusters = cluster_assignments['CLUSTER']
cell2cluster = dict(zip(cells_clustered, clusters))

#keep only cells that have a cluster assignment
cells_clustered_idx = np.isin(cells, cells_clustered)
cells = cells[cells_clustered_idx]
counts = counts[cells_clustered_idx,:]

#keep only genes with detections in 5 or more cells
counts,genes=remove_rare_genes(counts,genes,5)

#get cluster identities
clusters = np.array([cell2cluster[c] for c in cells])

#fix cluster names and make clusters numeric
cluster_names, clusters = np.unique(clusters, return_inverse=True)
cluster_names[cluster_names=='BC5A (Cone Bipolar cell 5A)'] = 'BC5A'
cluster_names[cluster_names=='BC7 (Cone Bipolar cell 7)'] = 'BC7'
cluster_names[cluster_names=='BC8/9 (mixture of BC8 and BC9)'] = 'BC8/9'
cluster_names[cluster_names=='AC (Amacrine cell)'] = 'Amacrine cells'
cluster_names[cluster_names=='Cone Photoreceptors'] = 'Cones'
cluster_names[cluster_names=='Rod Photoreceptors'] = 'Rods'
cluster_names[cluster_names=='MG (Mueller Glia)'] = 'Mueller Glia'
cluster_names[cluster_names=='RBC (Rod Bipolar cell)'] = 'RBC'

......................... done
Of 23831 total genes, returning 18396 genes that are detected in 5 or more cells.
Output shape: (27499, 18396)


In [10]:
#get replicate identities
replicates = [c.split('_')[0] for c in cells]
replicate_names,replicates = np.unique(replicates,return_inverse=True)

#use batch/replicate identity to get the two larger batches
assert np.all(replicate_names==np.array(['Bipolar1','Bipolar2','Bipolar3','Bipolar4','Bipolar5','Bipolar6']))
shekhar_replicate_ids = np.array([1, 2, 3, 4, 1, 2]) 
shekhar_batch_ids = np.array([1, 1, 1, 1, 2, 2]) 
batches = shekhar_batch_ids[replicates]

In [11]:
dataset = dict(counts=counts,
               genes=genes, 
               clusters=clusters, 
               cluster_names=cluster_names,
               batches=batches,
               replicates=replicates,
               replicate_names=replicate_names,
               replicate_ids=shekhar_replicate_ids,
               batch_ids_per_replicate=shekhar_batch_ids,
               countfile=countfile,
               clusterfile=clusterfile,
               label='Shekhar 2016, clustered cells only')

In [12]:
[print(k, type(dataset[k])) for k in dataset.keys()]

counts <class 'scipy.sparse.csc.csc_matrix'>
genes <class 'numpy.ndarray'>
clusters <class 'numpy.ndarray'>
cluster_names <class 'numpy.ndarray'>
batches <class 'numpy.ndarray'>
replicates <class 'numpy.ndarray'>
replicate_names <class 'numpy.ndarray'>
replicate_ids <class 'numpy.ndarray'>
batch_ids_per_replicate <class 'numpy.ndarray'>
countfile <class 'str'>
clusterfile <class 'str'>
label <class 'str'>


[None, None, None, None, None, None, None, None, None, None, None, None]

In [13]:
print(dataset)

{'counts': <27499x18396 sparse matrix of type '<class 'numpy.float64'>'
	with 24258648 stored elements in Compressed Sparse Column format>, 'genes': array(['0610005c13rik', '0610007p14rik', '0610009b22rik', ..., 'N-r5s29',
       'N-r5s40', 'N-r5s8'], dtype='<U14'), 'clusters': array([10, 15, 11, ..., 12,  3,  7]), 'cluster_names': array(['Amacrine cells', 'BC1A', 'BC1B', 'BC2', 'BC3A', 'BC3B', 'BC4',
       'BC5A', 'BC5B', 'BC5C', 'BC5D', 'BC6', 'BC7', 'BC8/9', 'Cones',
       'Doublets/Contaminants', 'Mueller Glia', 'RBC', 'Rods'],
      dtype='<U30'), 'batches': array([1, 1, 1, ..., 2, 2, 2]), 'replicates': array([0, 0, 0, ..., 5, 5, 5]), 'replicate_names': array(['Bipolar1', 'Bipolar2', 'Bipolar3', 'Bipolar4', 'Bipolar5',
       'Bipolar6'], dtype='<U8'), 'replicate_ids': array([1, 2, 3, 4, 1, 2]), 'batch_ids_per_replicate': array([1, 1, 1, 1, 2, 2]), 'countfile': 'datasets/retina/shekhar_bipolar/GSE81904_BipolarUMICounts_Cell2016.txt.gz', 'clusterfile': 'datasets/retina/shekhar_bi

In [14]:
with open(countfile + '_preprocessed.pickle', "wb") as f:
    pickle.dump(dataset, f, protocol=4)

# Tran 2019: Ganglion cells

In [3]:
clusterfile = 'datasets/retina/tran_ganglion/RGC_Atlas_coordinates.txt'
countfile_normalized = 'datasets/retina/tran_ganglion/RGC_Atlas.csv'
countfile = 'datasets/retina/tran_ganglion/GSE133382_AtlasRGCs_CountMatrix.csv'

In [4]:
annotations = pd.read_csv(clusterfile, sep='\t', header=[0,1])

In [5]:
counts, genes, cells = rnaseqTools.sparseload(countfile, sep=',')

............................ done


In [6]:
#we look at the normalized count matrix from the broadinstitute download to obtain the gene selection from Tran 2019
counts_norm, genes_norm, cells_norm = rnaseqTools.sparseload(countfile_normalized, sep=',')

................... done


In [7]:
cells_in_clustering = list(annotations[('NAME','TYPE')])
clusters = list(annotations[('Cluster','group')])
cluster_names, clusters = np.unique(clusters, return_inverse=True)
batches = list(annotations[('BatchID','group')])
batch_names,batches = np.unique(batches,return_inverse=True)

In [9]:
cell_isin_clustering = np.isin(cells,cells_in_clustering)
gene_isin_normdata = np.isin(genes,genes_norm)

In [10]:
counts = counts[cell_isin_clustering,:]
counts = counts[:,gene_isin_normdata]

In [13]:
counts,genes = remove_rare_genes(counts,genes[gene_isin_normdata], 5)

Of 18221 total genes, returning 18154 genes that are detected in 5 or more cells.
Output shape: (35699, 18154)


In [14]:
%%time
dataset = dict(counts= counts,
               genes = genes,
               clusters = clusters,
               cluster_names = cluster_names,
               batches = batches,
               batch_names=batch_names,
               countfile=countfile,
               clusterfile=clusterfile,
               label = 'Tran2019 - clustered cells and selected genes only')

CPU times: user 9 µs, sys: 1e+03 ns, total: 10 µs
Wall time: 16.7 µs


In [15]:
[print(k, type(dataset[k])) for k in dataset.keys()]

counts <class 'scipy.sparse.csc.csc_matrix'>
genes <class 'numpy.ndarray'>
clusters <class 'numpy.ndarray'>
cluster_names <class 'numpy.ndarray'>
batches <class 'numpy.ndarray'>
batch_names <class 'numpy.ndarray'>
countfile <class 'str'>
clusterfile <class 'str'>
label <class 'str'>


[None, None, None, None, None, None, None, None, None]

In [16]:
print(dataset)

{'counts': <35699x18154 sparse matrix of type '<class 'numpy.float64'>'
	with 132584766 stored elements in Compressed Sparse Column format>, 'genes': array(['Xkr4', 'Mrpl15', 'Lypla1', ..., 'U2af1l4', 'Umad1', 'Zfand4'],
      dtype='<U14'), 'clusters': array([42, 33, 33, ..., 43, 32, 39]), 'cluster_names': array(['10_Novel', '11_Novel', '12_ooDS_NT', '13_Novel', '14_ooDS_Cck',
       '15_Novel', '16_ooDS_DV', '17_Tbr1_S1', '18_Novel', '19_Novel',
       '1_W3D1.1', '20_Novel', '21_Tbr1_S2', '22_M5', '23_W3D2',
       '24_Novel', '25_Novel', '26_Novel', '27_Novel', '28_FmidiOFF',
       '29_Novel', '2_W3D1.2', '30_Novel', '31_M2', '32_F_Novel', '33_M1',
       '34_Novel', '35_Novel', '36_Novel', '37_Novel', '38_FmidiON',
       '39_Novel', '3_FminiON', '40_M1dup', '41_AlphaONT', '42_AlphaOFFS',
       '43_AlphaONS', '44_Novel', '45_AlphaOFFT', '4_FminiOFF', '5_J-RGC',
       '6_W3B', '7_Novel', '8_Novel', '9_Tbr1_Novel'], dtype='<U12'), 'batches': array([0, 0, 0, ..., 2, 2, 2]), 'batch

In [17]:
with open(countfile + '_preprocessed.pickle', "wb") as f:
    pickle.dump(dataset, f, protocol=4)

In [3]:
%load_ext watermark

In [4]:
watermark

Last updated: 2021-05-21T18:37:17.053295+02:00

Python implementation: CPython
Python version       : 3.8.0
IPython version      : 7.21.0

Compiler    : GCC 8.3.0
OS          : Linux
Release     : 3.10.0-957.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 40
Architecture: 64bit



In [7]:
watermark --iversions

pandas: 1.2.0
numpy : 1.20.1
scipy : 1.6.0

