In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

#import dmitrys loading code
import sys; sys.path.append('../../libs/rna-seq-tsne/')
import rnaseqTools

import pickle
import os.path
import pandas as pd
import numpy as np
import warnings
import anndata
from scipy.io import mmread
from scipy import sparse

from tools import remove_rare_genes

  from pandas.core.index import RangeIndex


# PMBC data

In [2]:
def load_PMBC_dataset(folder,genename_file='genes.tsv'):
    counts = mmread(folder + 'matrix.mtx')
    counts = sparse.csr_matrix(counts).T
    genes = pd.read_csv(folder + genename_file, header=None, sep='\t')
    genes = genes.values[:,1].squeeze().astype(str)
    return counts,genes

In [3]:
%%time
folder = 'datasets/33k_pbmc/'
label = '33k PBMC (v1)'
counts,genes=load_PMBC_dataset(folder)
counts,genes=remove_rare_genes(counts,genes,minimum_detected_cells_per_gene=5)

counts = sparse.csr_matrix(counts)

dataset = dict(counts=counts,genes=genes,label=label,folder=folder)
print(dataset)
pickle.dump(dataset, open(folder+'preprocessed.pickle', "wb"), protocol=4)

Of 20678 total genes, returning 16809 genes that are detected in 5 or more cells.
Output shape: (33148, 16809)
{'counts': <33148x16809 sparse matrix of type '<class 'numpy.float64'>'
	with 25374746 stored elements in Compressed Sparse Row format>, 'genes': array(['AL627309.1', 'AP006222.2', 'RP11-206L10.2', ..., 'KIR2DL2',
       'PNRC2', 'SRSF10'], dtype='<U19'), 'label': '33k PBMC (v1)', 'folder': 'datasets/33k_pbmc/'}
CPU times: user 45.2 s, sys: 847 ms, total: 46.1 s
Wall time: 46.3 s


# Negative control data
### 10X v2 - Svensson 2017 (technical control)

In [4]:
%%time
folder = 'datasets/10x/'
file = folder + 'svensson_chromium_control.h5ad'

raw_input = anndata.read(file)

for i,sample_id in enumerate([20311, 20312]):
    sample_input = raw_input[raw_input.obs.query('sample == "%u"' % (sample_id)).index]
    label = '2k Chromium control Svensson 2017 (10x v2) - sample %u' % (i+1)
    
    counts_pd = sample_input.to_df()
    genes = np.array(counts_pd.columns)
    counts = counts_pd.to_numpy()
    counts,genes=remove_rare_genes(counts,genes,minimum_detected_cells_per_gene=5)

    counts = sparse.csr_matrix(counts)

    dataset = dict(counts=counts,genes=genes,label=label,folder=folder,protocol='10X')
    print(dataset)
    pickle.dump(dataset, open(folder+'sample%u_preprocessed.pickle' % (i+1), "wb"), protocol=4)

Of 20647 total genes, returning 13025 genes that are detected in 5 or more cells.
Output shape: (2000, 13025)
{'counts': <2000x13025 sparse matrix of type '<class 'numpy.float32'>'
	with 1239772 stored elements in Compressed Sparse Row format>, 'genes': array(['ENSG00000000003', 'ENSG00000000419', 'ENSG00000000457', ...,
       'LRG_759', 'LRG_788', 'LRG_92'], dtype=object), 'label': '2k Chromium control Svensson 2017 (10x v2) - sample 1', 'folder': 'datasets/10x/', 'protocol': '10X'}
Of 21411 total genes, returning 13239 genes that are detected in 5 or more cells.
Output shape: (2000, 13239)
{'counts': <2000x13239 sparse matrix of type '<class 'numpy.float32'>'
	with 1135300 stored elements in Compressed Sparse Row format>, 'genes': array(['ENSG00000000003', 'ENSG00000000419', 'ENSG00000000971', ...,
       'LRG_81', 'LRG_89', 'LRG_92'], dtype=object), 'label': '2k Chromium control Svensson 2017 (10x v2) - sample 2', 'folder': 'datasets/10x/', 'protocol': '10X'}
CPU times: user 1.63 s

# inDrop

### Klein 2015 (one technial and various biological controls)

In [5]:
paths =['datasets/indrop/GSM1599501_K562_pure_RNA.csv']
ns = ['1k']
labelsshort = ['K562_pure_RNA']

In [6]:
def pickle_klein_data(path,labelshort,n):
    label = '%s %s Klein 2015 (inDrop)' % (n,labelshort)
    df = pd.read_csv(path,index_col=0)
    genes = df.index
    counts = df.values.T
    counts,genes = remove_rare_genes(counts,genes,minimum_detected_cells_per_gene=5)
    counts = sparse.csr_matrix(counts)
    dataset = dict(counts=counts,genes=genes,folder=path,label=label,protocol='inDrop')
    picklefile = '%s_preprocessed.pickle' % (path)
    print(dataset)
    pickle.dump(dataset, open(picklefile, "wb"), protocol=4)

In [7]:
%%time
## pickle-prepares all klein datasets
for p,n,l in zip(paths,ns,labelsshort):
    pickle_klein_data(p,l,n)

Of 25266 total genes, returning 25025 genes that are detected in 5 or more cells.
Output shape: (953, 25025)
{'counts': <953x25025 sparse matrix of type '<class 'numpy.longlong'>'
	with 9261429 stored elements in Compressed Sparse Row format>, 'genes': Index(['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1', 'A2ML1', 'A2MP1',
       'A3GALT2', 'A4GALT', 'A4GNT',
       ...
       'ZWILCH', 'ZWINT', 'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B', 'ZYX',
       'ZZEF1', 'ZZZ3'],
      dtype='object', length=25025), 'folder': 'datasets/indrop/GSM1599501_K562_pure_RNA.csv', 'label': '1k K562_pure_RNA Klein 2015 (inDrop)', 'protocol': 'inDrop'}
CPU times: user 2.98 s, sys: 200 ms, total: 3.18 s
Wall time: 3.38 s


# MicrowellSeq

### Han 2018 (biological controls)

In [8]:
%%time
## Preproc files for ES cells
files_raw = ['datasets/microwellseq/GSM2906413_EmbryonicStemCell_dge.txt.gz']
separators = [' ']

### iterate over files and make pickles for each
for i,(file_raw,sep) in enumerate(zip(files_raw,separators)):

    file = file_raw.split('/')[-1]
    cellline = file.split('_')[1]
    
    counts, genes, cells = rnaseqTools.sparseload(file_raw, sep=sep)

    counts = counts.toarray()
    print(file,counts.shape)
    
    #remove low depth cells
    depths = np.array(np.sum(counts,axis=1)).flatten()
    minimum_depth = 500
    cells = cells[depths>minimum_depth]
    counts = counts[depths>minimum_depth,:]
    print('Of',len(depths),'cells, returning',sum(depths>minimum_depth),'cells that have a depth larger than', minimum_depth)
    print('New shape:', counts.shape)    
    
    counts,genes = remove_rare_genes(counts,genes,minimum_detected_cells_per_gene=5)
    counts = sparse.csr_matrix(counts)

    dataset = dict(counts=counts,genes=genes,cells=cells,folder=file_raw,
                   label='Han 2018 %s control (MicrowellSeq)' % (cellline),
                   protocol='microwellSeq')
    
    print(dataset)
    picklefile = '%s_preprocessed.pickle' % (file_raw)
    pickle.dump(dataset, open(picklefile, "wb"), protocol=4)

..................... done
GSM2906413_EmbryonicStemCell_dge.txt.gz (9994, 20682)
Of 9994 cells, returning 9994 cells that have a depth larger than 500
New shape: (9994, 20682)
Of 20677 total genes, returning 15069 genes that are detected in 5 or more cells.
Output shape: (9994, 15069)
{'counts': <9994x15069 sparse matrix of type '<class 'numpy.float64'>'
	with 6762689 stored elements in Compressed Sparse Row format>, 'genes': array(['0610007P14Rik', '0610009B22Rik', '0610009L18Rik', ..., 'n-R5s151',
       'n-R5s2', 'n-R5s88'], dtype='<U14'), 'cells': array(['EmbryonicStemCells_1.CTCGCAATCAACAACCTA',
       'EmbryonicStemCells_1.CCGCTAATTCCAGTTGCC',
       'EmbryonicStemCells_1.CTCGCACACAAGACACCC', ...,
       'EmbryonicStemCells_1.CGAGTACCAGACAGATGG',
       'EmbryonicStemCells_1.CGGCAGCTGAAAGGCTGC',
       'EmbryonicStemCells_1.CTGTGTGAGATCGTCCCG'], dtype=object), 'folder': 'datasets/microwellseq/GSM2906413_EmbryonicStemCell_dge.txt.gz', 'label': 'Han 2018 EmbryonicStemCell control (