## Subsample 10x molecule info file

In [5]:
%matplotlib inline

import matplotlib.pylab as plt
import numpy as np
import tables
import itertools
import operator
from typing import Set, List, Dict
import logging
from collections import Counter
from collections import defaultdict

plt.rc('xtick', labelsize=16)
plt.rc('ytick', labelsize=16)

In [6]:
subsample_rate = 0.125
input_mol_h5_path = '/home/jupyter/data/10x/pbmc4k_molecule_info.h5'
output_mol_h5_path = f'/home/jupyter/data/10x/pbmc4k_ss_rate_{subsample_rate}_molecule_info.h5'

In [7]:
_logger = logging.Logger(__name__)

_logger.warning('Loading molecule info HDF5 file...')
input_mol_h5_tab = tables.open_file(input_mol_h5_path)
reads_array = input_mol_h5_tab.root.reads.read()

_logger.warning(f'Subsampling (rate={subsample_rate})...')
ss_reads_array = np.zeros_like(reads_array)
num_raw_records = len(reads_array)
log_after = int(num_raw_records/10)
for idx in range(num_raw_records):
    ss_reads_array[idx] = np.random.binomial(reads_array[idx], subsample_rate)
    if idx > 0 and idx % log_after == 0:
        _logger.warning(f'{int(10 * idx / log_after)}% complete...')
        
# indices of molecules with non-zero reads
nnz = ss_reads_array > 0

ss_reads_array = ss_reads_array[nnz]
ss_barcode_array = input_mol_h5_tab.root.barcode.read()[nnz]
ss_gene_array = input_mol_h5_tab.root.gene.read()[nnz]
ss_umi_array = input_mol_h5_tab.root.umi.read()[nnz]
# ss_gem_group_array = input_mol_h5_tab.root.gem_group.read()[nnz]
# ss_genome_array = input_mol_h5_tab.root.genome.read()[nnz]

Loading molecule info HDF5 file...
Subsampling (rate=0.125)...
10% complete...
20% complete...
30% complete...
40% complete...
50% complete...
60% complete...
70% complete...
80% complete...
90% complete...
100% complete...


In [9]:
def copy_earray_to_h5(where, name, instance, out_h5):
    out_h5.create_earray(where, name,
                         atom=instance.atom,
                         title=instance.title,
                         chunkshape=instance.chunkshape,
                         obj=instance.read())

def copy_carray_to_h5(where, name, instance, out_h5):
    out_h5.create_carray(where, name,
                         atom=instance.atom,
                         title=instance.title,
                         chunkshape=instance.chunkshape,
                         obj=instance.read())

_logger.warning('Saving molecule info HDF5 file...')
with tables.open_file(output_mol_h5_path, mode='w', title='10X',
                      filters=input_mol_h5_tab.filters) as output_mol_h5_tab:
    # copy unchanged arrays
    copy_carray_to_h5('/', 'gene_names', input_mol_h5_tab.root.gene_names, output_mol_h5_tab)
    copy_carray_to_h5('/', 'gene_ids', input_mol_h5_tab.root.gene_ids, output_mol_h5_tab)
    copy_carray_to_h5('/', 'genome_ids', input_mol_h5_tab.root.genome_ids, output_mol_h5_tab)

    # subsampled array
    output_mol_h5_tab.create_earray(
        '/', 'reads',
        atom=input_mol_h5_tab.root.reads.atom,
        title=input_mol_h5_tab.root.reads.title,
        chunkshape=input_mol_h5_tab.root.reads.chunkshape,
        obj=ss_reads_array)
    
    output_mol_h5_tab.create_earray(
        '/', 'barcode',
        atom=input_mol_h5_tab.root.barcode.atom,
        title=input_mol_h5_tab.root.barcode.title,
        chunkshape=input_mol_h5_tab.root.barcode.chunkshape,
        obj=ss_barcode_array)
        
    output_mol_h5_tab.create_earray(
        '/', 'gene',
        atom=input_mol_h5_tab.root.gene.atom,
        title=input_mol_h5_tab.root.gene.title,
        chunkshape=input_mol_h5_tab.root.gene.chunkshape,
        obj=ss_gene_array)
        
    output_mol_h5_tab.create_earray(
        '/', 'umi',
        atom=input_mol_h5_tab.root.umi.atom,
        title=input_mol_h5_tab.root.umi.title,
        chunkshape=input_mol_h5_tab.root.umi.chunkshape,
        obj=ss_umi_array)
    
#     output_mol_h5_tab.create_earray(
#         '/', 'gem_group',
#         atom=input_mol_h5_tab.root.gem_group.atom,
#         title=input_mol_h5_tab.root.gem_group.title,
#         chunkshape=input_mol_h5_tab.root.gem_group.chunkshape,
#         obj=ss_gem_group_array)

#     output_mol_h5_tab.create_earray(
#         '/', 'genome',
#         atom=input_mol_h5_tab.root.genome.atom,
#         title=input_mol_h5_tab.root.genome.title,
#         chunkshape=input_mol_h5_tab.root.genome.chunkshape,
#         obj=ss_genome_array)

Saving molecule info HDF5 file...
