## Subsample 10x molecule info file

In [2]:
%matplotlib inline

import os
import sys
notebook_path = os.path.abspath('')
sources_path = os.path.abspath(os.path.join(notebook_path, '..', 'sources'))
sys.path.insert(0, sources_path)

import matplotlib.pylab as plt
import numpy as np
import tables
import itertools
import operator
from typing import Set, List, Dict
import logging
from collections import Counter
from collections import defaultdict
from commons import *

plt.rc('xtick', labelsize=16)
plt.rc('ytick', labelsize=16)

In [3]:
subsample_rate = 0.0625

input_mol_h5_path = '/home/jupyter/data/10x/pbmc4k_molecule_info.h5'
cr_version = 'v2'
input_mol_info = MoleculeInfo(input_mol_h5_path, cr_version)



In [4]:
_logger = logging.Logger(__name__)

_logger.warning(f'Subsampling (rate={subsample_rate})...')
ss_reads_array = np.zeros_like(input_mol_info.reads_array)
num_raw_records = len(input_mol_info.reads_array)
log_after = int(num_raw_records/10)
for idx in range(num_raw_records):
    ss_reads_array[idx] = np.random.binomial(input_mol_info.reads_array[idx], subsample_rate)
    if idx > 0 and idx % log_after == 0:
        _logger.warning(f'{int(10 * idx / log_after)}% complete...')
        
# indices of molecules with non-zero reads
nnz = ss_reads_array > 0

ss_reads_array = ss_reads_array[nnz]
ss_barcode_array = input_mol_info.barcode_array[nnz]
ss_gene_array = input_mol_info.gene_array[nnz]

Subsampling (rate=0.0625)...
10% complete...
20% complete...
30% complete...
40% complete...
50% complete...
60% complete...
70% complete...
80% complete...
90% complete...
100% complete...


In [10]:
output_mol_h5_path = f'/home/jupyter/data/10x/pbmc4k_ss_rate_{subsample_rate}_molecule_info.h5'

In [11]:
input_mol_h5_tab = tables.open_file(input_mol_h5_path)

In [13]:
def copy_earray_to_h5(where, name, instance, out_h5):
    out_h5.create_earray(where, name, obj=instance.read())

def copy_carray_to_h5(where, name, instance, out_h5):
    out_h5.create_carray(where, name, obj=instance.read())

def copy_array_to_h5(where, name, instance, out_h5):
    out_h5.create_array(where, name, obj=instance.read())

_logger.warning('Saving molecule info HDF5 file...')
with tables.open_file(output_mol_h5_path, mode='w', title='10X',
                      filters=input_mol_h5_tab.filters) as output_mol_h5_tab:
    if cr_version == 'v2':
        
        copy_carray_to_h5('/', 'gene_names', input_mol_h5_tab.root.gene_names, output_mol_h5_tab)
        copy_carray_to_h5('/', 'gene_ids', input_mol_h5_tab.root.gene_ids, output_mol_h5_tab)

        output_mol_h5_tab.create_earray('/', 'reads', obj=ss_reads_array)
        output_mol_h5_tab.create_earray('/', 'barcode', obj=ss_barcode_array)
        output_mol_h5_tab.create_earray('/', 'gene', obj=ss_gene_array)

    elif cr_version == 'v3':
        
        output_mol_h5_tab.create_group('/', 'barcode_info')
        output_mol_h5_tab.create_group('/', 'features')
        output_mol_h5_tab.create_group('/', 'metrics')
        
        copy_array_to_h5('/features', '_all_tag_keys',
                         input_mol_h5_tab.root.features._all_tag_keys, output_mol_h5_tab)
        copy_array_to_h5('/features', 'feature_type',
                         input_mol_h5_tab.root.features.feature_type, output_mol_h5_tab)
        copy_array_to_h5('/features', 'genome',
                         input_mol_h5_tab.root.features.genome, output_mol_h5_tab)
        copy_array_to_h5('/features', 'id',
                         input_mol_h5_tab.root.features.id, output_mol_h5_tab)
        copy_array_to_h5('/features', 'name',
                         input_mol_h5_tab.root.features.name, output_mol_h5_tab)
        copy_array_to_h5('/features', 'pattern',
                         input_mol_h5_tab.root.features.pattern, output_mol_h5_tab)
        copy_array_to_h5('/features', 'read',
                         input_mol_h5_tab.root.features.read, output_mol_h5_tab)
        copy_array_to_h5('/features', 'sequence',
                         input_mol_h5_tab.root.features.sequence, output_mol_h5_tab)
        
        copy_carray_to_h5('/barcode_info', 'genomes',
                          input_mol_h5_tab.root.barcode_info.genomes, output_mol_h5_tab)
        copy_earray_to_h5('/barcode_info', 'pass_filter',
                          input_mol_h5_tab.root.barcode_info.pass_filter, output_mol_h5_tab)
        
        copy_array_to_h5('/', 'library_info',
                         input_mol_h5_tab.root.library_info, output_mol_h5_tab)

        output_mol_h5_tab.create_earray(
            '/', 'barcode_idx',
            obj=input_mol_h5_tab.root.barcode_idx.read()[nnz])

        output_mol_h5_tab.create_carray(
            '/', 'barcodes',
            obj=input_mol_h5_tab.root.barcodes.read())

        output_mol_h5_tab.create_earray(
            '/', 'count',
            obj=ss_reads_array)

        output_mol_h5_tab.create_earray(
            '/', 'feature_idx',
            obj=input_mol_h5_tab.root.feature_idx.read()[nnz])
        
        output_mol_h5_tab.create_earray(
            '/', 'gem_group',
            obj=input_mol_h5_tab.root.gem_group.read()[nnz])
 
        output_mol_h5_tab.create_earray(
            '/', 'library_idx',
            obj=input_mol_h5_tab.root.library_idx.read()[nnz])

        output_mol_h5_tab.create_earray(
            '/', 'umi',
            obj=input_mol_h5_tab.root.umi.read()[nnz])


Saving molecule info HDF5 file...
