## Generate cellranger-like output from droplet-counts output

Generate cellranger-like output using the same gene and barcode filters used by droplet-counts.

In [7]:
%matplotlib inline

import os
import numpy as np
import tables
from scipy.io import mmwrite
import scipy.sparse as sp

from commons import *
from sc_fingerprint import SingleCellFingerprint, SingleCellFingerprintDataStore

plt.rc('xtick', labelsize=16)
plt.rc('ytick', labelsize=16)

In [3]:
dataset_name = 'pbmc4k'
run_id = 'default_gene_filters'
root = '/home/jupyter/data/10x/'

# input paths
sc_fingerprint_path = os.path.join(root, f"{dataset_name}_sc_fingerprint.pkl")
mol_info_h5 = os.path.join(root, f"{dataset_name}_molecule_info.h5")
gene_indices_npy = os.path.join(root, f"{dataset_name}__{run_id}", "gene_indices.npy")
cell_barcodes_npy = os.path.join(root, f"{dataset_name}__{run_id}", "cell_barcodes.npy")

# output paths
output_root = os.path.join(root, f"{dataset_name}__{run_id}__mtx_output")
try:
    os.mkdir(output_root)
except:
    pass

In [10]:
# load fingerprint and instantiate the data-store
sc_fingerprint = SingleCellFingerprint.load(sc_fingerprint_path)
sc_fingerprint = sc_fingerprint.filter_genes()
sc_fingerprint_datastore = SingleCellFingerprintDataStore(
    sc_fingerprint, top_k_genes=(0, sc_fingerprint.num_genes))

Number of genes failed the maximum Good-Turing criterion: 15350
Number of genes failed the minimum expression criterion: 19742
Number of genes failed both criteria: 15203
Number of retained genes: 13805


In [14]:
# load gene names and gene IDs from molecule info H5 file
mol_info_h5_tab = tables.open_file(mol_info_h5)
gene_names_array = mol_info_h5_tab.root.gene_names.read()
gene_ids_array = mol_info_h5_tab.root.gene_ids.read()

# droplet count, included genes, and included barcodes
droplet_count_array = sc_fingerprint_datastore.obs_expr_matrix
gene_indices_array = np.asarray(list(map(
    sc_fingerprint_datastore.internal_gene_index_to_original_gene_index_map.get,
    range(sc_fingerprint_datastore.n_genes))))
cell_barcodes_array = np.asarray(sc_fingerprint.barcode_list)

# generate genes.tsv
with open(os.path.join(output_root, 'genes.tsv'), 'w') as f:
    for gene_index in gene_indices_array:
        gene_name = gene_names_array[gene_index].decode('ascii')
        gene_id = gene_ids_array[gene_index].decode('ascii')
        f.write(f"{gene_id}\t{gene_name}\n")
        
# generate barcodes.tsv
with open(os.path.join(output_root, 'barcodes.tsv'), 'w') as f:
    for encoded_barcode in cell_barcodes_array:
        decoded_barcode_with_gem = decode(encoded_barcode, 16) + "-1"
        f.write(f"{decoded_barcode_with_gem}\n")
        
# generate matrix.mtx
coo_droplet_counts = sp.coo_matrix(droplet_count_array.astype(np.int).T)
mmwrite(os.path.join(output_root, "matrix.mtx"), coo_droplet_counts)