## Generate cellranger-like output from droplet-counts output

In [2]:
%matplotlib inline

import os
import sys
notebook_path = os.path.abspath('')
sources_path = os.path.abspath(os.path.join(notebook_path, '..', 'sources'))
sys.path.insert(0, sources_path)

import os
import numpy as np
import tables
from scipy.io import mmwrite
import scipy.sparse as sp

from commons import *

from fingerprint import SingleCellFingerprintBase, SingleCellFingerprintDTM

plt.rc('xtick', labelsize=16)
plt.rc('ytick', labelsize=16)

In [5]:
dataset_name = 'pbmc4k_ss_rate_0.0625'
run_id = 'all_genes__11__final_3_weakreg'
mol_info_h5_root_root = '/home/jupyter/data/10x/'
dtm_out_root = '/home/jupyter/data/10x/out'
gem_group_suffix = "-1"

# dataset_name = 'pbmc4k_ss_rate_0.0625'
# run_id = 'all_genes__11__final_3_weakreg'
# mol_info_h5_root_root = '/home/jupyter/data/10x/'
# dtm_out_root = '/home/jupyter/data/10x/out'
# gem_group_suffix = "-3"

# input paths
mol_info_h5 = os.path.join(mol_info_h5_root_root, f"{dataset_name}_molecule_info.h5")
input_e_hi_map_npy = os.path.join(dtm_out_root, f"{dataset_name}__{run_id}", "e_hi_map__full.npy")
gene_indices_npy = os.path.join(dtm_out_root, f"{dataset_name}__{run_id}", "gene_indices.npy")
cell_barcodes_npy = os.path.join(dtm_out_root, f"{dataset_name}__{run_id}", "cell_barcodes.npy")

# output paths
output_root = os.path.join(dtm_out_root, f"{dataset_name}__{run_id}__mtx_output")
try:
    os.mkdir(output_root)
except:
    pass

In [33]:
# load gene names and gene IDs
mol_info_h5_tab = tables.open_file(mol_info_h5)
gene_names_array = mol_info_h5_tab.root.gene_names.read()
gene_ids_array = mol_info_h5_tab.root.gene_ids.read()

# load e_hi count, included genes, and included barcodes
e_hi_map_array = np.load(input_e_hi_map_npy)
gene_indices_array = np.load(gene_indices_npy)
cell_barcodes_array = np.load(cell_barcodes_npy)

# generate genes.tsv
with open(os.path.join(output_root, 'genes.tsv'), 'w') as f:
    for gene_index in gene_indices_array:
        gene_name = gene_names_array[gene_index].decode('ascii')
        gene_id = gene_ids_array[gene_index].decode('ascii')
        f.write(f"{gene_id}\t{gene_name}\n")
        
# generate barcodes.tsv
with open(os.path.join(output_root, 'barcodes.tsv'), 'w') as f:
    for encoded_barcode in cell_barcodes_array:
        decoded_barcode_with_gem = decode(encoded_barcode, 16) + gem_group_suffix
        f.write(f"{decoded_barcode_with_gem}\n")
        
# generate matrix.mtx
coo_e_hi_map_counts = sp.coo_matrix(e_hi_map_array.astype(np.int).T)
mmwrite(os.path.join(output_root, "matrix.mtx"), coo_e_hi_map_counts)

In [6]:
# (optional) save raw counts on the same genes and barcodes
sc_fingerprint_path = '/home/jupyter/data/10x/pbmc4k_ss_rate_0.0625_sc_fingerprint.pkl'
output_root = os.path.join(dtm_out_root, f"{dataset_name}__raw__mtx_output")
try:
    os.mkdir(output_root)
except:
    pass

In [17]:
sc_fingerprint_base = SingleCellFingerprintBase.load(sc_fingerprint_path)
gene_indices_array = np.asarray(sc_fingerprint_base.gene_idx_list)
mol_info_h5_tab = tables.open_file(mol_info_h5)
gene_names_array = mol_info_h5_tab.root.gene_names.read()
gene_ids_array = mol_info_h5_tab.root.gene_ids.read()
raw_counts = np.asarray(sc_fingerprint_base.sparse_count_matrix_csr.todense())

# generate genes.tsv
with open(os.path.join(output_root, 'genes.tsv'), 'w') as f:
    for gene_index in gene_indices_array:
        gene_name = gene_names_array[gene_index].decode('ascii')
        gene_id = gene_ids_array[gene_index].decode('ascii')
        f.write(f"{gene_id}\t{gene_name}\n")
        
# generate barcodes.tsv
with open(os.path.join(output_root, 'barcodes.tsv'), 'w') as f:
    for encoded_barcode in sc_fingerprint_base.barcode_list:
        decoded_barcode_with_gem = decode(encoded_barcode, 16) + gem_group_suffix
        f.write(f"{decoded_barcode_with_gem}\n")
        
# generate matrix.mtx
coo_raw_counts = sp.coo_matrix(raw_counts.astype(np.int).T)
mmwrite(os.path.join(output_root, "matrix.mtx"), coo_raw_counts)

Calculating and caching "SingleCellFingerprintBase.sparse_count_matrix_csr"...


## Generate CellRanger output counts from a given SingleCellFingerprint

In [9]:
sc_fingerprint_path = '/home/jupyter/data/10x/pbmc4k_alevin_sc_fingerprint.pkl'
sc_fingerprint_base = SingleCellFingerprintBase.load(sc_fingerprint_path)
raw_counts = np.asarray(sc_fingerprint_base.sparse_count_matrix_csr.todense())

# output paths
dataset_name = 'pbmc4k'
run_id = 'alevin__raw'
gem_group_suffix = "-1"
output_root = os.path.join('/home/jupyter/data/10x', f"{dataset_name}__{run_id}__mtx_output")
try:
    os.mkdir(output_root)
except:
    pass

# generate genes.tsv
with open(os.path.join(output_root, 'genes.tsv'), 'w') as f:
    for gene_name in sc_fingerprint_base.gene_names_list:
        gene_id = gene_name  ## fix later
        f.write(f"{gene_id}\t{gene_name}\n")

# generate barcodes.tsv
with open(os.path.join(output_root, 'barcodes.tsv'), 'w') as f:
    for encoded_barcode in sc_fingerprint_base.barcode_list:
        decoded_barcode_with_gem = decode(encoded_barcode, 16) + gem_group_suffix
        f.write(f"{decoded_barcode_with_gem}\n")
        
# generate matrix.mtx
coo_raw_counts = sp.coo_matrix(raw_counts.astype(np.int).T)
mmwrite(os.path.join(output_root, "matrix.mtx"), coo_raw_counts)

Calculating and caching "SingleCellFingerprintBase.sparse_count_matrix_csr"...
