Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
285 changes: 285 additions & 0 deletions scripts/scratch_ingest/render_expression_arrays.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
#! /usr/bin/env python3

"""render_expression_arrays.py
extract gene-level expression data from dense/sparse matrix files and process in the
context of a cluster file's cells.
this mimics the `expression` data array in visualization API responses

EXAMPLES

dense matrix:
python3 render_expression_arrays.py --matrix-file ../../tests/data/dense_expression_matrix.txt \
--cluster-file ../../tests/data/cluster_example.txt \
--cluster-name 'Dense Example'

dense matrix with precision override:
python3 render_expression_arrays.py --matrix-file ../../tests/data/dense_expression_matrix.txt \
--cluster-file ../../tests/data/cluster_example.txt \
--cluster-name 'Dense Example' --precision 1

sparse matrix:
python3 render_expression_arrays.py --matrix-file ../../tests/data/mtx/sorted_matrix_header.mtx \
--genes-file ../../tests/data/mtx/sampled_genes.tsv \
--barcodes-file ../../tests/data/mtx/barcodes.tsv \
--cluster-file ../../tests/data/mtx/cluster_mtx_barcodes.tsv \
--cluster-name 'Sparse Example'
"""

import json
import os
import re
import argparse
import uuid
import gzip
import time
Comment on lines +28 to +34
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for only using standard library, at least early on!


# regex to split on commas and tabs
COMMA_OR_TAB = r"[,\t]"

# regex to split on comma/tab/space
ALL_DELIM = r"[\s,\t]"

# Gzip magic number
GZIP_MAGIC_NUM = b'\x1f\x8b'

# default level of precision
precision = 3
Comment on lines +45 to +46
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like that this is somewhat abstracted. Per chat, a "contextual precision" approach might yield 10-100x smaller aggregate expression data sizes, as well as faster JSON.parse times for Image Pipeline and interactive end-user clients.

Abstracting this a bit earlier as you've done here makes that potential optimization that much easier.


def is_gz_file(filepath):
"""
Determine if a file is gzipped by reading the first two bytes and comparing to the 'magic number'
Args:
filepath (String): path to file to check

Returns:
(Bool): T/F if file is gzipped
"""
with open(filepath, 'rb') as test_f:
return test_f.read(2) == GZIP_MAGIC_NUM

def open_file(filepath):
"""
Open a file with the correct reader
Args:
filepath (String): path to file to open

Returns:
(TextIOWrapper): opened file
"""
if is_gz_file(filepath):
return gzip.open(filepath, 'rt')
else:
return open(filepath, 'r')

def make_data_dir(name):
"""
Make a directory to put output files in. Appends a UUID to the end of the directory to allow for
quick iteration and side-by-side comparison of outputs without manually moving directories
Args:
name (String): name of directory

Returns:
(String): name of created diretory with UUID appended to the end
"""
dirname = str(f"{name}-{uuid.uuid4()}")
print(f"creating data directory at {dirname}")
os.mkdir(dirname)
return dirname

def get_cluster_cells(file_path):
"""
Return a list of cell names from a cluster file (1st column, starting line 3)
Args:
file_path (String): absolute path to cluster_file

Returns:
(List): cluster cell names
"""
cells = []
with open_file(file_path) as cluster_file:
cluster_file.readline()
cluster_file.readline()
for row in cluster_file:
cell = re.split(COMMA_OR_TAB, row)[0]
Comment on lines +102 to +103
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We use csvreader from the standard library elsewhere in Ingest Pipeline for this sort of thing. But IIRC, that requires sniffing delimiters, which isn't needed here.

I wouldn't be surprised if that other approach is slightly faster, but I'm not confident it's notably so. Using this approach is fine by me; it just seemed worth commenting on.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a fair point, and as this eventually integrates with the rest of ingest pipeline, it will likely switch to that. But there are also the issues we see with mime types & file extensions in the rest of ingest, and I figured that was just a little more overhead than I wanted to take on for the initial PoC.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I've seen in ingest pipeline, we aren't sniffing delimiters, we're only assessing file suffixes. That's probably the root of the mime type issues we've had. Jon's approach (or sniffing delimiters) would be helpful in addressing some of the issues we've had with file types.

cells.append(cell)
return cells

def load_entities_as_list(file, column = None):
"""
Read an entire 10X feature/barcode file into a list for parsing sparse data
Args:
file (TextIOWrapper): open file object

Returns:
(List): entities from file
"""
print(f"reading entities from {file.name}")
if not column:
return list(map(str.rstrip, file.readlines()))
else:
return list(re.split(ALL_DELIM, line.strip())[column] for line in file)

def process_sparse_data(
matrix_file_path, genes, barcodes, cluster_cells, cluster_name, data_dir
):
"""
Main handler to read sparse mtx data and process entries at the gene level
Args:
matrix_file_path (String): path to matrix file
genes (List): gene names
barcodes (List): cell names
cluster_cells (List): cell names from cluster file
cluster_name (String): name of cluster object
data_dir (String): output data directory
Comment on lines +128 to +133
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, Python supports gradual typing. Parameters types are implicit by default, like they've historically been prior to Python 3.5, but types can be added as desired.

From what I understand, I find Python's take on types (which might also be Ruby's take?) more readable than, say, TypeScript. The latter requires all parameters to have explicit types. This often contributes to TS code being speckled with not-useful, visually-noisy any types -- which in Python would be absent.

So while our Python shouldn't require types, they might be worth adding where we document them in docstrings -- once we're beyond a prototyping stage of development.

"""
observed_cells = []
exp_vals = []
current_gene = 0
# cache a copy of an empty expression array to write out for missing genes
empty_exp = [0] * len(cluster_cells)
with open_file(matrix_file_path) as matrix_file:
matrix_file.readline()
matrix_file.readline()
matrix_file.readline()
print("processing sparse data")
for line in matrix_file:
gene_idx, barcode_idx, exp_val = extract_sparse_line(line)
# special case for first line of data
if current_gene == 0:
current_gene = gene_idx
if gene_idx == current_gene:
gene_name = genes[gene_idx - 1]
observed_cells.append(barcodes[barcode_idx - 1])
exp_vals.append(exp_val)
else:
print(f"preparing data for {gene_name} with {len(exp_vals)} entries... ", end = '')
filtered_expression = filter_expression_for_cluster(
cluster_cells, observed_cells, exp_vals
)
write_gene_scores(cluster_name, gene_name, filtered_expression, data_dir)
offset = gene_idx - current_gene
if offset > 1:
# we're are missing a gene here, so write empty expression arrays
missing_range = range(current_gene + 1, gene_idx)
for idx in missing_range:
missing_gene = genes[idx - 1]
print(f"####### {missing_gene} has no expression... ", end = '')
write_gene_scores(cluster_name, missing_gene, empty_exp, data_dir)
gene_name = genes[gene_idx - 1]
current_gene = gene_idx
observed_cells = [barcodes[barcode_idx - 1]]
exp_vals = [exp_val]

def extract_sparse_line(line):
"""
Process a single line from a sparse matrix and extract values as integers
Args:
line (String): single line from matrix file

Returns:
(List): values as integers
"""
gene_idx, barcode_idx, raw_exp = line.rstrip().split(' ')
return [int(gene_idx), int(barcode_idx), round(float(raw_exp), precision)]


def process_dense_data(matrix_file_path, cluster_cells, cluster_name, data_dir):
"""
Main handler to read dense matrix data and process entries at the gene level
Args:
matrix_file_path (String): path to dense matrix file
cluster_cells (List): cell names from cluster file
cluster_name (String): name of cluster object
data_dir (String): output data directory
"""
with open_file(matrix_file_path) as matrix_file:
header = matrix_file.readline().rstrip()
values = re.split(COMMA_OR_TAB, header)
cells = values[1:]
for line in matrix_file:
clean_line = line.rstrip()
raw_vals = re.split(COMMA_OR_TAB, clean_line)
gene_name = raw_vals.pop(0)
exp_vals = [round(float(val), precision) if float(val) != 0.0 else 0 for val in raw_vals]
filtered_expression = filter_expression_for_cluster(
cluster_cells, cells, exp_vals
)
write_gene_scores(cluster_name, gene_name, filtered_expression, data_dir)

def filter_expression_for_cluster(cluster_cells, exp_cells, exp_scores):
"""
Assemble a List of expression scores, filtered & ordered from a List of cluster cells
Will substitute 0 as a value for any cell not seen in the expression file

Args:
cluster_cells (List): cluster cell names
exp_cells (List): expression cell names
exp_scores (List): expression values, in the same order as exp_cells

Returns:
(List): Expression values, ordered by cluster_cells
"""
observed_exp = dict(zip(exp_cells, exp_scores))
return (observed_exp.get(cell, 0) for cell in cluster_cells)


def write_gene_scores(cluster_name, gene_name, exp_values, data_dir):
"""
Write a JSON array of expression values
Filename uses {cluster_name}--{gene_name}.json format

Args:
cluster_name (String): Name of cluster
gene_name (String): Name of gene
exp_values (List): expression values
data_dir (String): output data directory to write in

Returns:
(File): JSON file as a List of expression values
"""
start_write = time.time()
print(f"writing {gene_name} data... ", end = '')
with gzip.open(f"{data_dir}/{cluster_name}--{gene_name}.json.gz", "wt") as file:
json.dump(list(exp_values), file, separators=(',', ':'))
end_write = time.time()
time_in_sec = float(end_write - start_write)
print(f"{round(time_in_sec, 2)}s")

# parse cli arguments
parser = argparse.ArgumentParser()
parser.add_argument('--cluster-file', help='path to cluster file', required=True)
parser.add_argument('--cluster-name', help='name of cluster object', required=True)
parser.add_argument('--matrix-file', help='path to matrix file', required=True)
parser.add_argument('--precision', help='number of digits of precision for non-zero data')
parser.add_argument('--genes-file', help='path to genes file (None for dense matrix files)')
parser.add_argument('--barcodes-file', help='path to barcodes file (None for dense matrix files)')
args = parser.parse_args()

# main execution, set cluster name
start_time = time.time()
cluster_file_path = args.cluster_file
expression_file_path = args.matrix_file
sanitized_cluster_name = re.sub(r'\W', '_', args.cluster_name)
data_dir = make_data_dir(sanitized_cluster_name)
cluster_cells = get_cluster_cells(cluster_file_path)
if args.precision is not None:
precision = int(args.precision)
print(f"using {precision} digits of precision for non-zero data")
if args.genes_file is not None and args.barcodes_file is not None:
print(f"reading {expression_file_path} as sparse matrix")
genes_file = open_file(args.genes_file)
genes = load_entities_as_list(genes_file, 1)
barcodes_file = open_file(args.barcodes_file)
barcodes = load_entities_as_list(barcodes_file)
process_sparse_data(
expression_file_path, genes, barcodes, cluster_cells, sanitized_cluster_name,
data_dir
)
else:
print(f"reading {expression_file_path} as dense matrix")
process_dense_data(
expression_file_path, cluster_cells, sanitized_cluster_name, data_dir
)
end_time = time.time()
time_in_min = round(float(end_time - start_time), 3) / 60
print(f"completed, total runtime in minutes: {time_in_min}")
27 changes: 27 additions & 0 deletions tests/data/mtx/cluster_mtx_barcodes.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
NAME X Y
TYPE numeric numeric
AACGTTGGTTAAAGTG-1 34.472 32.211
AGTAGTCAGAGCTATA-1 15.975 10.043
ATCTGCCCATACTCTT-1 -11.688 -53.645
ATGCGATCAAGTTGTC-1 30.04 31.138
ATTTCTGTCCTTTCGG-1 23.862 33.092
CAGTAACGTAAACACA-1 -39.07 -14.64
CCAATCCCATGAAGTA-1 40.039 27.206
CGTAGGCCAGCGAACA-1 28.755 27.187
CTAACTTGTTCCATGA-1 -48.601 -13.512
CTCCTAGGTCTCATCC-1 14.653 27.832
CTCGGAGTCGTAGGAG-1 20.603 32.071
CTGAAACAGGGAAACA-1 -10.333 -51.733
GACTACAGTAACGCGA-1 -52.966 -12.484
GCATACAGTACCGTTA-1 38.513 26.969
GCGAGAACAAGAGGCT-1 12.838 13.047
GCTTCCACAGCTGTAT-1 33.092 28.755
GCTTGAAAGAGCTGCA-1 10.043 38.513
GGACGTCGTTAAAGAC-1 32.211 40.039
GTACTCCCACTGTTAG-1 27.187 14.653
GTCACAAAGTAAGTAC-1 -53.645 -39.07
TAGCCGGAGAGACGAA-1 -13.512 -48.601
TCAACGACACAGCCCA-1 31.138 14.653
TCGGGACGTCTCAACA-1 27.153 24.512
TCTCTAACATGCTGGC-1 1.234 31.123
TGAGCCGGTGATAAGT-1 -15.352 -1.421
7 changes: 7 additions & 0 deletions tests/data/mtx/sampled_genes.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
ENSG00000131591 C1orf159
ENSG00000272106 RP11-345P4.9
ENSG00000187730 GABRD
ENSG00000041988 THAP3
ENSG00000007923 DNAJC11
ENSG00000198754 OXCT2
ENSG00000103942 HOMER2
16 changes: 16 additions & 0 deletions tests/data/mtx/sorted_matrix_header.mtx
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
%%MatrixMarket matrix coordinate real general
%
7 25 14
1 1 1.234124
1 3 4.124124
2 4 3.123421
2 6 6.234521
3 15 5.235252
3 23 3.312412
4 12 4.215522
4 22 6.234123
5 10 7.134613
5 11 9.123512
6 12 1.643221
6 24 6.235125
7 10 2.123616