From 78bfa66394d61821be3d0eaca2e6db937e4aa2f3 Mon Sep 17 00:00:00 2001 From: bistline Date: Tue, 30 Aug 2022 17:10:57 -0400 Subject: [PATCH 1/6] adding script to generate data for image pipeline --- .../render_expression_arrays.py | 283 ++++++++++++++++++ tests/data/mtx/cluster_mtx_barcodes.tsv | 27 ++ tests/data/mtx/sampled_genes.tsv | 7 + tests/data/mtx/sorted_matrix_header.mtx | 16 + 4 files changed, 333 insertions(+) create mode 100755 scripts/scratch_ingest/render_expression_arrays.py create mode 100644 tests/data/mtx/cluster_mtx_barcodes.tsv create mode 100644 tests/data/mtx/sampled_genes.tsv create mode 100644 tests/data/mtx/sorted_matrix_header.mtx diff --git a/scripts/scratch_ingest/render_expression_arrays.py b/scripts/scratch_ingest/render_expression_arrays.py new file mode 100755 index 00000000..d314624c --- /dev/null +++ b/scripts/scratch_ingest/render_expression_arrays.py @@ -0,0 +1,283 @@ +#! /usr/bin/env python3 + +"""render_expression_arrays.py +extract gene-level expression data from dense/sparse matrix files and process in the +context of a cluster file's cells. +this mimics the `expression` data array in visualization API responses + +EXAMPLES + +dense matrix: +python3 render_expression_arrays.py --matrix-file ../../tests/data/dense_expression_matrix.txt \ + --cluster-file ../../tests/data/cluster_example.txt \ + --cluster-name 'Dense Example' +sparse matrix: +python3 render_expression_arrays.py --matrix-file ../../tests/data/mtx/sorted_matrix_header.mtx \ + --genes-file ../../tests/data/mtx/sampled_genes.tsv \ + --barcodes-file ../../tests/data/mtx/barcodes.tsv \ + --cluster-file ../../tests/data/mtx/cluster_mtx_barcodes.tsv \ + --cluster-name 'Sparse Example' +""" + +import json +import os +import re +import argparse +import uuid +import gzip +import time + +# regex to split on commas and tabs +COMMA_OR_TAB = r"[,\t]" + +# regex to split on comma/tab/space +ALL_DELIM = r"[\s,\t]" + +# Gzip magic number +GZIP_MAGIC_NUM = b'\x1f\x8b' + +# default level of precision +precision = 3 + +def is_gz_file(filepath): + """ + Determine if a file is gzipped by reading the first two bytes and comparing to the 'magic number' + Args: + filepath (String): path to file to check + + Returns: + (Bool): T/F if file is gzipped + """ + with open(filepath, 'rb') as test_f: + return test_f.read(2) == GZIP_MAGIC_NUM + +def open_file(filepath): + """Open a file with the correct reader + Args: + filepath (String): path to file to open + + Returns: + (TextIOWrapper): opened file + """ + if is_gz_file(filepath): + return gzip.open(filepath, 'rt') + else: + return open(filepath, 'r') + +def make_data_dir(name): + """ + Make a directory to put output files in + Args: + name (String): name of directory + + Returns: + (String): name of created diretory with UUID appended to the end + """ + dirname = str(f"{name}-{uuid.uuid4()}") + print(f"creating data directory at {dirname}") + os.mkdir(dirname) + return dirname + +def get_cluster_cells(file_path): + """ + Return a list of cell names from a cluster file (1st column, starting line 3) + Args: + file_path (String): absolute path to cluster_file + + Returns: + (List): cluster cell names + """ + cells = [] + with open_file(file_path) as cluster_file: + cluster_file.readline() + cluster_file.readline() + for row in cluster_file: + cell = re.split(COMMA_OR_TAB, row)[0] + cells.append(cell) + return cells + +def load_entities_as_list(file, column = None): + """ + Read an entire 10X feature/barcode file into a list for parsing sparse data + Args: + file (TextIOWrapper): open file object + + Returns: + (List): entities from file + """ + print(f"reading entities from {file.name}") + if not column: + return list(map(str.rstrip, file.readlines())) + else: + entities = [] + for line in file: + entry = re.split(ALL_DELIM, line.strip())[column] + entities.append(entry) + return entities + + +def process_sparse_data( + matrix_file_path, genes, barcodes, cluster_cells, cluster_name, data_dir + ): + """ + Main handler to read sparse mtx data and process entries at the gene level + Args: + matrix_file_path (String): path to matrix file + genes (List): gene names + barcodes (List): cell names + cluster_cells (List): cell names from cluster file + cluster_name (String): name of cluster object + data_dir (String): output data directory + """ + observed_cells = [] + exp_vals = [] + current_gene = 0 + # cache a copy of an empty expression array to write out for missing genes + empty_exp = [0] * len(cluster_cells) + with open_file(matrix_file_path) as matrix_file: + matrix_file.readline() + matrix_file.readline() + matrix_file.readline() + print("processing sparse data") + for line in matrix_file: + gene_idx, barcode_idx, exp_val = extract_sparse_line(line) + # special case for first line of data + if current_gene == 0: + current_gene = gene_idx + if gene_idx == current_gene: + gene_name = genes[gene_idx - 1] + observed_cells.append(barcodes[barcode_idx - 1]) + exp_vals.append(exp_val) + else: + print(f"preparing data for {gene_name} with {len(exp_vals)} entries... ", end = '') + filtered_expression = filter_expression_for_cluster( + cluster_cells, observed_cells, exp_vals + ) + write_gene_scores(cluster_name, gene_name, filtered_expression, data_dir) + offset = gene_idx - current_gene + if offset > 1: + # we're are missing a gene here, so write empty expression arrays + missing_range = range(current_gene + 1, gene_idx) + for idx in missing_range: + missing_gene = genes[idx - 1] + print(f"####### {missing_gene} has no expression... ", end = '') + write_gene_scores(cluster_name, missing_gene, empty_exp, data_dir) + gene_name = genes[gene_idx - 1] + current_gene = gene_idx + observed_cells = [barcodes[barcode_idx - 1]] + exp_vals = [exp_val] + +def extract_sparse_line(line): + """ + Process a single line from a sparse matrix and extract values as integers + Args: + line (String): single line from matrix file + + Returns: + (List): values as integers + """ + gene_idx, barcode_idx, raw_exp = line.rstrip().split(' ') + return [int(gene_idx), int(barcode_idx), round(float(raw_exp), precision)] + + +def process_dense_data(matrix_file_path, cluster_cells, cluster_name, data_dir): + """ + Main handler to read dense matrix data and process entries at the gene level + Args: + matrix_file_path (String): path to dense matrix file + cluster_cells (List): cell names from cluster file + cluster_name (String): name of cluster object + data_dir (String): output data directory + """ + with open_file(matrix_file_path) as matrix_file: + print(f"reading cells from {matrix_file.name}") + header = matrix_file.readline().rstrip() + values = re.split(COMMA_OR_TAB, header) + cells = values[1:] + for line in matrix_file: + clean_line = line.rstrip() + raw_vals = re.split(COMMA_OR_TAB, clean_line) + gene_name = raw_vals.pop(0) + exp_vals = [round(float(val), precision) for val in raw_vals] + filtered_expression = filter_expression_for_cluster( + cluster_cells, cells, exp_vals + ) + write_gene_scores(cluster_name, gene_name, filtered_expression, data_dir) + +def filter_expression_for_cluster(cluster_cells, exp_cells, exp_scores): + """ + Assemble a List of expression scores, filtered & ordered from a List of cluster cells + Will substitute 0 as a value for any cell not seen in the expression file + + Args: + cluster_cells (List): cluster cell names + exp_cells (List): expression cell names + exp_scores (List): expression values, in the same order as exp_cells + + Returns: + (List): Expression values, ordered by cluster_cells + """ + filtered_scores = [] + observed_exp = dict(zip(exp_cells, exp_scores)) + return (observed_exp.get(cell, 0) for cell in cluster_cells) + + +def write_gene_scores(cluster_name, gene_name, exp_values, data_dir): + """ + Write a JSON array of expression values + Filename uses {cluster_name}--{gene_name}.json format + + Args: + cluster_name (String): Name of cluster + gene_name (String): Name of gene + exp_values (List): expression values + data_dir (String): output data directory to write in + + Returns: + (File): JSON file as a List of expression values + """ + start_write = time.time() + print(f"writing {gene_name} data... ", end = '') + with gzip.open(f"{data_dir}/{cluster_name}--{gene_name}.json.gz", "wt") as file: + json.dump(list(exp_values), file, separators=(',', ':')) + end_write = time.time() + time_in_sec = float(end_write - start_write) + print(f"{round(time_in_sec, 2)}s") + +# parse cli arguments +parser = argparse.ArgumentParser() +parser.add_argument('--cluster-file', help='path to cluster file', required=True) +parser.add_argument('--cluster-name', help='name of cluster object', required=True) +parser.add_argument('--matrix-file', help='path to matrix file', required=True) +parser.add_argument('--precision', help='number of digits of precision for non-zero data') +parser.add_argument('--genes-file', help='path to genes file (None for dense matrix files)') +parser.add_argument('--barcodes-file', help='path to barcodes file (None for dense matrix files)') +args = parser.parse_args() + +# main execution, set cluster name +start_time = time.time() +cluster_file_path = args.cluster_file +expression_file_path = args.matrix_file +sanitized_cluster_name = re.sub(r'\W', '_', args.cluster_name) +data_dir = make_data_dir(sanitized_cluster_name) +cluster_cells = get_cluster_cells(cluster_file_path) +if args.precision is not None: + precision = int(args.precision) +if args.genes_file is not None and args.barcodes_file is not None: + print(f"reading {expression_file_path} as sparse matrix") + genes_file = open_file(args.genes_file) + genes = load_entities_as_list(genes_file, 1) + barcodes_file = open_file(args.barcodes_file) + barcodes = load_entities_as_list(barcodes_file) + process_sparse_data( + expression_file_path, genes, barcodes, cluster_cells, sanitized_cluster_name, + data_dir + ) +else: + print(f"reading {expression_file_path} as dense matrix") + process_dense_data( + expression_file_path, cluster_cells, sanitized_cluster_name, data_dir + ) +end_time = time.time() +time_in_min = round(float(end_time - start_time), 3) / 60 +print(f"completed, total runtime in minutes: {time_in_min}") diff --git a/tests/data/mtx/cluster_mtx_barcodes.tsv b/tests/data/mtx/cluster_mtx_barcodes.tsv new file mode 100644 index 00000000..8b35608b --- /dev/null +++ b/tests/data/mtx/cluster_mtx_barcodes.tsv @@ -0,0 +1,27 @@ +NAME X Y +TYPE numeric numeric +AACGTTGGTTAAAGTG-1 34.472 32.211 +AGTAGTCAGAGCTATA-1 15.975 10.043 +ATCTGCCCATACTCTT-1 -11.688 -53.645 +ATGCGATCAAGTTGTC-1 30.04 31.138 +ATTTCTGTCCTTTCGG-1 23.862 33.092 +CAGTAACGTAAACACA-1 -39.07 -14.64 +CCAATCCCATGAAGTA-1 40.039 27.206 +CGTAGGCCAGCGAACA-1 28.755 27.187 +CTAACTTGTTCCATGA-1 -48.601 -13.512 +CTCCTAGGTCTCATCC-1 14.653 27.832 +CTCGGAGTCGTAGGAG-1 20.603 32.071 +CTGAAACAGGGAAACA-1 -10.333 -51.733 +GACTACAGTAACGCGA-1 -52.966 -12.484 +GCATACAGTACCGTTA-1 38.513 26.969 +GCGAGAACAAGAGGCT-1 12.838 13.047 +GCTTCCACAGCTGTAT-1 33.092 28.755 +GCTTGAAAGAGCTGCA-1 10.043 38.513 +GGACGTCGTTAAAGAC-1 32.211 40.039 +GTACTCCCACTGTTAG-1 27.187 14.653 +GTCACAAAGTAAGTAC-1 -53.645 -39.07 +TAGCCGGAGAGACGAA-1 -13.512 -48.601 +TCAACGACACAGCCCA-1 31.138 14.653 +TCGGGACGTCTCAACA-1 27.153 24.512 +TCTCTAACATGCTGGC-1 1.234 31.123 +TGAGCCGGTGATAAGT-1 -15.352 -1.421 diff --git a/tests/data/mtx/sampled_genes.tsv b/tests/data/mtx/sampled_genes.tsv new file mode 100644 index 00000000..6f95a67e --- /dev/null +++ b/tests/data/mtx/sampled_genes.tsv @@ -0,0 +1,7 @@ +ENSG00000131591 C1orf159 +ENSG00000272106 RP11-345P4.9 +ENSG00000187730 GABRD +ENSG00000041988 THAP3 +ENSG00000007923 DNAJC11 +ENSG00000198754 OXCT2 +ENSG00000103942 HOMER2 diff --git a/tests/data/mtx/sorted_matrix_header.mtx b/tests/data/mtx/sorted_matrix_header.mtx new file mode 100644 index 00000000..0e9026e7 --- /dev/null +++ b/tests/data/mtx/sorted_matrix_header.mtx @@ -0,0 +1,16 @@ +%%MatrixMarket matrix coordinate real general +% +7 25 14 +1 1 1.234124 +1 3 4.124124 +2 4 3.123421 +2 6 6.234521 +3 15 5.235252 +3 23 3.312412 +4 12 4.215522 +4 22 6.234123 +5 10 7.134613 +5 11 9.123512 +6 12 1.643221 +6 24 6.235125 +7 10 2.123616 From 5f2e5a613dd2ccb29e0f752cad0fd5f4a5e88547 Mon Sep 17 00:00:00 2001 From: bistline Date: Tue, 30 Aug 2022 17:18:40 -0400 Subject: [PATCH 2/6] updating examples --- scripts/scratch_ingest/render_expression_arrays.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/scratch_ingest/render_expression_arrays.py b/scripts/scratch_ingest/render_expression_arrays.py index d314624c..5cecbd67 100755 --- a/scripts/scratch_ingest/render_expression_arrays.py +++ b/scripts/scratch_ingest/render_expression_arrays.py @@ -11,6 +11,12 @@ python3 render_expression_arrays.py --matrix-file ../../tests/data/dense_expression_matrix.txt \ --cluster-file ../../tests/data/cluster_example.txt \ --cluster-name 'Dense Example' + +dense matrix with precision override: +python3 render_expression_arrays.py --matrix-file ../../tests/data/dense_expression_matrix.txt \ + --cluster-file ../../tests/data/cluster_example.txt \ + --cluster-name 'Dense Example' --precision 1 + sparse matrix: python3 render_expression_arrays.py --matrix-file ../../tests/data/mtx/sorted_matrix_header.mtx \ --genes-file ../../tests/data/mtx/sampled_genes.tsv \ From 021c7739858e231ab327b697c83de8b8c410a1ee Mon Sep 17 00:00:00 2001 From: bistline Date: Tue, 30 Aug 2022 17:24:32 -0400 Subject: [PATCH 3/6] taking out confusing log message --- scripts/scratch_ingest/render_expression_arrays.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/scratch_ingest/render_expression_arrays.py b/scripts/scratch_ingest/render_expression_arrays.py index 5cecbd67..b6b3140a 100755 --- a/scripts/scratch_ingest/render_expression_arrays.py +++ b/scripts/scratch_ingest/render_expression_arrays.py @@ -196,7 +196,6 @@ def process_dense_data(matrix_file_path, cluster_cells, cluster_name, data_dir): data_dir (String): output data directory """ with open_file(matrix_file_path) as matrix_file: - print(f"reading cells from {matrix_file.name}") header = matrix_file.readline().rstrip() values = re.split(COMMA_OR_TAB, header) cells = values[1:] From 684067dd0ded3f4af9b5bfe8f763295c168e3967 Mon Sep 17 00:00:00 2001 From: bistline Date: Tue, 30 Aug 2022 17:26:55 -0400 Subject: [PATCH 4/6] adding log message for precision --- scripts/scratch_ingest/render_expression_arrays.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/scratch_ingest/render_expression_arrays.py b/scripts/scratch_ingest/render_expression_arrays.py index b6b3140a..ba1403b5 100755 --- a/scripts/scratch_ingest/render_expression_arrays.py +++ b/scripts/scratch_ingest/render_expression_arrays.py @@ -268,6 +268,7 @@ def write_gene_scores(cluster_name, gene_name, exp_values, data_dir): cluster_cells = get_cluster_cells(cluster_file_path) if args.precision is not None: precision = int(args.precision) +print(f"using {precision} digits of precision for non-zero data") if args.genes_file is not None and args.barcodes_file is not None: print(f"reading {expression_file_path} as sparse matrix") genes_file = open_file(args.genes_file) From 979510759ec0ac05250a0779ceb7a2f1845e5e75 Mon Sep 17 00:00:00 2001 From: bistline Date: Tue, 30 Aug 2022 17:35:14 -0400 Subject: [PATCH 5/6] override 0 for 0.0 in dense parse --- scripts/scratch_ingest/render_expression_arrays.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/scratch_ingest/render_expression_arrays.py b/scripts/scratch_ingest/render_expression_arrays.py index ba1403b5..42107614 100755 --- a/scripts/scratch_ingest/render_expression_arrays.py +++ b/scripts/scratch_ingest/render_expression_arrays.py @@ -203,7 +203,7 @@ def process_dense_data(matrix_file_path, cluster_cells, cluster_name, data_dir): clean_line = line.rstrip() raw_vals = re.split(COMMA_OR_TAB, clean_line) gene_name = raw_vals.pop(0) - exp_vals = [round(float(val), precision) for val in raw_vals] + exp_vals = [round(float(val), precision) if float(val) != 0.0 else 0 for val in raw_vals] filtered_expression = filter_expression_for_cluster( cluster_cells, cells, exp_vals ) @@ -221,8 +221,7 @@ def filter_expression_for_cluster(cluster_cells, exp_cells, exp_scores): Returns: (List): Expression values, ordered by cluster_cells - """ - filtered_scores = [] + """ observed_exp = dict(zip(exp_cells, exp_scores)) return (observed_exp.get(cell, 0) for cell in cluster_cells) From cbef530f67ce980db8bb4fd03f00552a45d49f44 Mon Sep 17 00:00:00 2001 From: bistline Date: Wed, 31 Aug 2022 12:25:49 -0400 Subject: [PATCH 6/6] addressing PR comments --- .../scratch_ingest/render_expression_arrays.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/scripts/scratch_ingest/render_expression_arrays.py b/scripts/scratch_ingest/render_expression_arrays.py index 42107614..2c36880e 100755 --- a/scripts/scratch_ingest/render_expression_arrays.py +++ b/scripts/scratch_ingest/render_expression_arrays.py @@ -58,7 +58,8 @@ def is_gz_file(filepath): return test_f.read(2) == GZIP_MAGIC_NUM def open_file(filepath): - """Open a file with the correct reader + """ + Open a file with the correct reader Args: filepath (String): path to file to open @@ -72,7 +73,8 @@ def open_file(filepath): def make_data_dir(name): """ - Make a directory to put output files in + Make a directory to put output files in. Appends a UUID to the end of the directory to allow for + quick iteration and side-by-side comparison of outputs without manually moving directories Args: name (String): name of directory @@ -101,7 +103,7 @@ def get_cluster_cells(file_path): cell = re.split(COMMA_OR_TAB, row)[0] cells.append(cell) return cells - + def load_entities_as_list(file, column = None): """ Read an entire 10X feature/barcode file into a list for parsing sparse data @@ -115,12 +117,7 @@ def load_entities_as_list(file, column = None): if not column: return list(map(str.rstrip, file.readlines())) else: - entities = [] - for line in file: - entry = re.split(ALL_DELIM, line.strip())[column] - entities.append(entry) - return entities - + return list(re.split(ALL_DELIM, line.strip())[column] for line in file) def process_sparse_data( matrix_file_path, genes, barcodes, cluster_cells, cluster_name, data_dir