broadinstitute · bistline · Sep 6, 2022 · Aug 30, 2022 · Aug 30, 2022 · Aug 30, 2022
diff --git a/scripts/scratch_ingest/render_expression_arrays.py b/scripts/scratch_ingest/render_expression_arrays.py
@@ -0,0 +1,285 @@
+#! /usr/bin/env python3
+
+"""render_expression_arrays.py
+extract gene-level expression data from dense/sparse matrix files and process in the
+context of a cluster file's cells.
+this mimics the `expression` data array in visualization API responses
+
+EXAMPLES
+
+dense matrix:
+python3 render_expression_arrays.py --matrix-file ../../tests/data/dense_expression_matrix.txt \
+                                    --cluster-file ../../tests/data/cluster_example.txt \
+                                    --cluster-name 'Dense Example'
+
+dense matrix with precision override:
+python3 render_expression_arrays.py --matrix-file ../../tests/data/dense_expression_matrix.txt \
+                                    --cluster-file ../../tests/data/cluster_example.txt \
+                                    --cluster-name 'Dense Example' --precision 1
+
+sparse matrix:
+python3 render_expression_arrays.py --matrix-file ../../tests/data/mtx/sorted_matrix_header.mtx \
+                                    --genes-file ../../tests/data/mtx/sampled_genes.tsv \
+                                    --barcodes-file ../../tests/data/mtx/barcodes.tsv \
+                                    --cluster-file ../../tests/data/mtx/cluster_mtx_barcodes.tsv \
+                                    --cluster-name 'Sparse Example'
+"""
+
+import json
+import os
+import re
+import argparse
+import uuid
+import gzip
+import time
+
+# regex to split on commas and tabs
+COMMA_OR_TAB = r"[,\t]"
+
+# regex to split on comma/tab/space
+ALL_DELIM = r"[\s,\t]"
+
+# Gzip magic number
+GZIP_MAGIC_NUM = b'\x1f\x8b'
+
+# default level of precision
+precision = 3
+
+def is_gz_file(filepath):
+    """
+    Determine if a file is gzipped by reading the first two bytes and comparing to the 'magic number'
+    Args:
+        filepath (String): path to file to check
+
+    Returns:
+        (Bool): T/F if file is gzipped
+    """
+    with open(filepath, 'rb') as test_f:
+        return test_f.read(2) == GZIP_MAGIC_NUM
+
+def open_file(filepath):
+    """
+    Open a file with the correct reader
+    Args:
+        filepath (String): path to file to open
+
+    Returns:
+        (TextIOWrapper): opened file
+    """
+    if is_gz_file(filepath):
+        return gzip.open(filepath, 'rt')
+    else:
+        return open(filepath, 'r')
+
+def make_data_dir(name):
+    """
+    Make a directory to put output files in.  Appends a UUID to the end of the directory to allow for
+    quick iteration and side-by-side comparison of outputs without manually moving directories
+    Args:
+        name (String): name of directory
+
+    Returns:
+        (String): name of created diretory with UUID appended to the end
+    """
+    dirname = str(f"{name}-{uuid.uuid4()}")
+    print(f"creating data directory at {dirname}")
+    os.mkdir(dirname)
+    return dirname
+
+def get_cluster_cells(file_path):
+    """
+    Return a list of cell names from a cluster file (1st column, starting line 3)
+    Args:
+        file_path (String): absolute path to cluster_file
+
+    Returns:
+        (List): cluster cell names
+    """
+    cells = []
+    with open_file(file_path) as cluster_file:
+        cluster_file.readline()
+        cluster_file.readline()
+        for row in cluster_file:
+            cell = re.split(COMMA_OR_TAB, row)[0]
+            cells.append(cell)
+    return cells
+
+def load_entities_as_list(file, column = None):
+    """
+    Read an entire 10X feature/barcode file into a list for parsing sparse data
+    Args:
+        file (TextIOWrapper): open file object
+
+    Returns:
+        (List): entities from file
+    """
+    print(f"reading entities from {file.name}")
+    if not column:
+        return list(map(str.rstrip, file.readlines()))
+    else:
+        return list(re.split(ALL_DELIM, line.strip())[column] for line in file)
+
+def process_sparse_data(
+        matrix_file_path, genes, barcodes, cluster_cells, cluster_name, data_dir
+    ):
+    """
+    Main handler to read sparse mtx data and process entries at the gene level
+    Args:
+        matrix_file_path (String): path to matrix file
+        genes (List): gene names
+        barcodes (List): cell names
+        cluster_cells (List): cell names from cluster file
+        cluster_name (String): name of cluster object
+        data_dir (String): output data directory
+    """
+    observed_cells = []
+    exp_vals = []
+    current_gene = 0
+    # cache a copy of an empty expression array to write out for missing genes
+    empty_exp = [0] * len(cluster_cells)
+    with open_file(matrix_file_path) as matrix_file:
+        matrix_file.readline()
+        matrix_file.readline()
+        matrix_file.readline()
+        print("processing sparse data")
+        for line in matrix_file:
+            gene_idx, barcode_idx, exp_val = extract_sparse_line(line)
+            # special case for first line of data
+            if current_gene == 0:
+                current_gene = gene_idx
+            if gene_idx == current_gene:
+                gene_name = genes[gene_idx - 1]
+                observed_cells.append(barcodes[barcode_idx - 1])
+                exp_vals.append(exp_val)
+            else:
+                print(f"preparing data for {gene_name} with {len(exp_vals)} entries... ", end = '')
+                filtered_expression = filter_expression_for_cluster(
+                    cluster_cells, observed_cells, exp_vals
+                )
+                write_gene_scores(cluster_name, gene_name, filtered_expression, data_dir)
+                offset = gene_idx - current_gene
+                if offset > 1:
+                    # we're are missing a gene here, so write empty expression arrays
+                    missing_range = range(current_gene + 1, gene_idx)
+                    for idx in missing_range:
+                        missing_gene = genes[idx - 1]
+                        print(f"####### {missing_gene} has no expression... ", end = '')
+                        write_gene_scores(cluster_name, missing_gene, empty_exp, data_dir)
+                gene_name = genes[gene_idx - 1]
+                current_gene = gene_idx
+                observed_cells = [barcodes[barcode_idx - 1]]
+                exp_vals = [exp_val]
+
+def extract_sparse_line(line):
+    """
+    Process a single line from a sparse matrix and extract values as integers
+    Args:
+        line (String): single line from matrix file
+
+    Returns:
+        (List): values as integers
+    """
+    gene_idx, barcode_idx, raw_exp = line.rstrip().split(' ')
+    return [int(gene_idx), int(barcode_idx), round(float(raw_exp), precision)]
+
+
+def process_dense_data(matrix_file_path, cluster_cells, cluster_name, data_dir):
+    """
+    Main handler to read dense matrix data and process entries at the gene level
+    Args:
+        matrix_file_path (String): path to dense matrix file
+        cluster_cells (List): cell names from cluster file
+        cluster_name (String): name of cluster object
+        data_dir (String): output data directory
+    """
+    with open_file(matrix_file_path) as matrix_file:
+        header = matrix_file.readline().rstrip()
+        values = re.split(COMMA_OR_TAB, header)
+        cells = values[1:]
+        for line in matrix_file:
+            clean_line = line.rstrip()
+            raw_vals = re.split(COMMA_OR_TAB, clean_line)
+            gene_name = raw_vals.pop(0)
+            exp_vals = [round(float(val), precision) if float(val) != 0.0 else 0 for val in raw_vals]
+            filtered_expression = filter_expression_for_cluster(
+                cluster_cells, cells, exp_vals
+            )
+            write_gene_scores(cluster_name, gene_name, filtered_expression, data_dir)  
+
+def filter_expression_for_cluster(cluster_cells, exp_cells, exp_scores):
+    """
+    Assemble a List of expression scores, filtered & ordered from a List of cluster cells
+    Will substitute 0 as a value for any cell not seen in the expression file
+
+    Args:
+        cluster_cells (List): cluster cell names
+        exp_cells (List): expression cell names
+        exp_scores (List): expression values, in the same order as exp_cells
+
+    Returns:
+        (List): Expression values, ordered by cluster_cells
+    """
+    observed_exp = dict(zip(exp_cells, exp_scores))
+    return (observed_exp.get(cell, 0) for cell in cluster_cells)
+
+
+def write_gene_scores(cluster_name, gene_name, exp_values, data_dir):
+    """
+    Write a JSON array of expression values
+    Filename uses {cluster_name}--{gene_name}.json format
+
+    Args:
+        cluster_name (String): Name of cluster
+        gene_name (String): Name of gene
+        exp_values (List): expression values
+        data_dir (String): output data directory to write in
+
+    Returns:
+        (File): JSON file as a List of expression values
+    """
+    start_write = time.time()
+    print(f"writing {gene_name} data... ", end = '')
+    with gzip.open(f"{data_dir}/{cluster_name}--{gene_name}.json.gz", "wt") as file:
+        json.dump(list(exp_values), file, separators=(',', ':'))
+    end_write = time.time()
+    time_in_sec = float(end_write - start_write)
+    print(f"{round(time_in_sec, 2)}s")
+
+# parse cli arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--cluster-file', help='path to cluster file', required=True)
+parser.add_argument('--cluster-name', help='name of cluster object', required=True)
+parser.add_argument('--matrix-file', help='path to matrix file', required=True)
+parser.add_argument('--precision', help='number of digits of precision for non-zero data')
+parser.add_argument('--genes-file', help='path to genes file (None for dense matrix files)')
+parser.add_argument('--barcodes-file', help='path to barcodes file (None for dense matrix files)')
+args = parser.parse_args()
+
+# main execution, set cluster name 
+start_time = time.time()
+cluster_file_path = args.cluster_file
+expression_file_path = args.matrix_file
+sanitized_cluster_name = re.sub(r'\W', '_', args.cluster_name)
+data_dir = make_data_dir(sanitized_cluster_name)
+cluster_cells = get_cluster_cells(cluster_file_path)
+if args.precision is not None:
+    precision = int(args.precision)
+print(f"using {precision} digits of precision for non-zero data")
+if args.genes_file is not None and args.barcodes_file is not None:
+    print(f"reading {expression_file_path} as sparse matrix")
+    genes_file = open_file(args.genes_file)
+    genes = load_entities_as_list(genes_file, 1)
+    barcodes_file = open_file(args.barcodes_file)
+    barcodes = load_entities_as_list(barcodes_file)
+    process_sparse_data(
+        expression_file_path, genes, barcodes, cluster_cells, sanitized_cluster_name,
+        data_dir
+    )
+else:
+    print(f"reading {expression_file_path} as dense matrix")
+    process_dense_data(
+        expression_file_path, cluster_cells, sanitized_cluster_name, data_dir
+    )
+end_time = time.time()
+time_in_min = round(float(end_time - start_time), 3) / 60
+print(f"completed, total runtime in minutes: {time_in_min}")
diff --git a/tests/data/mtx/cluster_mtx_barcodes.tsv b/tests/data/mtx/cluster_mtx_barcodes.tsv
@@ -0,0 +1,27 @@
+NAME    X   Y
+TYPE    numeric numeric
+AACGTTGGTTAAAGTG-1	34.472	32.211
+AGTAGTCAGAGCTATA-1	15.975	10.043
+ATCTGCCCATACTCTT-1	-11.688	-53.645
+ATGCGATCAAGTTGTC-1	30.04	31.138
+ATTTCTGTCCTTTCGG-1	23.862	33.092
+CAGTAACGTAAACACA-1	-39.07	-14.64
+CCAATCCCATGAAGTA-1	40.039	27.206
+CGTAGGCCAGCGAACA-1	28.755	27.187
+CTAACTTGTTCCATGA-1	-48.601	-13.512
+CTCCTAGGTCTCATCC-1	14.653	27.832
+CTCGGAGTCGTAGGAG-1	20.603	32.071
+CTGAAACAGGGAAACA-1	-10.333	-51.733
+GACTACAGTAACGCGA-1	-52.966	-12.484
+GCATACAGTACCGTTA-1	38.513	26.969
+GCGAGAACAAGAGGCT-1	12.838	13.047
+GCTTCCACAGCTGTAT-1	33.092	28.755
+GCTTGAAAGAGCTGCA-1	10.043	38.513
+GGACGTCGTTAAAGAC-1	32.211	40.039
+GTACTCCCACTGTTAG-1	27.187	14.653
+GTCACAAAGTAAGTAC-1	-53.645	-39.07
+TAGCCGGAGAGACGAA-1	-13.512	-48.601
+TCAACGACACAGCCCA-1	31.138	14.653
+TCGGGACGTCTCAACA-1	27.153	24.512
+TCTCTAACATGCTGGC-1	1.234	31.123
+TGAGCCGGTGATAAGT-1	-15.352	-1.421
diff --git a/tests/data/mtx/sampled_genes.tsv b/tests/data/mtx/sampled_genes.tsv
@@ -0,0 +1,7 @@
+ENSG00000131591	C1orf159
+ENSG00000272106	RP11-345P4.9
+ENSG00000187730	GABRD
+ENSG00000041988	THAP3
+ENSG00000007923	DNAJC11
+ENSG00000198754	OXCT2
+ENSG00000103942	HOMER2
diff --git a/tests/data/mtx/sorted_matrix_header.mtx b/tests/data/mtx/sorted_matrix_header.mtx
@@ -0,0 +1,16 @@
+%%MatrixMarket matrix coordinate real general
+%
+7 25 14
+1 1 1.234124
+1 3 4.124124
+2 4 3.123421
+2 6 6.234521
+3 15 5.235252
+3 23 3.312412
+4 12 4.215522
+4 22 6.234123
+5 10 7.134613
+5 11 9.123512
+6 12 1.643221
+6 24 6.235125
+7 10 2.123616