broadinstitute · jlchang · Aug 1, 2022 · Jul 14, 2022 · Jul 18, 2022 · Jul 18, 2022
diff --git a/ingest/cli_parser.py b/ingest/cli_parser.py
@@ -331,6 +331,20 @@ def create_parser():
         "--gene-file", help="Path to .genes.tsv file"
     )
 
+    # h5ad subparsers
+    parser_h5ad = subparsers.add_parser(
+        "ingest_h5ad", help="Indicates that h5ad file is being ingested"
+    )
+
+    parser_h5ad.add_argument(
+        "--ingest-h5ad",
+        required=True,
+        action="store_true",
+        help="Indicates that ingest of h5ad file should be invoked",
+    )
+
+    parser_h5ad.add_argument("--h5ad-file", required=True, help="Path to h5ad file")
+
     return parser
 
 

diff --git a/ingest/de.py b/ingest/de.py
@@ -51,8 +51,6 @@ def __init__(
         self.kwargs = kwargs
         self.accession = self.kwargs["study_accession"]
         self.annot_scope = self.kwargs["annotation_scope"]
-        # only used in output filename, replacing non-alphanumeric with underscores
-        self.cluster_name = re.sub(r'\W', '_', self.kwargs["name"])
         self.method = self.kwargs["method"]
 
         if matrix_file_type == "mtx":
@@ -183,6 +181,11 @@ def subset_adata(adata, de_cells):
     def execute_de(self):
         print(f'dev_info: Starting DE for {self.accession}')
         try:
+            # only used in output filename, replacing non-alphanumeric with underscores
+            # except '+' replaced with 'pos'
+            self.cluster_name = DifferentialExpression.sanitize_strings(
+                self.kwargs["name"]
+            )
             if self.matrix_file_type == "mtx":
                 DifferentialExpression.de_logger.info("preparing DE on sparse matrix")
                 self.run_scanpy_de(
@@ -389,13 +392,12 @@ def run_scanpy_de(
         DifferentialExpression.de_logger.info("Gathering DE annotation labels")
         groups = np.unique(adata.obs[annotation]).tolist()
         for group in groups:
-            clean_group = re.sub(r'\W', '_', group)
-            clean_annotation = re.sub(r'\W', '_', annotation)
+            clean_group = DifferentialExpression.sanitize_strings(group)
+            clean_annotation = DifferentialExpression.sanitize_strings(annotation)
             DifferentialExpression.de_logger.info(f"Writing DE output for {group}")
             rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=group)
             if DifferentialExpression.delimiter_in_gene_name(rank):
                 DifferentialExpression.extract_gene_id_for_out_file(rank)
-
             out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
             # Round numbers to 4 significant digits while respecting fixed point
             # and scientific notation (note: trailing zeros are removed)
@@ -408,10 +410,23 @@ def run_scanpy_de(
 
         DifferentialExpression.de_logger.info("DE processing complete")
 
+    @staticmethod
+    def sanitize_strings(input_string):
+        """
+        Replace '+' with 'pos', then replace non-alphanumerics with underscore
+        this allows distinct sanitization for "CD16+ monocyte" vs "CD16- monocyte"
+        """
+        plus_converted_string = re.sub('\+', 'pos', input_string)
+        return re.sub(r'\W', '_', plus_converted_string)
+
     @staticmethod
     def string_for_output_match(arguments):
-        cleaned_cluster_name = re.sub(r'\W', '_', arguments["cluster_name"])
-        cleaned_annotation_name = re.sub(r'\W', '_', arguments["annotation_name"])
+        cleaned_cluster_name = DifferentialExpression.sanitize_strings(
+            arguments["cluster_name"]
+        )
+        cleaned_annotation_name = DifferentialExpression.sanitize_strings(
+            arguments["annotation_name"]
+        )
         files_to_match = f"{cleaned_cluster_name}--{cleaned_annotation_name}*.tsv"
         return files_to_match
 

diff --git a/ingest/h5ad.py b/ingest/h5ad.py
@@ -0,0 +1,37 @@
+try:
+    from ingest_files import IngestFiles
+    from monitor import log_exception
+except ImportError:
+    # Used when importing as external package, e.g. imports in single_cell_portal code
+    from .ingest_files import IngestFiles
+    from .monitor import log_exception
+
+
+class H5adIngestor(IngestFiles):
+    ALLOWED_FILE_TYPES = ['application/x-hdf5']
+
+    def __init__(self, file_path, study_file_id, study_id, **kwargs):
+        IngestFiles.__init__(
+            self, file_path, allowed_file_types=self.ALLOWED_FILE_TYPES
+        )
+        pass
+
+    def obtain_adata(self):
+        try:
+            self.adata = self.open_file(self.file_path)[0]
+            print(self.adata)
+            IngestFiles.dev_logger.info(str(self.adata))
+        except ValueError as e:
+            raise ValueError(e)
+
+    def validate(self):
+        """
+        Currently, file passes "basic validation" if file
+        can be opened by scanpy
+        """
+        try:
+            self.adata = self.obtain_adata()
+            return True
+        except ValueError:
+            return False
+
diff --git a/ingest/ingest_files.py b/ingest/ingest_files.py
@@ -12,6 +12,7 @@
 from dataclasses import dataclass
 from typing import Dict, Generator, List, Tuple, Union  # noqa: F401
 import warnings
+import scanpy as sc
 
 
 import pandas as pd  # NOqa: F821
@@ -21,9 +22,9 @@
 # import google.cloud.logging
 
 try:
-    from monitor import setup_logger
+    from monitor import setup_logger, log_exception
 except ImportError:
-    from .monitor import setup_logger
+    from .monitor import setup_logger, log_exception
 
 
 @dataclass
@@ -75,13 +76,16 @@ class IngestFiles:
     # General logger for class
     # Logger provides more details
     dev_logger = setup_logger(__name__, "log.txt", format="support_configs")
+    user_logger = setup_logger(__name__ + ".user_logger", "user_log.txt")
     # Filter out warnings about using end user credentials when running ingest_pipeline as dev
     warnings.filterwarnings(
         "ignore", "Your application has authenticated using end user credentials"
     )
 
     def __init__(self, file_path, allowed_file_types):
         self.file_path = file_path
+        # define filetype for h5ad file extension
+        mimetypes.add_type('application/x-hdf5', '.h5ad')
         # File is remote (in GCS bucket) when running via PAPI,
         # and typically local when developing
         self.is_remote_file = IngestFiles.is_remote_file(file_path)
@@ -195,6 +199,7 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
             "text/plain": self.open_txt,
             "text/tab-separated-values": self.open_tsv,
             "dataframe": self.open_pandas,
+            "application/x-hdf5": self.open_h5ad,
         }
 
         if start_point != 0:
@@ -214,6 +219,11 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
                         file_connections.get(file_type)(open_file, file_type, **kwargs),
                         open_file,
                     )
+                elif file_type == "application/x-hdf5":
+                    return (
+                        file_connections.get(file_type)(file_path, **kwargs),
+                        open_file,
+                    )
                 else:
                     return (
                         file_connections.get(file_type)(open_file, **kwargs),
@@ -227,9 +237,12 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
                     open_file,
                 )
         else:
-            raise ValueError(
-                f"Unsupported file format. Allowed file types are: {' '.join(self.allowed_file_types)}"
+            msg = (
+                f"Unsupported file format. Allowed file MIME types are: "
+                f"{' '.join(self.allowed_file_types)}"
             )
+            log_exception(IngestFiles.dev_logger, IngestFiles.user_logger, msg)
+            raise ValueError(msg)
 
     # Inherited function
     def extract(self):
@@ -298,6 +311,15 @@ def open_pandas(self, file_path, file_type, **kwargs):
         else:
             raise ValueError("File must be tab or comma delimited")
 
+    def open_h5ad(self, file_path, **kwargs):
+        """Opens file as AnnData object """
+        try:
+            return sc.read_h5ad(file_path, backed='r')
+        except OSError as e:
+            msg = f"Scanpy cannot read file, \"{file_path}\"."
+            log_exception(IngestFiles.dev_logger, IngestFiles.user_logger, msg)
+            raise ValueError(msg)
+
     def open_csv(self, opened_file_object, **kwargs):
         """Opens csv file"""
         csv.register_dialect(

diff --git a/ingest/ingest_pipeline.py b/ingest/ingest_pipeline.py
@@ -26,8 +26,8 @@
 # Ingest dense file
 python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name human --ncbi-taxid 9606 --matrix-file ../tests/data/dense_matrix_19_genes_1000_cells.txt --matrix-file-type dense
 
-# Ingest loom file
-python ingest_pipeline.py  --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --matrix-file ../tests/data/test_loom.loom  --matrix-file-type loom --taxon-name 'Homo Sapiens' --taxon-common-name human
+# Ingest h5ad file
+python ingest_pipeline.py  --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_h5ad --h5ad-file ../tests/data/test.h5ad
 
 # Subsample cluster and metadata file
 python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_subsample --cluster-file ../tests/data/test_1k_cluster_Data.csv --name custer1 --cell-metadata-file ../tests/data/test_1k_metadata_Data.csv --subsample
@@ -82,6 +82,7 @@
     from clusters import Clusters
     from expression_files.mtx import MTXIngestor
     from expression_files.dense_ingestor import DenseIngestor
+    from h5ad import H5adIngestor
     from monitor import setup_logger, log_exception
     from de import DifferentialExpression
 
@@ -102,6 +103,7 @@
     from .clusters import Clusters
     from .expression_files.dense_ingestor import DenseIngestor
     from .expression_files.mtx import MTXIngestor
+    from .h5ad import H5adIngestor
     from .cli_parser import create_parser, validate_arguments
     from .de import DifferentialExpression
 
@@ -125,6 +127,7 @@ def __init__(
         matrix_file_type: str = None,
         cell_metadata_file: str = None,
         cluster_file: str = None,
+        h5ad_file: str = None,
         subsample=False,
         ingest_cell_metadata=False,
         ingest_cluster=False,
@@ -144,6 +147,7 @@ def __init__(
         else:
             self.db = None
         self.cluster_file = cluster_file
+        self.h5ad_file = h5ad_file
         self.kwargs = kwargs
         self.cell_metadata_file = cell_metadata_file
         self.props = {}
@@ -474,6 +478,20 @@ def subsample(self):
                 return 1
         return 0
 
+    @custom_metric(config.get_metric_properties)
+    def ingest_h5ad(self):
+        """Ingests h5ad files."""
+        self.h5ad = H5adIngestor(
+            self.h5ad_file, self.study_id, self.study_file_id, **self.kwargs
+        )
+        if self.h5ad.validate():
+            self.report_validation("success")
+            return 0
+        # scanpy unable to open h5ad file
+        else:
+            self.report_validation("failure")
+            return 1
+
     def calculate_de(self):
         """ Run differential expression analysis """
         try:
@@ -523,6 +541,11 @@ def run_ingest(ingest, arguments, parsed_args):
             config.set_parent_event_name("ingest-pipeline:subsample:ingest")
             status_subsample = ingest.subsample()
             status.append(status_subsample)
+    elif "ingest_h5ad" in arguments:
+        if arguments["ingest_h5ad"]:
+            config.set_parent_event_name("ingest-pipeline:h5ad:ingest")
+            status_h5ad = ingest.ingest_h5ad()
+            status.append(status_h5ad)
     elif "differential_expression" in arguments:
         config.set_parent_event_name("ingest-pipeline:differential-expression")
         status_de = ingest.calculate_de()

diff --git a/tests/data/h5ad/bad.h5 b/tests/data/h5ad/bad.h5
@@ -0,0 +1 @@
+non-empty
diff --git a/tests/data/h5ad/bad.h5ad b/tests/data/h5ad/bad.h5ad
diff --git a/tests/data/h5ad/test.h5ad b/tests/data/h5ad/test.h5ad
diff --git a/tests/test_de.py b/tests/test_de.py
@@ -35,10 +35,10 @@ def find_expected_files(labels, cluster_name, annotation, scope, method):
     """ Check that files were created for all expected annotation labels
     """
     found = []
-    sanitized_cluster_name = re.sub(r'\W', '_', cluster_name)
-    sanitized_annotation = re.sub(r'\W', '_', annotation)
+    sanitized_cluster_name = DifferentialExpression.sanitize_strings(cluster_name)
+    sanitized_annotation = DifferentialExpression.sanitize_strings(annotation)
     for label in labels:
-        sanitized_label = re.sub(r'\W', '_', label)
+        sanitized_label = DifferentialExpression.sanitize_strings(label)
         expected_file = f"{sanitized_cluster_name}--{sanitized_annotation}--{sanitized_label}--{scope}--{method}.tsv"
         assert os.path.exists(expected_file)
         found.append(expected_file)
@@ -185,16 +185,26 @@ def test_delimiter_in_gene_name(self):
     def test_filename_sanitation(self):
         """ Bugfix (SCP-4459) so sanitization does not collapse adjacent non-alphanumeric characters to
             single underscores, see also SCP-4455 for manual fix
+
+            Bugfix (SCP-4533) convert '+' to 'pos' so labels differing in only +/-
+            do not clobber and cause display of incorrect results for one of the labels.
         """
+        test_string = "foo++)"
+        plus_converted_result = DifferentialExpression.sanitize_strings(test_string)
+        self.assertEqual(
+            plus_converted_result,
+            "foopospos_",
+            "unexpected result from sanitation sanitize_strings function",
+        )
+
         arguments = {
-            "cluster_name": "UMAP, pre-QC all cells (complexity greater than or equal to 1000)",
+            "cluster_name": "UMAP+, pre-QC all cells (complexity greater than or equal to 1000)",
             "annotation_name": "cell..type",
         }
         files_to_match = DifferentialExpression.string_for_output_match(arguments)
-        print(files_to_match)
         self.assertEqual(
             files_to_match,
-            "UMAP__pre_QC_all_cells__complexity_greater_than_or_equal_to_1000_--cell__type*.tsv",
+            "UMAPpos__pre_QC_all_cells__complexity_greater_than_or_equal_to_1000_--cell__type*.tsv",
             "unexpected result from sanitation function",
         )
 
@@ -472,9 +482,7 @@ def test_de_process_sanitize(self):
             f"expected five annotation labels for {test_annotation}",
         )
 
-        expected_file = (
-            "UMAP__pre_QC--misc__cellaneous--cholinergic__neuron_--study--wilcoxon.tsv"
-        )
+        expected_file = "UMAP__pre_QC--miscposposcellaneous--cholinergic__neuron_--study--wilcoxon.tsv"
 
         # confirm expected results filename was generated in found result files
         self.assertIn(