broadinstitute · bistline · Aug 21, 2024 · Aug 20, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/ingest/anndata_.py b/ingest/anndata_.py
@@ -15,19 +15,19 @@ def _field_template(self, field, precision):
 
 
 try:
-    from ingest_files import IngestFiles
+    from ingest_files import DataArray, IngestFiles
     from expression_files.expression_files import GeneExpression
-    from monitor import log_exception
+    from monitor import log_exception, bypass_mongo_writes
     from validation.validate_metadata import list_duplicates
 except ImportError:
     # Used when importing as external package, e.g. imports in single_cell_portal code
-    from .ingest_files import IngestFiles
+    from .ingest_files import DataArray, IngestFiles
     from .expression_files.expression_files import GeneExpression
-    from .monitor import log_exception
+    from .monitor import log_exception, bypass_mongo_writes
     from .validation.validate_metadata import list_duplicates
 
 
-class AnnDataIngestor(GeneExpression, IngestFiles):
+class AnnDataIngestor(GeneExpression, IngestFiles, DataArray):
     ALLOWED_FILE_TYPES = ['application/x-hdf5']
 
     def __init__(self, file_path, study_file_id, study_id, **kwargs):
@@ -57,6 +57,36 @@ def basic_validation(self):
         except ValueError:
             return False
 
+    def create_cell_data_arrays(self):
+        """Extract cell name DataArray documents for raw data"""
+        adata = self.obtain_adata()
+        cells = list(adata.obs_names)
+        # use filename denoting a raw 'fragment' to allow successful ingest and downstream queries
+        raw_filename = "h5ad_frag.matrix.raw.mtx.gz"
+        data_arrays = []
+        for data_array in GeneExpression.create_data_arrays(
+            name=f"{raw_filename} Cells",
+            array_type="cells",
+            values=cells,
+            linear_data_type="Study",
+            linear_data_id=self.study_file_id,
+            cluster_name=raw_filename,
+            study_file_id=self.study_file_id,
+            study_id=self.study_id
+        ):
+            data_arrays.append(data_array)
+
+        return data_arrays
+
+    def ingest_raw_cells(self):
+        """Insert raw count cells into MongoDB"""
+        arrays = self.create_cell_data_arrays()
+        if not bypass_mongo_writes():
+            self.load(arrays, DataArray.COLLECTION_NAME)
+        else:
+            dev_msg = f"Extracted {len(arrays)} DataArray for {self.study_file_id}:{arrays[0]['name']}"
+            IngestFiles.dev_logger.info(dev_msg)
+
     @staticmethod
     def generate_cluster_header(adata, clustering_name):
         """

diff --git a/ingest/expression_files/expression_files.py b/ingest/expression_files/expression_files.py
@@ -101,6 +101,16 @@ def is_raw_count_file(study_id, study_file_id, client):
         QUERY = {"_id": study_file_id, "study_id": study_id}
 
         study_file_doc = list(client[COLLECTION_NAME].find(QUERY)).pop()
+        # special handling of non-reference AnnData files to always return false
+        # this will allow normal extraction of expression data as raw count cells are already ingested during
+        # the "raw_counts" extract phase
+        if (
+          study_file_doc.get("file_type") == "AnnData" and 
+          "ann_data_file_info" in study_file_doc.keys() and 
+          not study_file_doc["ann_data_file_info"].get("reference_file")
+        ):
+            return False
+
         # Name of embedded document that holds 'is_raw_count_files is named expression_file_info.
         # If study files does not have document expression_file_info
         # field, "is_raw_count_files", will not exist.:

diff --git a/ingest/ingest_pipeline.py b/ingest/ingest_pipeline.py
@@ -54,6 +54,9 @@
 # Ingest AnnData - happy path processed expression data only extraction
 python ingest_pipeline.py  --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad  --extract "['processed_expression']"
 
+# Ingest AnnData - happy path raw count cell name only extraction
+python ingest_pipeline.py  --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad  --extract "['raw_counts']"
+
 # Ingest AnnData - happy path cluster and metadata extraction
 python ingest_pipeline.py  --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad  --extract "['cluster', 'metadata']" --obsm-keys "['X_umap','X_tsne']"
 
@@ -537,6 +540,9 @@ def extract_from_anndata(self):
                 "extract"
             ):
                 self.anndata.generate_processed_matrix(self.anndata.adata)
+
+            if self.kwargs.get('extract') and "raw_counts" in self.kwargs.get('extract'):
+                self.anndata.ingest_raw_cells()
             self.report_validation("success")
             return 0
         # scanpy unable to open AnnData file