Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 35 additions & 5 deletions ingest/anndata_.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,19 @@ def _field_template(self, field, precision):


try:
from ingest_files import IngestFiles
from ingest_files import DataArray, IngestFiles
from expression_files.expression_files import GeneExpression
from monitor import log_exception
from monitor import log_exception, bypass_mongo_writes
from validation.validate_metadata import list_duplicates
except ImportError:
# Used when importing as external package, e.g. imports in single_cell_portal code
from .ingest_files import IngestFiles
from .ingest_files import DataArray, IngestFiles
from .expression_files.expression_files import GeneExpression
from .monitor import log_exception
from .monitor import log_exception, bypass_mongo_writes
from .validation.validate_metadata import list_duplicates


class AnnDataIngestor(GeneExpression, IngestFiles):
class AnnDataIngestor(GeneExpression, IngestFiles, DataArray):
ALLOWED_FILE_TYPES = ['application/x-hdf5']

def __init__(self, file_path, study_file_id, study_id, **kwargs):
Expand Down Expand Up @@ -57,6 +57,36 @@ def basic_validation(self):
except ValueError:
return False

def create_cell_data_arrays(self):
"""Extract cell name DataArray documents for raw data"""
adata = self.obtain_adata()
cells = list(adata.obs_names)
# use filename denoting a raw 'fragment' to allow successful ingest and downstream queries
raw_filename = "h5ad_frag.matrix.raw.mtx.gz"
data_arrays = []
for data_array in GeneExpression.create_data_arrays(
name=f"{raw_filename} Cells",
array_type="cells",
values=cells,
linear_data_type="Study",
linear_data_id=self.study_file_id,
cluster_name=raw_filename,
study_file_id=self.study_file_id,
study_id=self.study_id
):
data_arrays.append(data_array)

return data_arrays

def ingest_raw_cells(self):
"""Insert raw count cells into MongoDB"""
arrays = self.create_cell_data_arrays()
if not bypass_mongo_writes():
self.load(arrays, DataArray.COLLECTION_NAME)
else:
dev_msg = f"Extracted {len(arrays)} DataArray for {self.study_file_id}:{arrays[0]['name']}"
IngestFiles.dev_logger.info(dev_msg)

@staticmethod
def generate_cluster_header(adata, clustering_name):
"""
Expand Down
10 changes: 10 additions & 0 deletions ingest/expression_files/expression_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,16 @@ def is_raw_count_file(study_id, study_file_id, client):
QUERY = {"_id": study_file_id, "study_id": study_id}

study_file_doc = list(client[COLLECTION_NAME].find(QUERY)).pop()
# special handling of non-reference AnnData files to always return false
# this will allow normal extraction of expression data as raw count cells are already ingested during
# the "raw_counts" extract phase
if (
study_file_doc.get("file_type") == "AnnData" and
"ann_data_file_info" in study_file_doc.keys() and
not study_file_doc["ann_data_file_info"].get("reference_file")
):
return False

# Name of embedded document that holds 'is_raw_count_files is named expression_file_info.
# If study files does not have document expression_file_info
# field, "is_raw_count_files", will not exist.:
Expand Down
6 changes: 6 additions & 0 deletions ingest/ingest_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
# Ingest AnnData - happy path processed expression data only extraction
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['processed_expression']"

# Ingest AnnData - happy path raw count cell name only extraction
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['raw_counts']"

# Ingest AnnData - happy path cluster and metadata extraction
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['cluster', 'metadata']" --obsm-keys "['X_umap','X_tsne']"

Expand Down Expand Up @@ -537,6 +540,9 @@ def extract_from_anndata(self):
"extract"
):
self.anndata.generate_processed_matrix(self.anndata.adata)

if self.kwargs.get('extract') and "raw_counts" in self.kwargs.get('extract'):
self.anndata.ingest_raw_cells()
self.report_validation("success")
return 0
# scanpy unable to open AnnData file
Expand Down
Loading