Skip to content
14 changes: 14 additions & 0 deletions ingest/cli_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,20 @@ def create_parser():
"--gene-file", help="Path to .genes.tsv file"
)

# h5ad subparsers
parser_h5ad = subparsers.add_parser(
"ingest_h5ad", help="Indicates that h5ad file is being ingested"
)

parser_h5ad.add_argument(
"--ingest-h5ad",
required=True,
action="store_true",
help="Indicates that ingest of h5ad file should be invoked",
)

parser_h5ad.add_argument("--h5ad-file", required=True, help="Path to h5ad file")

return parser


Expand Down
29 changes: 22 additions & 7 deletions ingest/de.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@ def __init__(
self.kwargs = kwargs
self.accession = self.kwargs["study_accession"]
self.annot_scope = self.kwargs["annotation_scope"]
# only used in output filename, replacing non-alphanumeric with underscores
self.cluster_name = re.sub(r'\W', '_', self.kwargs["name"])
self.method = self.kwargs["method"]

if matrix_file_type == "mtx":
Expand Down Expand Up @@ -183,6 +181,11 @@ def subset_adata(adata, de_cells):
def execute_de(self):
print(f'dev_info: Starting DE for {self.accession}')
try:
# only used in output filename, replacing non-alphanumeric with underscores
# except '+' replaced with 'pos'
self.cluster_name = DifferentialExpression.sanitize_strings(
self.kwargs["name"]
)
if self.matrix_file_type == "mtx":
DifferentialExpression.de_logger.info("preparing DE on sparse matrix")
self.run_scanpy_de(
Expand Down Expand Up @@ -389,13 +392,12 @@ def run_scanpy_de(
DifferentialExpression.de_logger.info("Gathering DE annotation labels")
groups = np.unique(adata.obs[annotation]).tolist()
for group in groups:
clean_group = re.sub(r'\W', '_', group)
clean_annotation = re.sub(r'\W', '_', annotation)
clean_group = DifferentialExpression.sanitize_strings(group)
clean_annotation = DifferentialExpression.sanitize_strings(annotation)
DifferentialExpression.de_logger.info(f"Writing DE output for {group}")
rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=group)
if DifferentialExpression.delimiter_in_gene_name(rank):
DifferentialExpression.extract_gene_id_for_out_file(rank)

out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
# Round numbers to 4 significant digits while respecting fixed point
# and scientific notation (note: trailing zeros are removed)
Expand All @@ -408,10 +410,23 @@ def run_scanpy_de(

DifferentialExpression.de_logger.info("DE processing complete")

@staticmethod
def sanitize_strings(input_string):
"""
Replace '+' with 'pos', then replace non-alphanumerics with underscore
this allows distinct sanitization for "CD16+ monocyte" vs "CD16- monocyte"
"""
plus_converted_string = re.sub('\+', 'pos', input_string)
return re.sub(r'\W', '_', plus_converted_string)

@staticmethod
def string_for_output_match(arguments):
cleaned_cluster_name = re.sub(r'\W', '_', arguments["cluster_name"])
cleaned_annotation_name = re.sub(r'\W', '_', arguments["annotation_name"])
cleaned_cluster_name = DifferentialExpression.sanitize_strings(
arguments["cluster_name"]
)
cleaned_annotation_name = DifferentialExpression.sanitize_strings(
arguments["annotation_name"]
)
files_to_match = f"{cleaned_cluster_name}--{cleaned_annotation_name}*.tsv"
return files_to_match

Expand Down
37 changes: 37 additions & 0 deletions ingest/h5ad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
try:
from ingest_files import IngestFiles
from monitor import log_exception
except ImportError:
# Used when importing as external package, e.g. imports in single_cell_portal code
from .ingest_files import IngestFiles
from .monitor import log_exception


class H5adIngestor(IngestFiles):
ALLOWED_FILE_TYPES = ['application/x-hdf5']

def __init__(self, file_path, study_file_id, study_id, **kwargs):
IngestFiles.__init__(
self, file_path, allowed_file_types=self.ALLOWED_FILE_TYPES
)
pass

def obtain_adata(self):
try:
self.adata = self.open_file(self.file_path)[0]
print(self.adata)
IngestFiles.dev_logger.info(str(self.adata))
except ValueError as e:
raise ValueError(e)

def validate(self):
"""
Currently, file passes "basic validation" if file
can be opened by scanpy
"""
try:
self.adata = self.obtain_adata()
return True
except ValueError:
return False

30 changes: 26 additions & 4 deletions ingest/ingest_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from dataclasses import dataclass
from typing import Dict, Generator, List, Tuple, Union # noqa: F401
import warnings
import scanpy as sc


import pandas as pd # NOqa: F821
Expand All @@ -21,9 +22,9 @@
# import google.cloud.logging

try:
from monitor import setup_logger
from monitor import setup_logger, log_exception
except ImportError:
from .monitor import setup_logger
from .monitor import setup_logger, log_exception


@dataclass
Expand Down Expand Up @@ -75,13 +76,16 @@ class IngestFiles:
# General logger for class
# Logger provides more details
dev_logger = setup_logger(__name__, "log.txt", format="support_configs")
user_logger = setup_logger(__name__ + ".user_logger", "user_log.txt")
# Filter out warnings about using end user credentials when running ingest_pipeline as dev
warnings.filterwarnings(
"ignore", "Your application has authenticated using end user credentials"
)

def __init__(self, file_path, allowed_file_types):
self.file_path = file_path
# define filetype for h5ad file extension
mimetypes.add_type('application/x-hdf5', '.h5ad')
# File is remote (in GCS bucket) when running via PAPI,
# and typically local when developing
self.is_remote_file = IngestFiles.is_remote_file(file_path)
Expand Down Expand Up @@ -195,6 +199,7 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
"text/plain": self.open_txt,
"text/tab-separated-values": self.open_tsv,
"dataframe": self.open_pandas,
"application/x-hdf5": self.open_h5ad,
}

if start_point != 0:
Expand All @@ -214,6 +219,11 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
file_connections.get(file_type)(open_file, file_type, **kwargs),
open_file,
)
elif file_type == "application/x-hdf5":
return (
file_connections.get(file_type)(file_path, **kwargs),
open_file,
)
else:
return (
file_connections.get(file_type)(open_file, **kwargs),
Expand All @@ -227,9 +237,12 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
open_file,
)
else:
raise ValueError(
f"Unsupported file format. Allowed file types are: {' '.join(self.allowed_file_types)}"
msg = (
f"Unsupported file format. Allowed file MIME types are: "
f"{' '.join(self.allowed_file_types)}"
)
log_exception(IngestFiles.dev_logger, IngestFiles.user_logger, msg)
raise ValueError(msg)

# Inherited function
def extract(self):
Expand Down Expand Up @@ -298,6 +311,15 @@ def open_pandas(self, file_path, file_type, **kwargs):
else:
raise ValueError("File must be tab or comma delimited")

def open_h5ad(self, file_path, **kwargs):
"""Opens file as AnnData object """
try:
return sc.read_h5ad(file_path, backed='r')
except OSError as e:
msg = f"Scanpy cannot read file, \"{file_path}\"."
log_exception(IngestFiles.dev_logger, IngestFiles.user_logger, msg)
raise ValueError(msg)

def open_csv(self, opened_file_object, **kwargs):
"""Opens csv file"""
csv.register_dialect(
Expand Down
27 changes: 25 additions & 2 deletions ingest/ingest_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
# Ingest dense file
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name human --ncbi-taxid 9606 --matrix-file ../tests/data/dense_matrix_19_genes_1000_cells.txt --matrix-file-type dense

# Ingest loom file
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --matrix-file ../tests/data/test_loom.loom --matrix-file-type loom --taxon-name 'Homo Sapiens' --taxon-common-name human
# Ingest h5ad file
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_h5ad --h5ad-file ../tests/data/test.h5ad

# Subsample cluster and metadata file
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_subsample --cluster-file ../tests/data/test_1k_cluster_Data.csv --name custer1 --cell-metadata-file ../tests/data/test_1k_metadata_Data.csv --subsample
Expand Down Expand Up @@ -82,6 +82,7 @@
from clusters import Clusters
from expression_files.mtx import MTXIngestor
from expression_files.dense_ingestor import DenseIngestor
from h5ad import H5adIngestor
from monitor import setup_logger, log_exception
from de import DifferentialExpression

Expand All @@ -102,6 +103,7 @@
from .clusters import Clusters
from .expression_files.dense_ingestor import DenseIngestor
from .expression_files.mtx import MTXIngestor
from .h5ad import H5adIngestor
from .cli_parser import create_parser, validate_arguments
from .de import DifferentialExpression

Expand All @@ -125,6 +127,7 @@ def __init__(
matrix_file_type: str = None,
cell_metadata_file: str = None,
cluster_file: str = None,
h5ad_file: str = None,
subsample=False,
ingest_cell_metadata=False,
ingest_cluster=False,
Expand All @@ -144,6 +147,7 @@ def __init__(
else:
self.db = None
self.cluster_file = cluster_file
self.h5ad_file = h5ad_file
self.kwargs = kwargs
self.cell_metadata_file = cell_metadata_file
self.props = {}
Expand Down Expand Up @@ -474,6 +478,20 @@ def subsample(self):
return 1
return 0

@custom_metric(config.get_metric_properties)
def ingest_h5ad(self):
"""Ingests h5ad files."""
self.h5ad = H5adIngestor(
self.h5ad_file, self.study_id, self.study_file_id, **self.kwargs
)
if self.h5ad.validate():
self.report_validation("success")
return 0
# scanpy unable to open h5ad file
else:
self.report_validation("failure")
return 1

def calculate_de(self):
""" Run differential expression analysis """
try:
Expand Down Expand Up @@ -523,6 +541,11 @@ def run_ingest(ingest, arguments, parsed_args):
config.set_parent_event_name("ingest-pipeline:subsample:ingest")
status_subsample = ingest.subsample()
status.append(status_subsample)
elif "ingest_h5ad" in arguments:
if arguments["ingest_h5ad"]:
config.set_parent_event_name("ingest-pipeline:h5ad:ingest")
status_h5ad = ingest.ingest_h5ad()
status.append(status_h5ad)
elif "differential_expression" in arguments:
config.set_parent_event_name("ingest-pipeline:differential-expression")
status_de = ingest.calculate_de()
Expand Down
1 change: 1 addition & 0 deletions tests/data/h5ad/bad.h5
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
non-empty
Binary file added tests/data/h5ad/bad.h5ad
Binary file not shown.
Binary file added tests/data/h5ad/test.h5ad
Binary file not shown.
26 changes: 17 additions & 9 deletions tests/test_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ def find_expected_files(labels, cluster_name, annotation, scope, method):
""" Check that files were created for all expected annotation labels
"""
found = []
sanitized_cluster_name = re.sub(r'\W', '_', cluster_name)
sanitized_annotation = re.sub(r'\W', '_', annotation)
sanitized_cluster_name = DifferentialExpression.sanitize_strings(cluster_name)
sanitized_annotation = DifferentialExpression.sanitize_strings(annotation)
for label in labels:
sanitized_label = re.sub(r'\W', '_', label)
sanitized_label = DifferentialExpression.sanitize_strings(label)
expected_file = f"{sanitized_cluster_name}--{sanitized_annotation}--{sanitized_label}--{scope}--{method}.tsv"
assert os.path.exists(expected_file)
found.append(expected_file)
Expand Down Expand Up @@ -185,16 +185,26 @@ def test_delimiter_in_gene_name(self):
def test_filename_sanitation(self):
""" Bugfix (SCP-4459) so sanitization does not collapse adjacent non-alphanumeric characters to
single underscores, see also SCP-4455 for manual fix

Bugfix (SCP-4533) convert '+' to 'pos' so labels differing in only +/-
do not clobber and cause display of incorrect results for one of the labels.
"""
test_string = "foo++)"
plus_converted_result = DifferentialExpression.sanitize_strings(test_string)
self.assertEqual(
plus_converted_result,
"foopospos_",
"unexpected result from sanitation sanitize_strings function",
)

arguments = {
"cluster_name": "UMAP, pre-QC all cells (complexity greater than or equal to 1000)",
"cluster_name": "UMAP+, pre-QC all cells (complexity greater than or equal to 1000)",
"annotation_name": "cell..type",
}
files_to_match = DifferentialExpression.string_for_output_match(arguments)
print(files_to_match)
self.assertEqual(
files_to_match,
"UMAP__pre_QC_all_cells__complexity_greater_than_or_equal_to_1000_--cell__type*.tsv",
"UMAPpos__pre_QC_all_cells__complexity_greater_than_or_equal_to_1000_--cell__type*.tsv",
"unexpected result from sanitation function",
)

Expand Down Expand Up @@ -472,9 +482,7 @@ def test_de_process_sanitize(self):
f"expected five annotation labels for {test_annotation}",
)

expected_file = (
"UMAP__pre_QC--misc__cellaneous--cholinergic__neuron_--study--wilcoxon.tsv"
)
expected_file = "UMAP__pre_QC--miscposposcellaneous--cholinergic__neuron_--study--wilcoxon.tsv"

# confirm expected results filename was generated in found result files
self.assertIn(
Expand Down
Loading