Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions ingest/cli_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,14 @@ def validate_arguments(parsed_args):
raise ValueError(
f" Invalid argument: unable to connect to a BigQuery table called {parsed_args.bq_table}."
)
if (
"differential_expression" in parsed_args
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this in addition to the --differential-expression kwarg? They seem redundant.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I only check for the annotation_type kwarg, that argument isn't set for other types of ingest pipeline runs and all other non-DE ingest jobs would fail with:
AttributeError: 'Namespace' object has no attribute 'annotation_type'

Checking for "differential_expression" in parsed_args, ensures I'm only doing this validation for DE jobs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I didn't make myself clear - apologies. If we look at the example command line:

python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cluster_annot --annotation-type group --annotation-scope cluster --matrix-file-path ../tests/data/differential_expression/de_integration.tsv --matrix-file-type dense --annotation-file ../tests/data/differential_expression/de_integration_annotated_cluster.txt --cluster-file ../tests/data/differential_expression/de_integration_annotated_cluster.txt --cluster-name de_integration --study-accession SCPdev --differential-expression

There is the positional argument of differential_expression that you call out here, but then also at the end, there is the --differential-expression kwarg. Why do we need both? Looking in cli_parser.py, I see this:

# Differential expression subparsers
parser_differential_expression = subparsers.add_parser(
    "differential_expression",
    help="Indicates differential expression analysis processing",
)

parser_differential_expression.add_argument(
    "--differential-expression",
    required=True,
    action="store_true",
    help="Indicates that differential expression analysis should be invoked",
)

They look to me to be functionally equivalent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for chatting, Jon. As we discussed, that paradigm was set up for cluster and cell metadata ingest. I'm following that pattern in DE jobs for consistency and in case there's a quirk about subparsers that I don't understand.

and parsed_args.annotation_type != "group"
):
raise ValueError(
"Differential expression analysis restricted to group-type annotations,"
f" cannot run on data of type \"{parsed_args.annotation_type}\"."
)


def create_parser():
Expand Down Expand Up @@ -268,15 +276,23 @@ def create_parser():
)

parser_differential_expression.add_argument(
"--annotation", required=True, help="Name of annotation for DE analysis"
"--annotation-name", required=True, help="Name of annotation for DE analysis"
)

parser_differential_expression.add_argument(
"--annotation-type", required=True, help="Type of annotation for DE analysis"
)

parser_differential_expression.add_argument(
"--annotation-scope", required=True, help="Scope of annotation for DE analysis"
)

parser_differential_expression.add_argument(
"--method", default="wilcoxon", help="method for DE"
)

parser_differential_expression.add_argument(
"--name", required=True, help="study owner-specified cluster anem"
"--cluster-name", required=True, help="study owner-specified cluster name"
)

parser_differential_expression.add_argument(
Expand All @@ -286,9 +302,9 @@ def create_parser():
)

parser_differential_expression.add_argument(
"--cell-metadata-file",
"--annotation-file",
required=True,
help="Absolute or relative path to cell metadata file.",
help="Absolute or relative path to cell metadata or cluster file of annotations.",
)

parser_differential_expression.add_argument(
Expand Down
41 changes: 27 additions & 14 deletions ingest/de.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from email.headerregistry import Group
import logging
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -36,19 +37,20 @@ def __init__(
cell_metadata,
matrix_file_path,
matrix_file_type,
annotation,
annotation_name,
**kwargs,
):
DifferentialExpression.de_logger.info(
"Initializing DifferentialExpression instance"
)
self.cluster = cluster
self.metadata = cell_metadata
self.annotation = annotation
self.annotation = annotation_name
self.matrix_file_path = matrix_file_path
self.matrix_file_type = matrix_file_type
self.kwargs = kwargs
self.accession = self.kwargs["study_accession"]
self.annot_scope = self.kwargs["annotation_scope"]
# only used in output filename, replacing non-alphanumeric with underscores
self.cluster_name = re.sub(r'\W+', '_', self.kwargs["name"])
self.method = self.kwargs["method"]
Expand Down Expand Up @@ -175,6 +177,7 @@ def subset_adata(adata, de_cells):
return adata

def execute_de(self):
print(f'dev_info: Starting DE for {self.accession}')
try:
if self.matrix_file_type == "mtx":
DifferentialExpression.de_logger.info("preparing DE on sparse matrix")
Expand All @@ -184,6 +187,7 @@ def execute_de(self):
self.matrix_file_path,
self.matrix_file_type,
self.annotation,
self.annot_scope,
self.accession,
self.cluster_name,
self.method,
Expand All @@ -198,6 +202,7 @@ def execute_de(self):
self.matrix_file_path,
self.matrix_file_type,
self.annotation,
self.annot_scope,
self.accession,
self.cluster_name,
self.method,
Expand All @@ -222,18 +227,17 @@ def get_genes(genes_path):
"""
genes_df = pd.read_csv(genes_path, sep="\t", header=None)
if len(genes_df.columns) > 1:
# unclear if falling back to gene_id is useful (SCP-4283)
# print so we're aware of dups during dev testing
if genes_df[1].count() == genes_df[1].nunique():
return genes_df[1].tolist()
elif genes_df[0].count() == genes_df[0].nunique():
return genes_df[0].tolist()
msg = "dev_info: Features file contains duplicate identifiers (col 2)"
print(msg)
return genes_df[1].tolist()
else:
msg = "Features file contains duplicate identifiers"
print(msg)
log_exception(
DifferentialExpression.dev_logger, DifferentialExpression.de_logger, msg
)
raise ValueError(msg)
return genes
if genes_df[0].count() == genes_df[0].nunique():
msg = "dev_info: Features file contains duplicate identifiers (col 1)"
print(msg)
return genes_df[0].tolist()

@staticmethod
def get_barcodes(barcodes_path):
Expand Down Expand Up @@ -264,6 +268,7 @@ def run_scanpy_de(
matrix_file_path,
matrix_file_type,
annotation,
annot_scope,
study_accession,
cluster_name,
method,
Expand Down Expand Up @@ -315,15 +320,23 @@ def run_scanpy_de(
DifferentialExpression.dev_logger, DifferentialExpression.de_logger, msg
)
raise KeyError(msg)
# ToDo - detection and handling of annotations with only one sample (SCP-4282)
except ValueError as e:
print(e)
log_exception(
DifferentialExpression.dev_logger, DifferentialExpression.de_logger, e
)
raise KeyError(e)

DifferentialExpression.de_logger.info("Gathering DE annotation labels")
groups = np.unique(adata.obs[annotation]).tolist()
for group in groups:
group_filename = re.sub(r'\W+', '_', group)
clean_group = re.sub(r'\W+', '_', group)
clean_annotation = re.sub(r'\W+', '_', annotation)
DifferentialExpression.de_logger.info(f"Writing DE output for {group}")
rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=group)

out_file = f'{cluster_name}--{annotation}--{group_filename}--{method}.tsv'
out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
# Round numbers to 4 significant digits while respecting fixed point
# and scientific notation (note: trailing zeros are removed)
rank.to_csv(out_file, sep='\t', float_format='%.4g')
Expand Down
6 changes: 6 additions & 0 deletions ingest/ingest_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,12 @@ def main() -> None:
parsed_args = create_parser().parse_args()
validate_arguments(parsed_args)
arguments = vars(parsed_args)
if "differential_expression" in arguments:
# DE may use metadata or cluster file for annots BUT
# IngestPipeline initialization assumes a "cell_metadata_file"
arguments["cell_metadata_file"] = arguments["annotation_file"]
# IngestPipeline initialization expects "name" and not "cluster_name"
arguments["name"] = arguments["cluster_name"]
# Initialize global variables for current ingest job
config.init(
arguments["study_id"],
Expand Down
Loading