Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/minify_ontologies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ on:
jobs:
build:
runs-on: ubuntu-latest
permissions: write-all

steps:
- name: Checkout code
Expand Down
4 changes: 1 addition & 3 deletions ingest/cell_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from validation.validate_metadata import (
report_issues,
validate_input_metadata,
write_metadata_to_bq,
)
except ImportError:
# Used when importing as external package, e.g. imports in single_cell_portal code
Expand Down Expand Up @@ -180,8 +179,7 @@ def conforms_to_metadata_convention(self):
json_file = convention_file_object.open_file(self.JSON_CONVENTION)
convention = json.load(json_file)

import_to_bq = self.kwargs["bq_dataset"] and self.kwargs["bq_table"]
validate_input_metadata(self, convention, bq_json=import_to_bq)
validate_input_metadata(self, convention)

json_file.close()
return not report_issues(self)
Expand Down
52 changes: 0 additions & 52 deletions ingest/cli_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,12 @@
import argparse
import ast

from google.cloud import bigquery
from google.cloud.exceptions import NotFound


# Ingest file types
EXPRESSION_FILE_TYPES = ["dense", "mtx", "h5ad"]


def bq_dataset_exists(dataset):
bigquery_client = bigquery.Client()
dataset_ref = bigquery_client.dataset(dataset)
exists = False
try:
bigquery_client.get_dataset(dataset_ref)
exists = True
except NotFound:
print(f"Dataset {dataset} not found")
return exists


def bq_table_exists(dataset, table):
bigquery_client = bigquery.Client()
dataset_ref = bigquery_client.dataset(dataset)
table_ref = dataset_ref.table(table)
exists = False
try:
bigquery_client.get_table(table_ref)
exists = True
except NotFound:
print(f"Dataset {table} not found")
return exists


def validate_arguments(parsed_args):
"""Verify parsed input arguments

Expand All @@ -54,25 +27,6 @@ def validate_arguments(parsed_args):
"must include .genes.tsv, and .barcodes.tsv files. See --help for "
"more information"
)
if "ingest_cell_metadata" in parsed_args:
if (parsed_args.bq_dataset is not None and parsed_args.bq_table is None) or (
parsed_args.bq_dataset is None and parsed_args.bq_table is not None
):
raise ValueError(
"Missing argument: --bq_dataset and --bq_table are both required for BigQuery upload."
)
if parsed_args.bq_dataset is not None and not bq_dataset_exists(
parsed_args.bq_dataset
):
raise ValueError(
f" Invalid argument: unable to connect to a BigQuery dataset called {parsed_args.bq_dataset}."
)
if parsed_args.bq_table is not None and not bq_table_exists(
parsed_args.bq_dataset, parsed_args.bq_table
):
raise ValueError(
f" Invalid argument: unable to connect to a BigQuery table called {parsed_args.bq_table}."
)
if (
"differential_expression" in parsed_args
and parsed_args.annotation_type != "group"
Expand Down Expand Up @@ -191,12 +145,6 @@ def create_parser():
required=True,
help="Single study accession associated with ingest files.",
)
parser_cell_metadata.add_argument(
"--bq-dataset", help="BigQuery dataset identifer for ingest job."
)
parser_cell_metadata.add_argument(
"--bq-table", help="BigQuery table identifer for ingest job."
)
parser_cell_metadata.add_argument(
"--ingest-cell-metadata",
required=True,
Expand Down
35 changes: 3 additions & 32 deletions ingest/ingest_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,10 @@

# Ingest Cell Metadata file against convention
!! Please note that you must have a pre-configured BigQuery table available
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cell_metadata --cell-metadata-file ../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.txt --study-accession SCP123 --ingest-cell-metadata --validate-convention --bq-dataset cell_metadata --bq-table alexandria_convention
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cell_metadata --cell-metadata-file ../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.txt --study-accession SCP123 --ingest-cell-metadata --validate-convention

# Ingest Cell Metadata file against convention AND booleanize has_<modality> metadata for BigQuery
#### BQ schema must be updated for each has_<modality>
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 ingest_cell_metadata --cell-metadata-file ../tests/data/annotation/metadata/convention/brain_rf1/patchseq_classic_metadata_has_modality_10.tsv --study-accession SCPPR344 --ingest-cell-metadata --validate-convention --has-modality "['electrophysiology', 'morphology']" --bq-dataset cell_metadata_development --bq-table alexandria_convention
# Ingest Cell Metadata file against convention with has_<modality> metadata
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 ingest_cell_metadata --cell-metadata-file ../tests/data/annotation/metadata/convention/brain_rf1/patchseq_classic_metadata_has_modality_10.tsv --study-accession SCPPR344 --ingest-cell-metadata --validate-convention --has-modality "['electrophysiology', 'morphology']"

# Ingest dense file
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name human --ncbi-taxid 9606 --matrix-file ../tests/data/dense_matrix_19_genes_1000_cells.txt --matrix-file-type dense
Expand Down Expand Up @@ -113,7 +112,6 @@
from validation.validate_metadata import (
report_issues,
validate_input_metadata,
write_metadata_to_bq,
)
from cell_metadata import CellMetadata
from cli_parser import create_parser, validate_arguments
Expand Down Expand Up @@ -347,27 +345,6 @@ def get_cluster_query(self):

return query

def upload_metadata_to_bq(self):
"""Uploads metadata to BigQuery"""
if self.kwargs["validate_convention"] is not None:
if (
self.kwargs["validate_convention"]
and self.kwargs["bq_dataset"]
and self.kwargs["bq_table"]
):
write_status = write_metadata_to_bq(
self.cell_metadata,
self.kwargs["bq_dataset"],
self.kwargs["bq_table"],
)
return write_status
else:
IngestPipeline.dev_logger.error(
"Erroneous call to upload_metadata_to_bq"
)
return 1
return 0

@custom_metric(config.get_metric_properties)
def ingest_expression(self) -> int:
"""
Expand Down Expand Up @@ -681,12 +658,6 @@ def run_ingest(ingest, arguments, parsed_args):
config.set_parent_event_name("ingest-pipeline:cell_metadata:ingest")
status_cell_metadata_validation = ingest.validate_cell_metadata()
status.append(status_cell_metadata_validation)
if (
parsed_args.bq_table is not None
and status_cell_metadata_validation == 0
):
status_metadata_bq = ingest.upload_metadata_to_bq()
status.append(status_metadata_bq)
if status_cell_metadata_validation == 0:
if ingest.kwargs['has_modality'] is not None:
ingest.cell_metadata.file = CellMetadata.restore_modality_metadata(
Expand Down
36 changes: 2 additions & 34 deletions ingest/validation/metadata_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@
# generate an issues.json file to compare with reference test files
$ python3 metadata_validation.py --issues-json ../../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.tsv

# generate a BigQuery upload file to compare with reference test files
$ python3 metadata_validation.py --bq-json ../../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.tsv

# use a different metadata convention for validation
$ python3 metadata_validation.py --convention <path to convention json> ../../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.tsv

Expand All @@ -37,12 +34,10 @@
import copy
import itertools
import math
import pandas as pd

import colorama
from colorama import Fore
import jsonschema
from google.cloud import bigquery

sys.path.append("..")
try:
Expand All @@ -51,7 +46,6 @@
from validation.validate_metadata import (
report_issues,
validate_input_metadata,
write_metadata_to_bq,
serialize_issues,
exit_if_errors,
)
Expand Down Expand Up @@ -84,12 +78,9 @@ def create_parser():
# to generate reference output for tests
parser.add_argument("--issues-json", action="store_true")
# helper param to create JSON representation of convention metadata
# to generate json for bigquery testing
parser.add_argument("--bq-json", action="store_true")
# overwrite existing output
parser.add_argument("--force", action="store_true")
# test BigQuery upload functions
parser.add_argument("--upload", action="store_true")

# validate_metadata.py CLI only for dev, bogus defaults below shouldn't propagate
# make bogus defaults obviously artificial for ease of detection
parser.add_argument(
Expand All @@ -105,12 +96,6 @@ def create_parser():
parser.add_argument(
"--study-accession", help="SCP study accession", default="SCPtest"
)
parser.add_argument(
"--bq-dataset", help="BigQuery dataset identifier", default="cell_metadata"
)
parser.add_argument(
"--bq-table", help="BigQuery table identifier", default="alexandria_convention"
)
parser.add_argument(
"--convention",
help="Metadata convention JSON file",
Expand All @@ -120,24 +105,9 @@ def create_parser():
return parser


def check_if_old_output():
"""Exit if old output files found"""
output_files = ["bq.json"]

old_output = False
for file in output_files:
if os.path.exists(file):
print(f"{file} already exists, please delete file and try again")
old_output = True
if old_output:
exit(1)


if __name__ == "__main__":
args = create_parser().parse_args()
arguments = vars(args)
if not args.force:
check_if_old_output()

with open(args.convention, "r") as f:
convention = json.load(f)
Expand All @@ -150,10 +120,8 @@ def check_if_old_output():
metadata.preprocess(True)
print("Validating", args.input_metadata)

validate_input_metadata(metadata, convention, args.bq_json)
validate_input_metadata(metadata, convention)
if args.issues_json:
serialize_issues(metadata)
report_issues(metadata)
if args.upload:
write_metadata_to_bq(metadata, args.bq_dataset, args.bq_table)
exit_if_errors(metadata)
Binary file modified ingest/validation/ontologies/efo.min.tsv.gz
Binary file not shown.
Binary file modified ingest/validation/ontologies/mondo.min.tsv.gz
Binary file not shown.
Binary file modified ingest/validation/ontologies/pato.min.tsv.gz
Binary file not shown.
Binary file modified ingest/validation/ontologies/uberon.min.tsv.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion ingest/validation/ontologies/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1744734811 # validation cache key
1749563342 # validation cache key
Loading