Skip to content

Commit

Permalink
fix bactopia search querying with missing fields
Browse files Browse the repository at this point in the history
  • Loading branch information
rpetit3 committed May 31, 2023
1 parent 90c83c4 commit 4b48572
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 282 deletions.
2 changes: 1 addition & 1 deletion .gitpod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ tasks:
mkdir -p .vscode
echo '{"python.pythonPath": "/home/gitpod/.conda/envs/bactopia-py/bin/python"}' > .vscode/settings.json
pre-commit install --install-hooks
just install
command: |
vscode:
Expand All @@ -26,5 +27,4 @@ vscode:
- streetsidesoftware.code-spell-checker # Spelling checker for source code
- ms-python.black-formatter # Support for Python Black formatter
- njpwerner.autodocstring # Use type hints to auto create doc strings
- github.copilot # Load up Copilot
- ms-python.python # Syntax and linting
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

## 1.0.3

- Fixed `bactopia-search` using missing columns in the query
- dropped pysradb dependency

## 1.0.2

- Added `bactopia-datasets` to download optional datasets outside of Nextflow
Expand Down
190 changes: 72 additions & 118 deletions bactopia/cli/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
import sys
from pathlib import Path

import pandas as pd
import requests
import rich
import rich.console
import rich.traceback
import rich_click as click
from pysradb import SRAweb
from rich.logging import RichHandler

import bactopia
Expand All @@ -30,6 +30,7 @@
"--limit",
"--accession-limit",
"--biosample-subset",
"--include-empty",
],
},
{
Expand All @@ -55,91 +56,29 @@
},
]
}
ENA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"


ENA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
FIELDS = [
"study_accession",
"secondary_study_accession",
"sample_accession",
"secondary_sample_accession",
"experiment_accession",
"run_accession",
"submission_accession",
"tax_id",
"scientific_name",
"instrument_platform",
"instrument_model",
"library_name",
"library_layout",
"nominal_length",
"library_strategy",
"library_source",
"library_selection",
"read_count",
"base_count",
"center_name",
"first_public",
"last_updated",
"experiment_title",
"study_title",
"study_alias",
"experiment_alias",
"run_alias",
"fastq_bytes",
"fastq_md5",
"fastq_ftp",
"fastq_aspera",
"fastq_galaxy",
"submitted_bytes",
"submitted_md5",
"submitted_ftp",
"submitted_aspera",
"submitted_galaxy",
"submitted_format",
"sra_bytes",
"sra_md5",
"sra_ftp",
"sra_aspera",
"sra_galaxy",
"cram_index_ftp",
"cram_index_aspera",
"cram_index_galaxy",
"sample_alias",
"broker_name",
"sample_title",
"first_created",
]


def get_sra_metadata(query: str) -> list:
"""Fetch metadata from SRA.
def get_ena_metadata(query: str, is_accession: bool, limit: int):
"""Fetch metadata from ENA.
https://docs.google.com/document/d/1CwoY84MuZ3SdKYocqssumghBF88PWxUZ/edit#heading=h.ag0eqy2wfin5
Args:
query (str): The accession to search for.
query (str): The query to search for.
is_accession (bool): If the query is an accession or not.
limit (int): The maximum number of records to return.
Returns:
list: Records associated with the accession.
"""
#
db = SRAweb()
df = db.search_sra(
query, detailed=True, sample_attribute=True, expand_sample_attributes=True
)
if df is None:
return [False, []]
return [True, df.to_dict(orient="records")]


def get_ena_metadata(query, is_accession, limit=1000000):
"""USE ENA's API to retrieve the latest results."""
# ENA browser info: http://www.ebi.ac.uk/ena/about/browser
data = {
"dataPortal": "ena",
"dccDataOnly": "false",
"download": "false",
"result": "read_run",
"format": "tsv",
"limit": limit,
"fields": ",".join(FIELDS),
"fields": "all",
}

if is_accession:
Expand All @@ -154,6 +93,7 @@ def get_ena_metadata(query, is_accession, limit=1000000):
)

headers = {"accept": "*/*", "Content-type": "application/x-www-form-urlencoded"}

r = requests.post(ENA_URL, headers=headers, data=data)
if r.status_code == requests.codes.ok:
data = []
Expand All @@ -170,37 +110,33 @@ def get_ena_metadata(query, is_accession, limit=1000000):
return [False, [r.status_code, r.text]]


def get_metadata(
query: str, ena_query: str, is_accession: bool, limit: int = 1000000
def get_run_info(
sra_query: str, ena_query: str, is_accession: bool, limit: int = 1000000
) -> tuple:
"""Retrieve a list of samples available from ENA.
The first attempt will be against ENA, and if that fails, SRA will be queried. This should
capture those samples not yet synced between ENA and SRA.
Args:
query (str): The original query.
sra_query (str): A formatted query for SRA searches.
ena_query (str): A formatted query for ENA searches.
is_accession (bool): If the query is an accession or not.
limit (int): The maximum number of records to return.
Returns:
tuple: Records associated with the accession.
"""

logging.debug("Querying ENA for metadata...")
success, ena_data = get_ena_metadata(ena_query, is_accession, limit)
success, ena_data = get_ena_metadata(ena_query, is_accession, limit=limit)
if success:
return ena_data
return success, ena_data
else:
logging.debug("Failed to get metadata from ENA. Trying SRA...")
success, sra_data = get_sra_metadata(query)
if not success:
logging.error("There was an issue querying ENA and SRA, exiting...")
logging.error(f"STATUS: {ena_data[0]}")
logging.error(f"TEXT: {ena_data[1]}")
sys.exit(1)
else:
return sra_data
logging.error("There was an issue querying ENA, exiting...")
logging.error(f"STATUS: {ena_data[0]}")
logging.error(f"TEXT: {ena_data[1]}")
sys.exit(1)


def parse_accessions(
Expand Down Expand Up @@ -231,14 +167,17 @@ def parse_accessions(
"filtered": [],
}
for result in results:
instrument_key = (
"instrument_platform"
if "instrument_platform" in result
else "instrument_model_desc"
)
if (
result["instrument_platform"] == "ILLUMINA"
or result["instrument_platform"] == "OXFORD_NANOPORE"
result[instrument_key] == "ILLUMINA"
or result[instrument_key] == "OXFORD_NANOPORE"
):
technology = (
"ont"
if result["instrument_platform"] == "OXFORD_NANOPORE"
else "illumina"
"ont" if result[instrument_key] == "OXFORD_NANOPORE" else "illumina"
)
passes = True
reason = []
Expand Down Expand Up @@ -329,47 +268,51 @@ def parse_query(q, accession_limit, exact_taxon=False):
else:
queries.append(q)
results = []
bioproject_accessions = []
biosample_accessions = []
experiment_accessions = []
run_accessions = []

for query in queries:
try:
taxon_id = int(query)
if exact_taxon:
results.append(["taxon", f"tax_eq({taxon_id})"])
results.append(
["taxon", f"tax_eq({taxon_id})", f"txid{taxon_id}[Organism:noexp]"]
)
else:
results.append(["taxon_tree", f"tax_tree({taxon_id})"])
results.append(
[
"taxon_tree",
f"tax_tree({taxon_id})",
f"txid{taxon_id}[Organism:exp]",
]
)
except ValueError:
# It is a accession or scientific name
# Test Accession
# Thanks! https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html#accession-numbers
if re.match(r"^PRJ[EDN][A-Z][0-9]+$|^[EDS]RP[0-9]{6,}$", query):
results.append(
[
"bioproject",
f"(study_accession={query} OR secondary_study_accession={query})",
]
)
bioproject_accessions.append(query)
elif re.match(r"^SAM[EDN][A-Z]?[0-9]+$|^[EDS]RS[0-9]{6,}$", query):
results.append(
[
"biosample",
f"(sample_accession={query} OR secondary_sample_accession={query})",
]
)
biosample_accessions.append(query)
elif re.match(r"^[EDS]RX[0-9]{6,}$", query):
experiment_accessions.append(query)
elif re.match(r"^[EDS]RR[0-9]{6,}$", query):
run_accessions.append(query)
else:
# Assuming it is a scientific name
results.append(["taxon_name", f'tax_name("{query}")'])
results.append(["taxon_name", f'tax_name("{query}")', f"'{query}'"])

# Split the accessions into set number
for chunk in chunks(bioproject_accessions, accession_limit):
results.append(["bioproject_accession", ",".join(chunk), " OR ".join(chunk)])
for chunk in chunks(biosample_accessions, accession_limit):
results.append(["biosample_accession", ",".join(chunk), " OR ".join(chunk)])
for chunk in chunks(experiment_accessions, accession_limit):
results.append(["experiment_accession", ",".join(chunk)])
results.append(["experiment_accession", ",".join(chunk), " OR ".join(chunk)])
for chunk in chunks(run_accessions, accession_limit):
results.append(["run_accession", ",".join(chunk)])
results.append(["run_accession", ",".join(chunk), " OR ".join(chunk)])

return results

Expand Down Expand Up @@ -441,6 +384,11 @@ def parse_query(q, accession_limit, exact_taxon=False):
show_default=True,
help="Genome size to be used for all samples, and for calculating min coverage",
)
@click.option(
"--include-empty",
is_flag=True,
help="Include metadata columns that are empty for all rows",
)
@click.option("--force", is_flag=True, help="Overwrite existing reports")
@click.option("--verbose", is_flag=True, help="Increase the verbosity of output")
@click.option("--silent", is_flag=True, help="Only critical errors will be printed")
Expand All @@ -456,6 +404,7 @@ def search(
min_read_length,
min_coverage,
genome_size,
include_empty,
force,
verbose,
silent,
Expand All @@ -478,23 +427,23 @@ def search(

if min_coverage and genome_size:
if min_base_count:
print(
logging.error(
"--min_base_count cannot be used with --coverage/--genome_size. Exiting...",
file=sys.stderr,
)
sys.exit(1)
else:
min_base_count = min_coverage * genome_size
elif min_coverage or genome_size:
print(
logging.error(
"--coverage and --genome_size must be used together. Exiting...",
file=sys.stderr,
)
sys.exit(1)

if biosample_subset > 0:
if not is_biosample(query):
print(
logging.error(
"--biosample_subset requires a single BioSample. Input query: {query} is not a BioSample. Exiting...",
file=sys.stderr,
)
Expand All @@ -519,10 +468,12 @@ def search(
filtered_file = f"{outdir}/{prefix}-filtered.txt".replace("//", "/")
summary_file = f"{outdir}/{prefix}-search.txt".replace("//", "/")
genome_sizes = get_ncbi_genome_size()
for query_type, query in queries:
for query_type, ena_query, sra_query in queries:
logging.info(f"Submitting query (type - {query_type})")
is_accession = True if query_type.endswith("accession") else False
success, query_results = get_ena_metadata(query, is_accession, limit=limit)
success, query_results = get_run_info(
sra_query, ena_query, is_accession, limit=limit
)
results += query_results
if success:
query_accessions, query_filtered = parse_accessions(
Expand Down Expand Up @@ -602,10 +553,13 @@ def search(
# Output the results
logging.info(f"Writing results to {metadata_file}")
with open(metadata_file, "w") as output_fh:
output_fh.write(f"{results[0].keys()}\n")
for result in results:
if result:
output_fh.write(f"{result}\n")
df = pd.DataFrame.from_dict(results)
if not include_empty:
logging.debug(f"Removing empty columns from {metadata_file}")
df.replace("", float("NaN"), inplace=True)
df.dropna(inplace=True, how="all", axis=1)
df.replace(float("NaN"), "", inplace=True)
df.to_csv(output_fh, sep="\t", index=False)

logging.info(f"Writing accessions to {accessions_file}")
with open(accessions_file, "w") as output_fh:
Expand Down
Loading

0 comments on commit 4b48572

Please sign in to comment.