Skip to content

Commit

Permalink
feat(3.0.0): Docs / Refactor / NCBI / CLI
Browse files Browse the repository at this point in the history
BREAKING CHANGE: Refactored util.io -> util.web
  • Loading branch information
aaronmussig committed May 10, 2022
1 parent 56739fe commit 71df0e5
Show file tree
Hide file tree
Showing 25 changed files with 298 additions and 57 deletions.
5 changes: 5 additions & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,8 @@ sphinx-rtd-theme ~= 1.0.0
sphinx ~= 4.4.0
sphinx-autodoc-typehints ~= 1.12.0
myst-parser ~= 0.16.1
attrs ~= 21.4.0
tqdm ~= 4.64.0
sphinx-click ~= 4.0.3
typer ~= 0.4.1
biopython ~= 1.79
8 changes: 8 additions & 0 deletions docs/source/cli/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
***
CLI
***


.. click:: magna.__main__:typer_click_object
:prog: magna
:nested: full
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
# ones.
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon',
'sphinx_rtd_theme', 'sphinx_autodoc_typehints',
'myst_parser']
'myst_parser', 'sphinx_click']

# Napoleon settings
napoleon_google_docstring = True
Expand Down
4 changes: 2 additions & 2 deletions docs/source/hmmer/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ HMMER
TopHit file
-----------

.. autoclass:: magna.hmmer.Hit
.. autoclass:: magna.hmmer.tophit.Hit
:members:

.. autoclass:: magna.hmmer.TopHitFile
.. autoclass:: magna.hmmer.tophit.TopHitFile
:members:
27 changes: 26 additions & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@ Magna is a collection of bioinformatic datasets and utilities I use in my everyd

This has been written with the intention of personal use, but feel free to use/contribute.

.. toctree::
:caption: CLI
:titlesonly:

cli/index



.. toctree::
:caption: GTDB
:titlesonly:
Expand All @@ -19,6 +27,7 @@ This has been written with the intention of personal use, but feel free to use/c
gtdb/markers
gtdb/enums


.. toctree::
:caption: GUNC
:titlesonly:
Expand All @@ -45,11 +54,27 @@ This has been written with the intention of personal use, but feel free to use/c
tigrfam/index


.. toctree::
:caption: NCBI
:titlesonly:

ncbi/accession
ncbi/web


.. toctree::
:caption: HMMER
:titlesonly:

hmmer/index


.. toctree::
:caption: Utility
:titlesonly:

util/io
util/disk
util/web
util/accession
util/tree
util/pandas
Expand Down
6 changes: 6 additions & 0 deletions docs/source/ncbi/accession.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
*********
Accession
*********


.. autofunction:: magna.ncbi.accession.is_valid_ncbi_gid
14 changes: 14 additions & 0 deletions docs/source/ncbi/web.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
***
Web
***



.. autofunction:: magna.ncbi.web.get_ncbi_assembly_id

.. autofunction:: magna.ncbi.web.get_ncbi_ftp_root

.. autofunction:: magna.ncbi.web.get_md5checksums

.. autofunction:: magna.ncbi.web.download_ncbi_assembly_file_to_disk

2 changes: 1 addition & 1 deletion docs/source/pfam/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ PFAM
TopHit file
-----------

.. autoclass:: magna.pfam.TopHitPfamFile
.. autoclass:: magna.pfam.tophit.TopHitPfamFile
:members:


Expand Down
2 changes: 1 addition & 1 deletion docs/source/tigrfam/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ TIGRFAM
TopHit file
-----------

.. autoclass:: magna.tigrfam.TopHitTigrFile
.. autoclass:: magna.tigrfam.tophit.TopHitTigrFile
:members:


Expand Down
17 changes: 17 additions & 0 deletions docs/source/util/disk.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
************
Input/Output
************

.. autofunction:: magna.util.disk.untar


.. autofunction:: magna.util.disk.md5sum


.. autofunction:: magna.util.disk.cache_file


.. autofunction:: magna.util.disk.copy_file


.. autofunction:: magna.util.disk.move_file
20 changes: 0 additions & 20 deletions docs/source/util/io.rst

This file was deleted.

7 changes: 7 additions & 0 deletions docs/source/util/web.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
***
Web
***


.. autofunction:: magna.util.web.download_file

14 changes: 14 additions & 0 deletions magna/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import typer

from magna.cli.ncbi import app as ncbi_app

app = typer.Typer()

app.add_typer(ncbi_app, name='ncbi')

# Purely for documentation
typer_click_object = typer.main.get_command(app)


if __name__ == "__main__":
app()
Empty file added magna/cli/__init__.py
Empty file.
10 changes: 10 additions & 0 deletions magna/cli/ncbi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import typer

from magna.ncbi.web import download_ncbi_assembly_file_to_disk, NcbiAssemblyFileType

app = typer.Typer()


@app.command()
def download(gid: str, target: str, file: NcbiAssemblyFileType = NcbiAssemblyFileType.fna):
download_ncbi_assembly_file_to_disk(gid, target, file)
2 changes: 1 addition & 1 deletion magna/gtdb/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from Bio.SeqRecord import SeqRecord

from magna.gtdb.enums import GtdbRelease
from magna.util.io import cache_file
from magna.util.disk import cache_file


class Genome:
Expand Down
3 changes: 2 additions & 1 deletion magna/gtdb/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import pandas as pd

from magna.config import MAGNA_DIR
from magna.util.io import download_file, md5sum, untar
from magna.util.disk import md5sum, untar
from magna.util.web import download_file


class GtdbMetadata:
Expand Down
3 changes: 2 additions & 1 deletion magna/gtdb/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import dendropy

from magna.config import MAGNA_DIR
from magna.util.io import download_file, md5sum
from magna.util.disk import md5sum
from magna.util.web import download_file


class GtdbTree:
Expand Down
2 changes: 1 addition & 1 deletion magna/gunc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd

from magna.config import MAGNA_DIR
from magna.util.io import download_file
from magna.util.web import download_file


def read_contig_assignments_tsv(path: str) -> pd.DataFrame:
Expand Down
Empty file added magna/ncbi/__init__.py
Empty file.
22 changes: 22 additions & 0 deletions magna/ncbi/accession.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import re

RE_NCBI_ACCESSION = re.compile(r'(GC[AF]_\d{9}\.\d)')


def is_valid_ncbi_gid(gid: str) -> bool:
"""Check if the NCBI accession matches the expected format.
Args:
gid: The NCBI accession to check.
Returns:
True if the accession is valid, False otherwise.
Examples:
>>> is_valid_ncbi_gid('GC_000001.1')
False
>>> is_valid_ncbi_gid('GCA_123456789.1')
True
"""
return RE_NCBI_ACCESSION.match(gid) is not None
122 changes: 122 additions & 0 deletions magna/ncbi/web.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import os
import re
import tempfile
import urllib
import urllib.request
from enum import Enum
from typing import Dict, Tuple
from urllib.parse import urljoin

from magna.ncbi.accession import is_valid_ncbi_gid
from magna.util.disk import move_file
from magna.util.web import download_file

RE_NCBI_ASSEMBLY = re.compile(r'<a href="(GC[AF]_\d{9}\.\d.+?)\/">')


def get_ncbi_assembly_id(gid: str) -> str:
"""Return the assembly ID for a given NCBI accession.
Args:
gid: The NCBI accession.
Returns:
The NCBI assembly.
Examples:
>>> get_ncbi_assembly_id('GCA_003138775.1')
'GCA_003138775.1_20110800_S2D'
"""
if not is_valid_ncbi_gid(gid):
raise ValueError(f'Invalid NCBI accession: {gid}')
base = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/'
url = urljoin(base, f'{gid[0:3]}/{gid[4:7]}/{gid[7:10]}/{gid[10:13]}')

urlpath = urllib.request.urlopen(url)
string = urlpath.read().decode('utf-8')
hits = RE_NCBI_ASSEMBLY.findall(string)
if len(hits) == 0:
raise Exception(f'No hits found: {url}')
if len(hits) > 1:
raise NotImplemented(f'Found multiple hits: {hits}')
return hits[0]


def get_ncbi_ftp_root(gid: str) -> Tuple[str, str]:
"""Return the FTP root and assembly ID for a given NCBI accession.
Args:
gid: The NCBI accession.
Returns:
The FTP root and assembly ID.
Examples:
>>> get_ncbi_ftp_root('GCA_003138775.1')
('https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/138/775/GCA_003138775.1_20110800_S2D/', 'GCA_003138775.1_20110800_S2D')
"""
assembly = get_ncbi_assembly_id(gid)
base = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/'
url = urljoin(base, f'{gid[0:3]}/{gid[4:7]}/{gid[7:10]}/{gid[10:13]}/{assembly}/')
return url, assembly


def get_md5checksums(url: str) -> Dict[str, str]:
"""Retrieve the md5checksums.txt file and parse the content.
Args:
url: The URL to the md5checksums.txt file.
Returns:
A dictionary of md5checksums.
Examples:
>>> get_md5checksums('https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/138/775/GCA_003138775.1_20110800_S2D/md5checksums.txt')
{'GCA_003138775.1_20110800_S2D.fna.gz': 'f9f8f8f8f8f8f8f8f8f8f8f8f8f8f8', ...}
"""
out = dict()
urlpath = urllib.request.urlopen(url)
for line in urlpath.read().decode('utf-8').splitlines():
md5, file = line.split()
if file.startswith('./'):
file = file[2:]
out[file] = md5
return out


class NcbiAssemblyFileType(str, Enum):
fna = 'fna'


def download_ncbi_assembly_file_to_disk(gid: str, target: str, file: NcbiAssemblyFileType):
"""Download a file from the NCBI assembly directory to disk.
Args:
gid: The NCBI accession.
target: The target path.
file: The file type to download.
Examples:
>>> download_ncbi_assembly_file_to_disk('GCA_003138775.1', '/tmp/foo.fna.gz', NcbiAssemblyFileType.fna)
"""
# Create the directory if it doesn't exist
if os.path.isfile(target):
return
else:
os.makedirs(os.path.dirname(target), exist_ok=True)

# Generate the paths
root, assembly = get_ncbi_ftp_root(gid)
md5s = get_md5checksums(urljoin(root, 'md5checksums.txt'))
if file == NcbiAssemblyFileType.fna:
name = f'{assembly}_genomic.fna.gz'
url = urljoin(root, name)
md5 = md5s[name]
else:
raise NotImplementedError(f'File type not implemented: {file}')

# Download to a temporary location and verify the md5
with tempfile.TemporaryDirectory() as tmpdir:
target_tmp = os.path.join(tmpdir, name)
download_file(url, target_tmp, md5)
move_file(target_tmp, target, checksum=True)

0 comments on commit 71df0e5

Please sign in to comment.