-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(3.0.0): Docs / Refactor / NCBI / CLI
BREAKING CHANGE: Refactored util.io -> util.web
- Loading branch information
1 parent
56739fe
commit 71df0e5
Showing
25 changed files
with
298 additions
and
57 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
*** | ||
CLI | ||
*** | ||
|
||
|
||
.. click:: magna.__main__:typer_click_object | ||
:prog: magna | ||
:nested: full |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
********* | ||
Accession | ||
********* | ||
|
||
|
||
.. autofunction:: magna.ncbi.accession.is_valid_ncbi_gid |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
*** | ||
Web | ||
*** | ||
|
||
|
||
|
||
.. autofunction:: magna.ncbi.web.get_ncbi_assembly_id | ||
|
||
.. autofunction:: magna.ncbi.web.get_ncbi_ftp_root | ||
|
||
.. autofunction:: magna.ncbi.web.get_md5checksums | ||
|
||
.. autofunction:: magna.ncbi.web.download_ncbi_assembly_file_to_disk | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
************ | ||
Input/Output | ||
************ | ||
|
||
.. autofunction:: magna.util.disk.untar | ||
|
||
|
||
.. autofunction:: magna.util.disk.md5sum | ||
|
||
|
||
.. autofunction:: magna.util.disk.cache_file | ||
|
||
|
||
.. autofunction:: magna.util.disk.copy_file | ||
|
||
|
||
.. autofunction:: magna.util.disk.move_file |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
*** | ||
Web | ||
*** | ||
|
||
|
||
.. autofunction:: magna.util.web.download_file | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import typer | ||
|
||
from magna.cli.ncbi import app as ncbi_app | ||
|
||
app = typer.Typer() | ||
|
||
app.add_typer(ncbi_app, name='ncbi') | ||
|
||
# Purely for documentation | ||
typer_click_object = typer.main.get_command(app) | ||
|
||
|
||
if __name__ == "__main__": | ||
app() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
import typer | ||
|
||
from magna.ncbi.web import download_ncbi_assembly_file_to_disk, NcbiAssemblyFileType | ||
|
||
app = typer.Typer() | ||
|
||
|
||
@app.command() | ||
def download(gid: str, target: str, file: NcbiAssemblyFileType = NcbiAssemblyFileType.fna): | ||
download_ncbi_assembly_file_to_disk(gid, target, file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import re | ||
|
||
RE_NCBI_ACCESSION = re.compile(r'(GC[AF]_\d{9}\.\d)') | ||
|
||
|
||
def is_valid_ncbi_gid(gid: str) -> bool: | ||
"""Check if the NCBI accession matches the expected format. | ||
Args: | ||
gid: The NCBI accession to check. | ||
Returns: | ||
True if the accession is valid, False otherwise. | ||
Examples: | ||
>>> is_valid_ncbi_gid('GC_000001.1') | ||
False | ||
>>> is_valid_ncbi_gid('GCA_123456789.1') | ||
True | ||
""" | ||
return RE_NCBI_ACCESSION.match(gid) is not None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
import os | ||
import re | ||
import tempfile | ||
import urllib | ||
import urllib.request | ||
from enum import Enum | ||
from typing import Dict, Tuple | ||
from urllib.parse import urljoin | ||
|
||
from magna.ncbi.accession import is_valid_ncbi_gid | ||
from magna.util.disk import move_file | ||
from magna.util.web import download_file | ||
|
||
RE_NCBI_ASSEMBLY = re.compile(r'<a href="(GC[AF]_\d{9}\.\d.+?)\/">') | ||
|
||
|
||
def get_ncbi_assembly_id(gid: str) -> str: | ||
"""Return the assembly ID for a given NCBI accession. | ||
Args: | ||
gid: The NCBI accession. | ||
Returns: | ||
The NCBI assembly. | ||
Examples: | ||
>>> get_ncbi_assembly_id('GCA_003138775.1') | ||
'GCA_003138775.1_20110800_S2D' | ||
""" | ||
if not is_valid_ncbi_gid(gid): | ||
raise ValueError(f'Invalid NCBI accession: {gid}') | ||
base = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/' | ||
url = urljoin(base, f'{gid[0:3]}/{gid[4:7]}/{gid[7:10]}/{gid[10:13]}') | ||
|
||
urlpath = urllib.request.urlopen(url) | ||
string = urlpath.read().decode('utf-8') | ||
hits = RE_NCBI_ASSEMBLY.findall(string) | ||
if len(hits) == 0: | ||
raise Exception(f'No hits found: {url}') | ||
if len(hits) > 1: | ||
raise NotImplemented(f'Found multiple hits: {hits}') | ||
return hits[0] | ||
|
||
|
||
def get_ncbi_ftp_root(gid: str) -> Tuple[str, str]: | ||
"""Return the FTP root and assembly ID for a given NCBI accession. | ||
Args: | ||
gid: The NCBI accession. | ||
Returns: | ||
The FTP root and assembly ID. | ||
Examples: | ||
>>> get_ncbi_ftp_root('GCA_003138775.1') | ||
('https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/138/775/GCA_003138775.1_20110800_S2D/', 'GCA_003138775.1_20110800_S2D') | ||
""" | ||
assembly = get_ncbi_assembly_id(gid) | ||
base = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/' | ||
url = urljoin(base, f'{gid[0:3]}/{gid[4:7]}/{gid[7:10]}/{gid[10:13]}/{assembly}/') | ||
return url, assembly | ||
|
||
|
||
def get_md5checksums(url: str) -> Dict[str, str]: | ||
"""Retrieve the md5checksums.txt file and parse the content. | ||
Args: | ||
url: The URL to the md5checksums.txt file. | ||
Returns: | ||
A dictionary of md5checksums. | ||
Examples: | ||
>>> get_md5checksums('https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/138/775/GCA_003138775.1_20110800_S2D/md5checksums.txt') | ||
{'GCA_003138775.1_20110800_S2D.fna.gz': 'f9f8f8f8f8f8f8f8f8f8f8f8f8f8f8', ...} | ||
""" | ||
out = dict() | ||
urlpath = urllib.request.urlopen(url) | ||
for line in urlpath.read().decode('utf-8').splitlines(): | ||
md5, file = line.split() | ||
if file.startswith('./'): | ||
file = file[2:] | ||
out[file] = md5 | ||
return out | ||
|
||
|
||
class NcbiAssemblyFileType(str, Enum): | ||
fna = 'fna' | ||
|
||
|
||
def download_ncbi_assembly_file_to_disk(gid: str, target: str, file: NcbiAssemblyFileType): | ||
"""Download a file from the NCBI assembly directory to disk. | ||
Args: | ||
gid: The NCBI accession. | ||
target: The target path. | ||
file: The file type to download. | ||
Examples: | ||
>>> download_ncbi_assembly_file_to_disk('GCA_003138775.1', '/tmp/foo.fna.gz', NcbiAssemblyFileType.fna) | ||
""" | ||
# Create the directory if it doesn't exist | ||
if os.path.isfile(target): | ||
return | ||
else: | ||
os.makedirs(os.path.dirname(target), exist_ok=True) | ||
|
||
# Generate the paths | ||
root, assembly = get_ncbi_ftp_root(gid) | ||
md5s = get_md5checksums(urljoin(root, 'md5checksums.txt')) | ||
if file == NcbiAssemblyFileType.fna: | ||
name = f'{assembly}_genomic.fna.gz' | ||
url = urljoin(root, name) | ||
md5 = md5s[name] | ||
else: | ||
raise NotImplementedError(f'File type not implemented: {file}') | ||
|
||
# Download to a temporary location and verify the md5 | ||
with tempfile.TemporaryDirectory() as tmpdir: | ||
target_tmp = os.path.join(tmpdir, name) | ||
download_file(url, target_tmp, md5) | ||
move_file(target_tmp, target, checksum=True) |
Oops, something went wrong.