Skip to content

Commit

Permalink
feat(GTDB): Add R202 and Genome.
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronmussig committed Feb 16, 2022
1 parent 9136640 commit 31e5bc3
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 3 deletions.
5 changes: 5 additions & 0 deletions magna/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import os
import tempfile
from pathlib import Path

# Persistent cache
MAGNA_DIR = os.path.join(Path.home(), '.magna')

# Temporary cache
CACHE_DIR = os.path.join(tempfile.gettempdir(), 'magna')
File renamed without changes.
10 changes: 10 additions & 0 deletions magna/gtdb/enums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from enum import Enum


class GtdbRelease(Enum):
R80 = '80'
R83 = '83'
R86 = '86'
R89 = '89'
R95 = '95'
R202 = '202'
60 changes: 60 additions & 0 deletions magna/gtdb/genome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import os
from typing import Dict, Tuple

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

from magna.gtdb.enums import GtdbRelease
from magna.io import cache_file


class Genome:

def __init__(self, accession: str, root: str):
self.accession: str = accession
self.root: str = root

# Generate paths
base = os.path.basename(self.root)
self.cds_path = os.path.join(self.root, f'{base}_cds_from_genomic.fna')
self.fna_path = os.path.join(self.root, f'{base}_genomic.fna')

def __repr__(self):
return str(self.accession)

def cds_seqio(self) -> Tuple[SeqRecord, ...]:
# Returns the CDS generated from the FNA
with open(self.cds_path, 'r') as f:
out = tuple(SeqIO.parse(f, 'fasta'))
return out

def fna_seqio(self) -> Tuple[SeqRecord, ...]:
# Returns the FNA
with open(self.fna_path, 'r') as f:
out = tuple(SeqIO.parse(f, 'fasta'))
return out


class GenomeDirs:

def __init__(self, release: GtdbRelease):
self.release = release

# Create the paths
srv_path = f'/srv/db/gtdb/genomes/ncbi/release{release.value}/genome_dirs.tsv'
cache_path = cache_file(srv_path, f'genome_dirs_{self.release.value}.tsv')

# Read the data
self._data = self.read(cache_path)

@staticmethod
def read(path: str) -> Dict[str, str]:
out = dict()
with open(path, 'r') as f:
for line in f:
short, root, canonical = line.strip().split('\t')
out[short] = root
return out

def get(self, accession: str) -> Genome:
return Genome(accession=accession, root=self._data[accession])
36 changes: 33 additions & 3 deletions magna/dataset/gtdb/metadata.py → magna/gtdb/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from magna.io import download_file, md5sum, untar


class _GtdbMetadataR95:
class _GtdbMetadata:

def __init__(self, source: str, path: str, md5: str):
self.source = source
Expand Down Expand Up @@ -43,7 +43,9 @@ def _download(self):
df.to_feather(path=self.path, compression='lz4')


class GtdbMetadataR95Arc(_GtdbMetadataR95):
# ----------------------------------------------------------------------------------------------------------------------

class GtdbMetadataR95Arc(_GtdbMetadata):
source = 'https://data.gtdb.ecogenomic.org/releases/release95/95.0/ar122_metadata_r95.tar.gz'
path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'metadata', 'ar122_metadata_r95.feather')
md5 = '110ad5daa2dbed2ee904b10c295da5dc'
Expand All @@ -52,7 +54,7 @@ def __init__(self):
super().__init__(self.source, self.path, self.md5)


class GtdbMetadataR95Bac(_GtdbMetadataR95):
class GtdbMetadataR95Bac(_GtdbMetadata):
source = 'https://data.gtdb.ecogenomic.org/releases/release95/95.0/bac120_metadata_r95.tar.gz'
path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'metadata', 'bac120_metadata_r95.feather')
md5 = '223ada02ffca4d1a2dda6edb9a164dcd'
Expand All @@ -65,3 +67,31 @@ class GtdbMetadataR95:

def __init__(self):
self.df = pd.concat([GtdbMetadataR95Arc().df, GtdbMetadataR95Bac().df])


# ----------------------------------------------------------------------------------------------------------------------

class GtdbMetadataR202Arc(_GtdbMetadata):
source = 'https://data.gtdb.ecogenomic.org/releases/release202/202.0/ar122_metadata_r202.tar.gz'
path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'metadata', 'ar122_metadata_r202.feather')
md5 = '0607728ae1f56bdb1a7cc24d238185c3'

def __init__(self):
super().__init__(self.source, self.path, self.md5)


class GtdbMetadataR202Bac(_GtdbMetadata):
source = 'https://data.gtdb.ecogenomic.org/releases/release202/202.0/bac120_metadata_r202.tar.gz'
path = os.path.join(MAGNA_DIR, 'dataset', 'gtdb', 'metadata', 'bac120_metadata_r202.feather')
md5 = '68fed11eb688982edb6f4669476c2a10'

def __init__(self):
super().__init__(self.source, self.path, self.md5)


class GtdbMetadataR202:

def __init__(self):
self.df = pd.concat([GtdbMetadataR202Arc().df, GtdbMetadataR202Bac().df])

# ----------------------------------------------------------------------------------------------------------------------
14 changes: 14 additions & 0 deletions magna/io.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import hashlib
import os
import shutil
import tarfile
import urllib
import urllib.request
from typing import Optional

from tqdm import tqdm

from magna.config import CACHE_DIR


def untar(file_path, dir_name):
"""
Expand Down Expand Up @@ -52,3 +56,13 @@ def download_file(url: str, path: str, md5: Optional[str] = None):

if md5 and md5 != md5sum(path):
raise ValueError('Hash mismatch')


def cache_file(srv_path: str, local_name: str) -> str:
"""Copies a remote file to the local machine."""
if not os.path.isdir(CACHE_DIR):
os.makedirs(CACHE_DIR)
local_path = os.path.join(CACHE_DIR, local_name)
if not os.path.isfile(local_path):
shutil.copyfile(srv_path, local_path)
return local_path

0 comments on commit 31e5bc3

Please sign in to comment.