Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

parse repophlan demo #1 #15

Merged
merged 4 commits into from
May 24, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion genomesubsampler/genomeSubsampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

@click.command()
def _main():
""" Main front-end of the genome-subsampler
"""Main front-end of the genome-subsampler.
"""
click.echo('Task completed.')

Expand Down
60 changes: 60 additions & 0 deletions genomesubsampler/parseRepophlan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python

# ----------------------------------------------------------------------------
# Copyright (c) 2017--, genome-subsampler development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

#
# parser for RepoPhlAn-downloaded genomes
#

import click
import pandas as pd


def parse_repophlan(repophlan_wscores_fp):
"""Compute basic statistics of RepoPhlAn-downloaded genomes.

Parameters
----------
repophlan_wscores_fp : str
File path to RepoPhlAn summary table with scores.

Returns
-------
list of str
Human-readable report of basic statistics of genomes.
"""
df = pd.read_table(repophlan_wscores_fp, index_col=0, header=0)
out = []
out.append('Total number of genomes: %s.' % df.shape[0])
out.append('Number of RefSeq genomes: %s.'
% df['assembly_accession'].str.contains('GCF_').sum())
out.append('With genome sequences (fna): %s.' % df['fna_lname'].count())
out.append('With protein sequences (faa): %s.' % df['faa_lname'].count())
out.append('With protein-coding DNA sequences (ffn): %s.'
% df['ffn_lname'].count())
out.append('With RNA-coding DNA sequences (frn): %s.'
% df['frn_lname'].count())
return out


@click.command()
@click.option('--repophlan-wscores-fp', required=True,
type=click.Path(resolve_path=True, readable=True, exists=True,
file_okay=True),
help='RepoPhlAn summary table with scores')
def _main(repophlan_wscores_fp):
"""Parser for RepoPhlAn-downloaded genomes.
"""
out = parse_repophlan(repophlan_wscores_fp)
click.echo('\n'.join(out))
click.echo('Task completed.')


if __name__ == "__main__":
_main()
11 changes: 11 additions & 0 deletions genomesubsampler/tests/data/repophlan_microbes_wscores.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#genome all_coding_data all_data asm_name ass_id assembly_accession assembly_level bioproject biosample dwlf faa_lname ffn_lname fna_lname frn_lname ftp_path gbrs_paired_asm genome genome_rep infraspecific_name isolate organism_name outdir paired_asm_comp refseq_category release_type score_faa score_fna score_rrna score_trna seq_rel_date species_taxid submitter taxid taxonomy version_status
G000007525 Y Y ASM752v1 G000007525 GCF_000007525.1 Complete Genome PRJNA57939 SAMN02603675 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/007/525/GCF_000007525.1_ASM752v1 microbes/faa/G000007525.faa.bz2 microbes/ffn/G000007525.ffn.bz2 microbes/fna/G000007525.fna.bz2 microbes/frn/G000007525.frn.bz2 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/007/525/GCF_000007525.1_ASM752v1 GCA_000007525.1 G000007525 Full strain=NCC2705 Bifidobacterium longum NCC2705 microbes identical reference genome Major 1 0.996 0.9 1 2005/01/21 216816 Nestle Research Center, Switzerland 206672 k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium|s__Bifidobacterium_longum|t__Bifidobacterium_longum_NCC2705 latest
G000010305 Y Y ASM1030v1 G000010305 GCF_000010305.1 Complete Genome PRJNA224116 SAMD00060909 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/010/305/GCF_000010305.1_ASM1030v1 microbes/faa/G000010305.faa.bz2 microbes/ffn/G000010305.ffn.bz2 microbes/fna/G000010305.fna.bz2 microbes/frn/G000010305.frn.bz2 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/010/305/GCF_000010305.1_ASM1030v1 GCA_000010305.1 G000010305 Full strain=T-27 Gemmatimonas aurantiaca T-27 microbes identical representative genome Major 1 1 1 1 2009/04/01 173480 National Institute of Technology and Evaluation 379066 k__Bacteria|p__Gemmatimonadetes|c__Gemmatimonadetes|o__Gemmatimonadales|f__Gemmatimonadaceae|g__Gemmatimonas|s__Gemmatimonas_aurantiaca|t__Gemmatimonas_aurantiaca_T_27 latest
G000010365 Y Y ASM1036v1 G000010365 GCA_000010365.1 Complete Genome PRJNA17977 SAMD00061085 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/010/365/GCA_000010365.1_ASM1036v1 microbes/faa/G000010365.faa.bz2 microbes/ffn/G000010365.ffn.bz2 microbes/fna/G000010365.fna.bz2 microbes/frn/G000010365.frn.bz2 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/010/365/GCA_000010365.1_ASM1036v1 GCF_000010365.1 G000010365 Full strain=PV Candidatus Carsonella ruddii PV microbes identical na Major 0.72 1 0.9 1 2006/10/16 114186 Kitasato Institute for Life Sciences 387662 k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Oceanospirillales|f__Halomonadaceae|g__Candidatus_Carsonella|s__Candidatus_Carsonella_ruddii|t__Candidatus_Carsonella_ruddii_PV latest
G000018865 Y Y ASM1886v1 G000018865 GCF_000018865.1 Complete Genome PRJNA57657 SAMN02598539 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/018/865/GCF_000018865.1_ASM1886v1 microbes/faa/G000018865.faa.bz2 microbes/ffn/G000018865.ffn.bz2 microbes/fna/G000018865.fna.bz2 microbes/frn/G000018865.frn.bz2 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/018/865/GCF_000018865.1_ASM1886v1 GCA_000018865.1 G000018865 Full strain=J-10-fl Chloroflexus aurantiacus J-10-fl microbes identical reference genome Major 1 1 1 1 2007/12/20 1108 US DOE Joint Genome Institute 324602 k__Bacteria|p__Chloroflexi|c__Chloroflexia|o__Chloroflexales|f__Chloroflexaceae|g__Chloroflexus|s__Chloroflexus_aurantiacus|t__Chloroflexus_aurantiacus_J_10_fl latest
G000441575 Y Y ASM44157v1 G000441575 GCA_000441575.1 Complete Genome PRJNA39871 SAMN02603759 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/441/575/GCA_000441575.1_ASM44157v1 microbes/faa/G000441575.faa.bz2 microbes/ffn/G000441575.ffn.bz2 microbes/fna/G000441575.fna.bz2 microbes/frn/G000441575.frn.bz2 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/441/575/GCA_000441575.1_ASM44157v1 GCF_000441575.1 G000441575 Full strain=DC Candidatus Carsonella ruddii DC microbes identical na Major 0.73 1 0.9 1 2013/07/26 114186 RIKEN 667013 k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Oceanospirillales|f__Halomonadaceae|g__Candidatus_Carsonella|s__Candidatus_Carsonella_ruddii|t__Candidatus_Carsonella_ruddii_DC latest
G000025185 Y Y ASM2518v1 G000025185 GCF_000025185.1 Complete Genome PRJNA224116 SAMN00120219 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/025/185/GCF_000025185.1_ASM2518v1 microbes/faa/G000025185.faa.bz2 microbes/ffn/G000025185.ffn.bz2 microbes/fna/G000025185.fna.bz2 microbes/frn/G000025185.frn.bz2 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/025/185/GCF_000025185.1_ASM2518v1 GCA_000025185.1 G000025185 Full strain=DSM 6068 Pirellula staleyi DSM 6068 microbes identical representative genome Major 1 1 0.9 1 2010/01/07 125 US DOE Joint Genome Institute (JGI-PGF) 530564 k__Bacteria|p__Planctomycetes|c__Planctomycetia|o__Planctomycetales|f__Planctomycetaceae|g__Pirellula|s__Pirellula_staleyi|t__Pirellula_staleyi_DSM_6068 latest
G000158275 Y Y ASM15827v2 G000158275 GCF_000158275.2 Complete Genome PRJNA224116 SAMN02463711 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/158/275/GCF_000158275.2_ASM15827v2 microbes/faa/G000158275.faa.bz2 microbes/ffn/G000158275.ffn.bz2 microbes/fna/G000158275.fna.bz2 microbes/frn/G000158275.frn.bz2 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/158/275/GCF_000158275.2_ASM15827v2 GCA_000158275.2 G000158275 Full strain=7_1 Fusobacterium nucleatum subsp. animalis 7_1 microbes identical representative genome Major 1 0.996 1 1 2014/02/03 851 Broad Institute 457405 k__Bacteria|p__Fusobacteria|c__Fusobacteriia|o__Fusobacteriales|f__Fusobacteriaceae|g__Fusobacterium|s__Fusobacterium_nucleatum|t__Fusobacterium_nucleatum_subsp_animalis_7_1 latest
G000011545 Y Y ASM1154v1 G000011545 GCF_000011545.1 Complete Genome PRJNA57733 SAMEA1705938 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/011/545/GCF_000011545.1_ASM1154v1 microbes/faa/G000011545.faa.bz2 microbes/ffn/G000011545.ffn.bz2 microbes/fna/G000011545.fna.bz2 microbes/frn/G000011545.frn.bz2 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/011/545/GCF_000011545.1_ASM1154v1 GCA_000011545.1 G000011545 Full strain=K96243 Burkholderia pseudomallei K96243 microbes identical reference genome Major 1 0.999 0.9 1 2004/09/16 28450 Sanger Institute 272560 k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Burkholderia|s__Burkholderia_pseudomallei|t__Burkholderia_pseudomallei_K96243 latest
G000011705 Y Y ASM1170v1 G000011705 GCF_000011705.1 Complete Genome PRJNA57725 SAMN02603987 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/011/705/GCF_000011705.1_ASM1170v1 microbes/faa/G000011705.faa.bz2 microbes/ffn/G000011705.ffn.bz2 microbes/fna/G000011705.fna.bz2 microbes/frn/G000011705.frn.bz2 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/011/705/GCF_000011705.1_ASM1170v1 GCA_000011705.1 G000011705 Full strain=ATCC 23344 Burkholderia mallei ATCC 23344 microbes identical reference genome Major 0.96 0.998 0.9 1 2005/06/06 13373 TIGR 243160 k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Burkholderia|s__Burkholderia_mallei|t__Burkholderia_mallei_ATCC_23344 latest

8 changes: 4 additions & 4 deletions genomesubsampler/tests/test_genomeSubsampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,20 @@


class GenomeSubsamplerTests(TestCase):
""" Tests for genomeSubsampler.py """
"""Tests for genomeSubsampler.py."""

def setUp(self):
""" Create working directory and test files
"""Create working directory and test files.
"""
self.wkdir = mkdtemp()

def tearDown(self):
""" Delete working directory and test files
"""Delete working directory and test files.
"""
rmtree(self.wkdir)

def test__main(self):
""" Test for the main process following Click
"""Test for the main process following Click.
"""
params = []
res = CliRunner().invoke(_main, params)
Expand Down
58 changes: 58 additions & 0 deletions genomesubsampler/tests/test_parseRepophlan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2017--, genome-subsampler development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from unittest import TestCase, main
from click.testing import CliRunner
from shutil import rmtree
from tempfile import mkdtemp
from skbio.util import get_data_path

from genomesubsampler.parseRepophlan import (parse_repophlan,
_main)


class ParseRepophlanTests(TestCase):
"""Tests for parseRepophlan.py."""

def setUp(self):
"""Create working directory and test files.
"""
self.wkdir = mkdtemp()
self.repophlan_fp = get_data_path('repophlan_microbes_wscores.txt')

def tearDown(self):
"""Delete working directory and test files.
"""
rmtree(self.wkdir)

def test_parse_repophlan(self):
"""Test function parse_repophlan.
"""
obs = parse_repophlan(self.repophlan_fp)
exp = BASIC_STATS.split('\n')
self.assertListEqual(obs, exp)

def test__main(self):
"""Test for the main process following Click.
"""
params = ['--repophlan-wscores-fp', self.repophlan_fp]
res = CliRunner().invoke(_main, params)
self.assertEqual(res.exit_code, 0)
self.assertIn(BASIC_STATS, res.output)
self.assertIn('Task completed.', res.output)


BASIC_STATS = ('Total number of genomes: 9.\n'
'Number of RefSeq genomes: 7.\n'
'With genome sequences (fna): 9.\n'
'With protein sequences (faa): 9.\n'
'With protein-coding DNA sequences (ffn): 9.\n'
'With RNA-coding DNA sequences (frn): 9.')

if __name__ == '__main__':
main()