biocore · serenejiang · May 24, 2017 · May 22, 2017 · May 22, 2017 · May 22, 2017
diff --git a/genomesubsampler/parseRepophlan.py b/genomesubsampler/parseRepophlan.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2017--, genome-subsampler development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+#
+# parser for RepoPhlAn-downloaded genomes
+#
+
+import click
+import pandas as pd
+
+
+def parse_repophlan(repophlan_wscores_fp):
+    """ Extract number of HGTs found.
+
+    Parameters
+    ----------
+    repophlan_wscores_fp: string
+        file path to RepoPhlAn summary table with scores
+
+    Returns
+    -------
+    list of string
+        Human-readable report of basic statistics of genomes
+    """
+    df = pd.read_table(repophlan_wscores_fp, index_col=0, header=0)
+    out = []
+    out.append('Total number of genomes: %s.' % df.shape[0])
+    out.append('Number of RefSeq genomes: %s.'
+               % df['assembly_accession'].str.contains('GCF_').sum())
+    out.append('With genome sequences (fna): %s.' % df['fna_lname'].count())
+    out.append('With protein sequences (faa): %s.' % df['faa_lname'].count())
+    out.append('With protein-coding DNA sequences (ffn): %s.'
+               % df['ffn_lname'].count())
+    out.append('With RNA-coding DNA sequences (frn): %s.'
+               % df['frn_lname'].count())
+    return out
+
+
+@click.command()
+@click.option('--repophlan-wscores-fp', required=True,
+              type=click.Path(resolve_path=True, readable=True, exists=True,
+                              file_okay=True),
+              help='RepoPhlAn summary table with scores')
+def _main(repophlan_wscores_fp):
+    """ Parser for RepoPhlAn-downloaded genomes
+    """
+    out = parse_repophlan(repophlan_wscores_fp)
+    click.echo('\n'.join(out))
+    click.echo('Task completed.')
+
+
+if __name__ == "__main__":
+    _main()
diff --git a/genomesubsampler/tests/data/repophlan_microbes_wscores.txt b/genomesubsampler/tests/data/repophlan_microbes_wscores.txt
@@ -0,0 +1,11 @@
+#genome	all_coding_data	all_data	asm_name	ass_id	assembly_accession	assembly_level	bioproject	biosample	dwlf	faa_lname	ffn_lname	fna_lname	frn_lname	ftp_path	gbrs_paired_asm	genome	genome_rep	infraspecific_name	isolate	organism_name	outdir	paired_asm_comp	refseq_category	release_type	score_faa	score_fna	score_rrna	score_trna	seq_rel_date	species_taxid	submitter	taxid	taxonomy	version_status
+G000007525	Y	Y	ASM752v1	G000007525	GCF_000007525.1	Complete Genome	PRJNA57939	SAMN02603675	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/007/525/GCF_000007525.1_ASM752v1	microbes/faa/G000007525.faa.bz2	microbes/ffn/G000007525.ffn.bz2	microbes/fna/G000007525.fna.bz2	microbes/frn/G000007525.frn.bz2	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/007/525/GCF_000007525.1_ASM752v1	GCA_000007525.1	G000007525	Full	strain=NCC2705		Bifidobacterium longum NCC2705	microbes	identical	reference genome	Major	1	0.996	0.9	1	2005/01/21	216816	Nestle Research Center, Switzerland	206672	k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium|s__Bifidobacterium_longum|t__Bifidobacterium_longum_NCC2705	latest
+G000010305	Y	Y	ASM1030v1	G000010305	GCF_000010305.1	Complete Genome	PRJNA224116	SAMD00060909	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/010/305/GCF_000010305.1_ASM1030v1	microbes/faa/G000010305.faa.bz2	microbes/ffn/G000010305.ffn.bz2	microbes/fna/G000010305.fna.bz2	microbes/frn/G000010305.frn.bz2	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/010/305/GCF_000010305.1_ASM1030v1	GCA_000010305.1	G000010305	Full	strain=T-27		Gemmatimonas aurantiaca T-27	microbes	identical	representative genome	Major	1	1	1	1	2009/04/01	173480	National Institute of Technology and Evaluation	379066	k__Bacteria|p__Gemmatimonadetes|c__Gemmatimonadetes|o__Gemmatimonadales|f__Gemmatimonadaceae|g__Gemmatimonas|s__Gemmatimonas_aurantiaca|t__Gemmatimonas_aurantiaca_T_27	latest
+G000010365	Y	Y	ASM1036v1	G000010365	GCA_000010365.1	Complete Genome	PRJNA17977	SAMD00061085	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/010/365/GCA_000010365.1_ASM1036v1	microbes/faa/G000010365.faa.bz2	microbes/ffn/G000010365.ffn.bz2	microbes/fna/G000010365.fna.bz2	microbes/frn/G000010365.frn.bz2	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/010/365/GCA_000010365.1_ASM1036v1	GCF_000010365.1	G000010365	Full	strain=PV		Candidatus Carsonella ruddii PV	microbes	identical	na	Major	0.72	1	0.9	1	2006/10/16	114186	Kitasato Institute for Life Sciences	387662	k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Oceanospirillales|f__Halomonadaceae|g__Candidatus_Carsonella|s__Candidatus_Carsonella_ruddii|t__Candidatus_Carsonella_ruddii_PV	latest
+G000018865	Y	Y	ASM1886v1	G000018865	GCF_000018865.1	Complete Genome	PRJNA57657	SAMN02598539	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/018/865/GCF_000018865.1_ASM1886v1	microbes/faa/G000018865.faa.bz2	microbes/ffn/G000018865.ffn.bz2	microbes/fna/G000018865.fna.bz2	microbes/frn/G000018865.frn.bz2	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/018/865/GCF_000018865.1_ASM1886v1	GCA_000018865.1	G000018865	Full	strain=J-10-fl		Chloroflexus aurantiacus J-10-fl	microbes	identical	reference genome	Major	1	1	1	1	2007/12/20	1108	US DOE Joint Genome Institute	324602	k__Bacteria|p__Chloroflexi|c__Chloroflexia|o__Chloroflexales|f__Chloroflexaceae|g__Chloroflexus|s__Chloroflexus_aurantiacus|t__Chloroflexus_aurantiacus_J_10_fl	latest
+G000441575	Y	Y	ASM44157v1	G000441575	GCA_000441575.1	Complete Genome	PRJNA39871	SAMN02603759	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/441/575/GCA_000441575.1_ASM44157v1	microbes/faa/G000441575.faa.bz2	microbes/ffn/G000441575.ffn.bz2	microbes/fna/G000441575.fna.bz2	microbes/frn/G000441575.frn.bz2	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/441/575/GCA_000441575.1_ASM44157v1	GCF_000441575.1	G000441575	Full	strain=DC		Candidatus Carsonella ruddii DC	microbes	identical	na	Major	0.73	1	0.9	1	2013/07/26	114186	RIKEN	667013	k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Oceanospirillales|f__Halomonadaceae|g__Candidatus_Carsonella|s__Candidatus_Carsonella_ruddii|t__Candidatus_Carsonella_ruddii_DC	latest
+G000025185	Y	Y	ASM2518v1	G000025185	GCF_000025185.1	Complete Genome	PRJNA224116	SAMN00120219	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/025/185/GCF_000025185.1_ASM2518v1	microbes/faa/G000025185.faa.bz2	microbes/ffn/G000025185.ffn.bz2	microbes/fna/G000025185.fna.bz2	microbes/frn/G000025185.frn.bz2	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/025/185/GCF_000025185.1_ASM2518v1	GCA_000025185.1	G000025185	Full	strain=DSM 6068		Pirellula staleyi DSM 6068	microbes	identical	representative genome	Major	1	1	0.9	1	2010/01/07	125	US DOE Joint Genome Institute (JGI-PGF)	530564	k__Bacteria|p__Planctomycetes|c__Planctomycetia|o__Planctomycetales|f__Planctomycetaceae|g__Pirellula|s__Pirellula_staleyi|t__Pirellula_staleyi_DSM_6068	latest
+G000158275	Y	Y	ASM15827v2	G000158275	GCF_000158275.2	Complete Genome	PRJNA224116	SAMN02463711	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/158/275/GCF_000158275.2_ASM15827v2	microbes/faa/G000158275.faa.bz2	microbes/ffn/G000158275.ffn.bz2	microbes/fna/G000158275.fna.bz2	microbes/frn/G000158275.frn.bz2	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/158/275/GCF_000158275.2_ASM15827v2	GCA_000158275.2	G000158275	Full	strain=7_1		Fusobacterium nucleatum subsp. animalis 7_1	microbes	identical	representative genome	Major	1	0.996	1	1	2014/02/03	851	Broad Institute	457405	k__Bacteria|p__Fusobacteria|c__Fusobacteriia|o__Fusobacteriales|f__Fusobacteriaceae|g__Fusobacterium|s__Fusobacterium_nucleatum|t__Fusobacterium_nucleatum_subsp_animalis_7_1	latest
+G000011545	Y	Y	ASM1154v1	G000011545	GCF_000011545.1	Complete Genome	PRJNA57733	SAMEA1705938	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/011/545/GCF_000011545.1_ASM1154v1	microbes/faa/G000011545.faa.bz2	microbes/ffn/G000011545.ffn.bz2	microbes/fna/G000011545.fna.bz2	microbes/frn/G000011545.frn.bz2	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/011/545/GCF_000011545.1_ASM1154v1	GCA_000011545.1	G000011545	Full	strain=K96243		Burkholderia pseudomallei K96243	microbes	identical	reference genome	Major	1	0.999	0.9	1	2004/09/16	28450	Sanger Institute	272560	k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Burkholderia|s__Burkholderia_pseudomallei|t__Burkholderia_pseudomallei_K96243	latest
+G000011705	Y	Y	ASM1170v1	G000011705	GCF_000011705.1	Complete Genome	PRJNA57725	SAMN02603987	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/011/705/GCF_000011705.1_ASM1170v1	microbes/faa/G000011705.faa.bz2	microbes/ffn/G000011705.ffn.bz2	microbes/fna/G000011705.fna.bz2	microbes/frn/G000011705.frn.bz2	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/011/705/GCF_000011705.1_ASM1170v1	GCA_000011705.1	G000011705	Full	strain=ATCC 23344		Burkholderia mallei ATCC 23344	microbes	identical	reference genome	Major	0.96	0.998	0.9	1	2005/06/06	13373	TIGR	243160	k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Burkholderia|s__Burkholderia_mallei|t__Burkholderia_mallei_ATCC_23344	latest
+
diff --git a/genomesubsampler/tests/test_parseRepophlan.py b/genomesubsampler/tests/test_parseRepophlan.py
@@ -0,0 +1,58 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2017--, genome-subsampler development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from unittest import TestCase, main
+from click.testing import CliRunner
+from shutil import rmtree
+from tempfile import mkdtemp
+from skbio.util import get_data_path
+
+from genomesubsampler.parseRepophlan import (parse_repophlan,
+                                             _main)
+
+
+class ParseRepophlanTests(TestCase):
+    """ Tests for parseRepophlan.py """
+
+    def setUp(self):
+        """ Create working directory and test files
+        """
+        self.wkdir = mkdtemp()
+        self.repophlan_fp = get_data_path('repophlan_microbes_wscores.txt')
+
+    def tearDown(self):
+        """ Delete working directory and test files
+        """
+        rmtree(self.wkdir)
+
+    def test_parse_repophlan(self):
+        """ Test function parse_repophlan
+        """
+        obs = parse_repophlan(self.repophlan_fp)
+        exp = str_basic_stats.split('\n')
+        self.assertListEqual(obs, exp)
+
+    def test__main(self):
+        """ Test for the main process following Click
+        """
+        params = ['--repophlan-wscores-fp', self.repophlan_fp]
+        res = CliRunner().invoke(_main, params)
+        self.assertEqual(res.exit_code, 0)
+        self.assertIn(str_basic_stats, res.output)
+        self.assertIn('Task completed.', res.output)
+
+
+str_basic_stats = ('Total number of genomes: 9.\n'
+                   'Number of RefSeq genomes: 7.\n'
+                   'With genome sequences (fna): 9.\n'
+                   'With protein sequences (faa): 9.\n'
+                   'With protein-coding DNA sequences (ffn): 9.\n'
+                   'With RNA-coding DNA sequences (frn): 9.')
+
+if __name__ == '__main__':
+    main()