Allow multiple rdmcl runs to be combined in group_by_cluster

This required the `mode` positional to be converted to a flagged option
biologyguy · Mar 2, 2018 · 14c8111 · 14c8111
1 parent 2e1192f
commit 14c8111
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 83 deletions.
diff --git a/rdmcl/group_by_cluster.py b/rdmcl/group_by_cluster.py
@@ -74,12 +74,12 @@ def fmt(prog):
     # Positional
     positional = parser.add_argument_group(title="\033[1mPositional argument\033[m")
 
-    positional.add_argument("rdmcl_dir", action="store", help="Path to RD-MCL output directory")
-    positional.add_argument("mode", action="store", nargs="?", default="list",
-                            help="Choose the output type [list, seqs, sequences, aln, alignment, con, consensus]")
+    positional.add_argument("rdmcl_dir", action="append", nargs="+", help="Path(s) to RD-MCL output directory(ies)")
 
     # Optional commands
     parser_flags = parser.add_argument_group(title="\033[1mAvailable commands\033[m")
+    positional.add_argument("--mode", "-m", action="store", default="list",
+                            help="Choose the output type [list, seqs, sequences, aln, alignment, con, consensus]")
     parser_flags.add_argument("--sequence_file", "-s", action="store", help="Path to sequence file")
     parser_flags.add_argument("--aligner", "-a", action="store", default="clustalo", metavar="",
                               help="Specify a multiple sequence alignment program")
@@ -121,30 +121,42 @@ def main():
         Sb.br._stderr('Unrecognized mode, please select from ["seqs", "aln", "con", "list"].\n')
         sys.exit()
 
-    rdmcl_dir = os.path.abspath(in_args.rdmcl_dir)
+    rdmcl_dirs = {}
+    sequences = []
 
-    if not os.path.isdir(rdmcl_dir):
-        sys.stderr.write("Error: The provided RD-MCL output directory does not exist.\n")
-        sys.exit()
+    for rdmcl_dir in in_args.rdmcl_dir[0]:
+        rdmcl_dir = os.path.abspath(rdmcl_dir)
 
-    if not os.path.isfile(join(rdmcl_dir, "final_clusters.txt")):
-        sys.stderr.write("Error: The provided RD-MCL output directory does not "
-                         "contain the necessary file 'final_clusters.txt'.\n")
-        sys.exit()
+        if not os.path.isdir(rdmcl_dir):
+            sys.stderr.write("Error: The provided RD-MCL output directory does not exist.\n")
+            sys.exit()
 
-    cluster_file = hlp.prepare_clusters(join(rdmcl_dir, "final_clusters.txt"), hierarchy=True)
-    sequence_file = join(rdmcl_dir, "input_seqs.fa") if not in_args.sequence_file else in_args.sequence_file
-    if not os.path.isfile(sequence_file):
-        sys.stderr.write("Error: Unable to find sequence file.\n")
-        sys.exit()
+        if not os.path.isfile(join(rdmcl_dir, "final_clusters.txt")):
+            sys.stderr.write("Error: The provided RD-MCL output directory does not "
+                             "contain the necessary file 'final_clusters.txt'.\n")
+            sys.exit()
+
+        rdmcl_dirs[rdmcl_dir] = hlp.prepare_clusters(join(rdmcl_dir, "final_clusters.txt"), hierarchy=True)
+
+        if not in_args.sequence_file:
+            sequences.append(Sb.SeqBuddy(join(rdmcl_dir, "input_seqs.fa")))
+
+    if in_args.sequence_file:
+        if not os.path.isfile(in_args.sequence_file):
+            sys.stderr.write("Error: Unable to find sequence file at %s\n" % in_args.sequence_file)
+            sys.exit()
+        else:
+            seqbuddy = Sb.SeqBuddy(in_args.sequence_file)
+    else:
+        seqbuddy = Sb.SeqBuddy([rec for sb in sequences for rec in sb.records], out_format=sequences[0].out_format)
 
-    seqbuddy = Sb.SeqBuddy(sequence_file)
+    all_clusters = [clust for rdmcl_dir, clusts in rdmcl_dirs.items() for clust in clusts]
     output = OrderedDict()
 
     if in_args.groups:
         groups = []
         for group in in_args.groups[0]:
-            if group not in cluster_file:
+            if group not in all_clusters:
                 sys.stderr.write("%sWARNING%s: '%s' not present in clusters.\n" % (hlp.RED, hlp.END, group))
             else:
                 groups.append(group)
@@ -162,64 +174,65 @@ def main():
             paralogs = re.search("# .*?(?:_0)\n(.*)\n", paralogs)
             paralogs = {} if not paralogs else json.loads(paralogs.group(1))
 
-    for rank, node in cluster_file.items():
-        rank = rank.split()[0]
-        if in_args.groups:
-            if not re.search(in_args.groups, rank):
-                continue
-
-        if in_args.min_size:
-            if len(node) < in_args.min_size:
-                continue
-
-        if in_args.max_size:
-            if len(node) > in_args.max_size:
-                continue
-
-        if paralogs:
-            for rec_id, paralog_list in paralogs.items():
-                if rec_id in node:
-                    for p in paralog_list:
-                        del node[node.index(p)]
-
-        if in_args.strip_taxa:
-            node = [re.sub("^.*?\-", "", x) for x in node]
-
-        ids = "^%s$" % "$|^".join(node)
-        subset = Sb.pull_recs(Sb.make_copy(seqbuddy), ids)
-        subset = Sb.order_ids(subset)
-
-        if in_args.include_count:
-            rank += "(%s)" % len(subset)
-        rank_output = ""
-        if mode == "list":
-            rank_output += rank
-            for rec in subset.records:
-                rec.description = re.sub("^%s" % rec.id, "", rec.description)
-                rank_output += "\n%s %s" % (rec.id, rec.description)
-            rank_output += "\n"
-
-        elif mode == "seqs":
-            for rec in subset.records:
-                rec.description = "%s %s" % (rank, rec.description)
-            rank_output += str(subset)
-
-        elif mode in ["aln", "con"]:
-            try:
-                rank_output = make_msa(subset, in_args.aligner, in_args.trimal)
-            except (SystemError, AttributeError) as err:
-                print(err)
-                sys.exit()
-            rank_output.out_format = "phylip-relaxed"
-
-        if mode == "con":
-            rec = Alb.consensus_sequence(rank_output).records()[0]
-            rec.id = rank
-            rec.name = rank
-            rec.description = ""
-            rank_output.out_format = "fasta"
-
-        output[rank] = str(rank_output)
+    for rdmcl_dir, cluster_file in rdmcl_dirs.items():
+        for rank, node in cluster_file.items():
+            rank = rank.split()[0]
+            if in_args.groups:
+                if not re.search(in_args.groups, rank):
+                    continue
+
+            if in_args.min_size:
+                if len(node) < in_args.min_size:
+                    continue
+
+            if in_args.max_size:
+                if len(node) > in_args.max_size:
+                    continue
+
+            if paralogs:
+                for rec_id, paralog_list in paralogs.items():
+                    if rec_id in node:
+                        for p in paralog_list:
+                            del node[node.index(p)]
+
+            if in_args.strip_taxa:
+                node = [re.sub("^.*?-", "", x) for x in node]
+
+            ids = "^%s$" % "$|^".join(node)
+            subset = Sb.pull_recs(Sb.make_copy(seqbuddy), ids)
+            subset = Sb.order_ids(subset)
+
+            if in_args.include_count:
+                rank += "(%s)" % len(subset)
+            rank_output = ""
+            if mode == "list":
+                rank_output += rank
+                for rec in subset.records:
+                    rec.description = re.sub("^%s" % rec.id, "", rec.description)
+                    rank_output += "\n%s %s" % (rec.id, rec.description)
+                rank_output += "\n"
+
+            elif mode == "seqs":
+                for rec in subset.records:
+                    rec.description = "%s %s" % (rank, rec.description)
+                rank_output += str(subset)
+
+            elif mode in ["aln", "con"]:
+                try:
+                    rank_output = make_msa(subset, in_args.aligner, in_args.trimal)
+                except (SystemError, AttributeError) as err:
+                    print(err)
+                    sys.exit()
+                rank_output.out_format = "phylip-relaxed"
+
+            if mode == "con":
+                rec = Alb.consensus_sequence(rank_output).records()[0]
+                rec.id = rank
+                rec.name = rank
+                rec.description = ""
+                rank_output.out_format = "fasta"
+
+            output[rank] = str(rank_output)
 
     if not in_args.write:
         print("\n".join(data for rank, data in output.items()).strip())

diff --git a/rdmcl/tests/test_group_by_cluster.py b/rdmcl/tests/test_group_by_cluster.py
@@ -111,7 +111,7 @@ def test_argparse_init(monkeypatch, hf):
 
 
 def test_main_list(monkeypatch, hf, capsys):
-    argv = ['rdmcl.py', hf.resource_path, "list"]
+    argv = ['rdmcl.py', hf.resource_path, "-m", "list"]
     monkeypatch.setattr(sys, "argv", argv)
     group_by_cluster.main()
     out, err = capsys.readouterr()
@@ -128,23 +128,23 @@ def test_sequence_file(monkeypatch, hf, capsys):
 
 
 def test_main_seqs(monkeypatch, hf, capsys):
-    argv = ['rdmcl.py', hf.resource_path, "seqs"]
+    argv = ['rdmcl.py', hf.resource_path, "-m", "seqs"]
     monkeypatch.setattr(sys, "argv", argv)
     group_by_cluster.main()
     out, err = capsys.readouterr()
     assert hf.string2hash(out) == "732ff95ca03b8d56f94318faaf746b05", print(out)
 
 
 def test_main_aln(monkeypatch, hf, capsys):
-    argv = ['rdmcl.py', hf.resource_path, "aln"]
+    argv = ['rdmcl.py', hf.resource_path, "-m", "aln"]
     monkeypatch.setattr(sys, "argv", argv)
     group_by_cluster.main()
     out, err = capsys.readouterr()
     assert hf.string2hash(out) == "b4b7dae5b4f81db7e2ea26417f85a054", print(out)
 
 
 def test_main_cons(monkeypatch, hf, capsys):
-    argv = ['rdmcl.py', hf.resource_path, "cons"]
+    argv = ['rdmcl.py', hf.resource_path, "-m", "cons"]
     monkeypatch.setattr(sys, "argv", argv)
     group_by_cluster.main()
     out, err = capsys.readouterr()
@@ -197,7 +197,7 @@ def test_main_include_counts(monkeypatch, hf, capsys):
     out, err = capsys.readouterr()
     assert hf.string2hash(out) == "c22972e858d902a12e662e0899a3395a", print(out)
 
-    argv = ['rdmcl.py', hf.resource_path, "cons", "-ic"]
+    argv = ['rdmcl.py', hf.resource_path, "-m", "cons", "-ic"]
     monkeypatch.setattr(sys, "argv", argv)
     group_by_cluster.main()
     out, err = capsys.readouterr()
@@ -218,14 +218,14 @@ def test_main_write(monkeypatch, hf, capsys):
 
 
 def test_main_errors(monkeypatch, hf, capsys):
-    argv = ['rdmcl.py', hf.resource_path, "FOOBAR"]
+    argv = ['rdmcl.py', hf.resource_path, "-m", "FOOBAR"]
     monkeypatch.setattr(sys, "argv", argv)
     with pytest.raises(SystemExit):
         group_by_cluster.main()
     out, err = capsys.readouterr()
     assert 'Unrecognized mode, please select from ["seqs", "aln", "con", "list"].' in err
 
-    argv = ['rdmcl.py', hf.resource_path, "aln"]
+    argv = ['rdmcl.py', hf.resource_path, "-m", "aln"]
     monkeypatch.setattr(sys, "argv", argv)
 
     def kill1(*_):