From 70881cb4ffdd038c9803b586ae42e734f084b614 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 17 Nov 2015 20:53:11 -0500 Subject: [PATCH] add function and argparse wrapper for tabulating pairwise alignment summary numbers add function and argparse wrapper for tabulating pairwise alignment summary numbers, including: 1. how many positions have non-ambiguous (A,C,T,G) alleles on both assemblies (unambig_both) 2. how many positions have the same non-ambiguous allele on both assemblies (same_unambig) 3. how many unambiguous positions have a SNP between the two assemblies (snp_unambig) 4. how many unambiguous positions have an indel between the two assemblies (indel_unambig) 5. how many positions have an ambiguity in input 1 but not in input 2 (ambig_one) 6. how many positions have an ambiguity in input 2 but not in input 1 (ambig_two) The function can optionally print the counts to stdout (--printCounts), and can optionally write them to a TSV file (--outfileName). --- reports.py | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/reports.py b/reports.py index babbcf5bd..1b5beebcd 100755 --- a/reports.py +++ b/reports.py @@ -10,13 +10,20 @@ import glob import os import time +from collections import OrderedDict +import csv + import pysam import Bio.SeqIO +import Bio.AlignIO +from Bio.Alphabet.IUPAC import IUPACUnambiguousDNA + import util.cmd import util.file import util.misc import tools.samtools import assembly +import interhost from util.stats import mean, median log = logging.getLogger(__name__) @@ -152,6 +159,118 @@ def parser_assembly_stats(parser=argparse.ArgumentParser()): def get_refalign_stats(sample): pass +def alignment_summary(inFastaFileOne, inFastaFileTwo, outfileName=None, printCounts=False): + gap = '-' + ambiguous = 'N' + aligner = tools.muscle.MuscleTool()#.install_and_get_path() + + per_chr_fastas = interhost.transposeChromosomeFiles([inFastaFileOne, inFastaFileTwo]) + + results = OrderedDict() + results["same_unambig"] = 0 + results["snp_unambig"] = 0 + results["indel_unambig"] = 0 + results["indel_ambig"] = 0 + results["ambig_one"] = 0 + results["ambig_two"] = 0 + results["ambig_both"] = 0 + results["unambig_both"] = 0 + + for chr_fasta in per_chr_fastas: + same_unambig = 0 + snp_unambig = 0 + indel_unambig = 0 + indel_ambig = 0 + ambig_one = 0 + ambig_two = 0 + ambig_both = 0 + unambig_both = 0 + + alignOutFileName = util.file.mkstempfname('.fasta') + aligner.execute(chr_fasta, alignOutFileName, fmt="clw") + + with open(alignOutFileName, "r") as f: + alignment = Bio.AlignIO.read(f, "clustal") + + for col_idx in range(0, alignment.get_alignment_length()): + col = alignment[:, col_idx] + c1 = col[0] + c2 = col[1] + + if (c1 in ambiguous + and c2 in ambiguous): + ambig_both +=1 + elif c1 in ambiguous: + ambig_one += 1 + elif c2 in ambiguous: + ambig_two += 1 + + if (c1 in IUPACUnambiguousDNA().letters + and c2 in IUPACUnambiguousDNA().letters): + unambig_both += 1 + if c1 == c2: + same_unambig += 1 + else: + snp_unambig += 1 + + if ((c1 == gap and + c2 in IUPACUnambiguousDNA().letters) or + (c2 == gap and + c1 in IUPACUnambiguousDNA().letters)): + indel_unambig += 1 + + if ((c1 == gap and + c2 in ambiguous) or + (c2 == gap and + c1 in ambiguous)): + indel_ambig += 1 + + if printCounts: + print("Counts for this segment/chromosome:") + print("same_unambig ", same_unambig) + print("snp_unambig ", snp_unambig) + print("indel_unambig", indel_unambig) + print("indel_ambig ", indel_ambig) + print("ambig_one ", ambig_one) + print("ambig_two ", ambig_two) + print("ambig_both ", ambig_both) + print("unambig_both ", unambig_both) + + results["same_unambig"] += same_unambig + results["snp_unambig"] += snp_unambig + results["indel_unambig"] += indel_unambig + results["indel_ambig"] += indel_ambig + results["ambig_one"] += ambig_one + results["ambig_two"] += ambig_two + results["ambig_both"] += ambig_both + results["unambig_both"] += unambig_both + + if printCounts: + print("\nCounts for this sample:") + print("same_unambig ", results["same_unambig"]) + print("snp_unambig ", results["snp_unambig"]) + print("indel_unambig", results["indel_unambig"]) + print("indel_ambig ", results["indel_ambig"]) + print("ambig_one ", results["ambig_one"]) + print("ambig_two ", results["ambig_two"]) + print("ambig_both ", results["ambig_both"]) + print("unambig_both ", results["unambig_both"]) + + if outfileName: + with open(outfileName, "wt") as of: + csvout = csv.writer(of, delimiter='\t') + csvout.writerow(list(results.keys())) + csvout.writerow(list(results.values())) + +def parser_alignment_summary(parser=argparse.ArgumentParser()): + parser.add_argument('inFastaFileOne', help='First fasta file for an alignment') + parser.add_argument('inFastaFileTwo', help='First fasta file for an alignment') + parser.add_argument('--outfileName', help='Output file for counts in TSV format') + parser.add_argument('--printCounts', help='', action='store_true') + util.cmd.attach_main(parser, alignment_summary, split_args=True) + return parser +__commands__.append(('alignment_summary', parser_alignment_summary)) + def consolidate_fastqc(inDirs, outFile): '''Consolidate multiple FASTQC reports into one.'''