Skip to content

Commit

Permalink
add function and argparse wrapper for tabulating pairwise alignment s…
Browse files Browse the repository at this point in the history
…ummary numbers

add function and argparse wrapper for tabulating pairwise alignment
summary numbers, including:

 1. how many positions have non-ambiguous (A,C,T,G) alleles on both
assemblies (unambig_both)
 2. how many positions have the same non-ambiguous allele on both
assemblies (same_unambig)
 3. how many unambiguous positions have a SNP between the two
assemblies (snp_unambig)
 4. how many unambiguous positions have an indel between the two
assemblies (indel_unambig)
 5. how many positions have an ambiguity in input 1 but not in input 2
(ambig_one)
 6. how many positions have an ambiguity in input 2 but not in input 1
(ambig_two)

The function can optionally print the counts to stdout (--printCounts),
and can optionally write them to a TSV file (--outfileName).
  • Loading branch information
tomkinsc committed Nov 18, 2015
1 parent db079e3 commit 70881cb
Showing 1 changed file with 119 additions and 0 deletions.
119 changes: 119 additions & 0 deletions reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,20 @@
import glob
import os
import time
from collections import OrderedDict
import csv

import pysam
import Bio.SeqIO
import Bio.AlignIO
from Bio.Alphabet.IUPAC import IUPACUnambiguousDNA

import util.cmd
import util.file
import util.misc
import tools.samtools
import assembly
import interhost
from util.stats import mean, median

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -152,6 +159,118 @@ def parser_assembly_stats(parser=argparse.ArgumentParser()):
def get_refalign_stats(sample):
pass

def alignment_summary(inFastaFileOne, inFastaFileTwo, outfileName=None, printCounts=False):
gap = '-'
ambiguous = 'N'
aligner = tools.muscle.MuscleTool()#.install_and_get_path()

per_chr_fastas = interhost.transposeChromosomeFiles([inFastaFileOne, inFastaFileTwo])

results = OrderedDict()
results["same_unambig"] = 0
results["snp_unambig"] = 0
results["indel_unambig"] = 0
results["indel_ambig"] = 0
results["ambig_one"] = 0
results["ambig_two"] = 0
results["ambig_both"] = 0
results["unambig_both"] = 0

for chr_fasta in per_chr_fastas:
same_unambig = 0
snp_unambig = 0
indel_unambig = 0
indel_ambig = 0
ambig_one = 0
ambig_two = 0
ambig_both = 0
unambig_both = 0

alignOutFileName = util.file.mkstempfname('.fasta')
aligner.execute(chr_fasta, alignOutFileName, fmt="clw")

with open(alignOutFileName, "r") as f:
alignment = Bio.AlignIO.read(f, "clustal")

for col_idx in range(0, alignment.get_alignment_length()):
col = alignment[:, col_idx]
c1 = col[0]
c2 = col[1]

if (c1 in ambiguous
and c2 in ambiguous):
ambig_both +=1
elif c1 in ambiguous:
ambig_one += 1
elif c2 in ambiguous:
ambig_two += 1

if (c1 in IUPACUnambiguousDNA().letters
and c2 in IUPACUnambiguousDNA().letters):
unambig_both += 1
if c1 == c2:
same_unambig += 1
else:
snp_unambig += 1

if ((c1 == gap and
c2 in IUPACUnambiguousDNA().letters) or
(c2 == gap and
c1 in IUPACUnambiguousDNA().letters)):
indel_unambig += 1

if ((c1 == gap and
c2 in ambiguous) or
(c2 == gap and
c1 in ambiguous)):
indel_ambig += 1

if printCounts:
print("Counts for this segment/chromosome:")
print("same_unambig ", same_unambig)
print("snp_unambig ", snp_unambig)
print("indel_unambig", indel_unambig)
print("indel_ambig ", indel_ambig)
print("ambig_one ", ambig_one)
print("ambig_two ", ambig_two)
print("ambig_both ", ambig_both)
print("unambig_both ", unambig_both)

results["same_unambig"] += same_unambig
results["snp_unambig"] += snp_unambig
results["indel_unambig"] += indel_unambig
results["indel_ambig"] += indel_ambig
results["ambig_one"] += ambig_one
results["ambig_two"] += ambig_two
results["ambig_both"] += ambig_both
results["unambig_both"] += unambig_both

if printCounts:
print("\nCounts for this sample:")
print("same_unambig ", results["same_unambig"])
print("snp_unambig ", results["snp_unambig"])
print("indel_unambig", results["indel_unambig"])
print("indel_ambig ", results["indel_ambig"])
print("ambig_one ", results["ambig_one"])
print("ambig_two ", results["ambig_two"])
print("ambig_both ", results["ambig_both"])
print("unambig_both ", results["unambig_both"])

if outfileName:
with open(outfileName, "wt") as of:
csvout = csv.writer(of, delimiter='\t')
csvout.writerow(list(results.keys()))
csvout.writerow(list(results.values()))

def parser_alignment_summary(parser=argparse.ArgumentParser()):
parser.add_argument('inFastaFileOne', help='First fasta file for an alignment')
parser.add_argument('inFastaFileTwo', help='First fasta file for an alignment')
parser.add_argument('--outfileName', help='Output file for counts in TSV format')
parser.add_argument('--printCounts', help='', action='store_true')
util.cmd.attach_main(parser, alignment_summary, split_args=True)
return parser
__commands__.append(('alignment_summary', parser_alignment_summary))


def consolidate_fastqc(inDirs, outFile):
'''Consolidate multiple FASTQC reports into one.'''
Expand Down

0 comments on commit 70881cb

Please sign in to comment.