Skip to content

Commit

Permalink
Fixing up the indels script.
Browse files Browse the repository at this point in the history
  • Loading branch information
benedictpaten committed Aug 1, 2014
1 parent 2fa9c1c commit faa165f
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 14 deletions.
2 changes: 1 addition & 1 deletion LICENSE.txt
@@ -1,4 +1,4 @@
Copyright (C) 2014 by Benedict Paten (benedictpaten@gmail.com), Miten Jain, Karen Miga
Copyright (C) 2014 by Benedict Paten (benedictpaten@gmail.com), Miten Jain, Ian Fiddes, Karen Miga

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
78 changes: 66 additions & 12 deletions nanopore/analyses/indels.py
Expand Up @@ -7,16 +7,18 @@
from jobTree.src.bioio import reverseComplement, prettyXml, system

class IndelCounter():
def __init__(self, readSeqName, refSeqName):
def __init__(self, refSeqName, refSeq, readSeqName, readSeq, alignedRead):
self.readInsertionLengths = []
self.readDeletionLengths = []
self.blockLengths = []
self.readSeqName = readSeqName
self.readSeq = readSeq
self.refSeqName = refSeqName
self.refSeq = refSeq

def addReadAlignment(self, alignedRead, refSeq, readSeq):
#Now add the aligned read
blockLength = 0
for aP in AlignedPair.iterator(alignedRead, refSeq, readSeq):
for aP in AlignedPair.iterator(alignedRead, self.refSeq, self.readSeq):
if aP.getPrecedingReadInsertionLength() > 0:
self.readInsertionLengths.append(aP.getPrecedingReadInsertionLength())
if aP.getPrecedingReadDeletionLength() > 0:
Expand All @@ -30,16 +32,55 @@ def addReadAlignment(self, alignedRead, refSeq, readSeq):

def getXML(self):
return ET.Element("indels", { "refSeqName":self.refSeqName,
"refSeqLength":str(len(self.refSeq)),
"readSeqName":self.readSeqName,
"totalReadInsertions":str(len(self.readInsertionLengths)),
"totalReadDeletions":str(len(self.readDeletionLengths)),
"readSeqLength":str(len(self.readSeq)),
"numberReadInsertions":str(len(self.readInsertionLengths)),
"numberReadDeletions":str(len(self.readDeletionLengths)),
"avgReadInsertionLength":str(numpy.average(self.readInsertionLengths)),
"avgReadDeletionLength":str(numpy.average(self.readDeletionLengths)),
"medianReadInsertionLength":str(numpy.median(self.readInsertionLengths)),
"medianReadDeletionLength":str(numpy.median(self.readDeletionLengths)),
"readInsertionLengths":" ".join([ str(i) for i in self.readInsertionLengths ]),
"readDeletionLengths":" ".join([ str(i) for i in self.readDeletionLengths ]) })

def getAggregateIndelStats(indelCounters):
"""Calculates aggregate stats across a set of read alignments.
"""

readInsertionLengths = reduce(lambda x, y : x + y, map(lambda ic : ic.readInsertionLengths, indelCounters))
readDeletionLengths = reduce(lambda x, y : x + y, map(lambda ic : ic.readDeletionLengths, indelCounters))

attribs = { "numberOfReadAlignments":str(len(indelCounters)),
"readInsertionLengths":" ".join(map(str, readInsertionLengths)),
"readDeletionLengths":" ".join(map(str, readDeletionLengths)) }

readSequenceLengths = map(lambda ic : len(ic.readSeq), indelCounters)

numberReadInsertions = map(lambda ic : len(ic.readInsertionLengths), indelCounters)
numberReadDeletions = map(lambda ic : len(ic.readDeletionLengths), indelCounters)

medianReadInsertionLengths = map(lambda ic : numpy.median(ic.readInsertionLengths), indelCounters)
medianReadDeletionLengths = map(lambda ic : numpy.median(ic.readDeletionLengths), indelCounters)

def stats(distribution):
distribution = distribution[:]
distribution.sort()
return distribution[0], numpy.average(distribution), numpy.median(distribution), distribution[-1], " ".join(map(str, distribution))

for name, distribution in [("ReadSequenceLengths", readSequenceLengths),
("NumberReadInsertions", numberReadInsertions),
("NumberReadDeletions", numberReadDeletions),
("MedianReadInsertionLengths", medianReadInsertionLengths),
("MedianReadDeletionLengths", medianReadDeletionLengths) ]:
for attribName, value in zip([ "min" + name, "avg" + name, "median" + name, "max" + name, "distribution" + name ], list(stats(distribution))):
attribs[name] = str(value)

parentNode = ET.Element("indels", attribs)
for indelCounter in indelCounters:
parentNode.append(indelCounter.getXML())
return parentNode

class Indels(AbstractAnalysis):
"""Calculates stats on indels.
"""
Expand All @@ -48,14 +89,25 @@ def run(self):
refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences
readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences
sam = pysam.Samfile(self.samFile, "r" )
overallIndelCounter = IndelCounter("overall", "overall")
for aR in samIterator(sam): #Iterate on the sam lines
refSeq = refSequences[sam.getrname(aR.rname)]
readSeq = readSequences[aR.qname]
overallIndelCounter.addReadAlignment(aR, refSeq, readSeq)
indelCounters = map(lambda aR : IndelCounter(sam.getrname(aR.rname), refSequences[sam.getrname(aR.rname)], aR.qname, readSequences[aR.qname], aR), samIterator(sam)) #Iterate on the sam lines
sam.close()
#Write out the substitution info
open(os.path.join(self.outputDir, "indels.xml"), "w").write(prettyXml(overallIndelCounter.getXML()))
indelXML = getAggregateIndelStats(indelCounters)
open(os.path.join(self.outputDir, "indels.xml"), "w").write(prettyXml(indelXML))

#Plots:

##Read insertion lengths plot: x-axis insertion legnth, y-axis: frequency (see map(int, indelXML.attrib["readInsertionLengths].split()))

##Read deletion lengths plot: x-axis deletion length, y-axis: frequency (see map(int,indelXML.attrib["readDeletionLengths].split()))

##Number of read insertions vs. read length (map(int, indelXML.attrib["distributionReadSequenceLengths"].split()) vs. map(int, indelXML.attrib["distributionNumberReadInsertions"].split())

##Number of read deletion vs. read length

##Distribution of median insertion lengths (map(int, indelXML.attrib["distributionMedianReadInsertionLengths"].split()))

"""
stats = dict(ET.parse(os.path.join(self.outputDir, "indels.xml")).findall(".")[0].items())
outf = open(os.path.join(self.outputDir, "stats.tsv"), "w")
for key in stats:
Expand All @@ -67,4 +119,6 @@ def run(self):
outf.write("{}\t{}\n".format(key, str(stats[key])))
outf.close()
system("Rscript nanopore/analyses/indel_plot.R {} {} {} {}".format(os.path.join(self.outputDir, "stats.tsv"), os.path.join(self.getLocalTempDir(), "r_insert.txt"), os.path.join(self.getLocalTempDir(), "r_delete.txt"), os.path.join(self.outputDir, "indel_hist.pdf")))
self.finish()
"""

self.finish() #Indicates the batch is done
2 changes: 1 addition & 1 deletion nanopore/pipeline.py
Expand Up @@ -28,7 +28,7 @@

from nanopore.metaAnalyses.coverageSummary import CoverageSummary

mappers = [ Lastz, LastzChain, LastzRealign, Bwa, BwaChain, BwaRealign, BwaParams, BwaParamsChain, BwaParamsRealign, Last, LastChain, LastRealign, LastParams, LastParamsChain, LastParamsRealign ] #, Blasr, BlasrChain, BlasrRealign, BlasrParams, BlasrParamsChain, BlasrParamsRealign ] #LastChain, LastzChain, BwaChain ] #, #Lastz, Bwa, Last ] #Blasr ] #Blasr not yet working
mappers = [ Lastz, LastzChain, LastzRealign ] #, Bwa, BwaChain, BwaRealign, BwaParams, BwaParamsChain, BwaParamsRealign, Last, LastChain, LastRealign, LastParams, LastParamsChain, LastParamsRealign ] #, Blasr, BlasrChain, BlasrRealign, BlasrParams, BlasrParamsChain, BlasrParamsRealign ]
analyses = [ LocalCoverage, GlobalCoverage, Substitutions, Indels, AlignmentUncertainty, KmerAnalysis, FastQC, QualiMap, Consensus ]
metaAnalyses = [ CoverageSummary ]

Expand Down

0 comments on commit faa165f

Please sign in to comment.