Skip to content

Commit

Permalink
splitting favr_rare into two programs, this one for nonfamily filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
bjpop committed Nov 2, 2011
1 parent c84784c commit 98268bd
Showing 1 changed file with 323 additions and 0 deletions.
323 changes: 323 additions & 0 deletions favr_nonfamily_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
#!/bin/env python

'''
Variant filter script.
Authors: Bernie Pope, Danny Park, Fabrice Odefrey, Tu Ng.
Reads a list of variants from a CSV file for a sample and compares
them to the sequence reads in BAM files for other samples.
Revision history:
2 May 2011. Initial incomplete version. Can read CSV file and BAMS and
generate a table of which variants from the original sample
are found in the other samples.
30 May 2011. Filter out reads which are considered 'is_del'. These appear
to come from deletions, and don't belong in the pileup for
a given position.
31 May 2011. Change the supported input format from a custom CSV file to
a SIFT (csv) file.
25 Aug 2011. Changed the binning rule to bin 35-length only reads, in
addition to the previous binning rule.
29 Aug 2011. Changed back to not bin 35-length only reads. This is now
done in another tool.
19 Sep 2011. Added classification for family samples.
20 Sep 2011. Allowed the input to be tab separated, with coordinates comma separated.
'''

import os
import pysam
import sys
import csv
import yaml
import getopt

# print a usage message
def usage():
print("""Usage: %s
[-h | --help]
--family=[True|False]
--variants=<variant list as CSV file>
reads1.bam reads2.bam ...""") % sys.argv[0]

longOptionsFlags = ["help", "family=", "variants="]
shortOptionsFlags = "h"

# A place to store command line arguments.
class Options(object):
def __init__(self):
self.family = None
self.variants = None
def check(self):
return self.family != None and self.variants != None

def main():
try:
opts, args = getopt.getopt(sys.argv[1:], shortOptionsFlags, longOptionsFlags)
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(2)
options = Options()
for o, a in opts:
if o == "--family" and a in ['True','False']:
options.family = eval(a)
elif o == "--variants":
options.variants = a
elif o in ('-h', '--help'):
usage()
sys.exit(0)
if not options.check():
print('Incorrect arguments')
usage()
exit(2)
bamFilenames = args
# Read the rows of the variants CSV file into a list.
with open(options.variants) as variants:
variantList = list(csv.reader(variants, delimiter='\t', quotechar='|'))
# Initialise all the evidence to zero.
# The evidence records the number of samples which contain a particular variant.
evidence = initEvidence(variantList)
# Iterate over sample BAM files.
for bamFile in bamFilenames:
with pysam.Samfile(bamFile, "rb") as bam:
# Count how many samples have each particular variant.
countVariants(evidence, variantList, bam)
# either annotate the variants or filter them, depending on whether
# we are dealing with familial samples or not.
if options.family:
annotate(evidence)
else:
filter(evidence)

def annotate(evidence):
'''Annotate variants which appear in a sample of a family member'''
annotateFilename = makeSafeFilename('annotated_variants')
with open(annotateFilename,'w') as annotateFile:
inFamily = []
notInFamily = []
# sort the variants by coordinate
for key,info in sortByCoord(evidence):
for readCount,depth in info.counts:
if readCount > 0:
inFamily.append(info.inputRow)
else:
notInFamily.append(info.inputRow)
csvWriter = csv.writer(annotateFile, delimiter='\t', quotechar='|')
for row in inFamily:
csvWriter.writerow(row + ['IN RELATIVE'])
for row in notInFamily:
csvWriter.writerow(row + ['NOT IN RELATIVE'])
print('Annotated reads saved into file: %s' % annotateFilename)

def filter(evidence):
'''Decide which variants to keep and which to bin.'''
binFilename = makeSafeFilename('rare_binfile')
keepFilename = makeSafeFilename('rare_keepfile')
logFilename = makeSafeFilename('rare_logfile')
with open (logFilename,'w') as logFile:
with open(binFilename,'w') as binFile:
with open(keepFilename,'wb') as keepFile:
csvWriter = csv.writer(keepFile, delimiter=',', quotechar='|')
# sort the variants by coordinate
for key,info in sortByCoord(evidence):
classification = classify(info.counts)
# record the classification of this variant in the logfile
logFile.write("%s: %s: %s\n" % (key, classification.action, classification.reason))
if classification.action == 'bin':
# bin the variant
binFile.write('%s\n' % key)
for readCount,depth in info.counts:
binFile.write(' <vars/coverage: %d/%d>\n' % (readCount,depth))
elif classification.action == 'keep':
# keep the variant
csvWriter.writerow(info.inputRow)
print('Kept reads saved into file: %s' % keepFilename)
print('Binned reads saved into file: %s' % binFilename)
print('Log record saved into file: %s' % logFilename)

# sort the variants by coordinate.
def sortByCoord(evidence):
return sorted(evidence.items(), cmp=compareCoord)

# compare two chromosome coordinates for ordering.
def compareCoord(coord1, coord2):
chr1,pos1 = coord1[0].split(':')
chr2,pos2 = coord2[0].split(':')
if chr1 == chr2:
return cmp(int(pos1), int(pos2))
else:
return compareChrCode(chr1[3:], chr2[3:])

def compareChrCode(code1, code2):
isNumerical1 = code1.isdigit()
isNumerical2 = code2.isdigit()
if isNumerical1:
if isNumerical2:
return cmp(int(code1), int(code2))
else:
return -1
else:
return cmp(code1, code2)

class EvidenceInfo(object):
def __init__(self, inputRow, counts):
self.inputRow = inputRow
self.counts = counts

def initEvidence(variantList):
'''Initialise the frequency counter for each variant to be zero.'''
evidence = {}
for variant in variantList:
info = parseVariantRow(variant)
if info:
evidence[info.id] = EvidenceInfo(inputRow = info.inputRow, counts = [])
return evidence

def showEvidence(evidence):
'''Print out the frequency counter for each variant.'''
for chrPos,info in evidence.items():
print("%s %s" % (info.inputRow,str(info.counts)))

def countVariants(evidence, variantList, bam):
'''For each variant in the list, check if it is evident in this particular sample BAM.'''
for variant in variantList:
info = parseVariantRow(variant)
# only process valid rows in the variant CSV file
if info:
# get the pileup information for this particular variant coordinates
# the pileup tells us what base was called in each of the reads at
# this particular coordinate
pileupCol = lookupPileup(bam, info.chromosome, info.position)
sameAsVariant = 0
coverage = 0
unknown_size_reads = 0 # number of variants on a read of unknown size
if pileupCol:
pos,coverage,reads = pileupCol
# Count the number of reads that are the same as the variant
for pileupread in reads:
readBase = pileupread.alignment.seq[pileupread.qpos]
# check if the sample base (at the same position) is the same as the variant base
# but skip reads which are marked as "is_del", deletions
if not pileupread.is_del and readBase == info.variantBase:
sameAsVariant += 1
evidence[info.id].counts.append((sameAsVariant,coverage))

class VariantInfo(object):
'''Interesting information about a particular variant.'''
def __init__(self, id, chromosome, position, refBase, variantBase, inputRow):
self.id = id
self.chromosome = chromosome
self.position = position
self.refBase = refBase
self.variantBase = variantBase
self.inputRow = inputRow

def parseVariantRow(row):
'''Extract the interesting information about a variant from a SIFT CSV row.'''
if len(row) >= 1:
coordinates = row[0].split(',')
if len(coordinates) >= 4:
fromTo = parsePolymorphism(coordinates[3])
if fromTo:
chrName = "chr" + coordinates[0]
return VariantInfo(id = "%s:%s" % (chrName,coordinates[1]),
chromosome = chrName,
position = int(coordinates[1]), # XXX should really check it is all digits
refBase = fromTo[0],
variantBase = fromTo[1],
inputRow = row)
return None

def parsePolymorphism(s):
'''Parse the X/Y polymorphism notation, extracting X and Y.'''
fromTo = s.split('/')
if len(fromTo) == 2:
fromBase = fromTo[0]
toBase = fromTo[1]
if validBase(fromBase) and validBase(toBase):
return fromBase,toBase
return None

def validBase(s):
'''Check that a letter is a vaid code for a base.'''
return s in ['G', 'A', 'T', 'C']

def lookupPileup(bam, chr, col):
'''Retrieve the pileup for a particular chromosome:position coordinate.'''
# assume argument is in 1-based numbering.
# samtools gives back a range of pilups that cover the requested coordinates,
# (no idea why) so we have to search for the particular one we want
for pileupcolumn in bam.pileup(chr, col-1, col):
if pileupcolumn.pos == col-1:
# XXX can't return pileupcolumn here because: segfault!
# this appears to be a bug in either pysam or samtools (or both)
return (pileupcolumn.pos , pileupcolumn.n, pileupcolumn.pileups)
return None

def makeSafeFilename(name):
if not os.path.exists(name):
return name
else:
count = 2
while (count < sys.maxint):
newName = name + str(count)
if not os.path.exists(newName):
return newName
count += 1
# a safety condition in case we can't return a safe name
return None

# The classify function decides if a particular variant from the
# variant list should be kept or binned (discarded). It returns
# a classification of the variant which can be used to
# determine if it should be kept or discarded and a reason
# why.
#
# The decision to bin or keep is based on how many times
# we see the same variant in other samples.
#
# If you want to use a different binning criteria, then
# rewrite the classify function. The one provided below is just
# an example.

readCountThreshold = 1 # how many reads of the variant do we need to see in a single sample?
samplesPercent = 30 # what percentage of the total samples need to pass the above threashold?

class Classify(object):
def __init__(self, action, reason):
self.action = action # 'bin' or 'keep'
self.reason = reason # some text explaining why

def classify(variantInfo):
totalSamples = 0 # number of sample files
binableSamples = 0 # number of samples which are considered binable
totalSameAsVariants = 0 # total number of reads which had a base the same as the variant in the same position
# we ignore the depth in this particular example
for readCount,_depth in variantInfo:
totalSamples += 1
if readCount >= readCountThreshold:
binableSamples += 1
totalSameAsVariants += readCount
if totalSamples > 0:
if (binableSamples * 100 / totalSamples) >= samplesPercent:
return Classify('bin', binThresholdMessage % (binableSamples, totalSamples, samplesPercent))
else:
return Classify('keep', keepMessage % (binableSamples, totalSamples, samplesPercent))
else:
return Classify('bin', binZeroSamplesMessage)

binThresholdMessage = '(binableSamples(=%d) * 100 / totalSamples(=%d)) >= samplesPercent(=%d)'
binZeroSamplesMessage = 'there were zero samples to compare with'
keepMessage = '(binableSamples(=%d) * 100 / totalSamples(=%d)) < samplesPercent(=%d)'

if __name__ == '__main__':
main()

0 comments on commit 98268bd

Please sign in to comment.