Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

prepro

  • Loading branch information...
commit d97bdc08a8ea0f21146df23a40c4746b84ab762c 1 parent f4789e9
@brantp authored
View
31 LSF.py
@@ -177,7 +177,7 @@ def lsf_jobs_submit(cmds,outfile,queue='short_serial',bsub_flags='',jobname_base
except OSError:
pass
- open(sh_name,'w').write('#!/usr/bin/env sh\n'+execstr.replace('\\"','"'))
+ open(sh_name,'w').write('#!/usr/bin/env sh\nset -e\n'+execstr.replace('\\"','"'))
os.system('chmod +x '+sh_name)
execstr = sh_name
@@ -283,6 +283,31 @@ def lsf_wait_for_jobs(jobids,restart_outfile=None,restart_queue='normal_serial',
sys.stderr.flush()
print >> sys.stderr, '\ncompleted iteration in',str(datetime.timedelta(seconds=int(time.time() - t)))
+def lsf_run_until_done(to_run_dict,logfile,queue,bsub_flags,jobname_base,num_batches,MAX_RETRY):
+ from run_safe import unfinished_cmds
+ cmds = unfinished_cmds(to_run_dict)
+
+ retries = 0
+ last_cmds = []
+ while len(cmds) > 0:
+ print >> sys.stderr, '%s: %s cmds to run in %s batches on queue %s, logs in %s' % (jobname_base,len(cmds),num_batches,queue,logfile)
+ #code to halt execution on recurrent errors
+ if set(last_cmds) == set(cmds):
+ if retries > MAX_RETRY:
+ errstr = 'maximum number of retry attempts (%s) exceeded with identical jobs lists. Check logs (%s) for recurrent errors' % (MAX_RETRY,logfile)
+ raise IOError, errstr
+ else:
+ retries += 1
+ last_cmds = cmds
+
+ jobids,namedict = lsf_jobs_submit(cmds,logfile,queue,bsub_flags,jobname_base=jobname_base,num_batches=num_batches)
+ time.sleep(20)
+ lsf_wait_for_jobs(jobids,logfile,namedict=namedict)
+
+ cmds = unfinished_cmds(to_run_dict)
+ print >> sys.stderr, 'DONE\n'
+
+
def lsf_kill_jobs(jobids):
if isinstance(jobids,dict):
kills = jobids.keys()
@@ -322,8 +347,8 @@ def lsf_restart_jobs(jobids,keep_prereqs=True,return_kills=False):
restartnames.update(ndict)
except KeyError:
kills_skip.append(j)
-
- print >> sys.stderr, 'no restart data available for %s; not killed' % kills_skip
+ if kills_skip:
+ print >> sys.stderr, 'no restart data available for %s; not killed' % kills_skip
lsf_kill_jobs(kills_finish)
if return_kills:
View
22 mcl_id_triples_by_blat.py
@@ -7,12 +7,14 @@
'''
-import os, sys, re
+import os, sys, re, gzip
from config import SCRATCH as scratch
+fail_to_local = False #must set to "True" to allow use of working directory for blat output
+ #note that this can result in large .blat files on shared volumes!
keep_blat = False
pct_id_cut = 0.8
-min_space_on_scratch = 100000 #in mb
+min_space_on_scratch = 100000 #in mb; fail if less than this amount of free space
def splitpath_rec(path, maxdepth=20):
( head, tail ) = os.path.split(path)
@@ -43,10 +45,18 @@ def space_free_on_volume(vol,unit='M',verbose=False):
scratch_space = space_free_on_volume(splitpath(scratch)[0],verbose=True)
if keep_blat:
- outf = outbase+'.blat'
+ if fail_to_local:
+ outf = outbase+'.blat'
+ else:
+ print >> sys.stderr, 'working directory failover not allowed'
+ raise OSError, 'no space for blat output'
elif scratch_space < min_space_on_scratch:
print >> sys.stderr, 'available space on scratch (%s MB) is less than specified minimum (%s MB); use working directory' % ( scratch_space,min_space_on_scratch)
- outf = outbase+'.blat'
+ if fail_to_local:
+ outf = outbase+'.blat'
+ else:
+ print >> sys.stderr, 'working directory failover not allowed'
+ raise OSError, 'no space for blat output'
else:
try:
os.makedirs(scratch)
@@ -59,7 +69,7 @@ def space_free_on_volume(vol,unit='M',verbose=False):
else:
os.unlink(outf)
- matf = outbase+'.label'
+ matf = outbase+'.label.gz'
if os.path.exists(matf):
print >> sys.stderr, 'output %s already present' % matf
@@ -72,7 +82,7 @@ def space_free_on_volume(vol,unit='M',verbose=False):
raise OSError, 'blat failed; exit'
print >> sys.stderr, 'process blat output %s (%s bytes)' % (outf,os.path.getsize(outf))
- matfh = open(matf,'w')
+ matfh = gzip.open(matf,'w')
for l in open(outf):
fields = l.strip().split()
try:
View
102 preprocess_radtag_lane.py
@@ -129,23 +129,43 @@ def get_spreadsheet_key(target_sheet,gd_client=None):
return key,gd_client
-def get_table_as_dict(target_sheet,sq=None,gd_client=None,suppress_fc_check=False):
+def get_worksheet_key(target_worksheet,sheet_key,gd_client):
+ feed = gd_client.GetWorksheetsFeed(sheet_key)
+ key = [entry.id.text.rsplit('/', 1)[1] for entry in feed.entry if entry.title.text == target_worksheet][0]
+ return key,gd_client
+
+def get_all_worksheet_names(sheet_key,gd_client):
+ feed = gd_client.GetWorksheetsFeed(sheet_key)
+ names = [entry.title.text for entry in feed.entry]
+ return names,gd_client
+
+def get_table_as_dict(target_sheet,sq=None,gd_client=None,suppress_fc_check=False,target_worksheet=None):
key,gd_client = get_spreadsheet_key(target_sheet,gd_client)
+ if target_worksheet is None:
+ names = get_all_worksheet_names(key,gd_client)[0]
+ recs = []
+ for sheet_name in names:
+ print >> sys.stderr, 'add %s' % sheet_name
+ recs.extend(get_table_as_dict(target_sheet,sq=sq,gd_client=gd_client,suppress_fc_check=suppress_fc_check,target_worksheet=sheet_name))
+ return recs
+ else:
+ wskey,gd_client = get_worksheet_key(target_worksheet,key,gd_client)
if sq is not None:
q = gdata.spreadsheet.service.ListQuery()
q.sq = sq
- feed = gd_client.GetListFeed(key,query=q)
+ feed = gd_client.GetListFeed(key,wskey,query=q)
else:
- feed = gd_client.GetListFeed(key)
-
+ feed = gd_client.GetListFeed(key,wskey)
recs = []
for entry in feed.entry:
#d = []
#for el in entry.content.text.split(','):
-
+ #print entry.content.ToString()
try:
- recs.append(dict(re.findall('(.+?):\s(.+?)(?:(?:,\s)|$)',entry.content.text)))
+ #recs.append(dict(re.findall('([\d\w_-]+?):\s(.+?)(?:(?:,\s)|$)',entry.content.text)))
+ l = [m.strip(' ,:') for m in re.split('([^\s]+?:\s)',entry.content.text) if m]
+ recs.append(dict(zip(l[::2],l[1::2])))
if not suppress_fc_check and not all([k in recs[-1].keys() for k in ['flowcell','lane','pool']]):
print >> sys.stderr, 'missing keys:', dict(re.findall('(.+?):\s(.+?)(?:(?:,\s)|$)',entry.content.text))
print >> sys.stderr, 'line was:\n',entry.content.text
@@ -189,37 +209,47 @@ def get_individual_data_for_lane(filename=None,idxlookup=None,fc=None,lane=None,
lane = os.path.basename(filename)[2]
#print >> sys.stderr, fc,lane,idxlookup
fbase = os.path.basename(filename)
-
- key,gd_client = get_spreadsheet_key(LIBRARY_DATA)
-
- q = gdata.spreadsheet.service.ListQuery()
+ #
+ #key,gd_client = get_spreadsheet_key(LIBRARY_DATA)
+ #
+ #q = gdata.spreadsheet.service.ListQuery()
+ #
if 'index' in fbase:
- q.sq = 'flowcell="%s" and lane="%s" and index="%s"' % (fc,lane,fbase.split('index')[-1].split('.')[0])
+ # q.sq = 'flowcell="%s" and lane="%s" and index="%s"' % (fc,lane,fbase.split('index')[-1].split('.')[0])
+ sq = 'flowcell="%s" and lane="%s" and index="%s"' % (fc,lane,fbase.split('index')[-1].split('.')[0])
else:
- q.sq = 'flowcell="%s" and lane="%s"' % (fc,lane)
+ # q.sq = 'flowcell="%s" and lane="%s"' % (fc,lane)
+ sq = 'flowcell="%s" and lane="%s"' % (fc,lane)
else:
- key,gd_client = get_spreadsheet_key(LIBRARY_DATA)
-
- q = gdata.spreadsheet.service.ListQuery()
+ #key,gd_client = get_spreadsheet_key(LIBRARY_DATA)
+ #
+ #q = gdata.spreadsheet.service.ListQuery()
if index is not None:
- q.sq = 'flowcell="%s" and lane="%s" and index="%s"' % (fc,lane,index)
+ # q.sq = 'flowcell="%s" and lane="%s" and index="%s"' % (fc,lane,index)
+ sq = 'flowcell="%s" and lane="%s" and index="%s"' % (fc,lane,index)
else:
- q.sq = 'flowcell="%s" and lane="%s"' % (fc,lane)
-
+ # q.sq = 'flowcell="%s" and lane="%s"' % (fc,lane)
+ sq = 'flowcell="%s" and lane="%s"' % (fc,lane)
- feed = gd_client.GetListFeed(key,query=q)
- recs = []
- for entry in feed.entry:
- tl = [[st.strip() for st in el.split(':')] for el in entry.content.text.split(',')]
- try:
- recs.append(dict(tl))
- except:
- print >> sys.stderr, 'tuple list did not parse:\n\t%s' % tl
+ #feed = gd_client.GetListFeed(key,query=q)
+ #
+ #recs = []
+ #
+ #for entry in feed.entry:
+ # tl = [[st.strip() for st in el.split(':')] for el in entry.content.text.split(',')]
+ # try:
+ # recs.append(dict(tl))
+ # except:
+ # print >> sys.stderr, 'tuple list did not parse:\n\t%s' % tl
#recs = [dict([[st.strip() for st in el.split(':')] for el in entry.content.text.split(',')]) for entry in feed.entry]
- print >> sys.stderr, "%s records found for %s" % (len(recs), q.sq)
+
+ #try with table_as_dict
+ recs = get_table_as_dict(LIBRARY_DATA,sq=sq)
+
+ print >> sys.stderr, "%s records found for %s" % (len(recs), sq)
adaptersversions = set([r['adaptersversion'] for r in recs])
print >> sys.stderr, "adapters used: %s" % adaptersversions
idxs = reduce(lambda x,y: x+y, [idxlookup[adver].values() for adver in adaptersversions])
@@ -442,6 +472,24 @@ def assign_read_to_indiv(line,indiv_data,mismatch_allowed=1, \
qual = numpy.array(qual,dtype=int)
return indiv,read,qual
+def get_fastq_properties(fq):
+ fh = smartopen(fq)
+ chr1 = fh.readline()[0]
+ if chr1 == '@':
+ lnum = 4
+ print >> sys.stderr, 'using 4-line fastq'
+ else:
+ lnum = 1
+ print >> sys.stderr, 'using 1-line fastq'
+
+ baseQ = None
+ qfh = smartopen(fq)
+ while baseQ is None:
+ baseQ = get_baseQ(next_read_from_fh(qfh,lnum)[2])
+ qfh.close()
+ print >> sys.stderr, 'using quality encoding base-%s' % baseQ
+ return lnum,baseQ
+
def next_read_from_fh(fh,lnum=None):
if lnum is None:
View
105 rtd_run.py
@@ -39,9 +39,19 @@
from glob import glob
from copy import deepcopy
-import os, sys, re, numpy
+import os, sys, re, numpy, gzip
import gdata.spreadsheet.service
+def cat(filelist,targetfile):
+ '''cats an arbitrarily large filelist to targetfile'''
+ fh = smartopen(targetfile,'w')
+ print >> sys.stderr, '\n'
+ for i,f in enumerate(filelist):
+ print >> sys.stderr, '\r%s / %s' % (i,len(filelist)),
+ for l in open(f):
+ fh.write(l)
+ fh.close()
+
def load_uniqued(all_quality,uniqued,readlen=None,nticks=20,baseQ=None):
'''given a .uniqued file produced by preprocess_radtag_lane.py
@@ -214,42 +224,50 @@ def make_similarity_calc_inputs(unifiles,set_mincycles,nticks,cutsite,mIDfile,su
del(all_quality)
-def run_lsf_blat(subjects,queries,blattile,blatargstr='',num_batches=100):
+def run_lsf_blat(subjects,queries,blattile,blatargstr='',num_batches=100,queue='normal_serial'):
'''submits mcl_id_triples_by_blat.py jobs to LSF
intended as an example of parallelization over a compute grid;
uses a module LSF.py for interaction with scheduler
'''
- import LSF
+ import LSF,run_safe
blatargstr += ' -tileSize=%s' % blattile
blatargstr += ' -stepSize=%s' % (int(blattile)/2)
- cmds = []
+ #cmds = []
labf = []
+ to_run_dict = {}
for q in queries:
for subject in subjects:
subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj')
outbase = q.rstrip('.fa').rstrip('_query')+'_blat'+'-subj'+subjname+blatargstr.replace('=','').replace(' ','')
- labf.append(outbase+'.label')
- cmds.append('%smcl_id_triples_by_blat.py %s %s \\"%s\\" %s' % (radtag_denovo,subject,q,blatargstr,outbase))
-
- logfile = os.path.join(os.path.dirname(subjects[0]),'blat-log')
- logfiles = glob(logfile+'*.lsflog')
- for lf in logfiles:
- try:
- os.unlink(lf)
- except:
- pass
+ labf.append(outbase+'.label.gz')
+ # ESCAPES UNNECESSARY WITH safe_script
+ #cmds.append('%smcl_id_triples_by_blat.py %s %s \\"%s\\" %s' % (radtag_denovo,subject,q,blatargstr,outbase))
+ cmd = '%smcl_id_triples_by_blat.py %s %s "%s" %s' % (radtag_denovo,subject,q,blatargstr,outbase)
+ to_run_dict[outbase] = run_safe.safe_script(cmd,outbase)
+
+
+ logfile = os.path.join(os.path.dirname(subjects[0]),'blat-log/blat-log')
+ LSF.lsf_run_until_done(to_run_dict, logfile, queue, '-R "select[mem > 20000]"', 'blat2mat', num_batches, 3)
+
+ # REPLACED BY lsf_run_until_done ABOVE
+ #logfiles = glob(logfile+'*.lsflog')
+ #for lf in logfiles:
+ # try:
+ # os.unlink(lf)
+ # except:
+ # pass
#print >> sys.stderr, 'LSF %s\nlog: %s' % (cmds,logfile)
- import time
- while len(cmds) > 0:
- jobids,namedict = LSF.lsf_jobs_submit(cmds,logfile,'normal_serial',bsub_flags='-R "select[mem > 20000]"',jobname_base='blat2mat',num_batches=num_batches)
- time.sleep(20)
- LSF.lsf_wait_for_jobs(jobids,logfile,namedict=namedict)
- logfiles = glob(logfile+'*.lsflog')
- cmds = reduce(lambda x,y:x+y, [LSF.lsf_no_success_from_log(lf) for lf in logfiles])
+ #import time
+ #while len(cmds) > 0:
+ # jobids,namedict = LSF.lsf_jobs_submit(cmds,logfile,'normal_serial',bsub_flags='-R "select[mem > 20000]"',jobname_base='blat2mat',num_batches=num_batches)
+ # time.sleep(20)
+ # LSF.lsf_wait_for_jobs(jobids,logfile,namedict=namedict)
+ # logfiles = glob(logfile+'*.lsflog')
+ # cmds = reduce(lambda x,y:x+y, [LSF.lsf_no_success_from_log(lf) for lf in logfiles])
if not all([os.path.exists(f) for f in labf]):
raise OSError, 'blat failed'
@@ -271,8 +289,9 @@ def run_local_blat(subjects,queries,blattile,blatargstr='',num_cores=1):
for subject in subjects:
subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj')
outbase = q.rstrip('.fa').rstrip('_query')+'_blat'+'-subj'+subjname+blatargstr.replace('=','').replace(' ','')
- labf.append(outbase+'.label')
- cmds.append('%smcl_id_triples_by_blat.py %s %s "%s" %s' % (radtag_denovo,subject,q,blatargstr,outbase))
+ labf.append(outbase+'.label.gz')
+ cmd = '%smcl_id_triples_by_blat.py %s %s "%s" %s' % (radtag_denovo,subject,q,blatargstr,outbase)
+ cmds.append(run_safe.safe_script(cmd,outbase))
shscr = os.path.join(os.path.dirname(subjects[0]) , 'runblat.sh')
smartopen(shscr, 'w').writelines([cmd+';\n' for cmd in cmds])
@@ -297,8 +316,9 @@ def run_parallel_blat(subjects,queries,blattile,blatargstr='',num_cores='+0'):
for subject in subjects:
subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj')
outbase = q.rstrip('.fa').rstrip('_query')+'_blat'+'-subj'+subjname+blatargstr.replace('=','').replace(' ','')
- labf.append(outbase+'.label')
- cmds.append('%smcl_id_triples_by_blat.py %s %s "%s" %s' % (radtag_denovo,subject,q,blatargstr,outbase))
+ labf.append(outbase+'.label.gz')
+ cmd = '%smcl_id_triples_by_blat.py %s %s "%s" %s' % (radtag_denovo,subject,q,blatargstr,outbase)
+ cmds.append(run_safe.safe_script(cmd,outbase))
shscr = os.path.join(os.path.dirname(subjects[0]) , 'runblat.sh')
smartopen(shscr, 'w').writelines([cmd+';\n' for cmd in cmds])
@@ -427,9 +447,11 @@ def get_uniqued_error(infiles,cdest_searchbase):
outprefix += '_l%s' % minlen
labelfile = outprefix + '_all.label'
+ labdonefile = labelfile + '.done'
matfile = outprefix + '.mat'
tabfile = outprefix + '.tab'
+ matdonefile = matfile + '.done'
outprefix += '_l%s_mcl_I%0.1f' % (minlen,opts.mclradius)
@@ -473,8 +495,9 @@ def get_uniqued_error(infiles,cdest_searchbase):
# skip loads if relevant files already exist
if not os.path.exists(clunifile):
if not os.path.exists(grfile):
- if not (os.path.exists(matfile) and os.path.exists(tabfile)):
- if not os.path.exists(labelfile):
+ if not (os.path.exists(matfile) and os.path.exists(tabfile) and os.path.exists(matdonefile)):
+ if not (os.path.exists(labelfile) and os.path.exists(labdonefile)): #back to concat; testing 20130103
+ #if 1: # concatenated label file no longer generated; hardcoded for now
if not (os.path.exists(mIDfile) and all([os.path.exists(subj) for subj in subjects]) and all([os.path.exists(query) for query in queries])):
# make similarity calc inputs
make_similarity_calc_inputs(infiles,set_mincycles,nticks,cutsite,mIDfile,subjects,queries)
@@ -484,20 +507,34 @@ def get_uniqued_error(infiles,cdest_searchbase):
print >> sys.stderr, 'generate labels %s by %s' % (labelfile,match_engine)
labf = run_match(opts.match_engine,opts.parallel_engine,opts.ncores,subjects,queries,minlen,blatargstr)
+ # BACK TO CONCAT LABELS # concatenate replaced by
print >> sys.stderr, 'match complete, concatenate...',
- catcmd = 'cat %s > %s' % (' '.join(labf), labelfile)
- ret = os.system(catcmd)
- if ret != 0:
- raise OSError, 'cat failed with code %s\n%s' % (ret,catcmd)
-
- print >> sys.stderr, 'done'
+ cat(labf,labelfile)
+
+ print >> sys.stderr, 'labels done'
+ os.system('touch %s' % labdonefile)
else:
print >> sys.stderr, 'using labels %s' % labelfile
# make matrix
print >> sys.stderr, 'Write matrix for clustering'
- ret = os.system('mcxload -abc %s -o %s -write-tab %s' % (labelfile,matfile,tabfile))
+
+ # BACK TO CONCAT LABELS # no longer using concatenated label file
+ ret = os.system('mcxload -abc %s -o %s -write-tab %s' % (labelfile,matfile,tabfile))
+
+ # BACK TO CONCAT LABELS 20120103
+ #mcxload_ps = Popen('mcxload -abc - -o %s -write-tab %s' % (matfile,tabfile),shell=True,bufsize=1,stdin=PIPE)
+ #for lf in labf:
+ # lfh = gzip.open(lf)
+ # for l in lfh:
+ # mcxload_ps.stdin.write(l)
+ #mcxload_ps.communicate()
+ #ret = mcxload_ps.returncode
+ # /NO LABELS
+
if ret != 0:
raise OSError, 'mcxload failed with code %s' % ret
+
+ ret = os.system('touch %s' % matdonefile)
else:
print >> sys.stderr, '%s and %s present, using' % (matfile,tabfile)
View
39 run_safe.py
@@ -2,11 +2,34 @@
import os,sys
-cmd,done = sys.argv[1:]
-
-if os.path.exists(done):
- print >> sys.stderr, 'completion flag exists: %s' % done
-else:
- ret = os.system(cmd)
- if ret == 0:
- os.system('touch %s' % done)
+def safe_script(cmd,donefile,donesuffix='.done',scriptfile=None,force_write=False):
+ if scriptfile is None:
+ scriptfile = donefile+'.sh'
+ if os.path.exists(scriptfile) and not force_write:
+ #file present; use
+ pass
+ else:
+ open(scriptfile,'w').write('#!/usr/bin/env sh\nset -e\n%s\n' % (cmd))
+ ret = os.system('chmod +x %s' % scriptfile)
+ if ret != 0:
+ raise OSError, 'cannot set execute for %s' % scriptfile
+ return 'run_safe.py \"%s\" %s' % (scriptfile,donefile+donesuffix)
+
+def unfinished_cmds(to_run_dict,finished_ext='.done'):
+ cmds = []
+ for finished_base,cmd in to_run_dict.items():
+ if not os.path.exists(finished_base+finished_ext):
+ cmds.append(cmd)
+ return cmds
+
+if __name__ == "__main__":
+ cmd,done = sys.argv[1:]
+
+ if os.path.exists(done):
+ print >> sys.stderr, 'completion flag exists: %s' % done
+ else:
+ ret = os.system(cmd)
+ if ret == 0:
+ os.system('touch %s' % done)
+
+
View
44 vcf_to_rqtl.py
@@ -180,29 +180,21 @@ def load_vcf(vcf,cutoff_fn=None,ding_on=100000,store_only=None,indiv_gt_phred_cu
if i % ding_on == 0: print >> sys.stderr, 'reading',i
i += 1
- #skip bogus lines -- probably want to warn on this somehow...
- if '\x00' in line:
- print >> sys.stderr, 'Null byte in line %s, skipping' % i
- continue
-
- if line.startswith('#'):
+ sd = load_vcf_line(vcfh,headers,exp_elements,FORMAT,indiv_gt_phred_cut,store_indiv_prefix,drop_indiv,biallelic_sites,skip_noGQ)
+ if sd is None:
continue
- else:
- sd = load_vcf_line(vcfh,headers,exp_elements,FORMAT,indiv_gt_phred_cut,store_indiv_prefix,drop_indiv,biallelic_sites,skip_noGQ)
- if sd is None:
- continue
- elif sd == 'EOF':
- break
- key = (sd['CHROM'],sd['POS'])
- if cutoff_fn is None or cutoff_fn(sd):
- if store_only is not None:
- keep_sd = {}
- for k in sd:
- if k in store_only:
- keep_sd[k] = sd[k]
- sd = keep_sd
- if len(sd) > 0:
- vcf_data[key] = sd
+ elif sd == 'EOF':
+ break
+ key = (sd['CHROM'],sd['POS'])
+ if cutoff_fn is None or cutoff_fn(sd):
+ if store_only is not None:
+ keep_sd = {}
+ for k in sd:
+ if k in store_only:
+ keep_sd[k] = sd[k]
+ sd = keep_sd
+ if len(sd) > 0:
+ vcf_data[key] = sd
return vcf_data
@@ -337,9 +329,9 @@ def sortkey(x):
#parent_str = 'Ep,Ti'
#qd = 6
#gq = 20
- min_indiv = 50
+ min_indiv = 2
fh = 0.7
- site_before = 45 #polymorphism must occur before this base in a fragment
+ site_before = numpy.inf #polymorphism must occur before this base in a fragment
#chi2crit = 30
#vcfn,qd,gq,chi2crit = sys.argv[1:]
@@ -353,9 +345,11 @@ def sortkey(x):
print >> sys.stderr, 'loading vcf',vcfn
vcf = load_vcf(vcfn,cutoff_fn=cut_fn,indiv_gt_phred_cut=float(gq))
+ print >> sys.stderr, '%s sites loaded' % len(vcf)
print >> sys.stderr, 'convert to pm/gt matrices'
- pm,gt = genotypes_from_vcf_obj(vcf)
+ pm,gt = genotypes_from_vcf_obj(vcf,min_indiv=min_indiv)
+ print >> sys.stderr, 'length pm: %s length gt: %s' % (len(pm),len(gt))
parents_prefixes = dict(zip(['A', 'B'],parent_str.split(',')))
parents = dict([(l,[k for k in gt.keys() if k.startswith(p)]) for l,p in parents_prefixes.items()])
View
7 vcf_to_rqtl_from_template_map.py
@@ -10,6 +10,7 @@
from short_read_analysis import variant_detection
from short_read_analysis import extract_genotypes_from_mclgr
+import preprocess_radtag_lane
def load_vcf(vcf,allele_map,indiv_gt_phred_cut=None,ding_on=100000,return_map=False):
'''processes a vcf file, adding genotypes satisfying GQ cutoff indiv_gt_phred_cut to a returned cross genotype object
@@ -22,7 +23,7 @@ def load_vcf(vcf,allele_map,indiv_gt_phred_cut=None,ding_on=100000,return_map=Fa
vcf_data = {}
i = 0
- for line in open(vcf):
+ for line in preprocess_radtag_lane.smartopen(vcf):
if i % ding_on == 0: print >> sys.stderr, 'reading',i
i += 1
@@ -67,9 +68,9 @@ def load_vcf(vcf,allele_map,indiv_gt_phred_cut=None,ding_on=100000,return_map=Fa
#populate individual genotype metrics provided each GQ >= indiv_gt_phred_cut if defined
sd['indiv_gt'] = {}
for ind,gt in zip(headers[FORMAT+1:],fields[FORMAT+1:]):
- if ':' in gt:
+ if not gt.startswith('./.') and ':' in gt:
this_gt = dict(zip(fields[FORMAT].split(':'),gt.split(':')))
- if indiv_gt_phred_cut is None or float(this_gt['GQ']) >= indiv_gt_phred_cut:
+ if indiv_gt_phred_cut is None or float(this_gt['GQ'] != '.' and this_gt['GQ'] or '0') >= indiv_gt_phred_cut:
sd['indiv_gt'][ind] = this_gt
if return_map:
new_map[ind].update({loc:''.join([allele_map[loc][n] for n in sd['indiv_gt'][ind]['GT'].split('/')])})
Please sign in to comment.
Something went wrong with that request. Please try again.