Permalink
Browse files

iterative_rtd updates

  • Loading branch information...
brantp committed Sep 24, 2013
1 parent 30456d0 commit ac9e29cd45f55562e7f29903eb95a665b3e12ede
Showing with 52 additions and 21 deletions.
  1. +23 −10 iterative_rtd.py
  2. +29 −11 preprocess_radtag_lane.py
View
@@ -3,7 +3,7 @@
iterative mapping-assembly by rtd over fragment size
'''
-import os,sys
+import os,sys,re
import run_safe
from subprocess import Popen,PIPE
from preprocess_radtag_lane import smartopen,get_read_count
@@ -170,6 +170,14 @@ def write_uniqued_by_size(all_quality,outbase,baseQ=33):
return ofbysize
+def get_uniqued_by_size(outbase):
+ ofbysize = {}
+ unis = glob(outbase+'-*.uniqued.gz')
+ for u in unis:
+ size = re.search('-(\d+)\.uniqued\.gz',u).groups()[0]
+ ofbysize[int(size)] = u
+ return ofbysize
+
def filter_uniqued(uniqued,outfile,lines_to_write):
ofh = smartopen(outfile,'w')
for i,l in enumerate(smartopen(uniqued)):
@@ -199,20 +207,25 @@ def ref_len(ref):
uniqueds = sys.argv[3:]
bysize_dir = os.path.join(outdir,'by_size/uni_len')
+ bysize_done = os.path.join(outdir,'by_size.done')
denovo_ref = os.path.join(outdir,'denovo.fa')
if os.path.exists(denovo_ref):
print >> sys.stderr, 'REMOVE REF: %s' % denovo_ref
os.unlink(denovo_ref)
- all_quality = defaultdict(dict)
-
- for uniqued in uniqueds:
- load_uniqued(all_quality,uniqued,count_by_ind=True)
-
- print >> sys.stderr, 'LOAD COMPLETE. WRITE BY-SIZE.'
- ofbysize = write_uniqued_by_size(all_quality,bysize_dir)
- del all_quality
+ if os.path.exists(bysize_done):
+ ofbysize = get_uniqued_by_size(bysize_dir)
+ else:
+ all_quality = defaultdict(dict)
+
+ for uniqued in uniqueds:
+ load_uniqued(all_quality,uniqued,count_by_ind=True)
+
+ print >> sys.stderr, 'LOAD COMPLETE. WRITE BY-SIZE.'
+ ofbysize = write_uniqued_by_size(all_quality,bysize_dir)
+ del all_quality
+ ret = os.system('touch %s' % bysize_done)
sizes = sorted(ofbysize.keys(),reverse=True)
@@ -240,7 +253,7 @@ def ref_len(ref):
cmd = "rtd_run.py -pe parallel -np 8 -nc 8 -s CATG -cd 1 -mi 1 --cleanup -te 8 %s %s" % (outdir,funi)
ret = os.system(cmd)
if ret != 0:
- raise OSError
+ raise OSError,cmd
refadd = glob(outdir+'/*.fa')[0]
append_to_ref(denovo_ref,refadd,i)
View
@@ -255,6 +255,15 @@ def get_table_as_dict(target_sheet,sq=None,gd_client=None,suppress_fc_check=Fals
return recs
+def no_net_get_table_as_dict(target_sheet,host='heroint4'):
+ print >> sys.stderr, 'retrieve %s via %s ...' % (target_sheet,host),
+ from subprocess import Popen,PIPE
+ cmd = 'ssh %s "python -c \\"from rtd.preprocess_radtag_lane import get_table_as_dict; td=get_table_as_dict(\'%s\',suppress_fc_check=True); print td.__repr__()\\"" 2> /dev/null | tail -n 1' % (host,target_sheet)
+ td_str = Popen(cmd,shell=True,stdout=PIPE).stdout.read()
+ td = eval(td_str)
+ print >> sys.stderr, '%s records' % len(td)
+ return td
+
def get_adapter_index_lookup(verbose=True):
'''returns dict of dicts:
{ <adaptersversion> : { <well> : <index_seq> } }
@@ -278,7 +287,7 @@ def get_adapter_index_lookup(verbose=True):
print >> sys.stderr, 'loaded adapter lookup from %s lines in %s' % (len(d),ADAPTER_DATA)
return idxlookup
-def get_individual_data_for_lane(filename=None,idxlookup=None,fc=None,lane=None,index=None):
+def get_individual_data_for_lane(filename=None,idxlookup=None,fc=None,lane=None,index=None,transtable=None,get_transtable_from_mouseDB=False):
'''given a fastq file, treats the directory immediately above as the flowcell ID, returns dict:
{ <sequence_index_tag> : **ROW_FROM_LIBRARY_DATA }
'''
@@ -355,9 +364,19 @@ def get_individual_data_for_lane(filename=None,idxlookup=None,fc=None,lane=None,
if len(set(wells)) != len(wells):
raise ValueError, '%s wells, %s unique' % (len(wells),len(set(wells)))
+ if transtable is None and get_transtable_from_mouseDB:
+ transtable,failures = get_legacy_to_DB_lookup(recs)
+
indiv_data = {}
for r in recs:
- indiv_data[idxlookup[r['adaptersversion']][r['adapter']]] = r
+ if transtable is not None:
+ if transtable.has_key(r['sampleid']):
+ r['sampleid'] = transtable[r['sampleid']]
+ indiv_data[idxlookup[r['adaptersversion']][r['adapter']]] = r
+ else:
+ print >> sys.stderr, '%s missing from transtable; check database lookup' % r['sampleid']
+ else:
+ indiv_data[idxlookup[r['adaptersversion']][r['adapter']]] = r
return indiv_data
@@ -662,10 +681,12 @@ def write_uniqued(all_quality,outfile,baseQ):
parser.add_argument('-ol','--output_lnum',default='4',choices=['1','4'],type=int,help='number of lines per record in fastq output if -w is specified (either older 1-line, or newer 4-line)'+ds)
parser.add_argument('-fc','--flowcell',default=None,type=str,help='flowcell name (if None, derive from sequence infile path)'+ds)
parser.add_argument('-l','--lane',default=None,type=str,help='lane (if None, derive from sequence infile name)'+ds)
- parser.add_argument('-idx','--index',default=None,type=str,help='multiplex index (if None, derive from sequence infile name)'+ds)
+ parser.add_argument('-idx','--index',default=None,type=str,help='multiplex index (if None, no index applied)'+ds)
parser.add_argument('-suf','--suffix',default=None,type=str,help='suffix for .uniqued file (permits processing split files)'+ds)
+ parser.add_argument('--force_db_id',action='store_true',help='force mouse database ids for individuals \n(replacing legacy sampleid from DB_library_data)'+ds)
+
parser.add_argument('-e','--estimate_error',action='store_true',help='invokes clustering to estimate error rate after completion of preprocessing'+ds)
parser.add_argument('-ee','--est_err_engine',default='local',choices=['local','parallel','lsf'],help='use this engine for parallelizing error estimate (rtd_run -pe argument)'+ds)
parser.add_argument('-ec','--est_err_cores',default=1,type=int,help='parallelize error estimate run over this number of cores (serial if less than 2) REQUIRES GNU PARALLEL'+ds)
@@ -733,15 +754,12 @@ def write_uniqued(all_quality,outfile,baseQ):
nreads = get_read_count(r1,lnum)
#index info append
- if opts.index is not None:
- index = opts.index
- idxstr = '_index%s' % index
- elif 'index' in os.path.basename(r1):
- index = os.path.basename(r1).split('index')[-1].split('.')[0]
- idxstr = '_index%s' % index
- else:
+ if opts.index is None or opts.index == 'None':
index = None
idxstr = ''
+ else:
+ index = opts.index
+ idxstr = '_index%s' % index
if opts.suffix is not None:
idxstr = idxstr+'_'+opts.suffix
@@ -751,7 +769,7 @@ def write_uniqued(all_quality,outfile,baseQ):
if nreads != nreads2:
raise ValueError, '%s read count: %s; %s read count: %s' % (r1, nreads, r2, nreads2)
- indiv_data = get_individual_data_for_lane(idxlookup=idxlookup,fc=fc,lane=lane,index=index)
+ indiv_data = get_individual_data_for_lane(idxlookup=idxlookup,fc=fc,lane=lane,index=index,get_transtable_from_mouseDB=opts.force_db_id)
adaptersversions = set([r['adaptersversion'] for r in indiv_data.values()])
idxs = reduce(lambda x,y: x+y, [idxlookup[adver].values() for adver in adaptersversions])

0 comments on commit ac9e29c

Please sign in to comment.