Permalink
Browse files

DB_index_by_well.csv added

1 parent 2750d8f commit cf6aeda3eb8473e5dcd519a77e1be5b77cdb7262 @brantp committed Jun 27, 2012
Showing with 161 additions and 10 deletions.
  1. +109 −0 DB_index_by_well.csv
  2. +33 −7 preprocess_radtag_lane.py
  3. +3 −2 rtd_run.py
  4. +16 −1 sam_from_clust_uniqued.py
View
@@ -0,0 +1,109 @@
+,adaptersversion,well,idx
+,rad48,A1,GCATG
+,rad48,B1,AACCA
+,rad48,C1,CGATC
+,rad48,D1,TCGAT
+,rad48,E1,TGCAT
+,rad48,F1,CAACC
+,rad48,G1,GGTTG
+,rad48,H1,AAGGA
+,rad48,A2,AGCTA
+,rad48,B2,ACACA
+,rad48,C2,AATTA
+,rad48,D2,ACGGT
+,rad48,E2,ACTGG
+,rad48,F2,ACTTC
+,rad48,G2,ATACG
+,rad48,H2,ATGAG
+,rad48,A3,ATTAC
+,rad48,B3,CATAT
+,rad48,C3,CGAAT
+,rad48,D3,CGGCT
+,rad48,E3,CGGTA
+,rad48,F3,CGTAC
+,rad48,G3,CGTCG
+,rad48,H3,CTGAT
+,rad48,A4,CTGCG
+,rad48,B4,CTGTC
+,rad48,C4,CTTGG
+,rad48,D4,GACAC
+,rad48,E4,GAGAT
+,rad48,F4,GAGTC
+,rad48,G4,GCCGT
+,rad48,H4,GCTGA
+,rad48,A5,GGATA
+,rad48,B5,GGCCA
+,rad48,C5,GGCTC
+,rad48,D5,GTAGT
+,rad48,E5,GTCCG
+,rad48,F5,GTCGA
+,rad48,G5,TACCG
+,rad48,H5,TACGT
+,rad48,A6,TAGTA
+,rad48,B6,TATAC
+,rad48,C6,TCACG
+,rad48,D6,TCAGT
+,rad48,E6,TCCGG
+,rad48,F6,TCTGC
+,rad48,G6,TGGAA
+,rad48,H6,TTACC
+,craig12,1,GCATG
+,craig12,2,AACCA
+,craig12,3,CCCCC
+,craig12,4,CGATC
+,craig12,5,TCGAT
+,craig12,6,TGCAT
+,craig12,7,CAACC
+,craig12,8,GGTTG
+,craig12,9,AAGGA
+,craig12,10,AGCTA
+,craig12,11,ACACA
+,craig12,12,AATTA
+,flex48,1,GCATG
+,flex48,2,AACCA
+,flex48,3,CGATC
+,flex48,4,TCGAT
+,flex48,5,TGCAT
+,flex48,6,CAACC
+,flex48,7,GGTTG
+,flex48,8,AAGGA
+,flex48,9,AGCTA
+,flex48,10,ACACA
+,flex48,11,AATTA
+,flex48,12,ACGGT
+,flex48,13,ACTGG
+,flex48,14,ACTTC
+,flex48,15,ATACG
+,flex48,16,ATGAG
+,flex48,17,ATTAC
+,flex48,18,CATAT
+,flex48,19,CGAAT
+,flex48,20,CGGCT
+,flex48,21,CGGTA
+,flex48,22,CGTAC
+,flex48,23,CGTCG
+,flex48,24,CTGAT
+,flex48,25,CTGCG
+,flex48,26,CTGTC
+,flex48,27,CTTGG
+,flex48,28,GACAC
+,flex48,29,GAGAT
+,flex48,30,GAGTC
+,flex48,31,GCCGT
+,flex48,32,GCTGA
+,flex48,33,GGATA
+,flex48,34,GGCCA
+,flex48,35,GGCTC
+,flex48,36,GTAGT
+,flex48,37,GTCCG
+,flex48,38,GTCGA
+,flex48,39,TACCG
+,flex48,40,TACGT
+,flex48,41,TAGTA
+,flex48,42,TATAC
+,flex48,43,TCACG
+,flex48,44,TCAGT
+,flex48,45,TCCGG
+,flex48,46,TCTGC
+,flex48,47,TGGAA
+,flex48,48,TTACC
View
@@ -55,7 +55,17 @@ def smartopen(filename,*args,**kwargs):
return open(filename,*args,**kwargs)
-def get_read_count(filename,lnum=None):
+def get_read_count(filename,lnum=None,use_cache=True):
+
+ if use_cache:
+ rcc = filename+'.rc.cache'
+ try:
+ filesize,rc = open(rcc).readline().strip().split()
+ if float(filesize) == os.path.getsize(filename):
+ print >> sys.stderr, 'read count from cached value: %s' % rc
+ return int(rc)
+ except:
+ pass
if lnum is None:
if smartopen(filename).read(1) == '@':
@@ -67,12 +77,15 @@ def get_read_count(filename,lnum=None):
print >> sys.stderr, 'getting read count for compressed file',filename,'...',
rc = int(Popen('gunzip -c %s | wc -l' % filename,shell=True,stdout=PIPE).stdout.read().split()[0]) / lnum
print >> sys.stderr, rc
- return rc
else:
print >> sys.stderr, 'getting read count for file',filename,'...',
rc = int(Popen('wc -l %s' % filename,shell=True,stdout=PIPE).stdout.read().split()[0]) / lnum
print >> sys.stderr, rc
- return rc
+
+ if use_cache:
+ open(rcc,'w').write('%s\t%s\n' % (os.path.getsize(filename),rc))
+
+ return rc
def get_baseQ(qstr):
q = [ord(c) for c in qstr]
@@ -214,7 +227,16 @@ def get_individual_data_for_lane(filename=None,idxlookup=None,fc=None,lane=None,
if len(idxlens) != 1:
raise ValueError, 'non-uniform index lengths %s for %s' % (idxlens,filename)
- sampleids = [r['sampleid'] for r in recs]
+ try:
+ sampleids = [r['sampleid'] for r in recs]
+ except KeyError:
+ try: #permit backup sample ID use
+ sampleids = [r['sampleid2'] for r in recs]
+ except KeyError:
+ print >> sys.stderr, 'not all samples have ID:'
+ for d in recs:
+ print >> sys.stderr, d.get('sampleid','MISSING'),d
+ raise
wells = [r['adapter'] for r in recs]
if len(set(sampleids)) != len(sampleids):
@@ -334,8 +356,12 @@ def assign_read_to_indiv(line,indiv_data,mismatch_allowed=1, \
ts = [s[:idxlen] for s in ss]
tqs = [qstr[:idxlen] for qstr in qstrs]
tagdists = [sorted([(distance(t_this,t),t_this) for t_this in indiv_data.keys()]) for t in ts]
- indiv_cand = [indiv_data[tagdist[0][1]]['sampleid'] for tagdist in tagdists \
- if tagdist[0][0] <= mismatch_allowed and tagdist[1][0] > mismatch_allowed]
+ try:
+ indiv_cand = [indiv_data[tagdist[0][1]]['sampleid'] for tagdist in tagdists \
+ if tagdist[0][0] <= mismatch_allowed and tagdist[1][0] > mismatch_allowed]
+ except:
+ indiv_cand = [indiv_data[tagdist[0][1]]['sampleid2'] for tagdist in tagdists \
+ if tagdist[0][0] <= mismatch_allowed and tagdist[1][0] > mismatch_allowed]
if len(set(indiv_cand)) == 1:
indiv = indiv_cand[0]
read = [s[idxlen:] for s in ss]
@@ -360,7 +386,7 @@ def assign_read_to_indiv(line,indiv_data,mismatch_allowed=1, \
else:
if indiv_reads_out_pattern is not None:
for h,t,tq,s,q,rn,pat in zip(heads,ts,tqs,read,qual,[1,2],indiv_reads_out_pattern):
- newhead = '%s:%s:%s' % (h,t,tq)
+ newhead = '%s %s:%s' % (h,t,tq)
try:
fhdict[(indiv,rn)].write(as_fq_line(newhead,s,q,baseQ_out,output_lnum))
except KeyError:
View
@@ -483,9 +483,10 @@ def get_uniqued_error(infiles,cdest_searchbase):
labf = run_match(opts.match_engine,opts.parallel_engine,opts.ncores,subjects,queries,minlen,blatargstr)
print >> sys.stderr, 'match complete, concatenate...',
- ret = os.system('cat %s > %s' % (' '.join(labf), labelfile))
+ catcmd = 'cat %s > %s' % (' '.join(labf), labelfile)
+ ret = os.system(catcmd)
if ret != 0:
- raise OSError, 'cat failed with code %s' % ret
+ raise OSError, 'cat failed with code %s\n%s' % (ret,catcmd)
print >> sys.stderr, 'done'
else:
View
@@ -12,6 +12,21 @@
from collections import defaultdict
from config import RTDROOT
+def next_cluster_lines(fh):
+ this_cl = None
+ cl_lines = []
+ for l in fh:
+ if l.split()[0] != this_cl:
+ if this_cl is not None:
+ return cl_lines
+ else:
+ this_cl = l.split()[0]
+ cl_lines.append(l)
+ else:
+ cl_lines.append(l)
+
+
+
def samline_from_alnpair(rname,raln,qname,qaln,qqual):
if set(qqual) == set(['#']):
return None
@@ -169,7 +184,7 @@ def aln_from_clust(clname,cl_lines,keep_seqs=None,seq_len=0,break_on_error=True)
[zip( l[5].split(','), [int(i) for i in l[6].split(',')] ) for l in cl_lines] ) , \
key=lambda x: (len(x[1].replace('-','').replace('N','')),len(x[3]),len(x[2].replace('#',''))),reverse=True)
except:
- print >> sys.stderr, 'alignment failed for cluster %s (%s lines)' % (cl_name,len(cl_lines))
+ print >> sys.stderr, 'alignment failed for cluster %s (%s lines)' % (clname,len(cl_lines))
if break_on_error:
raise
else:

0 comments on commit cf6aeda

Please sign in to comment.