Permalink
Browse files

20120208 radtag_denovo code added

  • Loading branch information...
brantp committed Feb 8, 2012
1 parent 78bf432 commit 8e93363cb18c05358a58a10a1eb4d4d432d95d8e
View
@@ -0,0 +1 @@
+
View
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+import os,sys
+from radtag_denovo import preprocess_radtag_lane
+from subprocess import Popen,PIPE
+
+bam,lane,outroot = sys.argv[1:]
+
+idx_len = 6
+idx_table = 'DB_multiplex_indices'
+idx_field = 'RT'
+
+
+print >> sys.stderr, 'get index lookup ...',
+idx_d = dict([(d['seq'],d['idx']) for d in preprocess_radtag_lane.get_table_as_dict(idx_table)])
+print >> sys.stderr, '%s indices' % len(idx_d)
+
+bhandle = Popen('samtools view %s' % bam,shell=True,stdout=PIPE).stdout
+
+try:
+ os.makedirs(outroot)
+except:
+ pass
+
+outfhs = {}
+for idx in idx_d.values():
+ outf = os.path.join(outroot,'s_%s_sequence_index%s.txt' % (lane,idx))
+ outfhs[idx] = open(outf,'w')
+
+outfhs['pass'] = open(os.path.join(outroot,'s_%s_sequence_index%s.txt' % (lane,'PASS')),'w')
+
+print >> sys.stderr, 'process bam'
+for samline in bhandle:
+ fqstr,idx = preprocess_radtag_lane.sam_line_to_fastq(samline,idx_field,idx_d,idx_len)
+ if idx is None:
+ outfhs['pass'].write(fqstr)
+ else:
+ outfhs[idx].write(fqstr)
+
+for outfh in outfhs.values():
+ outfh.close()
View
@@ -0,0 +1,14 @@
+# login credentials for google docs account containing library and adapter data
+EMAIL = "" # google docs login
+PASS = "" # google docs password
+SOURCE = 'RADbase' # docuument source (leave as-is)
+LIBRARY_DATA = 'DB_library_data'
+ADAPTER_DATA = 'DB_index_by_well'
+
+# scratch must be set to a folder to which you have write permission:
+SCRATCH = ""
+
+# radtag_denovo must be the full path of the folder in which
+# this and its accessory scripts reside
+RTDROOT = ""
+
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+'''
+given a uniqued file and set of new reads, passes only those reads that perfectly match a uniqued entry present at or above some copy number
+'''
+
+import os,sys
+import numpy
+from editdist import distance
+from preprocess_radtag_lane import next_read_from_fh, smartopen, get_read_count
+
+idx_bp = 5
+cut_bp = 5
+
+lnum = 4
+min_seqs = 7
+
+uniqued, fastq = sys.argv[1:]
+
+readlen = len(next_read_from_fh(smartopen(fastq),4)[1])
+
+print >> sys.stderr, 'readlen: %s' % readlen
+
+num_reads = get_read_count(fastq,4)
+tickon = num_reads/200
+
+useqs = []
+for l in open(uniqued):
+ s,cntstr = l.strip().split()[0], l.strip().split()[4]
+ cnt = numpy.mean([int(i) for i in cntstr.split(',')])
+ if cnt >= min_seqs:
+ useqs.append(s[cut_bp:readlen-idx_bp])
+
+useqs = list(set(useqs))
+print >> sys.stderr, '%s unique %sbp sequences in uniqued file' % (len(useqs),len(s[cut_bp:readlen-idx_bp]))
+
+fh = smartopen(fastq)
+
+for i in range(num_reads):
+ if i%tickon==0: print >> sys.stderr, '%s / %s' % (i,num_reads)
+ line = next_read_from_fh(fh)
+ found = False
+ for seq in useqs:
+ if line[1][idx_bp+cut_bp:] == seq:
+ found = True
+ break
+
+ if found:
+ print '\n'.join(['@'+line[0],line[1],'+',line[2]])
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+import numpy,os,sys,re
+from preprocess_radtag_lane import get_baseQ
+
+
+def load_cluster_data(gr,tab,mID=None):
+ '''given an mcl graph output file, an mcl tab file of sequence labels, and an mID file of individuals per label,
+
+ returns (clusters,labels,mID_by_label) dicts
+ '''
+
+ #extract clusters from mcl output
+ print >>sys.stderr, 'load graph...',
+ fc = open(gr).read()
+ print >> sys.stderr, 'done\nparse graph...',
+ body = re.search('begin(.+?)\)',fc.replace('\n',' ')).groups()[0]
+ clusters = dict([(s.strip().split()[0],s.strip().split()[1:]) for s in body.strip().split('$') if len(s.strip().split()) > 1])
+ print >> sys.stderr, 'done\nload labels...',
+
+ #cluster labels
+ labels = dict([l.strip().split() for l in open(tab).readlines()])
+
+ print >> sys.stderr, 'done\nload mIDs...',
+ #individuals by labels
+ if mID is None:
+ mID_by_label = None
+ else:
+ mID_by_label = dict([(l.split()[0],l.split()[1:]) for l in open(mID).readlines()])
+ print >> sys.stderr, 'done'
+
+ return clusters,labels,mID_by_label
+
+def load_lines_from_uniqued(source_uniques,rv_sort = True, sort_key = lambda x: (len(x[0]),int(x[1])), keep_source_id = False):
+ '''
+ if keep_source_id is True
+ returns list of 2-tuples uniqued_id (eg 100617_lane6_PE for "data/100617/100617_lane6_PE.uniqued")
+ tuples are (parsed_lines,uniqued_id)
+
+ else list of lines.
+ '''
+ uniquedlines = []
+ for f in source_uniques:
+ lines = []
+
+ print >> sys.stderr, 'load %s ...' % f,
+ lines = tuple([l.strip().split() for l in open(f).readlines()])
+ print >> sys.stderr, '%s lines' % len(lines)
+
+ #get qual base
+ baseQ = None
+ for l in lines:
+ baseQ = get_baseQ(l[2])
+ if baseQ is not None:
+ break
+ print >> sys.stderr, 'qual base: %s' % baseQ
+
+ if baseQ == 64:
+ print >> sys.stderr, 'Translate quality encoding to base 33 ...',
+ for l in lines:
+ l[2] = ''.join([chr(ord(c)-64+33) for c in l[2]])
+ print >> sys.stderr, 'done'
+
+ if keep_source_id:
+ uniqued_id = os.path.basename(os.path.splitext(f)[0])
+ uniquedlines.extend( zip( lines,[uniqued_id]*len(lines) ) )
+ else:
+ uniquedlines.extend(lines)
+
+ print >> sys.stderr, 'sort',
+ if keep_source_id:
+ uniquedlines.sort(reverse = rv_sort,key = lambda x: sort_key(x[0]))
+ else:
+ uniquedlines.sort(reverse = rv_sort,key = sort_key)
+ print >> sys.stderr, 'done'
+ return uniquedlines
+
+
+if __name__ == '__main__':
+
+ gr,tab = sys.argv[1:3]
+ source_uniques = sys.argv[3:]
+
+ nticks = 100
+
+ clusters,labels,mID_by_label = load_cluster_data(gr,tab)
+
+ clid_by_lid = {}
+ for clid in clusters.keys():
+ for lblid in clusters[clid]:
+ clid_by_lid[lblid] = clid
+
+ lid_by_seq = {}
+ for k in labels.keys():
+ lid_by_seq[labels[k].split('.')[2]] = k
+
+ try_lens = sorted(list(set([len(k) for k in lid_by_seq])),reverse=True)
+
+ seqlen = try_lens[0]
+
+ uniquedlines = load_lines_from_uniqued(source_uniques, rv_sort = False, sort_key = lambda x: x[0][:seqlen],keep_source_id=True)
+
+ tickon = len(uniquedlines) / nticks
+
+ print >> sys.stderr, 'process clusters'
+
+ for i,(l,uniqued_id) in enumerate(uniquedlines):
+ if i % tickon == 0: print >> sys.stderr, '%s / %s lines' % (i,len(uniquedlines))
+ lid = None
+ try:
+ lid = lid_by_seq[l[0][:seqlen]]
+ except:
+ for sl in try_lens:
+ try:
+ lid = lid_by_seq[l[0][:sl]]
+ break
+ except:
+ pass
+ if lid is None:
+ pass
+ else:
+ print '\t'.join([clid_by_lid[lid],'%s.%s' % (lid,uniqued_id)]+l)
+
View
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+
+'''initializes an empty sample database table given data in config.py
+uses example index data in "DB_index_by_well.csv" in this directory
+'''
+
+import preprocess_radtag_lane
+import config
+import sys,os
+
+adapt_data = os.path.join(config.RTDROOT,'DB_index_by_well.csv')
+
+try:
+ key, gd_client = preprocess_radtag_lane.get_spreadsheet_key(config.LIBRARY_DATA)
+ print >> sys.stderr, 'table %s (%s) exists, skip' % (config.LIBRARY_DATA, key)
+except:
+ headers = ['sampleID','pool','adapter','adaptersversion','flowcell','lane','index']
+ preprocess_radtag_lane.create_empty_table(config.LIBRARY_DATA)
+ key, gd_client = preprocess_radtag_lane.get_spreadsheet_key(config.LIBRARY_DATA)
+ for i,col in enumerate(headers):
+ entry = gd_client.UpdateCell(1,2+i,col,key)
+
+
+try:
+ key, gd_client = preprocess_radtag_lane.get_spreadsheet_key(config.ADAPTER_DATA)
+ print >> sys.stderr, 'table %s (%s) exists, skip' % (config.ADAPTER_DATA, key)
+except:
+ preprocess_radtag_lane.create_empty_table(config.ADAPTER_DATA)
+ key, gd_client = preprocess_radtag_lane.get_spreadsheet_key(config.ADAPTER_DATA)
+ fh = open(adapt_data)
+ headers = fh.readline().strip().split(',')[1:]
+ for i,col in enumerate(headers):
+ entry = gd_client.UpdateCell(1,2+i,col,key)
+ for l in fh:
+ d = dict(zip(headers,l.strip().split(',')[1:]))
+ entry = gd_client.InsertRow(d,key)
View
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+'''
+given a list of query files and a single subject, generates a sparse id matrix in the label input format specified by mcl
+
+ALL SEQUENCE HEADERS MUST BEGIN WITH A UNIQUE INTEGER ID FOLLOWED BY A PERIOD
+
+'''
+
+import os, sys, re
+from config import SCRATCH as scratch
+
+keep_blat = False
+pct_id_cut = 0.9
+
+if __name__ == '__main__':
+ s,q,args,outbase = sys.argv[1:]
+
+ if keep_blat:
+ outf = outbase+'.blat'
+ else:
+ try:
+ os.makedirs(scratch)
+ except:
+ pass
+ outf = os.path.join(scratch,'%s-%s.blat' % (os.path.basename(s),os.path.basename(q)))
+ ret = os.system('touch %s' % outf)
+ if not os.path.exists(outf):
+ outf = outbase+'.blat'
+ else:
+ os.unlink(outf)
+
+ matf = outbase+'.label'
+
+ if os.path.exists(matf):
+ print >> sys.stderr, 'output %s already present' % matf
+ sys.exit(0)
+
+ exitcode = os.system('blat %s %s %s %s' % (s,q,args,outf))
+ if not exitcode == 0:
+ if not keep_blat:
+ os.unlink(outf)
+ raise OSError, 'blat failed; exit'
+
+ print >> sys.stderr, 'process blat output %s (%s bytes)' % (outf,os.path.getsize(outf))
+ matfh = open(matf,'w')
+ for l in open(outf):
+ fields = l.strip().split()
+ try:
+ m = int(fields[0])
+ topstrand = (fields[8] == '+')
+ except:
+ m = None
+ if m and topstrand:
+ s1 = fields[9]
+ s2 = fields[13]
+ l1 = len(s1.split('.')[2])
+ l2 = len(s2.split('.')[2])
+ if s1 != s2 and m/float(min((l1,l2))) >= pct_id_cut:
+ matfh.write('%s\t%s\t%s\n' % (fields[9],fields[13],m))
+ matfh.close()
+
+ if not keep_blat:
+ os.unlink(outf)
+
View
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+
+from subprocess import Popen,PIPE
+import sys,re
+
+def muscle(seqs,nohead=False):
+ if nohead:
+ label_seqs = list(enumerate(seqs))
+ else:
+ label_seqs = zip(seqs[:-1:2],seqs[1::2])
+
+ mh = Popen(['muscle'],stdin=PIPE,stderr=PIPE,stdout=PIPE)
+ mh.stdin.write('\n'.join(['>%s\n%s' % (l,s) for l,s in label_seqs]))
+ alnstr = mh.communicate()[0]
+
+ csep = re.sub(r'>(.+?)\n',r',\1,',alnstr).replace('\n','')[1:]
+
+ if nohead:
+ seqli = csep.split(',')
+ return [seq for idx,seq in sorted(zip([int(i) for i in seqli[::2]],seqli[1::2]))]
+ else:
+ return csep
+
+ #return ','.join(alnstr.strip().replace('\n','').split('>'))
+
+
+if __name__ == '__main__':
+ seqs = sys.argv[1].split(',')
+
+ print muscle(seqs)
Oops, something went wrong.

0 comments on commit 8e93363

Please sign in to comment.