Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge branch 'master' of github.com:brantp/rtd

Conflicts:
	mcl_id_triples_by_blat.py
  • Loading branch information...
commit d582187c0309d682e848ec3c9c2f203b453b5be5 2 parents d97bdc0 + 9746a9d
@brantp authored
View
48 101013_lane7_sample_data.csv
@@ -0,0 +1,48 @@
+,BC110,3,A1,rad48,101013,7
+,BC538,3,A2,rad48,101013,7
+,BC621,2,A3,rad48,101013,7
+,BC777,2,A4,rad48,101013,7
+,BC846,4,A5,rad48,101013,7
+,BC1006,4,A6,rad48,101013,7
+,BC244,2,B1,rad48,101013,7
+,BC540,1,B2,rad48,101013,7
+,BC673,2,B3,rad48,101013,7
+,BC779,1,B4,rad48,101013,7
+,BC885,1,B5,rad48,101013,7
+,BC1010,4,B6,rad48,101013,7
+,BC254,3,C1,rad48,101013,7
+,BC567,3,C2,rad48,101013,7
+,BC675,3,C3,rad48,101013,7
+,BC790,3,C4,rad48,101013,7
+,BC954,3,C5,rad48,101013,7
+,BC1029,4,C6,rad48,101013,7
+,BC301,1,D1,rad48,101013,7
+,BC606,2,D2,rad48,101013,7
+,BC689,2,D3,rad48,101013,7
+,BC793,3,D4,rad48,101013,7
+,BC958,4,D5,rad48,101013,7
+,BC1032,4,D6,rad48,101013,7
+,BC306,2,E1,rad48,101013,7
+,BC608,1,E2,rad48,101013,7
+,BC709,3,E3,rad48,101013,7
+,BC796,4,E4,rad48,101013,7
+,BC973,3,E5,rad48,101013,7
+,BC1054,4,E6,rad48,101013,7
+,BC442,3,F1,rad48,101013,7
+,BC613,2,F2,rad48,101013,7
+,BC744,1,F3,rad48,101013,7
+,BC826,4,F4,rad48,101013,7
+,BC974,1,F5,rad48,101013,7
+,BC1058,2,F6,rad48,101013,7
+,BC497,1,G1,rad48,101013,7
+,BC616,2,G2,rad48,101013,7
+,BC749,1,G3,rad48,101013,7
+,BC828,4,G4,rad48,101013,7
+,BC980,1,G5,rad48,101013,7
+,BC1059,1,G6,rad48,101013,7
+,BC527,2,H1,rad48,101013,7
+,BC620,1,H2,rad48,101013,7
+,BC750,2,H3,rad48,101013,7
+,BC834,4,H4,rad48,101013,7
+,BC985,4,H5,rad48,101013,7
+,BC1023B,3,H6,rad48,101013,7
View
5 USAGE_NOTES
@@ -64,6 +64,9 @@ Finally, note that the first sample had no index; this is fine, this
just means we didn't do illumina-style "third read" multiplexing for
that sample, whereas for the second example we did (using index #11)
+Example data for DB_library_data can be found at:
+https://docs.google.com/spreadsheet/ccc?key=0AnHEwF1NpAxDdFlXRXliTElXQzF6dkswZk5HenlTclE
+
- RUN preprocess_radtag_lane.py
we start with the s_X_[1|2]_sequence[_indexZ].txt[.gz]
@@ -78,4 +81,4 @@ $ preprocess_radtag_lane.py -h
- RUN rtd_run.py
using the .uniqued.gz files generated for each lane/index, invoke rtd_run.py per help:
-$ rtd_run.py -h
+$ rtd_run.py -h
View
10 mcl_id_triples_by_blat.py
@@ -7,7 +7,11 @@
'''
+<<<<<<< HEAD
import os, sys, re, gzip
+=======
+import os, sys, re, numpy
+>>>>>>> 9746a9d04d8803a838c0229517a1dacbdb12cb1b
from config import SCRATCH as scratch
fail_to_local = False #must set to "True" to allow use of working directory for blat output
@@ -35,7 +39,11 @@ def space_free_on_volume(vol,unit='M',verbose=False):
from subprocess import Popen,PIPE
if verbose:
print >> sys.stderr, 'checking free space on volume %s ...' % vol,
- free = int(Popen('df -P --sync -B %s %s' % (unit,vol), shell=True, stdout=PIPE).stdout.readlines()[-1].strip().split()[3].rstrip(unit))
+ try:
+ free = int(Popen('df -P --sync -B %s %s' % (unit,vol), shell=True, stdout=PIPE).stdout.readlines()[-1].strip().split()[3].rstrip(unit))
+ except:
+ print >> sys.stderr, 'free space check failed; proceeding. MONITOR AVAILABLE SPACE ON %s' % vol
+ free = numpy.inf
if verbose:
print >> sys.stderr, '%s %sB' % (free,unit)
return free
View
BIN  s_7_sequence-1M.txt.gz
Binary file not shown
View
376 vcf_to_rqtl.py
@@ -323,6 +323,382 @@ def sortkey(x):
return out_geno,mID_lookup
+#other output methods
+def write_structure_genotypes(vcf_data, outfile, keys_to_write = None, indiv_to_write = None):
+ '''given a parsed vcf data structure per load_vcf and an outfile name
+
+ prints a structure-formatted genotype file.
+
+ if keys_to_write is supplied, only vcf_data items corresponding to those keys will be written
+ '''
+
+ if keys_to_write is None:
+ keys_to_write = vcf_data.keys()
+
+ keys_to_write.sort(key = lambda x: (x[0],int(x[1])))
+
+ if indiv_to_write is None:
+ indiv_to_write = set()
+ for k in keys_to_write:
+ v = vcf_data[k]
+ indiv_to_write = indiv_to_write.union(set(v['indiv_gt'].keys()))
+ indiv_to_write = sorted(list(indiv_to_write))
+
+ ofh = open(outfile,'w')
+ ofh.write('\t'.join(['%s.%s' % (c,p) for c,p in keys_to_write]))
+ ofh.write('\n')
+
+ for ind in indiv_to_write:
+ ofh.write(ind)
+ for k in keys_to_write:
+ try:
+ ofh.write('\t'+(vcf_data[k]['indiv_gt'][ind]['GT'].replace('/',' ')))
+ except KeyError:
+ ofh.write('\t-9 -9')
+ ofh.write('\n')
+
+ ofh.close()
+
+
+def write_spagedi_genotypes(vcf_data, outfile, keys_to_write = None, indiv_to_write = None):
+ '''generates output intended for SPAGeDi
+
+ currently treats all individuals as originating from a single population;
+ this will need to be elaborated upon
+ '''
+
+ from short_read_analysis import preprocess_radtag_lane
+ lookup = dict([(l['sampleid'],l['population']) for l in preprocess_radtag_lane.get_table_as_dict('DB_library_data') if l.has_key('population')])
+
+ if keys_to_write is None:
+ keys_to_write = vcf_data.keys()
+
+ keys_to_write.sort(key = lambda x: (x[0],int(x[1])))
+
+ if indiv_to_write is None:
+ indiv_to_write = set()
+ for k in keys_to_write:
+ v = vcf_data[k]
+ indiv_to_write = indiv_to_write.union(set(v['indiv_gt'].keys()))
+ indiv_to_write = sorted(list(indiv_to_write))
+
+ ofh = open(outfile,'w')
+ #write header
+ ofh.write('%s\t1\t0\t%s\t1\t2\n0\nInd\tPop\t%s\n' % \
+ (len(indiv_to_write),len(keys_to_write), '\t'.join(['%s.%s' % (c,p) for c,p in keys_to_write])))
+
+ #write genotypes
+ for ind in indiv_to_write:
+ ofh.write('%s\t%s' % (ind,lookup.get(ind,'pop1')))
+ for k in keys_to_write:
+ try:
+ gt = '/'.join([str(int(i)+1) for i in vcf_data[k]['indiv_gt'][ind]['GT'].split('/')])
+ ofh.write('\t'+gt)
+ except KeyError:
+ ofh.write('\t0/0')
+ ofh.write('\n')
+
+ ofh.write('END\n')
+ ofh.close()
+
+
+def write_tassel_genotypes(vcf_data, outfile, keys_to_write = None, indiv_to_write = None):
+ '''given vcf_data per load_vcf above and an outfile name
+
+ writes tassel input format'''
+
+ xd = { '0':'REF','1':'ALT' }
+
+ if keys_to_write is None:
+ keys_to_write = vcf_data.keys()
+
+ keys_to_write.sort(key = lambda x: (x[0],int(x[1])))
+
+ if indiv_to_write is None:
+ indiv_to_write = set()
+ for k in keys_to_write:
+ v = vcf_data[k]
+ indiv_to_write = indiv_to_write.union(set(v['indiv_gt'].keys()))
+ indiv_to_write = sorted(list(indiv_to_write))
+
+ ofh = open(outfile,'w')
+
+ ofh.write('%s\t%s:%s\n' % (len(indiv_to_write),len(keys_to_write),'2'))
+ if len(set([c for c,p in keys_to_write])) == 1:
+ ofh.write('%s\n' % '\t'.join([p for c,p in keys_to_write]))
+ else:
+ ofh.write('%s\n' % '\t'.join(['%s.%s' % (c,p) for c,p in keys_to_write]))
+
+ for ind in indiv_to_write:
+ ofh.write('%s' % ind)
+ for k in keys_to_write:
+ try:
+ gt = ':'.join([vcf_data[k][xd[i]] for i in vcf_data[k]['indiv_gt'][ind]['GT'].split('/')])
+ except:
+ gt = '?:?'
+ ofh.write('\t' + gt)
+ ofh.write('\n')
+
+ ofh.close()
+
+def write_plink_genotypes(vcf_data, outfile, keys_to_write = None, indiv_to_write = None):
+
+ if keys_to_write is None:
+ keys_to_write = vcf_data.keys()
+
+ keys_to_write.sort(key = lambda x: (x[0],int(x[1])))
+
+ if indiv_to_write is None:
+ indiv_to_write = set()
+ for k in keys_to_write:
+ v = vcf_data[k]
+ indiv_to_write = indiv_to_write.union(set(v['indiv_gt'].keys()))
+ indiv_to_write = sorted(list(indiv_to_write))
+
+ ofh = open(outfile,'w')
+
+ idx = 0
+ for ind in indiv_to_write:
+ idx += 1
+ ofh.write('FAM%s\t%s\t0\t0\t1\t0' % (idx,ind))
+ for k in keys_to_write:
+ try:
+ gt = ' '.join([str(int(i)+1) for i in vcf_data[k]['indiv_gt'][ind]['GT'].split('/')])
+ except:
+ gt = '0 0'
+ ofh.write('\t' + gt)
+ ofh.write('\n')
+
+ ofh.close()
+
+ mapout = os.path.splitext(outfile)[0] + '.map'
+ ofh = open(mapout,'w')
+ infout = open(outfile + '.info','w')
+
+ chrom_translation = dict([(k,i+1) for i,k in enumerate(set([k[0] for k in keys_to_write]))])
+ open(mapout+'.xlat','w').write('\n'.join(['%s\t%s' % (k,v) for k, v in chrom_translation.items()]))
+
+ for k in keys_to_write:
+ ofh.write('%s\t%s.%s\t%s\t%s\n' % (chrom_translation[k[0]], k[0], k[1], 0, k[1]))
+ infout.write('%s\t%s\n' % (k[1],k[1]))
+
+ ofh.close()
+ infout.close()
+
+
+def write_flapjack_genotypes(vcf_data, outfile, keys_to_write = None, indiv_to_write = None):
+
+ xd = { '0':'REF','1':'ALT' }
+
+
+ if keys_to_write is None:
+ keys_to_write = vcf_data.keys()
+
+ keys_to_write.sort(key = lambda x: (x[0],int(x[1])))
+
+ if indiv_to_write is None:
+ indiv_to_write = set()
+ for k in keys_to_write:
+ v = vcf_data[k]
+ indiv_to_write = indiv_to_write.union(set(v['indiv_gt'].keys()))
+ indiv_to_write = sorted(list(indiv_to_write))
+
+ ofh = open(outfile,'w')
+
+ if len(set([c for c,p in keys_to_write])) == 1:
+ ofh.write('\t%s\n' % '\t'.join([p for c,p in keys_to_write]))
+ else:
+ #ofh.write('\t%s\n' % '\t'.join(['%s.%s' % (c,p) for c,p in keys_to_write]))
+ raise NotImplementedError, 'currently only supports single-chromosome output'
+
+ for ind in indiv_to_write:
+ ofh.write('%s' % ind)
+ for k in keys_to_write:
+ try:
+ gt = '/'.join([vcf_data[k][xd[i]] for i in vcf_data[k]['indiv_gt'][ind]['GT'].split('/')])
+ except:
+ gt = '?/?'
+ ofh.write('\t' + gt)
+ ofh.write('\n')
+
+ ofh.close()
+
+ mapout = os.path.splitext(outfile)[0] + '.fj.map'
+ ofh = open(mapout,'w')
+
+ for k in keys_to_write:
+ ofh.write('%s\t%s\t%s\n' % (k[1], 1, k[1]))
+
+ ofh.close()
+
+def write_peas_genotypes(vcf_data, outfile, keys_to_write = None, indiv_to_write = None):
+ '''given vcf_data per load_vcf above and an outfile name
+
+ writes peas input format'''
+
+ xd = { '0':'REF','1':'ALT' }
+
+ if keys_to_write is None:
+ keys_to_write = vcf_data.keys()
+
+ keys_to_write.sort(key = lambda x: (x[0],int(x[1])))
+
+ if indiv_to_write is None:
+ indiv_to_write = set()
+ for k in keys_to_write:
+ v = vcf_data[k]
+ indiv_to_write = indiv_to_write.union(set(v['indiv_gt'].keys()))
+ indiv_to_write = sorted(list(indiv_to_write))
+
+ ofh = open(outfile,'w')
+
+ ofh.write('SNPID\t%s\n' % '\t'.join('%s.%s' % (k[0],k[1]) for k in keys_to_write))
+ ofh.write('Chrom\t%s\n' % '\t'.join([k[0] for k in keys_to_write]))
+ ofh.write('Position\t%s\n' % '\t'.join([k[1] for k in keys_to_write]))
+ ofh.write('AlleleState\t%s\n' % '\t'.join(['%s/%s' % (vcf_data[k]['REF'],vcf_data[k]['ALT']) for k in keys_to_write]))
+ ofh.write('Strand\t%s\n' % '\t'.join(['+'] * len(keys_to_write)))
+
+ for ind in indiv_to_write:
+ ofh.write('%s' % ind)
+ for k in keys_to_write:
+ try:
+ gt = ''.join([vcf_data[k][xd[i]] for i in vcf_data[k]['indiv_gt'][ind]['GT'].split('/')])
+ except:
+ gt = '??'
+ ofh.write('\t' + gt)
+ ofh.write('\n')
+
+ ofh.close()
+
+def write_bimbam_genotypes(vcf_data, outfile, keys_to_write = None, indiv_to_write = None):
+ '''does not suppport phased or multiple chromosome input vcfs
+ '''
+
+ xd = { '0':'REF','1':'ALT' }
+
+
+ if keys_to_write is None:
+ keys_to_write = vcf_data.keys()
+
+ keys_to_write.sort(key = lambda x: (x[0],int(x[1])))
+
+ if indiv_to_write is None:
+ indiv_to_write = set()
+ for k in keys_to_write:
+ v = vcf_data[k]
+ indiv_to_write = indiv_to_write.union(set(v['indiv_gt'].keys()))
+ indiv_to_write = sorted(list(indiv_to_write))
+
+ ofh = open(outfile,'w')
+
+ if len(set([c for c,p in keys_to_write])) == 1:
+ ofh.write('%s\n%s\n' % (len(indiv_to_write),len(keys_to_write)))
+ ofh.write('IND,%s\n' % ','.join([ind for ind in indiv_to_write]))
+ else:
+ #ofh.write('\t%s\n' % '\t'.join(['%s.%s' % (c,p) for c,p in keys_to_write]))
+ raise NotImplementedError, 'currently only supports single-chromosome output'
+
+ for k in keys_to_write:
+ ofh.write('%s' % k[1])
+ for ind in indiv_to_write:
+ try:
+ gt = ''.join([vcf_data[k][xd[i]] for i in vcf_data[k]['indiv_gt'][ind]['GT'].split('/')])
+ except:
+ gt = '??'
+ ofh.write(',' + gt)
+ ofh.write('\n')
+
+ ofh.close()
+
+ mapout = os.path.splitext(outfile)[0] + '.bb.map.txt'
+ ofh = open(mapout,'w')
+
+ for k in keys_to_write:
+ ofh.write('%s,%s\n' % (k[1], k[1]))
+
+ ofh.close()
+
+def write_fastPHASE_genotypes(vcf_data,outbase, keys_to_write = None, indiv_to_write = None):
+ '''
+ given vcf per load_vcf, writes one output file PER CHROM, name as:
+ <outbase>_<CHROM>.inp
+
+ it is recommended that outbase include path information to a directory in which these will be written, i.e.:
+
+ outbase="beachmouse_fastPHASE/beaches_l55_k4_QD8_GQ12"
+
+ will produce files:
+
+ beachmouse_fastPHASE/beaches_l55_k4_QD8_GQ12_agouti_bac.inp
+ beachmouse_fastPHASE/beaches_l55_k4_QD8_GQ12_mc1r_bac.inp
+ ...etc
+
+ directories will be created as necessary.
+ '''
+
+
+ xd = { '0':'REF','1':'ALT' }
+
+
+ if keys_to_write is None:
+ keys_to_write = vcf_data.keys()
+
+ keys_to_write.sort(key = lambda x: (x[0],int(x[1])))
+
+ if indiv_to_write is None:
+ indiv_to_write = set()
+ for k in keys_to_write:
+ v = vcf_data[k]
+ indiv_to_write = indiv_to_write.union(set(v['indiv_gt'].keys()))
+ indiv_to_write = sorted(list(indiv_to_write))
+
+ outroot = os.path.dirname(outbase)
+
+ try:
+ os.makedirs(outroot)
+ except:
+ pass
+
+
+ this_chrom = None
+ this_len = 0
+ poslist = []
+ for k in keys_to_write:
+ if k[0] != this_chrom:
+ if this_chrom is not None:
+ outfile = '%s_%s.inp' % (outbase,this_chrom)
+ posfile = '%s_%s.pos' % (outbase,this_chrom)
+ open(posfile,'w').write(('\t'.join(poslist)) + '\n')
+ ofh = open(outfile,'w')
+ ofh.write('%s\n%s\n' % (len(indiv_to_write), this_len))
+ for ind in indiv_to_write:
+ ofh.write('# %s\n' % ind)
+ ofh.write('%s\n%s\n' % tuple([''.join(li) for li in Util.dezip(chrom_data[ind])]))
+ ofh.close()
+ chrom_data = defaultdict(list)
+ this_chrom = k[0]
+ this_len = 0
+ poslist = []
+ for ind in indiv_to_write:
+ try:
+ chrom_data[ind].append(vcf_data[k]['indiv_gt'][ind]['GT'].split('/'))
+ except:
+ chrom_data[ind].append(['?','?'])
+ this_len += 1
+ poslist.append(k[1])
+
+ #flush:
+ outfile = '%s_%s.inp' % (outbase,this_chrom)
+ posfile = '%s_%s.pos' % (outbase,this_chrom)
+ open(posfile,'w').write(('\t'.join(poslist)) + '\n')
+ ofh = open(outfile,'w')
+ ofh.write('%s\n%s\n' % (len(indiv_to_write), this_len))
+ for ind in indiv_to_write:
+ ofh.write('# %s\n' % ind)
+ ofh.write('%s\n%s\n' % tuple([''.join(li) for li in Util.dezip(chrom_data[ind])]))
+ ofh.close()
+
+
if __name__ == "__main__":
Please sign in to comment.
Something went wrong with that request. Please try again.