-
Notifications
You must be signed in to change notification settings - Fork 1
/
indelope.nim
608 lines (534 loc) · 22.5 KB
/
indelope.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
import osproc
import contig
import sequtils
import strutils
import sets
import os
import times
import random
import math
import algorithm
import strutils
import docopt
import hts/bam
import hts/fai
import ./ksw2/ksw2
import kmer
import genotyper
type
event = tuple[start: int, stop: int, len: int]
roi = tuple[start: int, stop: int, reads:seq[Record]]
proc trim(sequence: var string, base_qualities: seq[uint8], min_quality:int=15): int =
var a = 0
while a < base_qualities.high and base_qualities[a] < uint8(min_quality):
a += 1
if a == base_qualities.high:
sequence.set_len(0)
return a
var b = base_qualities.high
while b > a and base_qualities[b] < uint8(min_quality):
b -= 1
if a != 0 or b != base_qualities.high:
sequence = sequence[a..b]
return a
proc skippable(r: Record, allow_unmapped:bool=false): bool {.inline.} =
if r.chrom == "hs37d5": return true
if r.chrom.startswith("GL"): return true
var f = r.flag
if f.dup or f.qcfail: return true
if not allow_unmapped and f.unmapped: return true
if f.supplementary or f.secondary: return true
return false
type
Variant* = ref object of RootObj
chrom*: string
start*: int
filter*: string
qual*: float64
reference*: string
alternate*: string
genotype*: Genotype
ref_kmer*: string
alt_kmer*: string
info_str*: string
AD*: array[2, int]
proc info(v:Variant): string =
result = ("AD=" & $v.AD[0] & "," & $v.AD[1] &
";ref_kmer=" & $v.ref_kmer &
";alt_kmer=" & $v.alt_kmer)
if v.info_str.len != 0:
result &= ";" & v.info_str
proc info_add(v:Variant, kv:string) =
## add a new field to the info
if v.info_str == nil or v.info_str.len == 0:
v.info_str = kv
else:
v.info_str &= ';' & kv
const header = """##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##INFO=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=DP,Number=1,Type=Integer,Description="total reads covering this site">
##INFO=<ID=AL,Number=0,Type=Flag,Description="this was genotyped with alignment, no k-mer counting">
##INFO=<ID=AMQ,Number=1,Type=Integer,Description="median mapping quality of alts">
##INFO=<ID=RMQ,Number=1,Type=Integer,Description="median mapping quality of refs">
##INFO=<ID=BS,Number=1,Type=Integer,Description="number of times there was support for both ref and alt k-mer in a single read">
##INFO=<ID=MF,Number=1,Type=Integer,Description="minimum matching bases around this event when BS > 0. Higher gives more confidence">
##INFO=<ID=CF,Number=1,Type=Integer,Description="minimum flank of the event from either end of the contig. higher is better.">
##INFO=<ID=NC,Number=1,Type=Integer,Description="number of contigs at the site of this variant.">
##INFO=<ID=CC,Number=1,Type=String,Description="contig cigar from alignment to reference">
##INFO=<ID=LO,Number=0,Type=Flag,Description="low-offset: the event occurred near at the start of the contig so we may not have the full variant">
##INFO=<ID=AKE,Number=1,Type=Float,Description="mean alt-kmer distance from end of read">
##INFO=<ID=RKE,Number=1,Type=Float,Description="mean ref-kmer distance from end of read">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="supporting k-mer depth">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GL,Number=G,Type=Float,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=ref_kmer,Number=1,Type=String,Description="reference kmer used for genotyping">
##INFO=<ID=alt_kmer,Number=1,Type=String,Description="alternate kmer used for genotyping">
$1
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT $2"""
proc `$`*(v:Variant): string =
if v.filter == nil: v.filter = ""
var filter = v.filter
if filter == "": filter = "PASS"
return format("$chrom\t$pos\t$id\t$ref\t$alt\t$qual\t$filter\t$info\tGT:GQ:GL\t$gt" %
["chrom", v.chrom, "pos", $v.start, "id", ".",
"ref", v.reference, "alt", v.alternate,
"qual", formatFloat(v.qual, precision=2, format=ffDecimal),
"filter", filter, "info", v.info(), "gt", $(v.genotype)])
proc same(a:Variant, b:Variant): bool =
if a == nil or b == nil: return false
return (a.start == b.start) and (a.chrom == b.chrom) and (a.reference == b.reference) and (a.alternate == b.alternate)
const init_len = high(int)
proc get_min_flank(e:ksw2.event, ez:Ez): int =
result = init_len
var found_event = false
for c in ez.cigar:
if c.op == 0:
if found_event:
result = min(c.length.int, result)
else:
result = c.length.int
if found_event: return
elif c.op != 0 and int(c.op) - 1 == int(e.event_type) and c.length.int == e.len.int:
if init_len == result: result = 0
found_event = true
return 0
proc show(ez:Ez, r:roi, ctg:Contig, reference:string) =
# debug ez alignment
var s = ""
echo ez.cigar_string(s)
discard ez.cigar_string(s, full=true)
echo "roi:", r.start, "-", r.stop
echo "ctg range:", ctg.start, " ", ctg.start + ctg.len
echo "lens:", ctg.len, " ", reference.len
echo "full:", s
echo "part:", ez.cigar_string(s, full=true)
echo ez.draw(ctg.sequence, reference)
proc mean(a:seq[int]): float64 =
result = 0'f64
for v in a:
result += v.float64
return result / float64(a.len)
proc median(a: seq[uint8]): int =
var b = a
sort(b, proc(a, b: uint8): int = return int(a) - int(b))
return int(b[int(len(b)/2)])
proc assemble(r: roi, n_contigs:ptr int, min_qual:uint8=uint8(20), min_overlap_pct:float64=0.88): seq[Contig] =
## assemble the reads into contigs
var contigs = new_seq[Contig]()
var read_seq = ""
var base_q = new_seq[uint8](300)
# assemble the alt contig
for read in r.reads:
if read.qual < min_qual: continue
if read.skippable(allow_unmapped=false): continue
discard read.sequence(read_seq)
discard read.base_qualities(base_q)
var o = trim(read_seq, base_q)
contigs.insert(read_seq, read.start + o, min_overlap=int(min_overlap_pct * float64(read_seq.len)))
n_contigs[] = len(contigs)
when not defined(release):
echo "n contigs:", n_contigs[]
for c in contigs:
echo "start:", c.start, " ", c.nreads
contigs = contigs.combine(min_support=3)
var n_contigs_a = len(contigs)
when not defined(release):
echo "n contigs after:", n_contigs_a
for c in contigs:
echo "start:", c.start, " ", c.nreads
echo c.sequence
return contigs
proc count_flanked_cigar(ez:Ez): int =
var matched = false
var n = 0
var last_op: int
for e in ez.cigar:
if not matched:
if e.op == 0:
n += 1
matched = true
else:
n += 1
last_op = int(e.op)
if last_op != 0:
n -= 1
return n
iterator callsemble(r: roi, fai: Fai, ez: Ez, min_ctg_len:int=74, min_reads:int=4, min_event_len:int=4, K:int=27): Variant =
var n_contigs:int
var contigs = r.assemble(n_contigs.addr)
var read_seq = ""
var chrom = r.reads[0].chrom
for ctg in contigs:
if n_contigs > 20: continue
#echo "ctg:", ctg.nreads, " len:", ctg.len
if ctg.nreads < min_reads or ctg.len < min_ctg_len: continue
var max_stop = ctg.start
for read in r.reads:
if read.qual <= 5: continue
max_stop = max(max_stop, read.stop)
var width = int((K + 1) / 2 - 1)
# TODO: for an SV, for the alignment, we need both ends.
var reference = fai.get(chrom, ctg.start, max_stop + width + 50)
ctg.sequence.align_to(reference, ez, bw=50, z=400)
var qlocs = toSeq(ez.query_locations())
when not defined(release):
ez.show(r, ctg, reference)
echo "\nqlocs:", qlocs
echo "tlocs:", toSeq(ez.target_locations(ctg.start))
if len(qlocs) == 0 or len(qlocs) > 4: continue
var ii = -1
for tloc in ez.target_locations(ctg.start):
ii += 1
if tloc.len.int < min_event_len: continue
# TODO: relative positions of ref and alt should match
var tstart = max(0, tloc.start - ctg.start - width)
if tstart + K > reference.len:
tstart = reference.len - K
var ref_kmer = reference[tstart..<(tstart + K)]
var qloc = qlocs[ii]
var offset = min(qloc.start, ctg.len - qloc.stop - 1)
var qstart = max(qloc.start - width, 0)
if qstart + K > ctg.len:
qstart = ctg.len - K
var alt_kmer = ctg.sequence[qstart..<(qstart + K)]
# this can happen given a long homopolymer in the insert
# so we put as much of the insertion into the kmer as we can an hope that makes it unique
# TODO: this part has to be dynamic and iterative
# 1. maximize counts
# 2. minimize bs
# 3. vary offset and k-mer length.
if alt_kmer == ref_kmer:
# move to the left to find varied sequence
qstart = max(qloc.start - 3, 0)
if qstart + K > ctg.sequence.len:
var qend = min(qloc.stop + 4, ctg.len)
alt_kmer = ctg.sequence[qend - K..<(qend)]
else:
alt_kmer = ctg.sequence[qstart..<(qstart + K)]
if ref_kmer == alt_kmer and (qloc.start == 0 or alt_kmer.toSet.len == 1): continue
# simple repeats are hard so require at least 2 unique bases.
if ref_kmer.toSet.len < 3: continue
if ref_kmer == alt_kmer:
stderr.write_line("bug!!! ref and alt kmers are same!! chrom:" & chrom & " " & $qloc & " alt:" & $tloc)
stderr.write_line(">>>>>> contig length:" & $ctg.len & " ctg start:" & $ctg.start & " ctg nreads:" & $ctg.nreads )
stderr.write_line(">>>>>> kmer:" & ref_kmer)
var d = ez.draw(ctg.sequence, reference).split("\n")
stderr.write_line ">>>>>> " & d[0]
stderr.write_line ">>>>>> " & d[1]
continue
if len(ref_kmer) != len(alt_kmer) or len(ref_kmer) != K:
stderr.write_line("bug!!! kmer lengths should be equal, ref:" & $ref_kmer.len & " alt:" & $alt_kmer.len)
stderr.write_line("bug!!! " & $qloc & " alt:" & $tloc)
stderr.write_line("bug!!! tloc start:" & $(tloc.start - ctg.start - width) & " ctg len:" & $ctg.len & " genomic position:" & $chrom & ":" & $tloc.start)
#quit(2)
var refe = ref_kmer.mincode()
var alte = alt_kmer.mincode()
var alt_support = 0
var ref_support = 0
var adists = new_seq_of_cap[int](len(r.reads))
var rdists = new_seq_of_cap[int](len(r.reads))
var amapqs = new_seq_of_cap[uint8](len(r.reads))
var rmapqs = new_seq_of_cap[uint8](len(r.reads))
var both_found = 0
for read in r.reads:
if read.qual < 10: continue
#var e2 = new_ez()
#read_seq.align_to(reference, e2)
#echo toSeq(items(read.cigar)), ".. ", e2.cigar_string(read_seq)
var ref_found = false
var alt_found = false
for d, e in read.sequence(read_seq).dists(K):
if not ref_found and e == refe:
ref_support += 1; ref_found = true
rdists.add(d)
rmapqs.add(read.qual)
if not alt_found and e == alte:
alt_support += 1; alt_found = true
adists.add(d)
amapqs.add(read.qual)
if ref_found and alt_found:
both_found += 1
var aligned = false
if both_found > 0:
var missed = 0
# if we didnt' get uniq k-mers, we resort to alignment
both_found = 0
var ez_ref = new_ez(mismatch=int8(-2), gap_open=int8(5), gap_ext=int8(1))
var ez_alt = new_ez(mismatch=int8(-2), gap_open=int8(5), gap_ext=int8(1))
#var ez_ref = new_ez()
#var ez_alt = new_ez()
ref_support = 0
alt_support = 0
var xa = ""
var xr = ""
#stderr.write_line "##############################################"
var base_q = new_seq[uint8](200)
for read in r.reads:
if read.qual < 10: continue
discard read.sequence(read_seq)
discard read.base_qualities(base_q)
var rs = read.start + trim(read_seq, base_q)
if rs > tloc.stop: continue
var L = 0
if tloc.event_type == Insertion:
L = tloc.len.int
if rs + read_seq.len + L < tloc.start: continue
var ref_sub, ctg_sub: string
var start = max(rs, ctg.start) - ctg.start
ref_sub = reference[start..<reference.len]
ctg_sub = ctg.sequence[start..<ctg.len]
read_seq.align_to(ref_sub, ez_ref)
read_seq.align_to(ctg_sub, ez_alt)
var rn = count_flanked_cigar(ez_ref)
var an = count_flanked_cigar(ez_alt)
when defined(debug):
stderr.write_line("####")
stderr.write_line "ref:" & $ez_ref.cigar_string(xr, full=false) & "(" & $ez_ref.score & ")" & " alt: " & $ez_alt.cigar_string(xa, full=false) & "(" & $ez_alt.score & ")"
stderr.write_line "ref_len:" & $ez_ref.n_cigar & " alt_len: " & $ez_alt.n_cigar
stderr.write_line "ref_rn:" & $rn & " alt_rn: " & $an
if rn == 1 and an > 1:# or ez_ref.score > ez_alt.score + 1:
ref_support += 1
elif an == 1 and rn > 1:# or ez_alt.score > ez_ref.score + 1:
alt_support += 1
#[
var rc = ref_sub.contains(read_seq)
var ac = ctg_sub.contains(read_seq)
if ac and rc:
both_found += 1
elif ac:
alt_support += 1
elif rc:
ref_support += 1
else:
missed += 1
]#
#stderr.write_line "ref_support:" & $ref_support & " alt_support:" & $alt_support & " at: " & chrom & ":" & $tloc
aligned = true
if alt_support < min_reads: continue
#if float64(alt_support) / float64(alt_support + ref_support) < 0.1: continue
if float64(alt_support) / float64(r.reads.len) < 0.1: continue
var gt = genotype(ref_support, alt_support, 1e-3)
if gt.GT == GT.HOM_REF: continue
var v = Variant(chrom: chrom, start: tloc.start, genotype: gt, ref_kmer: ref_kmer,
qual: gt.qual, alt_kmer: alt_kmer, AD: [ref_support, alt_support])
# this line works extremeley well to remove false positives with no loss of sensitivity.
if offset == 0 and both_found >= int(0.75 * float64(min(ref_support, alt_support))): continue
v.info_add("DP=" & $r.reads.len)
if offset < 5:
v.info_add("LO")
v.qual /= 2'f64
if both_found > 0:
v.info_add("BS=" & $both_found)
v.qual /= 1.5
else:
v.qual *= 2
v.info_add("CC=" & ez.cigar_string(read_seq))
if aligned:
v.info_add("AL")
var min_flank = qloc.get_min_flank(ez)
# if we have a big event and a small flank, bail
if (min_flank - 1) < max(tloc.stop - tloc.start, qloc.stop - qloc.start): continue
v.info_add("MF=" & $min_flank)
v.info_add("CF=" & $offset)
v.info_add("NC=" & $n_contigs)
if offset == 0:
v.qual /= 4'f64
v.info_add("AKE=" & formatFloat(mean(adists), precision=2, format=ffDecimal))
v.info_add("RKE=" & formatFloat(mean(rdists), precision=2, format=ffDecimal))
if len(amapqs) > 0:
v.info_add("AMQ=" & $median(amapqs))
if len(rmapqs) > 0:
v.info_add("RMQ=" & $median(rmapqs))
if mean(adists) < 5: continue
if tloc.event_type == Deletion:
v.reference = fai.get(chrom, tloc.start - 1, tloc.stop - 1)
v.alternate = v.reference[0..<1]
# di-nucleotide repeats.
#if (alt_support - both_found) < 5 and v.reference[1..<v.reference.len].toSet.len == 2:
# if fai.get(chrom, tloc.start, tloc.stop + 12 * (v.reference.len - 1)).count(v.reference[1..<v.reference.len]) > 11:
# continue
else:
v.reference = fai.get(chrom, tloc.start - 1, tloc.start - 1)
v.alternate = ctg.sequence[(qloc.start-1)..<qloc.stop]
v.start = tloc.start
var vset = v.alternate[1..v.alternate.high].toSet
if vset.len == 1 and alt_kmer[alt_kmer.high-10..alt_kmer.high].toSet.len == 1 and
ref_kmer[ref_kmer.high-10..ref_kmer.high].toSet.len == 1:
continue
yield v
iterator event_locations(r: Record, max_tags:int=1): event {.inline.} =
## the genomic start-end of the location of the event
## TODO: SA tags
var off: int
for c in r.cigar:
var cons = c.consumes.reference
if c.op != CigarOp.match:
if cons:
yield (r.start + off, r.start + off + c.len, c.len)
else:
yield (r.start + off, r.start + off + 1, c.len)
if cons:
off += c.len
#var tags = r.aux("SA")
#if tags != nil and tags.count(';') <= max_tags:
# tags = tags[0..<tags.high] # strip(';')
# for tag in tags:
proc overlaps(r: Record, start:int, stop:int): bool {.inline.} =
if r.start > stop: return false
if r.stop < start: return false
return true
proc single_roi(b:Bam, region:string): roi =
var reads = new_seq_of_cap[Record](16)
for r in b.querys(region):
reads.add(r.copy())
var se = region.split(":")[1].split("-")
return (parse_int(se[0]), parse_int(se[1]), reads)
iterator gen_roi_internal(evidence: seq[uint8], cache:seq[Record], min_evidence:uint8, min_reads:int, max_reads:int, cache_start:int, cache_end:int): roi {.inline.} =
## given the counts in evidence and a region to look in (cache_start .. cache_end)
## yield regions where the values in evidence are >= min_evidence along with reads from that region.
## further filter to those regions that have > min_reads that overlap the putative event
var in_roi = false
var roi_start = 0
var roi_end = 0
for i in cache_start..<cache_end:
var ev = evidence[i]
if ev >= min_evidence:
if not in_roi:
in_roi = true
roi_start = i
roi_end = i
continue
# ending a region, yield the roi
if in_roi:
var reads = new_seq_of_cap[Record](16)
for r in cache:
if r.overlaps(roi_start, roi_end):
reads.add(r)
if len(reads) > max_reads: break
if r.start > roi_end: break
if len(reads) >= min_reads and len(reads) <= max_reads:
yield (roi_start, roi_end, reads)
in_roi = false
if in_roi:
var reads = new_seq_of_cap[Record](16)
for r in cache:
if r.overlaps(roi_start, roi_end):
reads.add(r)
if len(reads) > max_reads: break
if r.start > roi_end: break
if len(reads) >= min_reads and len(reads) <= max_reads:
yield (roi_start, roi_end, reads)
# store the max stop we've seen.
type cache_t = tuple[records: seq[Record], stop:int]
proc add(c:var cache_t, r:Record) =
c.stop = max(c.stop, r.stop)
c.records.add(r)
proc clear(c:var cache_t) =
c.records.set_len(0)
c.stop = 0
proc len(c:cache_t): int {.inline.} =
return c.records.len
iterator gen_roi(b:Bam, t:Target, min_event_support:uint8=4, min_read_coverage:int=4, max_read_coverage:int=600): roi =
# we iterate over the bam an increment an evidence counter in an genomic array for positions
# that appear to have an event (any non match)
# whenever we have a gap in coverage where start > last_end, we check the evidence
# array for any regions with >= min_event_support that indicate an event.
# we yield the bounds of the event and the reads that overlapped it.
var evidence = new_seq[uint8](t.length + 1)
var cache: cache_t = (new_seq_of_cap[Record](100000), 0)
# we use last_start to make sure we dont keep iterating over the same chunk of evidence.
var last_start = 0
for r in b.querys(t.name):
#echo "r.start:", r.start
if cache.len > 0 and r.start > cache.stop:
for roi in gen_roi_internal(evidence, cache.records, min_event_support, min_read_coverage, max_read_coverage, last_start, r.start):
yield roi
# reset
last_start = int(r.start)
cache.clear()
if r.skippable: continue
cache.add(r.copy())
for e in r.event_locations:
for i in e.start..<e.stop:
evidence[i] += 1
# reset after avoid overflow
if evidence[i] == 0:
evidence[i] = 255
for roi in gen_roi_internal(evidence, cache.records, min_event_support, min_read_coverage, max_read_coverage, last_start, evidence.len):
yield roi
## make proper vcf header.
proc make_contig_header(t:Target): string =
return "##contig=<ID=$1,length=$2>" % [t.name, $t.length]
proc contig_header(b: Bam): string =
return join(map(b.hdr.targets, make_contig_header), "\n")
when isMainModule:
let version = "indelope 0.0.1"
let doc = format("""
$version
Usage: indelope [options] <reference> <BAM-or-CRAM>
Arguments:
<reference> reference fasta file.
<BAM-or-CRAM> call variants in this file.
Options:
-m --min-reads <INT> minimum number of reads to send for alignment [default: 3]
-c --min-contig-len <INT> minimum contig length to send for alignment [default: 73]
-e --min-event-len <INT> minimum size of indel to report [default: 4]
-t --threads <INT> number of cram/bam decompression threads [default: 1]
-h --help show help
""" % ["version", version])
var b:Bam
var ez = new_ez()
if len(commandLineParams()) > 0 and commandLineParams()[0] == "single-site":
# single-site region fai bam
open(b, commandLineParams()[3], index=true)
var fai = open_fai(commandLineParams()[2])
var r = single_roi(b, commandLineParams()[1])
echo "got ", len(r.reads), " reads"
for v in callsemble(r, fai, ez, min_event_len=4):
echo v
quit(0)
let
args = docopt(doc, version = version)
min_reads = parse_int($args["--min-reads"])
min_ctg_len = parse_int($args["--min-contig-len"])
min_event_len = parse_int($args["--min-event-len"])
bam_path = $args["<BAM-or-CRAM>"]
faidx = open_fai($args["<reference>"])
open(b, bam_path, index=true, threads=parse_int($args["--threads"]))
var targets = b.hdr.targets
var last_var: Variant
var last_var2: Variant
echo header % [b.contig_header, "sample"]
for target in targets:
for r in gen_roi(b, target, min_read_coverage=min_reads, min_event_support=max(3, min_reads-2).uint8):
for v in callsemble(r, faidx, ez, min_ctg_len=min_ctg_len, min_reads=min_reads, min_event_len=min_event_len):
if v.same(last_var): continue
if v.same(last_var2): continue
echo v
last_var2 = last_var
last_var = v