-
Notifications
You must be signed in to change notification settings - Fork 1
/
coordinates.d
553 lines (468 loc) · 16.3 KB
/
coordinates.d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
/** Coordinates and Coordinate Systems
STATUS: Experimental
Interval include `start` and `end`, but no reference sequence id (chr, contig, etc.)
The `Coordinate` type is templated on `CoordSystem` enum, so that the actual coordinate
system in use -- and by this, we mean zero- or one-based, and half-open vs. closed --
is encoded within the type itself. e.g. `Coordinate!(CoordSystem.zbho)(0, 100)`
In this way, dhtslib functions that take type `Coordinate` can enforce safety checks,
or optionally, interconversions, to avoid off-by-one errors when dealing with different
systems, as is common in different HTS/NGS file formats.
In general, zero-based half-open (ZBHO) coordinates are easy to compute on, whereas
many data sources intended to be read and understood by humans (including NCBI) are
one-based closed systems. You may enjoy reading the following articles:
https://www.biostars.org/p/84686/
https://www.biostars.org/p/6373/
http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/
http://genomewiki.ucsc.edu/index.php/Coordinate_Transforms
Zero-based, half open: BED, BAM
Zero-based, closed: HTSlib function faidx_fetch_seq , strangely
One-based, half open: ?
One-based, closed: GFF3, SAM (text; not BAM), VCF (text; not BCF)
*/
/*
Translation table for my reference:
zbho zbc obho obc
zbho - 0,-1 +1,+1 +1, 0
zbc 0,+1 - +1,+2 +1,+1
obho -1,-1 -1,-2 - 0,-1
obc -1,0 -1,-1 0,+1 -
*/
module dhtslib.coordinates;
import std.traits : isIntegral;
import std.conv : to;
import std.string : toStringz;
import std.typecons : tuple;
import dhtslib : VCFHeader, SAMHeader;
import htslib.sam : bam_name2id;
import htslib.vcf : bcf_hdr_id2name;
import htslib.hts : hts_parse_region, hts_parse_reg64, hts_parse_decimal, hts_name2id_f, HTS_PARSE_FLAGS;
/// Represents 0-based vs 1-based coordinate types
enum Basis
{
zero = 0,
one
}
/// Represents whether a coordinate set's end
/// coordinate is open or closed
enum End
{
open = 0,
closed
}
/// Coordinate type for a single position with in a Coordinate set,
/// where the type itself encodes the coordinate system details
/// (zero or one-based)
struct Coordinate(Basis bs)
{
/// Coordinate value
long pos;
alias pos this;
invariant
{
assert(pos >= 0);
// in one based systems, pos cannot be zero
static if(bs == Basis.one)
{
assert(pos != 0);
}
}
/// Convert coordinate to another based system
auto to(Basis tobs)()
{
// return same Base type
static if (bs == tobs) return Coordinate!bs(pos);
// one to zero base, decrement
else static if (tobs == Basis.zero) return Coordinate!bs(pos - 1);
// zero to one base, increment
else static if (tobs == Basis.one) return Coordinate!bs(pos + 1);
else static assert(0, "Coordinate Type error");
}
/// Convert coordinate to another based system using shortcuts
auto to(T: ZB)()
{
return this.to!(Basis.zero);
}
auto to(T: OB)()
{
return this.to!(Basis.one);
}
/// make a new coordinate with a value of this.pos + off
/// this.pos + off
auto offset(T)(T off)
if(isIntegral!T)
{
return Coordinatse!(bs)(cast(long)(this.pos + off));
}
}
alias ZB = Coordinate!(Basis.zero);
alias OB = Coordinate!(Basis.one);
alias ZeroBased = Coordinate!(Basis.zero);
alias OneBased = Coordinate!(Basis.one);
/// template to convert Basis, End enum combination to
/// respective CoordinateSystem enum
template getCoordinateSystem(Basis bs, End es){
static if(bs == Basis.zero){
static if(es == End.open)
enum CoordSystem getCoordinateSystem = CoordSystem.zbho;
else static if(es == End.closed)
enum CoordSystem getCoordinateSystem = CoordSystem.zbc;
}
else static if(bs == Basis.one){
static if(es == End.open)
enum CoordSystem getCoordinateSystem = CoordSystem.obho;
else static if(es == End.closed)
enum CoordSystem getCoordinateSystem = CoordSystem.obc;
}
}
// just sanity checking
static assert(getCoordinateSystem!(Basis.zero, End.open) == CoordSystem.zbho);
static assert(getCoordinateSystem!(Basis.zero, End.closed) == CoordSystem.zbc);
static assert(getCoordinateSystem!(Basis.one, End.open) == CoordSystem.obho);
static assert(getCoordinateSystem!(Basis.one, End.closed) == CoordSystem.obc);
/// template to convert CoordinateSystem enum to
/// respective Basis enum
template coordinateSystemToBasis(CoordSystem cs){
static if(cs == CoordSystem.zbho){
enum Basis coordinateSystemToBasis = Basis.zero;
}
else static if(cs == CoordSystem.zbc){
enum Basis coordinateSystemToBasis = Basis.zero;
}
else static if(cs == CoordSystem.obho){
enum Basis coordinateSystemToBasis = Basis.one;
}
else static if(cs == CoordSystem.obc){
enum Basis coordinateSystemToBasis = Basis.one;
}
}
// just sanity checking
static assert(coordinateSystemToBasis!(CoordSystem.zbho) == Basis.zero);
static assert(coordinateSystemToBasis!(CoordSystem.zbc) == Basis.zero);
static assert(coordinateSystemToBasis!(CoordSystem.obho) == Basis.one);
static assert(coordinateSystemToBasis!(CoordSystem.obc) == Basis.one);
/// template to convert CoordinateSystem enum to
/// respective End enum
template coordinateSystemToEnd(CoordSystem cs){
static if(cs == CoordSystem.zbho){
enum End coordinateSystemToEnd = End.open;
}
else static if(cs == CoordSystem.zbc){
enum End coordinateSystemToEnd = End.closed;
}
else static if(cs == CoordSystem.obho){
enum End coordinateSystemToEnd = End.open;
}
else static if(cs == CoordSystem.obc){
enum End coordinateSystemToEnd = End.closed;
}
}
// just sanity checking
static assert(coordinateSystemToEnd!(CoordSystem.zbho) == End.open);
static assert(coordinateSystemToEnd!(CoordSystem.zbc) == End.closed);
static assert(coordinateSystemToEnd!(CoordSystem.obho) == End.open);
static assert(coordinateSystemToEnd!(CoordSystem.obc) == End.closed);
/// Coordsytem types
enum CoordSystem
{
zbho = 0, /// zero-based, half-open (BED, BAM)
zbc, /// zero-based, closed (HTSlib.faidx_fetch_seq)
obho, /// one-based, half-open
obc /// one-based, closed (GFF3, VCF [not BCF], SAM [not BAM])
}
/// Labels for each CoordSystem Type
static immutable CoordSystemLabels = ["zbho", "zbc", "obho", "obc"];
/// The (start, end) coordinates within a coordinate system,
/// where the type itself encodes the coordinate system details
/// (zero or one-based; half-open vs. closed)
struct Interval(CoordSystem cs)
{
/// alias Basis and End enums for this Coordsystem type
alias basetype = coordinateSystemToBasis!cs;
alias endtype = coordinateSystemToEnd!cs;
/// Starting coordinate
Coordinate!basetype start;
/// Ending coordinate
Coordinate!basetype end;
/// long constructor
this(long start, long end)
{
this.start.pos = start;
this.end.pos = end;
}
invariant
{
// In half-open systems, ensure start strictly less than end,
// unless the zero-length range is truly desired, in which case use (0,0)
static if (cs == CoordSystem.zbho || cs == CoordSystem.obho)
assert(this.start < this.end || (this.start == 0 && this.end == 0));
else
assert(this.start <= this.end);
}
/// Return the size of the interval spanned by start and end
long size()
{
static if (cs == CoordSystem.zbho || cs == CoordSystem.obho)
return end - start;
else static if (cs == CoordSystem.zbc || cs == CoordSystem.obc)
return end - start + 1;
else
static assert(0, "Coordinate Type error");
}
/// Convert coordinates to another coordinate system
auto to(CoordSystem tocs)()
{
/// alias Basis and End enums for the tocs Coordsystem type
alias tobasetype = coordinateSystemToBasis!tocs;
alias toendtype = coordinateSystemToEnd!tocs;
// new coordinates
auto newStart = this.start;
auto newEnd = this.end;
// return same CoordinateSystem type
static if (cs == tocs)
return Interval!(tocs)(newStart, newEnd);
else{
// convert coordinates to new base type
newStart = newStart.to!tobasetype;
newEnd = newEnd.to!tobasetype;
// if going between end types
static if (endtype != toendtype){
// open to closed end, decrement end
// closed to open end, increment end
static if(toendtype == End.closed){
newEnd--;
}else{
newEnd++;
}
}
return Interval!(tocs)(newStart, newEnd);
}
}
/// Convert coordinate to another based system using shortcuts
auto to(T: ZBHO)()
{
return this.to!(CoordSystem.zbho);
}
/// Convert coordinate to another based system using shortcuts
auto to(T: OBHO)()
{
return this.to!(CoordSystem.obho);
}
/// Convert coordinate to another based system using shortcuts
auto to(T: ZBC)()
{
return this.to!(CoordSystem.zbc);
}
/// Convert coordinate to another based system using shortcuts
auto to(T: OBC)()
{
return this.to!(CoordSystem.obc);
}
/// make a new coordinate pair with a value of
/// this.start + off and this.end + off
auto offset(T)(T off)
if(isIntegral!T)
{
return Interval!(cs)(cast(long)(this.start + off), cast(long)(this.end + off));
}
/// intersection of two regions
auto intersectImpl(Interval!cs other)
{
if(!this.isOverlap(other)){
return Interval!(cs).init;
}
return Interval!(cs)(
this.getMaxStart(other),
this.getMinEnd(other)
);
}
/// union of two regions
auto unionImpl(Interval!cs other)
{
if(!this.isOverlap(other)){
return Interval!(cs).init;
}
return Interval!(cs)(
this.getMinStart(other),
this.getMaxEnd(other)
);
}
auto isOverlap(Interval!cs other)
{
static if(endtype == End.closed){
return this.getMaxStart(other) <= this.getMinEnd(other);
}else{
return this.getMaxStart(other) < this.getMinEnd(other);
}
}
auto getMinStart(Interval!cs other)
{
return this.start < other.start ? this.start : other.start;
}
auto getMaxStart(Interval!cs other)
{
return this.start > other.start ? this.start : other.start;
}
auto getMinEnd(Interval!cs other)
{
return this.end < other.end ? this.end : other.end;
}
auto getMaxEnd(Interval!cs other)
{
return this.end > other.end ? this.end : other.end;
}
/// set operators for intersect, union, and difference
auto opBinary(string op)(Interval!cs other)
{
static if(op == "|") return unionImpl(other);
else static if(op == "&") return intersectImpl(other);
else static assert(0,"Operator "~op~" not implemented");
}
/// Get string representation for printing
string toString() const{
return "[" ~ CoordSystemLabels[cs] ~ "] " ~ this.start.pos.to!string ~ "-" ~ this.end.pos.to!string;
}
}
/// Returns tuple of String and ZBHO coordinates
/// representing input string. Supports htslib coordinate strings.
/// i.e chr1:1-10
auto getIntervalFromString(string region){
ZBHO coords;
auto regStr = toStringz(region);
auto ptr = hts_parse_reg64(regStr,&coords.start.pos,&coords.end.pos);
if(!ptr){
throw new Exception("Region string could not be parsed");
}
auto contig = region[0..ptr - regStr];
return tuple!("contig","interval")(contig,coords);
}
/// TODO: complete getIntervalFromString with the ability to check headers
// auto getIntervalFromString(Header)(string region, Header h)
// if(is(Header == SAMHeader) || is(Header == VCFHeader))
// {
// ZBHO coords;
// int tid;
// auto regStr = toStringz(region);
// auto flag = HTS_PARSE_FLAGS.HTS_PARSE_THOUSANDS_SEP;
// static if(is(Header == SAMHeader)){
// auto ptr = hts_parse_region(regStr,&tid, &coords.start.pos, &coords.end.pos, cast(hts_name2id_f * )&bam_name2id, h.h, flag);
// } else static if(is(Header == VCFHeader)){
// auto ptr = hts_parse_region(regStr,&tid, &coords.start.pos, &coords.end.pos, &bcf_hdr_id2name, h.hdr, flag);
// }
// if(!ptr){
// throw new Exception("Region string could not be parsed");
// }
// return tuple!("tid","interval")(tid, coords);
// }
alias ZBHO = Interval!(CoordSystem.zbho);
alias OBHO = Interval!(CoordSystem.obho);
alias ZBC = Interval!(CoordSystem.zbc);
alias OBC = Interval!(CoordSystem.obc);
alias ZeroBasedHalfOpen = Interval!(CoordSystem.zbho);
alias OneBasedHalfOpen = Interval!(CoordSystem.obho);
alias ZeroBasedClosed = Interval!(CoordSystem.zbc);
alias OneBasedClosed = Interval!(CoordSystem.obc);
debug(dhtslib_unittest) unittest
{
auto c0 = Interval!(CoordSystem.zbho)(0, 100);
assert(c0.size == 100);
auto c1 = c0.to!(CoordSystem.zbc);
auto c2 = c0.to!(CoordSystem.obc);
auto c3 = c0.to!(CoordSystem.obho);
auto c4 = c0.to!(CoordSystem.zbho);
assert(c1 == ZBC(0, 99));
assert(c2 == OBC(1, 100));
assert(c3 == OBHO(1, 101));
assert(c4 == ZBHO(0, 100));
// ...
}
debug(dhtslib_unittest) unittest
{
auto reg = getIntervalFromString("chrom1:0-100");
assert(reg.contig == "chrom1");
auto c0 = reg.interval;
assert(c0 == Interval!(CoordSystem.zbho)(0, 100));
}
debug(dhtslib_unittest) unittest
{
auto c0 = Interval!(CoordSystem.zbho)(0, 100);
assert(c0.size == 100);
auto c1 = c0.to!(CoordSystem.zbc);
auto c2 = c0.to!(CoordSystem.obc);
auto c3 = c0.to!(CoordSystem.obho);
auto c4 = c0.to!(CoordSystem.zbho);
assert(c1 == ZBC(0, 99));
assert(c2 == OBC(1, 100));
assert(c3 == OBHO(1, 101));
assert(c4 == ZBHO(0, 100));
}
debug(dhtslib_unittest) unittest
{
auto c0 = ZBHO(0, 100);
assert(c0.size == 100);
auto c1 = c0.to!ZBC;
auto c2 = c0.to!OBC;
auto c3 = c0.to!OBHO;
auto c4 = c0.to!ZBHO;
assert(c1 == ZBC(0, 99));
assert(c2 == OBC(1, 100));
assert(c3 == OBHO(1, 101));
assert(c4 == ZBHO(0, 100));
}
debug(dhtslib_unittest) unittest
{
auto c0 = Interval!(CoordSystem.obho)(1, 101);
assert(c0.size == 100);
auto c1 = c0.to!(CoordSystem.zbc);
auto c2 = c0.to!(CoordSystem.obc);
auto c3 = c0.to!(CoordSystem.obho);
auto c4 = c0.to!(CoordSystem.zbho);
assert(c1 == ZBC(0, 99));
assert(c2 == OBC(1, 100));
assert(c3 == OBHO(1, 101));
assert(c4 == ZBHO(0, 100));
// ...
}
debug(dhtslib_unittest) unittest
{
ZBHO c0 = ZBHO(0, 100);
assert(c0.size == 100);
auto c1 = c0.offset(50);
assert(c1 == ZBHO(50, 150));
assert((c0 & c1) == ZBHO(50, 100));
assert((c0 | c1) == ZBHO(0, 150));
c1 = c0.offset(99);
assert(c1 == ZBHO(99, 199));
assert((c0 & c1) == ZBHO(99, 100));
assert((c0 | c1) == ZBHO(0, 199));
}
debug(dhtslib_unittest) unittest
{
OBC c0 = OBC(1, 100);
assert(c0.size == 100);
auto c1 = c0.offset(50);
assert(c1 == OBC(51, 150));
assert((c0 & c1) == OBC(51, 100));
assert((c0 | c1) == OBC(1, 150));
c1 = c0.offset(99);
assert(c1 == OBC(100, 199));
import std.stdio;
writeln((c0 & c1));
assert((c0 & c1) == OBC(100, 100));
assert((c0 | c1) == OBC(1, 199));
}
///
// debug(dhtslib_unittest) unittest
// {
// import dhtslib.sam;
// import htslib.hts_log;
// import std.path : buildPath, dirName;
// import std.string : fromStringz;
// import std.array : array;
// hts_set_log_level(htsLogLevel.HTS_LOG_WARNING);
// hts_log_info(__FUNCTION__, "Testing SAMFile & SAMRecord");
// hts_log_info(__FUNCTION__, "Loading test file");
// auto sam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","auxf#values.sam"), 0);
// auto reg = getIntervalFromString("sheila:1-10",sam.header);
// assert(reg.tid == 0);
// assert(reg.interval == ZBHO(0,11));
// }