-
Notifications
You must be signed in to change notification settings - Fork 66
/
GenomeIndex.cpp
2271 lines (1931 loc) · 93 KB
/
GenomeIndex.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*++
Module Name:
GenomeIndex.cpp
Abstract:
Index (hash table) builder for the SNAP sequencer
Authors:
Bill Bolosky, August, 2011
Environment:
User mode service.
Revision History:
Adapted from Matei Zaharia's Scala implementation.
--*/
#include "stdafx.h"
#include "ApproximateCounter.h"
#include "BigAlloc.h"
#include "Compat.h"
#include "FASTA.h"
#include "FixedSizeSet.h"
#include "FixedSizeVector.h"
#include "GenericFile.h"
#include "GenericFile_stdio.h"
#include "Genome.h"
#include "GenomeIndex.h"
#include "HashTable.h"
#include "Seed.h"
#include "exit.h"
#include "Error.h"
#include "directions.h"
#include "DataReader.h"
using namespace std;
static const int DEFAULT_SEED_SIZE = 24;
static const double DEFAULT_SLACK = 0.3;
static const unsigned DEFAULT_PADDING = 2000;
const char *GenomeIndexFileName = "GenomeIndex";
const char *OverflowTableFileName = "OverflowTable";
const char *GenomeIndexHashFileName = "GenomeIndexHash";
const char *GenomeFileName = "Genome";
static void usage()
{
WriteErrorMessage(
"Usage: %s index <input.fa> <output-dir> [<options>]\n"
"Options:\n"
" -s Seed size (default: %d)\n"
" -h Hash table slack (default: %.1f)\n"
" -t Specify the maximum number of threads to use. Default is the number of cores. Do not leave a space after the -t, e.g., -t16\n"
" -B<chars> Specify characters to use as chromosome name terminators in the FASTA header line; these characters and anything after are\n"
" not part of the chromosome name. You must specify all characters on a single -B switch. So, for example, with -B_|,\n"
" the FASTA header line '>chr1|Chromosome 1' would generate a chromosome named 'chr1'. There's a separate flag for\n"
" indicating that a space is a terminator.\n"
" -bSpace Indicates that the space and tab characters are terminators for chromosome names (see -B above). This may be used in addition\n"
" to other terminators specified by -B. -B and -bSpace are case sensitive. This is the default.\n"
" -bSpace- Indicates that space and tab characters should be included in chromosome names.\n"
" -p Specify the number of Ns to put as padding between chromosomes. This must be as large as the largest\n"
" edit distance you'll ever use, and there's a performance advantage to have it be bigger than any\n"
" read you'll process or gap between paired-end reads. Default is %d. Specify the amount of padding directly after -p without a space.\n"
" -H Build a histogram of seed popularity. This is just for information, it's not used by SNAP.\n"
" Specify the histogram file name directly after -H without leaving a space.\n"
" -exact Compute hash table sizes exactly. This will slow down index build, but usually will result in smaller indices.\n"
" -keysize The number of bytes to use for the hash table key. Larger values increase SNAP's memory footprint, but allow larger seeds.\n"
" By default it's autoselected based on the seed size.\n"
" -large Build a larger index that's a little faster, particularly for runs with quick/inaccurate parameters. Increases index size by\n"
" about 30%%, depending on the other index parameters and the contents of the reference genome\n"
" -locationSize The size of the genome locations stored in the index. This can be from 4 to 8 bytes. The locations need to be big enough\n"
" not only to index the genome, but also to allow some space for representing seeds that occur multiple times. For the\n"
" human genome, it will fit with four byte locations if the seed size is 20 or larger, but needs 5 (or more) for smaller\n"
" seeds. Making the location size bigger than necessary will just waste (lots of) space, so unless you're doing something\n"
" quite unusual, the right answer is 4 or 5. Default is based on seed size: 4 if it's 20 or greater, 5 otherwise.\n"
" -sm Use a temp file to work better in smaller memory. This only helps a little, but can be the difference if you're close.\n"
" In particular, this will generally use less memory than the index will use once it's built, so if this doesn't work you\n"
" won't be able to use the index anyway. However, if you've got sufficient memory to begin with, this option will just\n"
" slow down the index build by doing extra, useless IO.\n"
" -AutoAlt- Don't automatically mark ALT contigs. Otherwise, any contig whose name ends in '_alt' (regardless of captialization) or starts\n"
" with HLA- will be marked ALT. Others will not.\n"
" -maxAltContigSize Specify a size at or below which all contigs are automatically marked ALT, unless overridden by name using the args below\n"
" -altContigName Specify the (case independent) name of an alt to mark a contig. You can supply this parameter as often as you'd like\n"
" -altContigFile Specify the name of a file with a list of alt contig names, one per line. You may specify this as often as you'd like\n"
" -nonAltContigName Specify the name of a contig that's not an alt, regardless of its size\n"
" -nonAltContigFile Specify the name of a file that contains a list of contigs (one per line) that will not be marked ALT regardless of size\n"
" -altLiftoverFile Specify the file containing ALT-to-REF mappings (SAM format). e.g., hs38DH.fa.alt from bwa-kit\n"
,
BINARY_NAME,
DEFAULT_SEED_SIZE,
DEFAULT_SLACK,
DEFAULT_PADDING);
soft_exit_no_print(0); // Don't use soft-exit, it's confusing people to get an error message after the usage
}
// Copies the input string, reallocates the list and adds it to the end.
void
addToCountedListOfStrings(
const char *newString,
int *length,
char ***list)
{
char *newStringCopy = new char[strlen(newString) + 1];
strcpy(newStringCopy, newString);
char **newList = new char*[*length + 1];
memcpy(newList, *list, sizeof(char *) * *length);
delete[] *list;
*list = newList;
(*list)[*length] = newStringCopy;
(*length)++;
}
void
GenomeIndex::runIndexer(
int argc,
const char **argv)
{
if (argc < 2) {
usage();
}
const char *fastaFile = argv[0];
const char *outputDir = argv[1];
unsigned maxThreads = GetNumberOfProcessors();
int seedLen = DEFAULT_SEED_SIZE;
double slack = DEFAULT_SLACK;
const char *pieceNameTerminatorCharacters = NULL;
bool spaceIsAPieceNameTerminator = true;
const char *histogramFileName = NULL;
unsigned chromosomePadding = DEFAULT_PADDING;
bool forceExact = false;
unsigned keySizeInBytes = 0; // If it's not set by the user, it gets set based on the seed size later
bool large = false;
unsigned locationSize = 0; // If it's not set by the user, it gets set based on the seed size later
bool smallMemory = false;
GenomeDistance maxSizeForAutomaticALT = -1;
int nAltOptIn = 0;
char **altOptInList = NULL;
int nAltOptOut = 0;
char **altOptOutList = NULL;
bool autoALT = true;
int nAltLiftover = 0;
char **altLiftoverLines = NULL;
char **altLiftoverContigNames = NULL;
unsigned *altLiftoverContigFlags = NULL;
char **altLiftoverProjContigNames = NULL;
unsigned *altLiftoverProjContigOffsets = NULL;
char **altLiftoverProjCigar = NULL;
DataSupplier::ExpansionFactor = 50; // This is for the decompression of a gzipped FASTA/ALT file. FASTAs compress really well so we need a lot.
for (int n = 2; n < argc; n++) {
if (strcmp(argv[n], "-s") == 0) {
if (n + 1 < argc) {
seedLen = atoi(argv[n+1]);
n++;
} else {
usage();
}
} else if (strcmp(argv[n], "-h") == 0) {
if (n + 1 < argc) {
slack = atof(argv[n+1]);
n++;
} else {
usage();
}
} else if (strcmp(argv[n], "-exact") == 0) {
forceExact = true;
} else if (strcmp(argv[n], "-hg19") == 0) {
WriteErrorMessage("The -hg19 flag is deprecated, ignoring it.\n");
} else if (_stricmp(argv[n], "-locationSize") == 0) {
if (n + 1 < argc) {
locationSize = atoi(argv[n+1]);
if (locationSize < 4 || locationSize > 8) {
WriteErrorMessage("Location size must be between 4 and 8 inclusive\n");
soft_exit(1);
}
n++;
} else {
usage();
}
} else if (strcmp(argv[n], "-large") == 0) {
large = true;
} else if (argv[n][0] == '-' && argv[n][1] == 'H') {
histogramFileName = argv[n] + 2;
} else if (argv[n][0] == '-' && argv[n][1] == 'O') {
// Deprecated, ignored parameter
} else if (argv[n][0] == '-' && argv[n][1] == 't') {
maxThreads = atoi(argv[n]+2);
if (maxThreads < 1 || maxThreads > 100) {
WriteErrorMessage("maxThreads must be between 1 and 100 inclusive (and you need not to leave a space after '-t')\n");
soft_exit(1);
}
} else if (argv[n][0] == '-' && argv[n][1] == 'p') {
chromosomePadding = atoi(argv[n] + 2);
if (0 == chromosomePadding) {
WriteErrorMessage("Invalid chromosome padding specified, must be at least one (and in practice as large as any max edit distance you might use).\n");
soft_exit(1);
}
} else if (argv[n][0] == '-' && argv[n][1] == 's' && argv[n][2] == 'm') {
smallMemory = true;
}
else if (_stricmp(argv[n], "-keysize") == 0) {
if (n + 1 < argc) {
keySizeInBytes = atoi(argv[n+1]);
if (keySizeInBytes < 2 || keySizeInBytes > 8) {
WriteErrorMessage("Key size must be between 2 and 8 inclusive\n");
soft_exit(1);
}
n++;
} else {
usage();
}
} else if (argv[n][0] == '-' && argv[n][1] == 'B') {
pieceNameTerminatorCharacters = argv[n] + 2;
} else if (!strcmp(argv[n], "-bSpace")) {
spaceIsAPieceNameTerminator = true;
} else if (!strcmp(argv[n], "-bSpace-")) {
spaceIsAPieceNameTerminator = false;
} else if (!_stricmp(argv[n], "-AutoAlt-")) {
autoALT = false;
} else if (!strcmp(argv[n], "-maxAltContigSize")) {
if (n + 1 < argc) {
maxSizeForAutomaticALT = atoll(argv[n + 1]);
} else {
usage();
}
n++;
} else if (!strcmp(argv[n], "-altContigName")) {
if (n + 1 < argc) {
addToCountedListOfStrings(argv[n + 1], &nAltOptIn, &altOptInList);
} else {
usage();
}
n++;
} else if (!strcmp(argv[n], "-nonAltContigName")) {
if (n + 1 < argc) {
addToCountedListOfStrings(argv[n + 1], &nAltOptOut, &altOptOutList);
} else {
usage();
}
n++;
} else if (!strcmp(argv[n], "-altContigFile")) {
if (n + 1 < argc) {
DataReader* inputFile = getDefaultOrGzipDataReader(argv[n + 1]);
if (NULL == inputFile) {
WriteErrorMessage("Unable to open alt contig list file %s\n", argv[n + 1]);
soft_exit(1);
}
char *contigNameBuffer = NULL;
int contigNameBufferSize = 0;
while (NULL != reallocatingFgets(&contigNameBuffer, &contigNameBufferSize, inputFile)) {
if (NULL != strchr(contigNameBuffer, '\n')) {
*strchr(contigNameBuffer, '\n') = '\0';
}
if (NULL != strchr(contigNameBuffer, '\r')) {
*strchr(contigNameBuffer, '\r') = '\0';
}
addToCountedListOfStrings(contigNameBuffer, &nAltOptIn, &altOptInList);
} // while we have an input string.
delete inputFile;
delete[] contigNameBuffer;
} else {
usage();
}
n++;
} else if (!strcmp(argv[n], "-nonAltContigFile")) {
if (n + 1 < argc) {
DataReader *inputFile = getDefaultOrGzipDataReader(argv[n + 1]);
if (NULL == inputFile) {
WriteErrorMessage("Unable to open non-alt contig list file %s\n", argv[n + 1]);
soft_exit(1);
}
char *contigNameBuffer = NULL;
int contigNameBufferSize = 0;
while (NULL != reallocatingFgets(&contigNameBuffer, &contigNameBufferSize, inputFile)) {
if (NULL != strchr(contigNameBuffer, '\n')) {
*strchr(contigNameBuffer, '\n') = '\0';
}
if (NULL != strchr(contigNameBuffer, '\r')) {
*strchr(contigNameBuffer, '\r') = '\0';
}
addToCountedListOfStrings(contigNameBuffer, &nAltOptOut, &altOptOutList);
} // while we have an input string.
delete inputFile;
delete[] contigNameBuffer;
}
else {
usage();
}
n++;
} else if (!strcmp(argv[n], "-altLiftoverFile")) {
if (n + 1 < argc) {
DataReader* inputFile = getDefaultOrGzipDataReader(argv[n + 1]);
if (NULL == inputFile) {
WriteErrorMessage("Unable to open ALT liftover file %s\n", argv[n + 1]);
soft_exit(1);
}
char* altLiftoverBuffer = NULL;
int altLiftoverBufferSize = 0;
while (NULL != reallocatingFgets(&altLiftoverBuffer, &altLiftoverBufferSize, inputFile)) {
if (altLiftoverBuffer[0] == '@') {
continue;
}
if (NULL != strchr(altLiftoverBuffer, '\n')) {
*strchr(altLiftoverBuffer, '\n') = '\0';
}
if (NULL != strchr(altLiftoverBuffer, '\r')) {
*strchr(altLiftoverBuffer, '\r') = '\0';
}
addToCountedListOfStrings(altLiftoverBuffer, &nAltLiftover, &altLiftoverLines);
} // while we have an input string.
altLiftoverContigNames = new char*[nAltLiftover];
altLiftoverContigFlags = new unsigned [nAltLiftover];
altLiftoverProjContigNames = new char*[nAltLiftover];
altLiftoverProjContigOffsets = new unsigned [nAltLiftover];
altLiftoverProjCigar = new char*[nAltLiftover];
for (int i = 0; i < nAltLiftover; i++) {
// get contig name
char* contigNameStart = altLiftoverLines[i];
char* contigNameEnd = strchr(contigNameStart, '\t');
if (NULL == contigNameEnd) {
WriteErrorMessage("Invalid format for ALT liftover file %s. Not tab separated\n", argv[n + 1]);
soft_exit(1);
}
// get contig flags
char* contigFlagsEnd = strchr(contigNameEnd + 1, '\t');
if (1 != sscanf(contigNameEnd + 1, "%u", &altLiftoverContigFlags[i]) || NULL == contigFlagsEnd) {
WriteErrorMessage("Invalid format for ALT liftover file %s. Not tab separated\n", argv[n + 1]);
soft_exit(1);
}
// get projected contig name
char* projContigNameStart = contigFlagsEnd + 1;
char* projContigNameEnd = strchr(projContigNameStart, '\t');
if (NULL == projContigNameEnd) {
WriteErrorMessage("Invalid format for ALT liftover file %s. Not tab separated\n", argv[n + 1]);
soft_exit(1);
}
// get projected contig offsets
char* projContigOffsetEnd = strchr(projContigNameEnd + 1, '\t');
if (1 != sscanf(projContigNameEnd + 1, "%u", &altLiftoverProjContigOffsets[i]) || NULL == projContigOffsetEnd) {
WriteErrorMessage("Invalid format for ALT liftover file %s. Not tab separated\n", argv[n + 1]);
soft_exit(1);
}
// skip next field (mapping quality)
char* tmp = strchr(projContigOffsetEnd + 1, '\t');
if (NULL == tmp) {
WriteErrorMessage("Invalid format for ALT liftover file %s. Not tab separated\n", argv[n + 1]);
soft_exit(1);
}
// get projected cigar
char* projCigarStart = tmp + 1;
char* projCigarEnd = strchr(projCigarStart, '\t');
if (NULL == projCigarEnd) {
WriteErrorMessage("Invalid format for ALT liftover file %s. Not tab separated\n", argv[n + 1]);
soft_exit(1);
}
// skip contigs that do not have a mapping to the primary reference
if (*projContigNameStart == '*') {
altLiftoverContigNames[i] = NULL;
altLiftoverProjContigNames[i] = NULL;
altLiftoverProjCigar[i] = NULL;
continue;
}
size_t contigNameLength = contigNameEnd - contigNameStart;
altLiftoverContigNames[i] = new char[contigNameLength + 1];
strncpy(altLiftoverContigNames[i], contigNameStart, contigNameLength);
altLiftoverContigNames[i][contigNameLength] = '\0';
size_t projContigNameLength = projContigNameEnd - projContigNameStart;
altLiftoverProjContigNames[i] = new char[projContigNameLength + 1];
strncpy(altLiftoverProjContigNames[i], projContigNameStart, projContigNameLength);
altLiftoverProjContigNames[i][projContigNameLength] = '\0';
size_t projCigarLength = projCigarEnd - projCigarStart;
altLiftoverProjCigar[i] = new char[projCigarLength + 1];
strncpy(altLiftoverProjCigar[i], projCigarStart, projCigarLength);
altLiftoverProjCigar[i][projCigarLength] = '\0';
}
delete inputFile;
delete[] altLiftoverBuffer;
}
else {
usage();
}
n++;
} else {
WriteErrorMessage("Invalid argument: %s\n\n", argv[n]);
usage();
}
} // for each arg
if (seedLen < 8 || seedLen > 32) {
// Seeds are stored in 64 bits, so they can't be larger than 32 bases for now.
WriteErrorMessage("Seed length must be between 8 and 32, inclusive\n");
soft_exit(1);
}
if (keySizeInBytes == 0) {
//
// Auto set they key size. This computation should result in between 16 & 1024 hash tables.
//
keySizeInBytes = __max(2,(seedLen + 2) / 4 - 1);
WriteStatusMessage("Auto-set key size to %d\n", keySizeInBytes);
}
if (locationSize == 0) { // Numbers that work for the human genome
if (seedLen >= 20) {
locationSize = 4;
} else {
locationSize = 5;
}
WriteStatusMessage("Auto-set locationSize to %d\n", locationSize);
}
if ((unsigned)seedLen * 2 < keySizeInBytes * 8) {
WriteErrorMessage("You must specify a smaller keysize or a larger seed size. The seed must be big enough to fill the key\n"
"and takes two bits per base of seed.\n");
soft_exit(1);
}
if (seedLen * 2 - keySizeInBytes * 8 > 16) {
WriteErrorMessage("You must specify a biger keysize or smaller seed len. SNAP restricts the number of hash tables to 4^8,\n"
"and needs 4^{excess seed len} hash tables, where excess seed len is the seed size minus the four times the key size.\n");
soft_exit(1);
}
WriteStatusMessage("Hash table slack %lf\nLoading FASTA file '%s' into memory...", slack, fastaFile);
BigAllocUseHugePages = false;
_int64 start = timeInMillis();
const Genome *genome = ReadFASTAGenome(fastaFile, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator, chromosomePadding, altOptInList, nAltOptIn, altOptOutList, nAltOptOut, maxSizeForAutomaticALT, autoALT,
altLiftoverContigNames, altLiftoverContigFlags, altLiftoverProjContigNames, altLiftoverProjContigOffsets, altLiftoverProjCigar, nAltLiftover);
if (NULL == genome) {
WriteErrorMessage("Unable to read FASTA file\n");
soft_exit(1);
}
WriteStatusMessage("%llds\n", (timeInMillis() + 500 - start) / 1000);
WriteStatusMessage("Genome has %d contigs, of which %d are ALTs\n", genome->getNumContigs(), genome->getNumALTContigs());
GenomeDistance nBases = genome->getCountOfBases();
if (!GenomeIndex::BuildIndexToDirectory(genome, seedLen, slack, outputDir, maxThreads, chromosomePadding, forceExact, keySizeInBytes,
large, histogramFileName, locationSize, smallMemory)) {
WriteErrorMessage("Genome index build failed\n");
soft_exit(1);
}
genome = NULL; // It's deleted by BuildIndexToDirectory.
_int64 end = timeInMillis();
WriteStatusMessage("Index build and save took %llds (%lld bases/s)\n",
(end - start) / 1000, nBases / max((end - start) / 1000, (_int64) 1));
}
//
// Compute the value of InvalidGenomeLoctaion based on the number of bytes we're using in the hash table to
// store genome locations.
//
void
SetInvalidGenomeLocation(unsigned locationSize)
{
if (locationSize == 8) {
InvalidGenomeLocation = 0xffffffffffffffff;
} else {
InvalidGenomeLocation = ((_int64) 1 << (locationSize * 8)) - 1;
}
ContigForInvalidGenomeLocation.beginningLocation = InvalidGenomeLocation;
ContigForInvalidGenomeLocation.isALT = false;
ContigForInvalidGenomeLocation.length = 1;
ContigForInvalidGenomeLocation.name = (char *)"InvalidGenomeLocation"; // This is never going to be written in to, but the field is char * because other contigs have changes made to their names.
ContigForInvalidGenomeLocation.nameLength = (int)strlen(ContigForInvalidGenomeLocation.name);
}
bool
GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double slack, const char *directoryName,
unsigned maxThreads, unsigned chromosomePaddingSize, bool forceExact, unsigned hashTableKeySize,
bool large, const char *histogramFileName, unsigned locationSize, bool smallMemory)
{
PreventMachineHibernationWhileThisThreadIsAlive();
SetInvalidGenomeLocation(locationSize);
bool buildHistogram = (histogramFileName != NULL);
FILE *histogramFile;
if (buildHistogram) {
histogramFile = fopen(histogramFileName, "w");
if (NULL == histogramFile) {
WriteErrorMessage("Unable to open histogram file '%s', skipping it.\n", histogramFileName);
buildHistogram = false;
}
}
if (mkdir(directoryName, 0777) != 0 && errno != EEXIST) {
WriteErrorMessage("BuildIndex: failed to create directory %s\n",directoryName);
return false;
}
int filenameBufferSize = (int)(strlen(directoryName) + 1 + __max(strlen(GenomeIndexFileName), __max(strlen(OverflowTableFileName), __max(strlen(GenomeIndexHashFileName), strlen(GenomeFileName)))) + 1);
char *filenameBuffer = new char[filenameBufferSize];
fprintf(stderr,"Saving genome...");
_int64 start = timeInMillis();
snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName);
if (!genome->saveToFile(filenameBuffer)) {
WriteErrorMessage("GenomeIndex::saveToDirectory: Failed to save the genome itself\n");
delete[] filenameBuffer;
return false;
}
fprintf(stderr,"%llds\n", (timeInMillis() + 500 - start) / 1000);
GenomeIndex *index = new GenomeIndex();
index->genome = NULL; // We always delete the index when we're done, but we delete the genome first to save space during the overflow table build.
GenomeDistance countOfBases = genome->getCountOfBases();
if (locationSize != 8 && countOfBases > ((_int64) 1 << (locationSize*8)) - 16) {
WriteErrorMessage("Genome is too big for %d byte genome locations. Specify a larger location size with -locationSize\n", locationSize);
soft_exit(1);
}
// Compute bias table sizes
double *biasTable = NULL;
unsigned nHashTables = 1 << ((max((unsigned)seedLen, hashTableKeySize * 4) - hashTableKeySize * 4) * 2);
biasTable = new double[nHashTables];
ComputeBiasTable(genome, seedLen, biasTable, maxThreads, forceExact, hashTableKeySize, large);
WriteStatusMessage("Allocating memory for hash tables...");
start = timeInMillis();
SNAPHashTable** hashTables = index->hashTables =
allocateHashTables(&nHashTables, countOfBases, slack, seedLen, hashTableKeySize, large, locationSize, biasTable);
index->nHashTables = nHashTables;
//
// Set up the hash tables. Each table has a key value of the lower 32 bits of the seed, and data
// of two integers. There is one integer each for the seed and its reverse complement (i.e., what you'd
// get from the complementary DNA strand, A<->T and G<->C with the string order reversed).
// The first integer is always for the version of the seed with "lower" value, using an arbitrary
// total order that we define in Seed.h. Some seeds are their own reverse complements (e.g.,
// AGCT), in which case only the first integer is used.
//
OverflowBackpointerAnchor *overflowAnchor = new OverflowBackpointerAnchor(__min(((locationSize == 8) ? (_int64)0x8effffffffffffff : GenomeLocationAsInt64(InvalidGenomeLocation)) - countOfBases, countOfBases)); // i.e., as much as the address space will allow.
WriteStatusMessage("%llds\nBuilding hash tables.\n", (timeInMillis() + 500 - start) / 1000);
start = timeInMillis();
volatile _int64 nextOverflowBackpointer = 0;
volatile _int64 nonSeeds = 0;
volatile _int64 seedsWithMultipleOccurrences = 0;
volatile _int64 genomeLocationsInOverflowTable = 0; // Number of extra hits on duplicate indices. This should come out once we implement the overflow table.
volatile _int64 bothComplementsUsed = 0; // Number of hash buckets where both complements are used
volatile _int64 noBaseAvailable = 0; // Number of places where getSubstring returned null.
volatile _int64 nBasesProcessed = 0;
volatile int runningThreadCount;
SingleWaiterObject doneObject;
CreateSingleWaiterObject(&doneObject);
unsigned nThreads = __min(GetNumberOfProcessors(), maxThreads);
BuildHashTablesThreadContext *threadContexts = new BuildHashTablesThreadContext[nThreads];
ExclusiveLock *hashTableLocks = new ExclusiveLock[nHashTables];
for (unsigned i = 0; i < nHashTables; i++) {
InitializeExclusiveLock(&hashTableLocks[i]);
}
runningThreadCount = nThreads;
GenomeDistance nextChunkToProcess = 0;
_int64 * lastBackpointerIndexUsedByThread = NULL;
ExclusiveLock backpointerSpillLock;
FILE *backpointerSpillFile = NULL;
char *backpointerSpillFileName = NULL;
InitializeExclusiveLock(&backpointerSpillLock);
if (smallMemory) {
lastBackpointerIndexUsedByThread = new _int64[nThreads];
for (unsigned i = 0; i < nThreads; i++) {
lastBackpointerIndexUsedByThread[i] = 0;
}
#define BACKPOINTER_TABLE_SPILL_FILE_NAME "BackpointerTableSpillFile"
backpointerSpillFileName = new char[strlen(directoryName) + 1 + strlen(BACKPOINTER_TABLE_SPILL_FILE_NAME) + 1];
sprintf(backpointerSpillFileName, "%s%c%s", directoryName, PATH_SEP, BACKPOINTER_TABLE_SPILL_FILE_NAME);
backpointerSpillFile = fopen(backpointerSpillFileName, "w+b");
if (NULL == backpointerSpillFile) {
WriteErrorMessage("Unable to create spill file '%s' for -sm\n", backpointerSpillFileName);
soft_exit(1);
}
}
for (unsigned i = 0; i < nThreads; i++) {
threadContexts[i].whichThread = i;
threadContexts[i].nThreads = nThreads;
threadContexts[i].doneObject = &doneObject;
threadContexts[i].genome = genome;
threadContexts[i].genomeChunkStart = nextChunkToProcess;
if (i == nThreads - 1) {
nextChunkToProcess = countOfBases - seedLen - 1;
} else {
nextChunkToProcess += (countOfBases - seedLen) / nThreads;
}
threadContexts[i].genomeChunkEnd = nextChunkToProcess;
threadContexts[i].nBasesProcessed = &nBasesProcessed;
threadContexts[i].index = index;
threadContexts[i].runningThreadCount = &runningThreadCount;
threadContexts[i].seedLen = seedLen;
threadContexts[i].noBaseAvailable = &noBaseAvailable;
threadContexts[i].nonSeeds = &nonSeeds;
threadContexts[i].seedsWithMultipleOccurrences = &seedsWithMultipleOccurrences;
threadContexts[i].genomeLocationsInOverflowTable = &genomeLocationsInOverflowTable;
threadContexts[i].bothComplementsUsed = &bothComplementsUsed;
threadContexts[i].overflowAnchor = overflowAnchor;
threadContexts[i].nextOverflowBackpointer = &nextOverflowBackpointer;
threadContexts[i].hashTableLocks = hashTableLocks;
threadContexts[i].hashTableKeySize = hashTableKeySize;
threadContexts[i].large = large;
threadContexts[i].locationSize = locationSize;
threadContexts[i].backpointerSpillLock = &backpointerSpillLock;
threadContexts[i].lastBackpointerIndexUsedByThread = lastBackpointerIndexUsedByThread;
threadContexts[i].backpointerSpillFile = backpointerSpillFile;
StartNewThread(BuildHashTablesWorkerThreadMain, &threadContexts[i]);
}
WaitForSingleWaiterObject(&doneObject);
DestroySingleWaiterObject(&doneObject);
DestroyExclusiveLock(&backpointerSpillLock);
delete[] lastBackpointerIndexUsedByThread;
if (locationSize != 8 && seedsWithMultipleOccurrences + genomeLocationsInOverflowTable + (_int64)genome->getCountOfBases() > ((_int64)1 << (8 * locationSize)) - 15) { // Only really need -1 for InvalidGenomeLocation, the rest is just spare
WriteErrorMessage("Ran out of overflow table namespace. This genome cannot be indexed with this seed and location size. Increase at least one.\n");
exit(1);
}
size_t totalUsedHashTableElements = 0;
for (unsigned j = 0; j < index->nHashTables; j++) {
totalUsedHashTableElements += hashTables[j]->GetUsedElementCount();
// printf("HashTable[%d] has %lld used elements, loading %lld%%\n",j,(_int64)hashTables[j]->GetUsedElementCount(),
// (_int64)hashTables[j]->GetUsedElementCount() * 100 / (_int64)hashTables[j]->GetTableSize());
}
WriteStatusMessage("%lld(%lld%%) seeds occur more than once, total of %lld(%lld%%) genome locations are not unique, %lld(%lld%%) bad seeds, %lld both complements used %lld no string\n",
seedsWithMultipleOccurrences,
(seedsWithMultipleOccurrences * 100) / countOfBases,
genomeLocationsInOverflowTable,
genomeLocationsInOverflowTable * 100 / countOfBases,
nonSeeds,
(nonSeeds * 100) / countOfBases,
bothComplementsUsed,
noBaseAvailable);
WriteStatusMessage("Hash table build took %llds\n",(timeInMillis() + 500 - start) / 1000);
//
// We're done with the raw genome. Delete it to save some memory.
//
delete genome;
genome = NULL;
char *halfBuiltHashTableSpillFileName = NULL;
if (smallMemory) {
//
// In the hash table build, we use the backpointer table sequentially, and the hash tables randomly. In the
// overflow table build, it's the opposite. So, we spill out the half-built hash tables (except for #0, which
// we need immediately anyway), and then load back in the backpointer table.
//
_int64 startSpill = timeInMillis();
WriteStatusMessage("Spilling half-built hash tables to disk..");
#define HALF_BUILT_HASH_TABLE_SPILL_FILE_NAME "HalfBuiltHashTables"
halfBuiltHashTableSpillFileName = new char[strlen(directoryName) + 1 + strlen(HALF_BUILT_HASH_TABLE_SPILL_FILE_NAME) + 20]; // +20 is for the number and trailing null
for (unsigned i = 1; i < nHashTables; i++) {
sprintf(halfBuiltHashTableSpillFileName, "%s%c%s.%d", directoryName, PATH_SEP, HALF_BUILT_HASH_TABLE_SPILL_FILE_NAME, i);
size_t bytesWritten;
hashTables[i]->saveToFile(halfBuiltHashTableSpillFileName, &bytesWritten);
delete hashTables[i];
hashTables[i] = NULL;
}
_int64 spillDone = timeInMillis();
WriteStatusMessage("%llds\nReloading backpointer table from disk...", (spillDone - startSpill + 500) / 1000);
overflowAnchor->loadFromFile(backpointerSpillFile);
fclose(backpointerSpillFile);
DeleteSingleFile(backpointerSpillFileName);
delete[] backpointerSpillFileName;
WriteStatusMessage("%llds\n", (timeInMillis() - spillDone + 500) / 1000);
}
WriteStatusMessage("Building overflow table.\n");
start = timeInMillis();
fflush(stdout);
//
// Now build the real overflow table and simultaneously fixup the hash table entries.
// If locationSize == 4, then it's built from 32 bit entries, otherwise from 64.
// Its format is one entry of the number of genome locations matching the
// particular seed, followed by that many genome locations, reverse sorted
// (the reverse part is for historical reasons, but it's necessary for correct functioning).
// For each seed with multiple occurrences in the genome, there is one count.
// For each genome location that's not unique, there is one list entry. So, the size
// of the overflow table is the number of non-unique seeds plus the number of non-unique
// genome locations.
//
index->overflowTableSize = seedsWithMultipleOccurrences + genomeLocationsInOverflowTable;
if (locationSize > 4) {
index->overflowTable64 = (_int64 *)BigAlloc(index->overflowTableSize * sizeof(*index->overflowTable64));
} else {
index->overflowTable32 = (unsigned *)BigAlloc(index->overflowTableSize * sizeof(*index->overflowTable32));
}
if ((_int64)index->overflowTableSize + countOfBases >= GenomeLocationAsInt64(InvalidGenomeLocation) - 15) {
WriteErrorMessage("Not enough address space to index this genome with this seed size. Try a larger seed or location size.\n");
soft_exit(1);
}
_uint64 nBackpointersProcessed = 0;
_int64 lastPrintTime = timeInMillis();
const unsigned maxHistogramEntry = 500000;
_uint64 countOfTooBigForHistogram = 0;
_uint64 sumOfTooBigForHistogram = 0;
_uint64 largestSeed = 0;
unsigned *histogram = NULL;
if (buildHistogram) {
histogram = new unsigned[maxHistogramEntry+1];
for (unsigned i = 0; i <= maxHistogramEntry; i++) {
histogram[i] = 0;
}
}
//
// Build the overflow table by walking each of the hash tables and looking for elements to fix up.
// Write the hash tables as we go so that we can free their memory on the fly.
//
snprintf(filenameBuffer,filenameBufferSize,"%s%c%s", directoryName, PATH_SEP, GenomeIndexHashFileName);
FILE *tablesFile = fopen(filenameBuffer, "wb");
if (NULL == tablesFile) {
WriteErrorMessage("Unable to open hash table file '%s'\n", filenameBuffer);
soft_exit(1);
}
size_t totalBytesWritten = 0;
_uint64 overflowTableIndex = 0;
_uint64 duplicateSeedsProcessed = 0;
for (unsigned whichHashTable = 0; whichHashTable < nHashTables; whichHashTable++) {
if (NULL == hashTables[whichHashTable]) {
_ASSERT(smallMemory);
sprintf(halfBuiltHashTableSpillFileName, "%s%c%s.%d", directoryName, PATH_SEP, HALF_BUILT_HASH_TABLE_SPILL_FILE_NAME, whichHashTable);
GenericFile_stdio *file = GenericFile_stdio::open(halfBuiltHashTableSpillFileName);
if (NULL == file) {
WriteErrorMessage("Unable to open file '%s' to reload spilled hash table.\n", halfBuiltHashTableSpillFileName);
soft_exit(1);
}
hashTables[whichHashTable] = SNAPHashTable::loadFromGenericFile(file);
file->close();
DeleteSingleFile(halfBuiltHashTableSpillFileName);
}
for (_uint64 whichEntry = 0; whichEntry < hashTables[whichHashTable]->GetTableSize(); whichEntry++) {
unsigned *values32 = (unsigned *)hashTables[whichHashTable]->getEntryValues(whichEntry);
char *values64 = (char *)values32; // char * because it's variable sized
for (int i = 0; i < (large ? NUM_DIRECTIONS : 1); i++) {
_int64 value;
if (locationSize > 4) {
value = 0;
memcpy((char *)&value, values64 + (_int64)locationSize * i, locationSize); // assumes little endian
} else {
value = values32[i];
}
if (value >= countOfBases && value != GenomeLocationAsInt64(InvalidGenomeLocation) && value != GenomeLocationAsInt64(InvalidGenomeLocation) - 1) {
//
// This is an overflow pointer. Fix it up. Count the number of occurrences of this
// seed by walking the overflow chain.
//
duplicateSeedsProcessed++;
_uint64 nOccurrences = 0;
_int64 backpointerIndex = value - countOfBases;
while (backpointerIndex != -1) {
nOccurrences++;
OverflowBackpointer *backpointer = overflowAnchor->getBackpointer(backpointerIndex);
_ASSERT(overflowTableIndex + nOccurrences < index->overflowTableSize);
if (locationSize > 4) {
index->overflowTable64[overflowTableIndex + nOccurrences] = GenomeLocationAsInt64(backpointer->genomeLocation);
} else {
index->overflowTable32[overflowTableIndex + nOccurrences] = GenomeLocationAsInt32(backpointer->genomeLocation);
}
backpointerIndex = backpointer->nextIndex;
}
_ASSERT(nOccurrences > 1);
//
// Fill the count in as the first thing in the overflow table
// and patch the value into the hash table.
//
_ASSERT(overflowTableIndex < index->overflowTableSize);
if (locationSize > 4) {
index->overflowTable64[overflowTableIndex] = nOccurrences;
_int64 newValue = overflowTableIndex + countOfBases;
memcpy(values64 + (_int64)locationSize * i, &newValue, locationSize); // Assumes little endian
} else {
index->overflowTable32[overflowTableIndex] = (unsigned)nOccurrences;
values32[i] = (unsigned)(overflowTableIndex + countOfBases);
}
overflowTableIndex += 1 + nOccurrences;
_ASSERT(overflowTableIndex <= index->overflowTableSize);
nBackpointersProcessed += nOccurrences;
//
// Sort the overflow table entries, because the paired-end aligner relies on this. Sort them backwards, because that's
// what it expects. For those who are desparately curious, this is because it was originally built this way by accident
// before there was any concept of doing binary search over a seed's hits. When the binary search was built, it relied
// on this. Then, when the index build was parallelized it was easier just to preserve the old order than to change the
// code in the aligner. So now you know.
//
if (locationSize > 4) {
qsort(&index->overflowTable64[overflowTableIndex -nOccurrences], nOccurrences, sizeof(index->overflowTable64[0]), BackwardsInt64Compare);
} else {
qsort(&index->overflowTable32[overflowTableIndex -nOccurrences], nOccurrences, sizeof(index->overflowTable32[0]), BackwardsUnsignedCompare);
}
if (timeInMillis() - lastPrintTime > 60 * 1000) {
WriteStatusMessage("%lld/%lld duplicate seeds, %lld/%lld backpointers, %d/%d hash tables processed\n",
duplicateSeedsProcessed, seedsWithMultipleOccurrences, nBackpointersProcessed, genomeLocationsInOverflowTable,
whichHashTable, nHashTables);
lastPrintTime = timeInMillis();
}
//
// If we're building a histogram, update it.
//
if (buildHistogram) {
if (nOccurrences > maxHistogramEntry) {
countOfTooBigForHistogram++;
sumOfTooBigForHistogram += nOccurrences;
} else {
histogram[nOccurrences]++;
}
largestSeed = __max(largestSeed, nOccurrences);
}
} // If this entry needs patching
} // forward and RC if large table
} // for each entry in the hash table
//
// We're done with this hash table, free it to releive memory pressure.
//
size_t bytesWrittenThisHashTable;
if (!hashTables[whichHashTable]->saveToFile(tablesFile, &bytesWrittenThisHashTable)) {
WriteErrorMessage("GenomeIndex::saveToDirectory: Failed to save hash table %d\n", whichHashTable);
delete[] filenameBuffer;
return false;
}
totalBytesWritten += bytesWrittenThisHashTable;
delete hashTables[whichHashTable];
hashTables[whichHashTable] = NULL;
} // for each hash table
fclose(tablesFile);
_ASSERT(overflowTableIndex == index->overflowTableSize); // We used exactly what we expected to use.
delete overflowAnchor;
overflowAnchor = NULL;
if (buildHistogram) {
histogram[1] = (unsigned)(totalUsedHashTableElements - seedsWithMultipleOccurrences);
for (unsigned i = 0; i <= maxHistogramEntry; i++) {
if (histogram[i] != 0) {
fprintf(histogramFile,"%d\t%d\n", i, histogram[i]);
}
}
fprintf(histogramFile, "%lld larger than %d with %lld total genome locations, largest seed %lld\n", countOfTooBigForHistogram, maxHistogramEntry, sumOfTooBigForHistogram, largestSeed);
fclose(histogramFile);
delete [] histogram;
}
//
// Now save out the part of the index that's independent of the genome itself.
//
WriteStatusMessage("Overflow table build and hash table save took %llds\nSaving overflow table...", (timeInMillis() + 500 - start)/1000);
start = timeInMillis();
snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, OverflowTableFileName);
FILE* fOverflowTable = fopen(filenameBuffer, "wb");
if (fOverflowTable == NULL) {
WriteErrorMessage("Unable to open overflow table file, '%s', %d\n", filenameBuffer, errno);
delete[] filenameBuffer;
return false;
}
const unsigned writeSize = 32 * 1024 * 1024;
unsigned overflowElementSize = (locationSize > 4) ? sizeof(*index->overflowTable64) : sizeof(*index->overflowTable32);
char *tableToWriteAsChar = (locationSize > 4) ? (char *)index->overflowTable64 : (char *)index->overflowTable32;
for (size_t writeOffset = 0; writeOffset < index->overflowTableSize * overflowElementSize; ) {
unsigned amountToWrite = (unsigned)__min((size_t)writeSize,(size_t)index->overflowTableSize * overflowElementSize - writeOffset);
size_t amountWritten = fwrite(tableToWriteAsChar + writeOffset, 1, amountToWrite, fOverflowTable);
if (amountWritten < amountToWrite) {
WriteErrorMessage("GenomeIndex::saveToDirectory: fwrite failed, %d\n",errno);
fclose(fOverflowTable);
delete[] filenameBuffer;
return false;
}
writeOffset += amountWritten;
}
fclose(fOverflowTable);
fOverflowTable = NULL;
//
// The save format is:
// file 'GenomeIndex' contains in order major version, minor version, nHashTables, overflowTableSize, seedLen, chromosomePaddingSize.
// File 'overflowTable' overflowTableSize bytes of the overflow table.
// Each hash table is saved in file base name 'GenomeIndexHash%d' where %d is the
// table number.
// And the genome itself is already saved in the same directory in its own format.
//
snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeIndexFileName);
FILE *indexFile = fopen(filenameBuffer,"w");
if (indexFile == NULL) {
WriteErrorMessage("Unable to open file '%s' for write.\n", filenameBuffer);
delete[] filenameBuffer;
return false;
}
fprintf(indexFile,"%d %d %d %lld %d %d %d %lld %d %d", GenomeIndexFormatMajorVersion, GenomeIndexFormatMinorVersion, index->nHashTables,
index->overflowTableSize, seedLen, chromosomePaddingSize, hashTableKeySize, totalBytesWritten, large ? 0 : 1, locationSize);
fclose(indexFile);
delete index;
if (biasTable != NULL) {
delete[] biasTable;
}
WriteStatusMessage("%llds\n", (timeInMillis() + 500 - start) / 1000);
delete[] filenameBuffer;
return true;
}
SNAPHashTable** GenomeIndex::allocateHashTables(
unsigned* o_nTables,
GenomeDistance countOfBases,
double slack,
int seedLen,
unsigned hashTableKeySize,
bool large,
unsigned locationSize,
double* biasTable)
{
_ASSERT(NULL != biasTable);
BigAllocUseHugePages = false; // Huge pages just slow down allocation and don't help much for hash table build, so don't use them.
if (slack <= 0) {
WriteErrorMessage("allocateHashTables: must have positive slack for the hash table to work. 0.3 is probably OK, 0.1 is minimal, less will wreak havoc with perf.\n");
soft_exit(1);
}