-
Notifications
You must be signed in to change notification settings - Fork 66
/
GenomeIndex.h
307 lines (236 loc) · 12.2 KB
/
GenomeIndex.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
/*++
Module Name:
GenomeIndex.h
Abstract:
Headers for the index builder for the SNAP sequencer
Authors:
Bill Bolosky, August, 2011
Environment:
User mode service.
Revision History:
Adapted from Matei Zaharia's Scala implementation.
--*/
#pragma once
#include "HashTable.h"
#include "Seed.h"
#include "Genome.h"
#include "ApproximateCounter.h"
#include "GenericFile_map.h"
class GenomeIndex {
public:
const Genome *getGenome() {return genome;}
//
// This looks up a seed and its reverse complement, and returns the number and list of hits for each.
// It guarantees that if the lookup succeeds that hits[-1] and rcHits[-1] are valid memory with
// arbirtary values. The -32 version is used for indices with 32 bit genome offsets; using the version
// that doesn't match the genome index GenomeLocation size is an error. Check the index type with
// doesGenomeIndexHave64BitLocations();
//
// The 64 bit version requires the called to supply a special location for storing a single forward or
// reverse hit. This is because in the hash table (but not overflow table), these may be stored in
// 5-7 bytes in order to save space. This means that there's no address in the hash table that can
// be pointed to as a return value. When only a single hit is returned, *hits == singleHit, so there's
// no need to check on the caller's side.
//
void lookupSeed(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit);
void lookupSeed32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits);
bool doesGenomeIndexHave64BitLocations() const {return locationSize > 4;}
//
// Looks up a seed and its reverse complement, restricting the search to a given range of locations,
// and returns the number and list of hits for each.
//
// virtual void lookupSeed(Seed seed, unsigned minLocation, unsigned maxLocation,
// unsigned *nHits, const unsigned **hits, unsigned *nRCHits, const unsigned **rcHits) = 0;
//
// This issues a compiler prefetch for the genome data.
//
inline void prefetchGenomeData(GenomeLocation genomeLocation) const {
genome->prefetchData(genomeLocation);
}
inline int getSeedLength() const { return seedLen; }
void dropIndex(); // This frees the memory for the index, but leaves the genome itself around.
virtual ~GenomeIndex();
//
// run the indexer from command line arguments
//
static void runIndexer(int argc, const char **argv);
static GenomeIndex *loadFromDirectory(char *directoryName, bool map, bool prefetch);
int getMajorVersion();
int getMinorVersion();
protected:
int seedLen;
unsigned hashTableKeySize;
unsigned nHashTables;
const Genome *genome;
int majorVersion;
int minorVersion;
bool largeHashTable;
unsigned locationSize;
//
// The overflow table is indexed by numbers > than the number of bases in the genome.
// The hash table(s) point into the overflow table when they have a seed that's got more
// than one instance in the genome. For locationSize <= 4, the table is made of 32
// bit entries (and pointed to by overflowTable32), otherwise it's 64 bit entries.
//
_uint64 overflowTableSize;
unsigned *overflowTable32;
_int64 *overflowTable64;
GenericFile_map *mappedOverflowTable;
void *tablesBlob; // All of the hash tables in one giant blob
GenericFile_map *mappedTables;
//
// We have to build the overflow table in two stages. While we're walking the genome, we first
// assign tentative overflow table locations, and build up a list of places where each repeat
// occurs. Once we've read the whole thing (and so know the exact number of instances of each
// repeated seed) we build the actual overflow table and go back and update the entries in the
// hash table.
//
// The list of repeats works as a singly linked list, headed by the hash table entry. The entries
// use the index in the overflow table as links, rather than using real pointers, in order to save
// space. So that we can dynamically allocate overflow entries while still using indices to
// find them, they're built in a two level table.
//
struct OverflowBackpointer {
_int64 nextIndex;
GenomeLocation genomeLocation;
};
class OverflowBackpointerAnchor {
public:
OverflowBackpointerAnchor(_int64 maxOverflowEntries_);
~OverflowBackpointerAnchor();
OverflowBackpointer *getBackpointer(_int64 index);
void trimTo(_int64 trimToIndex, FILE *trimFile);
void loadFromFile(FILE *tripFile);
private:
ExclusiveLock lock;
static const unsigned batchSize;
_int64 maxOverflowEntries;
OverflowBackpointer **table;
static OverflowBackpointer spilledTableSlot; // This value is used to indicate that the table slot in question has been spilled
};
//
// Build a genome index and write it to a directory. If you don't already have a saved index
// the only way to get one is to build it into a directory and then load it from the directory.
// NB: This deletes the Genome that's passed into it.
//
static bool BuildIndexToDirectory(const Genome *genome, int seedLen, double slack,
const char *directory,
unsigned maxThreads, unsigned chromosomePaddingSize, bool forceExact,
unsigned hashTableKeySize, bool large, const char *histogramFileName,
unsigned locationSize, bool smallMemory);
//
// Allocate set of hash tables indexed by seeds with bias
//
static SNAPHashTable** allocateHashTables(unsigned* o_nTables, GenomeDistance countOfBases, double slack,
int seedLen, unsigned hashTableKeySize, bool large, unsigned locationSize, double* biasTable = NULL);
static const unsigned GenomeIndexFormatMajorVersion = 7;
static const unsigned GenomeIndexFormatMinorVersion = 1; // Index version 5.0 has Ns in the FASTA stored in lower case in the index so that they won't match Ns in reads. 5.1 does away with that.
static const unsigned largestBiasTable = 32; // Can't be bigger than the biggest seed size, which is set in Seed.h. Bigger than 32 means a new Seed structure.
static const unsigned largestKeySize = 8;
static double *hg19_biasTables[largestKeySize+1][largestBiasTable+1];
static double *hg19_biasTables_large[largestKeySize+1][largestBiasTable+1];
static void ComputeBiasTable(const Genome* genome, int seedSize, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large);
struct ComputeBiasTableThreadContext {
SingleWaiterObject *doneObject;
volatile int *runningThreadCount;
GenomeDistance genomeChunkStart;
GenomeDistance genomeChunkEnd;
unsigned nHashTables;
unsigned hashTableKeySize;
std::vector<ApproximateCounter> *approxCounters;
const Genome *genome;
volatile _int64 *nBasesProcessed;
unsigned seedLen;
volatile _int64 *validSeeds;
bool large;
ExclusiveLock *approximateCounterLocks;
};
static void ComputeBiasTableWorkerThreadMain(void *param);
struct OverflowBackpointer;
struct BuildHashTablesThreadContext {
unsigned nThreads;
unsigned whichThread;
SingleWaiterObject *doneObject;
volatile int *runningThreadCount;
GenomeLocation genomeChunkStart;
GenomeLocation genomeChunkEnd;
const Genome *genome;
volatile _int64 *nBasesProcessed;
unsigned seedLen;
volatile _int64 *noBaseAvailable;
volatile _int64 *nonSeeds;
volatile _int64 *seedsWithMultipleOccurrences;
volatile _int64 *bothComplementsUsed;
GenomeIndex *index;
OverflowBackpointerAnchor *overflowAnchor;
volatile _int64 *nextOverflowBackpointer;
volatile _int64 *genomeLocationsInOverflowTable;
unsigned hashTableKeySize;
bool large;
unsigned locationSize;
//
// The "small memory" option causes SNAP to write out the backpointer table as it's
// built in order to save the memory it uses, because it's accessed sequentially as
// the table is built. In order to be able to tell when it's safe to free some of the
// table, each thread occasionally records the largest backpointer index that it's done
// with. It's safe to free table chunks that are less than the least of these. The lock
// is used to keep more than one thread from trying to spill table at a time.
//
_int64 *lastBackpointerIndexUsedByThread;
ExclusiveLock *backpointerSpillLock;
FILE *backpointerSpillFile;
ExclusiveLock *hashTableLocks;
ExclusiveLock *overflowTableLock;
};
struct PerHashTableBatch {
PerHashTableBatch() : nUsed(0) {}
static const unsigned nSeedsPerBatch = 1000;
unsigned nUsed;
struct Entry {
bool usingComplement;
_uint64 lowBases;
GenomeLocation genomeLocation;
};
Entry entries[nSeedsPerBatch];
bool addSeed(GenomeLocation genomeLocation, _uint64 seedLowBases, bool seedUsingComplement) {
_ASSERT(nUsed < nSeedsPerBatch);
entries[nUsed].lowBases = seedLowBases;
entries[nUsed].usingComplement = seedUsingComplement;
entries[nUsed].genomeLocation = genomeLocation;
nUsed++;
return nUsed >= nSeedsPerBatch;
}
void clear()
{
nUsed = 0;
}
};
struct IndexBuildStats {
IndexBuildStats() : noBaseAvailable(0), nonSeeds(0), bothComplementsUsed(0), genomeLocationsInOverflowTable(0),
unrecordedSkippedSeeds(0), seedsWithMultipleOccurrences(0) {}
_int64 noBaseAvailable;
_int64 nonSeeds;
_int64 bothComplementsUsed;
_int64 genomeLocationsInOverflowTable;
_int64 seedsWithMultipleOccurrences;
_uint64 unrecordedSkippedSeeds;
};
static const _int64 printPeriod;
virtual void indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large);
virtual void completeIndexing(PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large);
static void BuildHashTablesWorkerThreadMain(void *param);
void BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context);
static void ApplyHashTableUpdate(BuildHashTablesThreadContext *context, _uint64 whichHashTable, GenomeLocation genomeLocation, _uint64 lowBases, bool usingComplement,
_int64 *bothComplementsUsed, _int64 *genomeLocationsInOverflowTable, _int64 *seedsWithMultipleOccurrences, bool large);
static int BackwardsUnsignedCompare(const void *, const void *);
static int BackwardsInt64Compare(const void *, const void *);
GenomeIndex();
SNAPHashTable **hashTables;
static _int64 AddOverflowBackpointer(
_int64 previousOverflowBackpointer,
BuildHashTablesThreadContext*context,
GenomeLocation genomeLocation);
void fillInLookedUpResults32(const unsigned *subEntry, _int64 *nHits, const unsigned **hits);
void fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation);
};
extern Genome::Contig ContigForInvalidGenomeLocation;