-
Notifications
You must be signed in to change notification settings - Fork 67
/
Read.h
351 lines (285 loc) · 12 KB
/
Read.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
/*++
Module Name:
Read.h
Abstract:
Headers for the Read class for the SNAP sequencer
Authors:
Bill Bolosky, August, 2011
Environment:
User mode service.
Revision History:
Adapted from Matei Zaharia's Scala implementation.
--*/
#pragma once
#include "Compat.h"
#include "Tables.h"
const int MaxReadLength = 20000;
class Read;
enum ReadClippingType {NoClipping, ClipFront, ClipBack, ClipFrontAndBack};
class ReadReader {
public:
virtual ~ReadReader() {}
virtual void readDoneWithBuffer(unsigned *referenceCount) = 0;
virtual bool getNextRead(Read *readToUpdate) = 0;
virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess) = 0;
};
class PairedReadReader {
public:
virtual ~PairedReadReader() {}
virtual bool getNextReadPair(Read *read1, Read *read2) = 0;
virtual ReadReader *getReaderToInitializeRead(int whichHalfOfPair) = 0;
virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess) = 0;
};
class Read {
public:
Read(ReadReader *i_reader = NULL) : reader(i_reader), id(NULL), data(NULL), quality(NULL),
localUnclippedDataBuffer(NULL), localUnclippedQualityBuffer(NULL), localBufferSize(0),
originalUnclippedDataBuffer(NULL), originalUnclippedQualityBuffer(NULL), clippingState(NoClipping)
{
referenceCounts[0] = referenceCounts[1] = NULL;
}
~Read()
{
delete [] localUnclippedDataBuffer;
delete [] localUnclippedQualityBuffer;
}
//
// Initialize the Read. Reads do NOT take ownership of the memory to which they
// point, and it's the caller's responsibility to make sure that it continues to
// exist as long as the Read does. This is so that the caller can read a bunch of
// read data into a buffer, and then carve Reads out of it without doing further
// memory allocations, which would slow down the sequencing.
//
void init(
const char *i_id,
unsigned i_idLength,
const char *i_data,
const char *i_quality,
unsigned i_dataLength,
unsigned **newReferenceCounts)
{
commonInit(i_id,i_idLength,i_data,i_quality,i_dataLength);
if (NULL != referenceCounts[0]) {
(*referenceCounts[0])--;
if (0 == *referenceCounts[0]) {
reader->readDoneWithBuffer(referenceCounts[0]);
}
}
if (NULL != referenceCounts[1]) {
(*referenceCounts[1])--;
if (0 == *referenceCounts[1]) {
reader->readDoneWithBuffer(referenceCounts[1]);
}
}
if (NULL != newReferenceCounts) {
referenceCounts[0] = newReferenceCounts[0];
referenceCounts[1] = newReferenceCounts[1];
}
}
void init(
const char *i_id,
unsigned i_idLength,
const char *i_data,
const char *i_quality,
unsigned i_dataLength)
{
commonInit(i_id,i_idLength,i_data,i_quality,i_dataLength);
referenceCounts[0] = referenceCounts[1] = NULL;
}
// For efficiency, this class holds id, data and quality pointers that are
// *NOT* guaranteed to be to null-terminated strings; use the the length fields
// to figure out how far to read into these strings.
inline const char *getId() const {return id;}
inline unsigned getIdLength() const {return idLength;}
inline const char *getData() const {return data;}
inline const char *getUnclippedData() const {return unclippedData;}
inline const char *getQuality() const {return quality;}
inline const char *getUnclippedQuality() const {return unclippedQuality;}
inline unsigned getDataLength() const {return dataLength;}
inline unsigned getUnclippedLength() const {return unclippedLength;}
inline unsigned getFrontClippedLength() const {return (unsigned)(data - unclippedData);} // number of bases clipped from the front of the read
inline void setUnclippedLength(unsigned length) {unclippedLength = length;}
inline ReadClippingType getClippingState() const {return clippingState;}
void clip(ReadClippingType clipping) {
if (clipping == clippingState) {
//
// Already in the right state.
//
return;
}
//
// Revert to unclipped, then clip to the correct state.
//
dataLength = unclippedLength;
frontClippedLength = 0;
data = unclippedData;
quality = unclippedQuality;
//
// First clip from the back.
//
if (ClipBack == clipping || ClipFrontAndBack == clipping) {
while (dataLength > 0 && quality[dataLength - 1] == '#') {
dataLength--;
}
}
//
// Then clip from the beginning.
//
if (ClipFront == clipping || ClipFrontAndBack == clipping) {
frontClippedLength = 0;
while (frontClippedLength < dataLength && quality[frontClippedLength] == '#') {
frontClippedLength++;
}
}
if (dataLength-frontClippedLength < 50) {
// There are a lot of quality-2 bases in the read so just use all of it (IS THIS A GOOD IDEA? -BB)
dataLength = unclippedLength;
frontClippedLength = 0;
} else {
dataLength -= frontClippedLength;
data += frontClippedLength;
quality += frontClippedLength;
}
clippingState = clipping;
};
unsigned countOfTrailing2sInQuality() const { // 2 here is represented in Phred+33, or ascii '#'
unsigned count = 0;
while (count < dataLength && quality[dataLength - 1 - count] == '#') {
count++;
}
return count;
}
unsigned countOfNs() const {
unsigned count = 0;
for (unsigned i = 0; i < dataLength; i++) {
count += IS_N[data[i]];
}
return count;
}
void computeReverseCompliment(char *outputBuffer) { // Caller guarantees that outputBuffer is at least getDataLength() bytes
for (unsigned i = 0; i < dataLength; i++) {
outputBuffer[i] = COMPLEMENT[data[dataLength - i - 1]];
}
}
void becomeRC()
{
if (NULL != originalUnclippedDataBuffer) {
//
// We've already RCed ourself. Switch back.
//
unclippedData = originalUnclippedDataBuffer;
unclippedQuality = originalUnclippedQualityBuffer;
//
// The clipping reverses as we go to/from RC.
//
frontClippedLength = unclippedLength - dataLength - frontClippedLength;
data = unclippedData + frontClippedLength;
quality = unclippedQuality + frontClippedLength;
originalUnclippedDataBuffer = NULL;
originalUnclippedQualityBuffer = NULL;
return;
}
//
// If we don't have local space to store the RC (and the quality RC) then allocate
// it. This preserves the buffer space (but not content) across calls to init().
//
if (unclippedLength > localBufferSize) {
delete [] localUnclippedDataBuffer;
delete [] localUnclippedQualityBuffer;
localUnclippedDataBuffer = new char[unclippedLength];
localUnclippedQualityBuffer =new char[unclippedLength];
localBufferSize = unclippedLength;
}
//
// We can't just call computeReverseCompliment() because we need to reverse the
// unclipped portion of the buffer.
//
for (unsigned i = 0; i < unclippedLength; i++) {
localUnclippedDataBuffer[i] = COMPLEMENT[unclippedData[unclippedLength - i - 1]];
localUnclippedQualityBuffer[unclippedLength-i-1] = unclippedQuality[i];
}
originalUnclippedDataBuffer = unclippedData;
originalUnclippedQualityBuffer = unclippedQuality;
unclippedData = localUnclippedDataBuffer;
unclippedQuality = localUnclippedQualityBuffer;
//
// The clipping reverses as we go to/from RC.
//
frontClippedLength = unclippedLength - dataLength - frontClippedLength;
data = localUnclippedDataBuffer + frontClippedLength;
quality = localUnclippedQualityBuffer + frontClippedLength;
}
private:
void commonInit(
const char *i_id,
unsigned i_idLength,
const char *i_data,
const char *i_quality,
unsigned i_dataLength)
{
id = i_id;
idLength = i_idLength;
data = unclippedData = i_data;
quality = unclippedQuality = i_quality;
dataLength = i_dataLength;
unclippedLength = dataLength;
frontClippedLength = 0;
originalUnclippedDataBuffer = NULL;
originalUnclippedQualityBuffer = NULL;
clippingState = NoClipping;
}
const char *id;
const char *data;
const char *unclippedData;
const char *unclippedQuality;
const char *quality;
unsigned idLength;
unsigned dataLength;
unsigned unclippedLength;
unsigned frontClippedLength;
ReadClippingType clippingState;
ReadReader *reader;
unsigned *referenceCounts[2];
//
// Data stored in the read that's used if we RC ourself. These are always unclipped.
//
char *localUnclippedDataBuffer;
char *localUnclippedQualityBuffer;
unsigned localBufferSize;
//
// If we've switched to RC, then the original data and quality are pointed to by these.
// This allows us to switch back by just flipping pointers.
//
const char *originalUnclippedDataBuffer;
const char *originalUnclippedQualityBuffer;
};
//
// Reads that copy the memory for their strings. They're less efficient than the base
// Read class, but you can keep them around without holding references to the IO buffers
// and eventually stopping the IO.
//
class ReadWithOwnMemory : public Read {
public:
ReadWithOwnMemory(const Read &baseRead) {
idBuffer = new char[baseRead.getIdLength()+1];
dataBuffer = new char[baseRead.getUnclippedLength()+1];
qualityBuffer = new char[baseRead.getUnclippedLength() + 1];
memcpy(idBuffer,baseRead.getId(),baseRead.getIdLength());
idBuffer[baseRead.getIdLength()] = '\0'; // Even though it doesn't need to be null terminated, it seems like a good idea.
memcpy(dataBuffer,baseRead.getUnclippedData(),baseRead.getUnclippedLength());
dataBuffer[baseRead.getUnclippedLength()] = '\0';
memcpy(qualityBuffer,baseRead.getUnclippedQuality(),baseRead.getUnclippedLength());
qualityBuffer[baseRead.getUnclippedLength()] = '\0';
init(idBuffer,baseRead.getIdLength(),dataBuffer,qualityBuffer,baseRead.getUnclippedLength());
clip(baseRead.getClippingState());
}
~ReadWithOwnMemory() {
delete [] dataBuffer;
delete [] idBuffer;
delete [] qualityBuffer;
}
private:
char *idBuffer;
char *dataBuffer;
char *qualityBuffer;
};