-
Notifications
You must be signed in to change notification settings - Fork 286
/
InputStreamMgr.cpp
298 lines (276 loc) · 8.37 KB
/
InputStreamMgr.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
/*
* InputStreamMgr.cpp
*
* Created on: Mar 21, 2013
* Author: nek3d
*/
#include "InputStreamMgr.h"
#include <cstring> //for memset
#include "gzstream.h"
#include "CompressionTools.h"
const char *InputStreamMgr::FIFO_STRING_LITERAL = "/dev/fd";
InputStreamMgr::InputStreamMgr(const QuickString &filename, bool buildScanBuffer)
:
_filename(filename),
_pushBackStreamBuf(NULL),
_inputFileStream(NULL),
_infStreamBuf(NULL),
_oldInputStream(NULL),
_isStdin(false),
_isGzipped(false),
_isBam(false),
_isBgzipped(false),
_tmpZipBuf(NULL),
_bamRuledOut(false),
_streamFinished(false),
_numBytesInBuffer(0),
_bamReader(NULL),
_bgStream(NULL)
{
_possibleBamCode.resize(4, 0);
}
InputStreamMgr::~InputStreamMgr() {
if (_pushBackStreamBuf != NULL) {
delete _pushBackStreamBuf;
_pushBackStreamBuf = NULL;
}
if (_inputFileStream != NULL) {
delete _inputFileStream;
_inputFileStream = NULL;
}
if (_oldInputStream != NULL) {
delete _oldInputStream;
_oldInputStream = NULL;
}
if (_infStreamBuf != NULL) {
delete _infStreamBuf;
_infStreamBuf = NULL;
}
if (_bamReader != NULL) {
delete _bamReader;
_bgStream = NULL;
}
if (_bgStream != NULL) {
delete _bgStream;
_bgStream = NULL;
}
if (_finalInputStream != NULL) {
delete _finalInputStream;
_finalInputStream = NULL;
}
if (_tmpZipBuf != NULL) {
delete [] _tmpZipBuf;
_tmpZipBuf = NULL;
}
}
bool InputStreamMgr::init()
{
if (_filename == "-" || _filename == "stdin") { //stdin
_isStdin = true;
//peek at the first char of stdin to see if this is gzipped.
if ((unsigned char)cin.peek() == 0x1f) {
_isGzipped = true;
}
_pushBackStreamBuf = new PushBackStreamBuf(cin.rdbuf());
} else {
if (strncmp(_filename.c_str(), FIFO_STRING_LITERAL, strlen(FIFO_STRING_LITERAL)) == 0) {
_isStdin = true;
}
_inputFileStream = new ifstream(_filename.c_str());
if (_inputFileStream->fail()) {
cerr << "Error: Unable to open file " << _filename << ". Exiting." << endl;
delete _inputFileStream;
_inputFileStream = NULL;
exit(1);
}
//peek at the first char of stdin to see if this is gzipped.
if ((unsigned char)_inputFileStream->peek() == 0x1f) {
_isGzipped = true;
}
_pushBackStreamBuf = new PushBackStreamBuf(_inputFileStream->rdbuf());
}
//now we have a PushBackStreamBuf. Make a new stream.
_finalInputStream = new istream(_pushBackStreamBuf);
populateScanBuffer();
// resetStream();
return true;
}
int InputStreamMgr::read(char *data, size_t dataSize)
{
size_t origRead = 0;
if (!_saveDataStr.empty()) {
//must first copy contents of savedData into requested data read buffer.
if (dataSize >= _saveDataStr.size()) {
//They asked for the same amount of data or more than we saved. Give them all the saved data,
//then decrement the requested data size accordingly.
origRead = _saveDataStr.size();
memcpy(data, _saveDataStr.c_str(), origRead);
data += origRead;
dataSize -= origRead;
_saveDataStr.clear();
} else {
//This part is tricky. They want less data than we saved. Give them what they
//requested, then delete from the front of the saveDataStr by using it's substr method.
memcpy(data, _saveDataStr.c_str(), dataSize);
QuickString newDataStr;
_saveDataStr.substr(newDataStr, dataSize, _saveDataStr.size() - dataSize);
_saveDataStr = newDataStr;
return dataSize;
}
}
if (_streamFinished) {
return origRead;
}
if (_isBgzipped) {
return (int)(origRead + _bgStream->Read(data, dataSize));
}
_finalInputStream->read(data, dataSize);
return origRead + _finalInputStream->gcount();
}
void InputStreamMgr::populateScanBuffer()
{
_scanBuffer.clear();
_saveDataStr.clear();
int numChars=0;
int currChar = 0;
while (1) {
if (_isGzipped && _bamRuledOut) {
readZipChunk();
return;
}
currChar = _pushBackStreamBuf->sbumpc();
//Stop when EOF hit.
if (currChar == EOF) {
break;
}
numChars++;
_scanBuffer.push_back(currChar);
if (_isGzipped) {
if (!_bamRuledOut && detectBamOrBgzip(numChars, currChar)) {
return;
}
if (numChars == 0) {
continue; //this will only happen when we've just discovered that this
//is definitely not BAM, and want to start over.
}
}
//Stop if we have the minimum number of bytes and newline is hit.
//For gzip, stop at SCAN_BUFFER_SIZE.
if (currChar == '\n' && numChars >= MIN_SCAN_BUFFER_SIZE ){
break;
}
}
_numBytesInBuffer = _scanBuffer.size();
//append it to the savedDataStr.
_scanBuffer.toStr(_saveDataStr, true);
}
bool InputStreamMgr::detectBamOrBgzip(int &numChars, int currChar)
{
//Look for the BAM magic string "BAM\1" in the first fouur characters of the input stream.
//In compressed form, the first char is the gzip signifier, which was already found.
//The next three are the integers 139, 8, and 4.
if (numChars < 5) {
_possibleBamCode[numChars -1] = currChar;
//special: test for BAM
if (numChars == 4 && _possibleBamCode[1] == 139 && _possibleBamCode[2] == 8 && _possibleBamCode[3] == 4) {
//BAM magic string detected.This is either a BAM or bgzip file. To find out which, we have to try and
//open the file as BAM, with a BAM reader, and see if the header and references are both non-empty.
//However, if they are empty, we will have had to save all bytes consumed in the attempt, meaning still
//fill the scanBuffer and push it back onto the pushBackStream as normal.
for (; numChars < BAM_SCAN_BUFFER_SIZE; numChars++) {
currChar = _pushBackStreamBuf->sbumpc();
//Stop when EOF hit.
if (currChar == EOF) {
break;
}
_scanBuffer.push_back(currChar);
}
_pushBackStreamBuf->pushBack(_scanBuffer);
//ok, now all the data read so far is saved in the scan buffer, and pushbackstream is reset.
//now we make a BamReader and try to open the file.
_bamReader = new BamTools::BamReader();
if (!_bamReader->OpenStream(_finalInputStream)) {
//This is NOT a bam file, but it is bgzipped.
_pushBackStreamBuf->clear();
//Put all bytes read so far back onto the scan buffer, then reset
//everything so that we're effectively starting over.
_pushBackStreamBuf->pushBack(_scanBuffer);
_scanBuffer.clear();
numChars = 0;
_isBam = false;
_isBgzipped = true;
_bamRuledOut = true;
_numBytesInBuffer = 0;
delete _bamReader;
_bamReader = NULL;
//Alter the finalInputSream to become a bgzfReader.
_bgStream = new BamTools::Internal::BgzfStream();
_bgStream->OpenStream(_finalInputStream, BamTools::IBamIODevice::ReadOnly);
return false;
}
//This is a BAM file.
_isBam = true;
_numBytesInBuffer = _scanBuffer.size();
return true;
} else if (numChars == 4) {
//This is a gzipped file, and it is not bgzipped or BAM.
_pushBackStreamBuf->clear();
_pushBackStreamBuf->pushBack(_scanBuffer);
_scanBuffer.clear();
numChars = 0;
_isBam = false;
_isBgzipped = false;
_bamRuledOut = true;
_numBytesInBuffer = 0;
_infStreamBuf = new InflateStreamBuf(_finalInputStream);
if (_oldInputStream != NULL) {
delete _oldInputStream;
}
_oldInputStream = _finalInputStream;
_finalInputStream = new istream(_infStreamBuf);
return false;
}
}
return false;
}
//void InputStreamMgr::decompressBuffer()
//{
// //allocate an array to hold uncompressed data.
// _saveDataStr.clear();
// uInt maxDecompressSize = 20 * _numBytesInBuffer;
// unsigned char *newScanBuffer = new unsigned char[maxDecompressSize];
// memset(newScanBuffer, 0, maxDecompressSize);
//
// unsigned int numDecompressChars = inflateGzippedArray(_scanBuffer, newScanBuffer, maxDecompressSize, _numBytesInBuffer);
//
// // newScanBuffer should now contain uncompressed data.
// //delete old buffer, point it at new buffer.
// _saveDataStr.append((char *)newScanBuffer, numDecompressChars);
//
// delete [] newScanBuffer;
//}
void InputStreamMgr::readZipChunk()
{
if (_tmpZipBuf == NULL) {
_tmpZipBuf = new char[SCAN_BUFFER_SIZE +1];
}
memset(_tmpZipBuf, 0, SCAN_BUFFER_SIZE +1);
size_t numCharsRead = read(_tmpZipBuf, (size_t)SCAN_BUFFER_SIZE);
_saveDataStr.append(_tmpZipBuf);
_numBytesInBuffer = _saveDataStr.size();
if ((int)numCharsRead < SCAN_BUFFER_SIZE) {
_streamFinished = true;
}
return;
}
bool InputStreamMgr::resetStream()
{
_saveDataStr.clear();
if (!_isBam && !_isStdin && !_isGzipped) {
//For non-compressed, non-stdin file input, just re-open the file.
delete _finalInputStream;
_finalInputStream = new ifstream(_filename.c_str());
return true;
}
return false;
}