/
CsvTokenizer.cs
489 lines (456 loc) · 22.9 KB
/
CsvTokenizer.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
using System;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
namespace Cursively
{
/// <summary>
/// Tokenizes a byte stream into CSV fields. The processing follows the guidelines set out in
/// RFC 4180 unless and until the stream proves to be in an incompatible format, in which case a
/// set of additional rules kick in to ensure that all streams are still compatible.
/// <para>
/// The byte stream is tokenized according to the rules of the ASCII encoding. This makes it
/// compatible with any encoding that encodes 0x0A, 0x0D, 0x22, and 0x2C the same way that ASCII
/// encodes them. UTF-8 and Extended ASCII SBCS are notable examples of acceptable encodings.
/// UTF-16 is a notable example of an unacceptable encoding; trying to use this class to process
/// text encoded in an unacceptable encoding will yield undesirable results without any errors.
/// </para>
/// <para>
/// All bytes that appear in the stream except 0x0A, 0x0D, 0x22, and 0x2C are unconditionally
/// treated as data and passed through as-is. It is the consumer's responsibility to handle (or
/// not handle) NUL bytes, invalid UTF-8, leading UTF-8 BOM, or any other quirks that come with
/// the territory of text processing.
/// </para>
/// </summary>
/// <remarks>
/// <para>
/// Each instance of this class expects to process all data from one stream, represented as zero
/// or more <see cref="ProcessNextChunk"/> followed by one <see cref="ProcessEndOfStream"/>,
/// before moving on to another stream. An instance may be reused after a stream has been fully
/// processed, but each instance is also <strong>very</strong> lightweight, so it is recommended
/// that callers simply create a new instance for each stream that needs to be processed.
/// </para>
/// <para>
/// RFC 4180 leaves a lot of wiggle room for implementers. The following section explains how
/// this implementation resolves ambiguities in the spec, explains where and why we deviate from
/// it, and offers clarifying notes where the spec appears to have "gotchas", in the order that
/// the relevant items appear in the spec, primarily modeled off of how Josh Close's CsvHelper
/// library handles the same situations:
/// </para>
/// <list type="bullet">
/// <item>
/// <description>
/// The spec says that separate lines are delimited by CRLF line breaks. This implementation
/// accepts line breaks of any format (CRLF, LF, CR).
/// </description>
/// </item>
/// <item>
/// <description>
/// The spec says that there may or may not be a line break at the end of the last record in the
/// stream. This implementation does not require there to be a line break, and it would not
/// hurt to add one either.
/// </description>
/// </item>
/// <item>
/// <description>
/// The spec refers to an optional header line at the beginning. This implementation does not
/// include any special treatment for the first line of fields; if they need to be treated as
/// headers, then the consumer needs to know that and respond accordingly.
/// </description>
/// </item>
/// <item>
/// <description>
/// The spec says each record may contain "one or more fields". This implementation interprets
/// that to mean strictly that any number of consecutive newline characters in a row are treated
/// as one.
/// </description>
/// </item>
/// <item>
/// <description>
/// Many implementations allow the delimiter character to be configured to be something else
/// other than a comma. This implementation does not currently offer that flexibility.
/// </description>
/// </item>
/// <item>
/// <description>
/// Many implementations allow automatically trimming whitespace at the beginning and/or end of
/// each field (sometimes optionally). The spec expressly advises against doing that, and this
/// implementation follows suit. It is our opinion that consumers ought to be more than capable
/// of trimming spaces at the beginning or end as part of their processing if this is desired.
/// </description>
/// </item>
/// <item>
/// <description>
/// The spec says that the last field in a record must not be followed by a comma. This
/// implementation interprets that to mean that if we do see a comma followed immediately by a
/// line ending character, then that represents the data for an empty field.
/// </description>
/// </item>
/// </list>
/// <para>
/// Finally, the spec has a lot to say about double quotes. This implementation follows the
/// rules that it expressly lays out, but there are some "gotchas" that follow from the spec
/// leaving it open-ended how implementations should deal with various streams that include
/// double quotes which do not completely enclose fields, resolved as follows:
/// </para>
/// <para>
/// If a double quote is encountered at the very beginning of a field, then all characters up
/// until the next unescaped double quote or the end of the stream (whichever comes first) are
/// considered to be part of the data for that field (we do translate escaped double quotes for
/// convenience). This includes line ending characters, even though Excel seems to only make
/// that happen if the field counts matching up. If parsing stopped at an unescaped double
/// quote, but there are still more bytes after that double quote before the next delimiter,
/// then all those bytes will be treated verbatim as part of the field's data (double quotes are
/// no longer special at all for the remainder of the field).
/// </para>
/// <para>
/// Double quotes encountered at any other point are included verbatim as part of the field with
/// no special processing.
/// </para>
/// <para>
/// <example>
/// <code>
/// <![CDATA[
/// var visitor = new MyVisitorSubclass();
/// var tokenizer = new CsvTokenizer();
/// tokenizer.ProcessNextChunk(File.ReadAllBytes("..."), visitor);
/// tokenizer.ProcessEndOfStream(visitor);
/// ]]>
/// </code>
/// </example>
/// </para>
/// <para>
/// <example>
/// <code>
/// <![CDATA[
/// using (var stream = File.OpenRead("..."))
/// {
/// var visitor = new MyVisitorSubclass();
/// var tokenizer = new CsvTokenizer();
/// var buffer = new byte[81920];
/// int lastRead;
/// while ((lastRead = stream.Read(buffer, 0, buffer.Length)) != 0)
/// {
/// tokenizer.ProcessNextChunk(new ReadOnlySpan<byte>(buffer, 0, lastRead), visitor);
/// }
///
/// tokenizer.ProcessEndOfStream(visitor);
/// }
/// ]]>
/// </code>
/// </example>
/// </para>
/// </remarks>
public class CsvTokenizer
{
private const byte CR = (byte)'\r';
private const byte LF = (byte)'\n';
private const byte QUOTE = (byte)'"';
private readonly byte _delimiter;
private ParserFlags _parserFlags;
/// <summary>
/// Initializes a new instance of the <see cref="CsvTokenizer"/> class.
/// </summary>
public CsvTokenizer()
: this((byte)',')
{
}
/// <summary>
/// Initializes a new instance of the <see cref="CsvTokenizer"/> class.
/// </summary>
/// <param name="delimiter">
/// The single byte to expect to see between fields of the same record. This may not be an
/// end-of-line or double-quote character, as those have special meanings.
/// </param>
/// <exception cref="ArgumentException">
/// Thrown when <paramref name="delimiter"/> is <code>0x0A</code>, <code>0x0D</code>, or
/// <code>0x22</code>.
/// </exception>
public CsvTokenizer(byte delimiter)
{
if (!IsValidDelimiter(delimiter))
{
#pragma warning disable CA1303 // Do not pass literals as localized parameters
throw new ArgumentException("Must not be a carriage return, linefeed, or double-quote.", nameof(delimiter));
#pragma warning restore CA1303 // Do not pass literals as localized parameters
}
_delimiter = delimiter;
}
[Flags]
private enum ParserFlags : byte
{
None,
ReadAnythingOnCurrentLine = 0b00000001,
ReadAnythingInCurrentField = 0b00000010,
CurrentFieldStartedWithQuote = 0b00000100,
QuotedFieldDataEnded = 0b00001000,
CutAtPotentiallyTerminalDoubleQuote = 0b00010000,
}
/// <summary>
/// Checks if a particular byte value is legal for <see cref="CsvTokenizer(byte)"/>, i.e.,
/// that it is not <code>0x0A</code>, <code>0x0D</code>, or <code>0x22</code>.
/// </summary>
/// <param name="delimiter">
/// The single byte to expect to see between fields of the same record. This may not be an
/// end-of-line or double-quote character, as those have special meanings.
/// </param>
/// <returns>
/// <see langword="true"/> if the delimiter is legal for <see cref="CsvTokenizer(byte)"/>,
/// <see langword="false"/> otherwise.
/// </returns>
public static bool IsValidDelimiter(byte delimiter)
{
switch (delimiter)
{
case CR:
case LF:
case QUOTE:
return false;
default:
return true;
}
}
/// <summary>
/// Accepts the next (or first) chunk of data in the CSV stream, and informs an instance of
/// <see cref="CsvReaderVisitorBase"/> what it contains.
/// </summary>
/// <param name="chunk">
/// A <see cref="ReadOnlySpan{T}"/> containing the next chunk of data.
/// </param>
/// <param name="visitor">
/// The <see cref="CsvReaderVisitorBase"/> to interact with, or <see langword="null"/> if we
/// should simply advance the parser state.
/// </param>
/// <remarks>
/// If <paramref name="chunk"/> is empty, this method will do nothing.
/// </remarks>
[SuppressMessage("Microsoft.Design", "CA1062:ValidateArgumentsOfPublicMethods")] // Microsoft.CodeAnalysis.FxCopAnalyzers 2.9.3 has a false positive. Remove when fixed
public void ProcessNextChunk(ReadOnlySpan<byte> chunk, CsvReaderVisitorBase visitor)
{
// "null object" pattern.
visitor = visitor ?? CsvReaderVisitorBase.Null;
byte delimiter = _delimiter;
// we're going to consume the entire buffer that was handed to us.
while (!chunk.IsEmpty)
{
if ((_parserFlags & ParserFlags.ReadAnythingInCurrentField) != 0)
{
// most of the time, we should be able to fully process each field in the same
// loop iteration that we first start reading it. the most prominent exception
// is when we encounter a quoted field.
PickUpFromLastTime(ref chunk, visitor);
continue;
}
// loop one-by-one, instead of doing an IndexOfAny, greedily assuming that the most
// performance-sensitive applications will tend to have few enough bytes in each
// unquoted field that this manual inlining will benefit those applications **much**
// more than practically any IndexOfAny implementation would.
for (int idx = 0; idx < chunk.Length; idx++)
{
byte c = chunk[idx];
if (c == delimiter)
{
_parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
visitor.VisitEndOfField(chunk.Slice(0, idx));
}
else if (c == CR || c == LF)
{
ProcessEndOfRecord(chunk.Slice(0, idx), visitor);
}
else if (c == QUOTE)
{
if (idx == 0)
{
_parserFlags = ParserFlags.CurrentFieldStartedWithQuote | ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
}
else
{
// RFC 4180 forbids quotes that show up anywhere but the beginning of a
// field, so it's up to us to decide what we want to do about this. We
// choose to treat all such quotes as just regular data.
_parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
visitor.VisitPartialFieldContents(chunk.Slice(0, idx + 1));
// let the visitor know that this was nonstandard.
visitor.VisitNonstandardQuotedField();
}
}
else
{
continue;
}
chunk = chunk.Slice(idx + 1);
goto nextLoop;
}
_parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
visitor.VisitPartialFieldContents(chunk);
break;
nextLoop:;
}
}
/// <summary>
/// Informs this tokenizer that the last chunk of data in the stream has been read, and so
/// we should make any final interactions with the <see cref="CsvReaderVisitorBase"/> and
/// reset our state to prepare for the next stream.
/// </summary>
/// <param name="visitor">
/// The <see cref="CsvReaderVisitorBase"/> to interact with, or <see langword="null"/> if we
/// should simply advance the parser state.
/// </param>
/// <remarks>
/// <para>
/// If <see cref="ProcessNextChunk"/> has never been called (or has not been called since
/// the last time that this method was called), then this method will do nothing.
/// </para>
/// </remarks>
[SuppressMessage("Microsoft.Design", "CA1062:ValidateArgumentsOfPublicMethods")] // Microsoft.CodeAnalysis.FxCopAnalyzers 2.9.3 has a false positive. Remove when fixed
public void ProcessEndOfStream(CsvReaderVisitorBase visitor)
{
// "null object" pattern.
visitor = visitor ?? CsvReaderVisitorBase.Null;
ProcessEndOfRecord(default, visitor);
}
private void PickUpFromLastTime(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisitorBase visitor)
{
if ((_parserFlags & (ParserFlags.CurrentFieldStartedWithQuote | ParserFlags.QuotedFieldDataEnded | ParserFlags.CutAtPotentiallyTerminalDoubleQuote)) == ParserFlags.CurrentFieldStartedWithQuote)
{
int idx = readBuffer.IndexOf(QUOTE);
if (idx < 0)
{
visitor.VisitPartialFieldContents(readBuffer);
readBuffer = default;
return;
}
// the double quote we stopped at was either escaping a literal double quote, or it
// represented the end of a quoted field. we will usually have at least one more
// byte ready for us (except in contrived cases), and so it should almost always pay
// off to try to look ahead by one more byte to see if we can avoid a Partial call.
if (idx == readBuffer.Length - 1)
{
// in fact, it should pay off so well in so many cases that we can probably even
// get away with making the other case really suboptimal, which is what it will
// do when we pick up where we leave off after setting this flag.
_parserFlags |= ParserFlags.CutAtPotentiallyTerminalDoubleQuote;
visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx));
readBuffer = default;
return;
}
// we have at least one more byte, so let's see what the double quote actually means
byte b = readBuffer[idx + 1];
if (b == QUOTE)
{
// the double quote we stopped at was escaping a literal double quote, so we
// send everything up to and including the escaping quote.
visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx + 1));
}
else if (b == _delimiter)
{
// the double quote was the end of a quoted field, so send the entire data from
// the beginning of this quoted field data chunk up to the double quote that
// terminated it (excluding, of course, the double quote itself).
_parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
visitor.VisitEndOfField(readBuffer.Slice(0, idx));
}
else if (b == CR || b == LF)
{
// same thing as the delimiter case, just the field ended at the end of a line
// instead of the end of a field on the current line.
ProcessEndOfRecord(readBuffer.Slice(0, idx), visitor);
}
else
{
// the double quote was the end of the quoted part of the field data, but then
// it continues on with more data; don't spend too much time optimizing this
// case since it's not RFC 4180, just do the parts we need to do in order to
// behave the way we said we would.
_parserFlags |= ParserFlags.QuotedFieldDataEnded;
visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx));
visitor.VisitPartialFieldContents(readBuffer.Slice(idx + 1, 1));
// let the visitor know that this was nonstandard.
visitor.VisitNonstandardQuotedField();
}
// slice off the data up to the quote and the next byte that we read.
readBuffer = readBuffer.Slice(idx + 2);
}
else if ((_parserFlags & ParserFlags.CutAtPotentiallyTerminalDoubleQuote) != 0)
{
HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref readBuffer, visitor);
}
else
{
// this is expected to be rare: either we were cut between field reads, or we're
// reading nonstandard field data where there's a quote that neither starts nor ends
// the field; by this point, we don't save enough state to remember which case we're
// in, so VisitNonstandardQuotedField **MUST** have been correctly called (or not)
// before entering this section.
for (int idx = 0; idx < readBuffer.Length; idx++)
{
byte b = readBuffer[idx];
if (b == _delimiter)
{
_parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
visitor.VisitEndOfField(readBuffer.Slice(0, idx));
}
else if (b == CR || b == LF)
{
ProcessEndOfRecord(readBuffer.Slice(0, idx), visitor);
}
else
{
continue;
}
readBuffer = readBuffer.Slice(idx + 1);
return;
}
visitor.VisitPartialFieldContents(readBuffer);
readBuffer = default;
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
private void HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref ReadOnlySpan<byte> readBuffer, CsvReaderVisitorBase visitor)
{
// this method is only called in the rare case where the very last character of the last
// read buffer was a stopping double quote while we were reading quoted field data, so
// this method is expected to be called so rarely in performance-sensitive cases that I
// don't think it will ever pay off to bother doing more processing here. so we just do
// the minimum amount that we need to do in order to clear this flag and get back into
// the normal swing of things.
_parserFlags &= ~ParserFlags.CutAtPotentiallyTerminalDoubleQuote;
byte c = readBuffer[0];
if (c == QUOTE)
{
// the previous double quote was actually there to escape this double quote. we
// didn't visit the double-quote last time because we weren't sure. well, we're
// sure now, so go ahead and do it.
visitor.VisitPartialFieldContents(readBuffer.Slice(0, 1));
// we processed the double quote, so main loop should resume at the next byte.
readBuffer = readBuffer.Slice(1);
}
else
{
// the previous double quote did in fact terminate the quoted part of the field
// data, and so all we need to do is set this flag.. main loop will re-process this
// buffer and go about its merry way.
_parserFlags |= ParserFlags.QuotedFieldDataEnded;
if (c != _delimiter && c != CR && c != LF)
{
// let the visitor know that this was nonstandard, since this is our last
// opportunity to do so before our state machine can longer distinguish between
// the current state and the state for a standard field that spans chunks.
visitor.VisitNonstandardQuotedField();
}
}
}
private void ProcessEndOfRecord(ReadOnlySpan<byte> lastFieldDataChunk, CsvReaderVisitorBase visitor)
{
// even if the last field data chunk is empty, we still need to send it: we might be
// looking at a newline that immediately follows a comma, which is defined to mean
// an empty field at the end of a line.
bool notify = !lastFieldDataChunk.IsEmpty || (_parserFlags & ParserFlags.ReadAnythingOnCurrentLine) != 0;
_parserFlags = ParserFlags.None;
if (notify)
{
visitor.VisitEndOfField(lastFieldDataChunk);
visitor.VisitEndOfRecord();
}
}
}
}