-
Notifications
You must be signed in to change notification settings - Fork 51
/
Copy pathTextFormat.cpp
306 lines (279 loc) · 10.6 KB
/
TextFormat.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
/*------------------------------------
///\ Plywood C++ Framework
\\\/ https://plywood.arc80.com/
------------------------------------*/
#include <ply-runtime/Precomp.h>
#include <ply-runtime/io/text/TextFormat.h>
#include <ply-runtime/io/text/TextConverter.h>
#include <ply-runtime/io/text/NewLineFilter.h>
#include <ply-runtime/io/InStream.h>
#include <ply-runtime/io/OutStream.h>
#include <ply-runtime/io/StdIO.h>
namespace ply {
PLY_NO_INLINE TextFormat TextFormat::platformPreference() {
TextFormat tff;
#if PLY_TARGET_WIN32
tff.newLine = TextFormat::NewLine::CRLF;
#endif
return tff;
}
struct TextFileStats {
u32 numPoints = 0;
u32 numValidPoints = 0;
u32 totalPointValue = 0; // This value won't be accurate if byte encoding is detected
u32 numLines = 0;
u32 numCRLF = 0;
u32 numControl = 0; // non-whitespace points < 32, including nulls
u32 numNull = 0;
u32 numPlainAscii = 0; // includes whitespace, excludes control characters < 32
u32 numWhitespace = 0;
u32 numExtended = 0;
float ooNumPoints = 0.f;
PLY_INLINE u32 numInvalidPoints() const {
return this->numPoints - this->numValidPoints;
}
PLY_INLINE TextFormat::NewLine getNewLineType() const {
PLY_ASSERT(this->numCRLF <= this->numLines);
if (this->numCRLF == 0 || this->numCRLF * 2 < this->numLines) {
return TextFormat::NewLine::LF;
} else {
return TextFormat::NewLine::CRLF;
}
}
PLY_INLINE float getScore() const {
return (2.5f * this->numWhitespace + this->numPlainAscii -
100.f * this->numInvalidPoints() - 50.f * this->numControl +
5.f * this->numExtended) *
this->ooNumPoints;
}
};
const TextEncoding* encodingFromEnum(TextFormat::Encoding enc) {
switch (enc) {
default:
PLY_ASSERT(0);
case TextFormat::Encoding::Bytes:
return TextEncoding::get<Enc_Bytes>();
case TextFormat::Encoding::UTF8:
return TextEncoding::get<UTF8>();
case TextFormat::Encoding::UTF16_be:
return TextEncoding::get<UTF16<true>>();
case TextFormat::Encoding::UTF16_le:
return TextEncoding::get<UTF16<false>>();
}
};
PLY_NO_INLINE u32 scanTextFile(TextFileStats* stats, InStream* ins, const TextEncoding* encoding,
u32 maxBytes) {
bool prevWasCR = false;
u32 numBytes = 0;
while (numBytes < maxBytes) {
ins->tryMakeBytesAvailable(4); // returns < 4 on EOF/error *ONLY*
DecodeResult decoded = encoding->decodePoint(ins->viewAvailable());
if (decoded.status == DecodeResult::Status::Truncated)
break; // EOF/error
PLY_ASSERT(decoded.point >= 0 && decoded.numBytes > 0);
ins->curByte += decoded.numBytes;
numBytes += decoded.numBytes;
stats->numPoints++;
if (decoded.status == DecodeResult::Status::Valid) {
stats->numValidPoints++;
stats->totalPointValue += decoded.point;
if (decoded.point < 32) {
if (decoded.point == '\n') {
stats->numPlainAscii++;
stats->numLines++;
stats->numWhitespace++;
if (prevWasCR) {
stats->numCRLF++;
}
} else if (decoded.point == '\t') {
stats->numPlainAscii++;
stats->numWhitespace++;
} else if (decoded.point == '\r') {
stats->numPlainAscii++;
} else {
stats->numControl++;
if (decoded.point == 0) {
stats->numNull++;
}
}
} else if (decoded.point < 127) {
stats->numPlainAscii++;
if (decoded.point == ' ') {
stats->numWhitespace++;
}
} else if (decoded.point >= 65536) {
stats->numExtended++;
}
}
prevWasCR = (decoded.point == '\r');
}
if (stats->numPoints > 0) {
stats->ooNumPoints = 1.f / stats->numPoints;
}
return numBytes;
}
PLY_NO_INLINE TextFormat guessFileEncoding(InStream* ins) {
TextFileStats stats8;
BlockList::Ref start = ins->getBlockRef();
// Try UTF8 first:
u32 numBytesRead =
scanTextFile(&stats8, ins, TextEncoding::get<UTF8>(), TextFormat::NumBytesForAutodetect);
if (numBytesRead == 0) {
// Empty file
return {TextFormat::Encoding::UTF8, TextFormat::NewLine::LF, false};
}
ins->rewind(start);
if (stats8.numInvalidPoints() == 0 && stats8.numControl == 0) {
// No UTF-8 encoding errors, and no weird control characters/nulls. Pick UTF-8.
return {TextFormat::Encoding::UTF8, stats8.getNewLineType(), false};
}
// If more than 20% of the high bytes in UTF-8 are encoding errors, reinterpret UTF-8 as just
// bytes.
TextFormat::Encoding encoding8 = TextFormat::Encoding::UTF8;
{
u32 numHighBytes = numBytesRead - stats8.numPlainAscii - stats8.numControl;
if (stats8.numInvalidPoints() >= numHighBytes * 0.2f) {
// Too many UTF-8 errors. Consider it bytes.
encoding8 = TextFormat::Encoding::Bytes;
stats8.numPoints = numBytesRead;
stats8.numValidPoints = numBytesRead;
}
}
// Examine both UTF16 endianness:
TextFileStats stats16_le;
scanTextFile(&stats16_le, ins, TextEncoding::get<UTF16_LE>(),
TextFormat::NumBytesForAutodetect);
ins->rewind(start);
TextFileStats stats16_be;
scanTextFile(&stats16_be, ins, TextEncoding::get<UTF16_BE>(),
TextFormat::NumBytesForAutodetect);
ins->rewind(start);
// Choose the better UTF16 candidate:
TextFileStats* stats = &stats16_le;
TextFormat::Encoding encoding = TextFormat::Encoding::UTF16_le;
if (stats16_be.getScore() > stats16_le.getScore()) {
stats = &stats16_be;
encoding = TextFormat::Encoding::UTF16_be;
}
// Choose between the UTF16 and 8-bit encoding:
if (stats8.getScore() >= stats->getScore()) {
stats = &stats8;
encoding = encoding8;
}
// Return best guess
return {encoding, stats->getNewLineType(), false};
}
PLY_NO_INLINE TextFormat TextFormat::autodetect(InStream* ins) {
TextFormat tff;
BlockList::Ref start = ins->getBlockRef();
u8 h[3] = {0};
h[0] = ins->readByte();
h[1] = ins->readByte();
if (h[0] == 0xef && h[1] == 0xbb) {
h[2] = ins->readByte();
if (h[2] == 0xbf) {
tff.encoding = TextFormat::Encoding::UTF8;
tff.bom = true;
}
} else if (h[0] == 0xfe && h[1] == 0xff) {
tff.encoding = TextFormat::Encoding::UTF16_be;
tff.bom = true;
} else if (h[0] == 0xff && h[1] == 0xfe) {
tff.encoding = TextFormat::Encoding::UTF16_le;
tff.bom = true;
}
ins->rewind(start);
if (!tff.bom) {
return guessFileEncoding(ins);
} else {
// Detect LF or CRLF
BlockList::Ref start = ins->getBlockRef();
TextFileStats stats;
scanTextFile(&stats, ins, encodingFromEnum(tff.encoding), NumBytesForAutodetect);
ins->rewind(start);
tff.newLine = stats.getNewLineType();
return tff;
}
}
//-----------------------------------------------------------------------
PLY_NO_INLINE Owned<InStream> TextFormat::createImporter(OptionallyOwned<InStream>&& ins) const {
using Enc = TextFormat::Encoding;
if (this->bom) {
BlockList::Ref start = ins->getBlockRef();
bool gotBom = false;
switch (this->encoding) {
case Enc::Bytes: {
PLY_ASSERT(0); // Bytes format shouldn't have a BOM
break;
}
case Enc::UTF8: {
char h[3] = {0};
bool valid = ins->read({h, PLY_STATIC_ARRAY_SIZE(h)});
gotBom = valid && memcmp(h, "\xef\xbb\xbf", 3) == 0;
break;
}
case Enc::UTF16_be: {
char h[2] = {0};
bool valid = ins->read({h, PLY_STATIC_ARRAY_SIZE(h)});
gotBom = valid && memcmp(h, "\xfe\xff", 2) == 0;
break;
}
case Enc::UTF16_le: {
char h[2] = {0};
bool valid = ins->read({h, PLY_STATIC_ARRAY_SIZE(h)});
gotBom = valid && memcmp(h, "\xff\xfe", 2) == 0;
break;
}
}
if (!gotBom) {
// Expected a BOM, but didn't actually encounter one
// FIXME: Some callers may want to know about this
ins->rewind(start);
}
}
// Install converter from UTF-16 if needed
OptionallyOwned<InStream> importer;
if (this->encoding == TextFormat::Encoding::UTF8) {
importer = std::move(ins);
} else {
importer = Owned<InStream>::create(Owned<InPipe_TextConverter>::create(
std::move(ins), TextEncoding::get<UTF8>(), encodingFromEnum(this->encoding)));
}
// Install newline filter (basically just eats \r)
// FIXME: Some caller might want the LFs to be unchanged.
return Owned<InStream>::create(createInNewLineFilter(std::move(importer)));
}
PLY_NO_INLINE Owned<OutStream> TextFormat::createExporter(OptionallyOwned<OutStream>&& outs) const {
OptionallyOwned<OutStream> exporter = std::move(outs);
switch (this->encoding) {
case TextFormat::Encoding::Bytes: { // FIXME: Bytes needs to be converted
break;
}
case TextFormat::Encoding::UTF8: {
if (this->bom) {
exporter->write({"\xef\xbb\xbf", 3});
}
break;
}
case TextFormat::Encoding::UTF16_be: {
if (this->bom) {
exporter->write({"\xfe\xff", 2});
}
exporter = Owned<OutStream>::create(Owned<OutPipe_TextConverter>::create(
std::move(exporter), TextEncoding::get<UTF16_BE>(), TextEncoding::get<UTF8>()));
break;
}
case TextFormat::Encoding::UTF16_le: {
if (this->bom) {
exporter->write({"\xff\xfe", 2});
}
exporter = Owned<OutStream>::create(Owned<OutPipe_TextConverter>::create(
std::move(exporter), TextEncoding::get<UTF16_LE>(), TextEncoding::get<UTF8>()));
break;
}
}
// Install newline filter
return Owned<OutStream>::create(
createOutNewLineFilter(std::move(exporter), this->newLine == TextFormat::NewLine::CRLF));
}
} // namespace ply