/
OrcFile.java
1315 lines (1169 loc) · 41.1 KB
/
OrcFile.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.orc.impl.HadoopShims;
import org.apache.orc.impl.HadoopShimsFactory;
import org.apache.orc.impl.KeyProvider;
import org.apache.orc.impl.MemoryManagerImpl;
import org.apache.orc.impl.OrcTail;
import org.apache.orc.impl.ReaderImpl;
import org.apache.orc.impl.WriterImpl;
import org.apache.orc.impl.WriterInternal;
import org.apache.orc.impl.writer.WriterImplV2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
/**
* Contains factory methods to read or write ORC files.
*/
public class OrcFile {
private static final Logger LOG = LoggerFactory.getLogger(OrcFile.class);
public static final String MAGIC = "ORC";
/**
* Create a version number for the ORC file format, so that we can add
* non-forward compatible changes in the future. To make it easier for users
* to understand the version numbers, we use the Hive release number that
* first wrote that version of ORC files.
*
* Thus, if you add new encodings or other non-forward compatible changes
* to ORC files, which prevent the old reader from reading the new format,
* you should change these variable to reflect the next Hive release number.
* Non-forward compatible changes should never be added in patch releases.
*
* Do not make any changes that break backwards compatibility, which would
* prevent the new reader from reading ORC files generated by any released
* version of Hive.
*/
public enum Version {
V_0_11("0.11", 0, 11),
V_0_12("0.12", 0, 12),
/**
* Do not use this format except for testing. It will not be compatible
* with other versions of the software. While we iterate on the ORC 2.0
* format, we will make incompatible format changes under this version
* without providing any forward or backward compatibility.
*
* When 2.0 is released, this version identifier will be completely removed.
*/
UNSTABLE_PRE_2_0("UNSTABLE-PRE-2.0", 1, 9999),
/**
* The generic identifier for all unknown versions.
*/
FUTURE("future", Integer.MAX_VALUE, Integer.MAX_VALUE);
public static final Version CURRENT = V_0_12;
private final String name;
private final int major;
private final int minor;
Version(String name, int major, int minor) {
this.name = name;
this.major = major;
this.minor = minor;
}
public static Version byName(String name) {
for(Version version: values()) {
if (version.name.equals(name)) {
return version;
}
}
throw new IllegalArgumentException("Unknown ORC version " + name);
}
/**
* Get the human readable name for the version.
*/
public String getName() {
return name;
}
/**
* Get the major version number.
*/
public int getMajor() {
return major;
}
/**
* Get the minor version number.
*/
public int getMinor() {
return minor;
}
}
public enum WriterImplementation {
ORC_JAVA(0), // ORC Java writer
ORC_CPP(1), // ORC C++ writer
PRESTO(2), // Presto writer
SCRITCHLEY_GO(3), // Go writer from https://github.com/scritchley/orc
TRINO(4), // Trino writer
UNKNOWN(Integer.MAX_VALUE);
private final int id;
WriterImplementation(int id) {
this.id = id;
}
public int getId() {
return id;
}
public static WriterImplementation from(int id) {
WriterImplementation[] values = values();
if (id >= 0 && id < values.length - 1) {
return values[id];
}
return UNKNOWN;
}
}
/**
* Records the version of the writer in terms of which bugs have been fixed.
* When you fix bugs in the writer (or make substantial changes) that don't
* change the file format, add a new version here instead of Version.
*
* The ids are assigned sequentially from 6 per a WriterImplementation so that
* readers that predate ORC-202 treat the other writers correctly.
*/
public enum WriterVersion {
// Java ORC Writer
ORIGINAL(WriterImplementation.ORC_JAVA, 0),
HIVE_8732(WriterImplementation.ORC_JAVA, 1), /**
* fixed stripe/file maximum statistics and
* string statistics to use utf8 for min/max
*/
HIVE_4243(WriterImplementation.ORC_JAVA, 2), // use real column names from Hive tables
HIVE_12055(WriterImplementation.ORC_JAVA, 3), // vectorized writer
HIVE_13083(WriterImplementation.ORC_JAVA, 4), // decimals write present stream correctly
ORC_101(WriterImplementation.ORC_JAVA, 5), // bloom filters use utf8
ORC_135(WriterImplementation.ORC_JAVA, 6), // timestamp stats use utc
ORC_517(WriterImplementation.ORC_JAVA, 7), // decimal64 min/max are fixed
ORC_203(WriterImplementation.ORC_JAVA, 8), // trim long strings & record they were trimmed
ORC_14(WriterImplementation.ORC_JAVA, 9), // column encryption added
// C++ ORC Writer
ORC_CPP_ORIGINAL(WriterImplementation.ORC_CPP, 6),
// Presto Writer
PRESTO_ORIGINAL(WriterImplementation.PRESTO, 6),
// Scritchley Go Writer
SCRITCHLEY_GO_ORIGINAL(WriterImplementation.SCRITCHLEY_GO, 6),
// Trino Writer
TRINO_ORIGINAL(WriterImplementation.TRINO, 6),
// Don't use any magic numbers here except for the below:
FUTURE(WriterImplementation.UNKNOWN, Integer.MAX_VALUE); // a version from a future writer
private final int id;
private final WriterImplementation writer;
public WriterImplementation getWriterImplementation() {
return writer;
}
public int getId() {
return id;
}
WriterVersion(WriterImplementation writer, int id) {
this.writer = writer;
this.id = id;
}
private static final WriterVersion[][] values =
new WriterVersion[WriterImplementation.values().length][];
static {
for(WriterVersion v: WriterVersion.values()) {
WriterImplementation writer = v.writer;
if (writer != WriterImplementation.UNKNOWN) {
if (values[writer.id] == null) {
values[writer.id] = new WriterVersion[WriterVersion.values().length];
}
if (values[writer.id][v.id] != null) {
throw new IllegalArgumentException("Duplicate WriterVersion id " + v);
}
values[writer.id][v.id] = v;
}
}
}
/**
* Convert the integer from OrcProto.PostScript.writerVersion
* to the enumeration with unknown versions being mapped to FUTURE.
* @param writer the writer implementation
* @param val the serialized writer version
* @return the corresponding enumeration value
*/
public static WriterVersion from(WriterImplementation writer, int val) {
if (writer == WriterImplementation.UNKNOWN) {
return FUTURE;
}
if (writer != WriterImplementation.ORC_JAVA && val < 6) {
throw new IllegalArgumentException("ORC File with illegal version " +
val + " for writer " + writer);
}
WriterVersion[] versions = values[writer.id];
if (val < 0 || versions.length <= val) {
return FUTURE;
}
WriterVersion result = versions[val];
return result == null ? FUTURE : result;
}
/**
* Does this file include the given fix or come from a different writer?
* @param fix the required fix
* @return true if the required fix is present
*/
public boolean includes(WriterVersion fix) {
return writer != fix.writer || id >= fix.id;
}
}
/**
* The WriterVersion for this version of the software.
*/
public static final WriterVersion CURRENT_WRITER = WriterVersion.ORC_14;
public enum EncodingStrategy {
SPEED, COMPRESSION
}
public enum CompressionStrategy {
SPEED, COMPRESSION
}
// unused
protected OrcFile() {}
public static class ReaderOptions {
private final Configuration conf;
private FileSystem filesystem;
private long maxLength = Long.MAX_VALUE;
private OrcTail orcTail;
private KeyProvider keyProvider;
// TODO: We can generalize FileMetadata interface. Make OrcTail implement FileMetadata interface
// and remove this class altogether. Both footer caching and llap caching just needs OrcTail.
// For now keeping this around to avoid complex surgery
private FileMetadata fileMetadata;
private boolean useUTCTimestamp;
private boolean useProlepticGregorian;
public ReaderOptions(Configuration conf) {
this.conf = conf;
this.useProlepticGregorian = OrcConf.PROLEPTIC_GREGORIAN.getBoolean(conf);
}
public ReaderOptions filesystem(FileSystem fs) {
this.filesystem = fs;
return this;
}
public ReaderOptions maxLength(long val) {
maxLength = val;
return this;
}
public ReaderOptions orcTail(OrcTail tail) {
this.orcTail = tail;
return this;
}
/**
* Set the KeyProvider to override the default for getting keys.
* @param provider
* @return
*/
public ReaderOptions setKeyProvider(KeyProvider provider) {
this.keyProvider = provider;
return this;
}
/**
* Should the reader convert dates and times to the proleptic Gregorian
* calendar?
* @param newValue should it use the proleptic Gregorian calendar?
* @return this
*/
public ReaderOptions convertToProlepticGregorian(boolean newValue) {
this.useProlepticGregorian = newValue;
return this;
}
public Configuration getConfiguration() {
return conf;
}
public FileSystem getFilesystem() {
return filesystem;
}
public long getMaxLength() {
return maxLength;
}
public OrcTail getOrcTail() {
return orcTail;
}
public KeyProvider getKeyProvider() {
return keyProvider;
}
/**
* @deprecated Use {@link #orcTail(OrcTail)} instead.
*/
public ReaderOptions fileMetadata(final FileMetadata metadata) {
fileMetadata = metadata;
return this;
}
public FileMetadata getFileMetadata() {
return fileMetadata;
}
public ReaderOptions useUTCTimestamp(boolean value) {
useUTCTimestamp = value;
return this;
}
public boolean getUseUTCTimestamp() {
return useUTCTimestamp;
}
public boolean getConvertToProlepticGregorian() {
return useProlepticGregorian;
}
}
public static ReaderOptions readerOptions(Configuration conf) {
return new ReaderOptions(conf);
}
public static Reader createReader(Path path,
ReaderOptions options) throws IOException {
return new ReaderImpl(path, options);
}
public interface WriterContext {
Writer getWriter();
}
public interface WriterCallback {
void preStripeWrite(WriterContext context) throws IOException;
void preFooterWrite(WriterContext context) throws IOException;
}
public enum BloomFilterVersion {
// Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams to support
// both old and new readers.
ORIGINAL("original"),
// Only include the BLOOM_FILTER_UTF8 streams that consistently use UTF8.
// See ORC-101
UTF8("utf8");
private final String id;
BloomFilterVersion(String id) {
this.id = id;
}
@Override
public String toString() {
return id;
}
public static BloomFilterVersion fromString(String s) {
for (BloomFilterVersion version: values()) {
if (version.id.equals(s)) {
return version;
}
}
throw new IllegalArgumentException("Unknown BloomFilterVersion " + s);
}
}
/**
* Options for creating ORC file writers.
*/
public static class WriterOptions implements Cloneable {
private final Configuration configuration;
private FileSystem fileSystemValue = null;
private TypeDescription schema = null;
private long stripeSizeValue;
private long stripeRowCountValue;
private long blockSizeValue;
private boolean buildIndex;
private int rowIndexStrideValue;
private int bufferSizeValue;
private boolean enforceBufferSize = false;
private boolean blockPaddingValue;
private CompressionKind compressValue;
private MemoryManager memoryManagerValue;
private Version versionValue;
private WriterCallback callback;
private EncodingStrategy encodingStrategy;
private CompressionStrategy compressionStrategy;
private double paddingTolerance;
private String bloomFilterColumns;
private double bloomFilterFpp;
private BloomFilterVersion bloomFilterVersion;
private PhysicalWriter physicalWriter;
private WriterVersion writerVersion = CURRENT_WRITER;
private boolean useUTCTimestamp;
private boolean overwrite;
private boolean writeVariableLengthBlocks;
private HadoopShims shims;
private String directEncodingColumns;
private String encryption;
private String masks;
private KeyProvider provider;
private boolean useProlepticGregorian;
private Map<String, HadoopShims.KeyMetadata> keyOverrides = new HashMap<>();
protected WriterOptions(Properties tableProperties, Configuration conf) {
configuration = conf;
memoryManagerValue = getStaticMemoryManager(conf);
overwrite = OrcConf.OVERWRITE_OUTPUT_FILE.getBoolean(tableProperties, conf);
stripeSizeValue = OrcConf.STRIPE_SIZE.getLong(tableProperties, conf);
stripeRowCountValue = OrcConf.STRIPE_ROW_COUNT.getLong(tableProperties, conf);
blockSizeValue = OrcConf.BLOCK_SIZE.getLong(tableProperties, conf);
buildIndex = OrcConf.ENABLE_INDEXES.getBoolean(tableProperties, conf);
rowIndexStrideValue =
(int) OrcConf.ROW_INDEX_STRIDE.getLong(tableProperties, conf);
bufferSizeValue = (int) OrcConf.BUFFER_SIZE.getLong(tableProperties,
conf);
blockPaddingValue =
OrcConf.BLOCK_PADDING.getBoolean(tableProperties, conf);
compressValue =
CompressionKind.valueOf(OrcConf.COMPRESS.getString(tableProperties,
conf).toUpperCase());
enforceBufferSize = OrcConf.ENFORCE_COMPRESSION_BUFFER_SIZE.getBoolean(tableProperties, conf);
String versionName = OrcConf.WRITE_FORMAT.getString(tableProperties,
conf);
versionValue = Version.byName(versionName);
String enString = OrcConf.ENCODING_STRATEGY.getString(tableProperties,
conf);
encodingStrategy = EncodingStrategy.valueOf(enString);
String compString =
OrcConf.COMPRESSION_STRATEGY.getString(tableProperties, conf);
compressionStrategy = CompressionStrategy.valueOf(compString);
paddingTolerance =
OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(tableProperties, conf);
bloomFilterColumns = OrcConf.BLOOM_FILTER_COLUMNS.getString(tableProperties,
conf);
bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(tableProperties,
conf);
bloomFilterVersion =
BloomFilterVersion.fromString(
OrcConf.BLOOM_FILTER_WRITE_VERSION.getString(tableProperties,
conf));
shims = HadoopShimsFactory.get();
writeVariableLengthBlocks =
OrcConf.WRITE_VARIABLE_LENGTH_BLOCKS.getBoolean(tableProperties,conf);
directEncodingColumns = OrcConf.DIRECT_ENCODING_COLUMNS.getString(
tableProperties, conf);
useProlepticGregorian = OrcConf.PROLEPTIC_GREGORIAN.getBoolean(conf);
}
/**
* @return a SHALLOW clone
*/
@Override
public WriterOptions clone() {
try {
return (WriterOptions) super.clone();
} catch (CloneNotSupportedException ex) {
throw new AssertionError("Expected super.clone() to work");
}
}
/**
* Provide the filesystem for the path, if the client has it available.
* If it is not provided, it will be found from the path.
*/
public WriterOptions fileSystem(FileSystem value) {
fileSystemValue = value;
return this;
}
/**
* If the output file already exists, should it be overwritten?
* If it is not provided, write operation will fail if the file already exists.
*/
public WriterOptions overwrite(boolean value) {
overwrite = value;
return this;
}
/**
* Set the stripe size for the file. The writer stores the contents of the
* stripe in memory until this memory limit is reached and the stripe
* is flushed to the HDFS file and the next stripe started.
*/
public WriterOptions stripeSize(long value) {
stripeSizeValue = value;
return this;
}
/**
* Set the file system block size for the file. For optimal performance,
* set the block size to be multiple factors of stripe size.
*/
public WriterOptions blockSize(long value) {
blockSizeValue = value;
return this;
}
/**
* Set the distance between entries in the row index. The minimum value is
* 1000 to prevent the index from overwhelming the data. If the stride is
* set to 0, no indexes will be included in the file.
*/
public WriterOptions rowIndexStride(int value) {
rowIndexStrideValue = value;
return this;
}
/**
* The size of the memory buffers used for compressing and storing the
* stripe in memory. NOTE: ORC writer may choose to use smaller buffer
* size based on stripe size and number of columns for efficient stripe
* writing and memory utilization. To enforce writer to use the requested
* buffer size use enforceBufferSize().
*/
public WriterOptions bufferSize(int value) {
bufferSizeValue = value;
return this;
}
/**
* Enforce writer to use requested buffer size instead of estimating
* buffer size based on stripe size and number of columns.
* See bufferSize() method for more info.
* Default: false
*/
public WriterOptions enforceBufferSize() {
enforceBufferSize = true;
return this;
}
/**
* Sets whether the HDFS blocks are padded to prevent stripes from
* straddling blocks. Padding improves locality and thus the speed of
* reading, but costs space.
*/
public WriterOptions blockPadding(boolean value) {
blockPaddingValue = value;
return this;
}
/**
* Sets the encoding strategy that is used to encode the data.
*/
public WriterOptions encodingStrategy(EncodingStrategy strategy) {
encodingStrategy = strategy;
return this;
}
/**
* Sets the tolerance for block padding as a percentage of stripe size.
*/
public WriterOptions paddingTolerance(double value) {
paddingTolerance = value;
return this;
}
/**
* Comma separated values of column names for which bloom filter is to be created.
*/
public WriterOptions bloomFilterColumns(String columns) {
bloomFilterColumns = columns;
return this;
}
/**
* Specify the false positive probability for bloom filter.
*
* @param fpp - false positive probability
* @return this
*/
public WriterOptions bloomFilterFpp(double fpp) {
bloomFilterFpp = fpp;
return this;
}
/**
* Sets the generic compression that is used to compress the data.
*/
public WriterOptions compress(CompressionKind value) {
compressValue = value;
return this;
}
/**
* Set the schema for the file. This is a required parameter.
*
* @param schema the schema for the file.
* @return this
*/
public WriterOptions setSchema(TypeDescription schema) {
this.schema = schema;
return this;
}
/**
* Sets the version of the file that will be written.
*/
public WriterOptions version(Version value) {
versionValue = value;
return this;
}
/**
* Add a listener for when the stripe and file are about to be closed.
*
* @param callback the object to be called when the stripe is closed
* @return this
*/
public WriterOptions callback(WriterCallback callback) {
this.callback = callback;
return this;
}
/**
* Set the version of the bloom filters to write.
*/
public WriterOptions bloomFilterVersion(BloomFilterVersion version) {
this.bloomFilterVersion = version;
return this;
}
/**
* Change the physical writer of the ORC file.
* <p>
* SHOULD ONLY BE USED BY LLAP.
*
* @param writer the writer to control the layout and persistence
* @return this
*/
public WriterOptions physicalWriter(PhysicalWriter writer) {
this.physicalWriter = writer;
return this;
}
/**
* A public option to set the memory manager.
*/
public WriterOptions memory(MemoryManager value) {
memoryManagerValue = value;
return this;
}
/**
* Should the ORC file writer use HDFS variable length blocks, if they
* are available?
* @param value the new value
* @return this
*/
public WriterOptions writeVariableLengthBlocks(boolean value) {
writeVariableLengthBlocks = value;
return this;
}
/**
* Set the HadoopShims to use.
* This is only for testing.
* @param value the new value
* @return this
*/
public WriterOptions setShims(HadoopShims value) {
this.shims = value;
return this;
}
/**
* Manually set the writer version.
* This is an internal API.
*
* @param version the version to write
* @return this
*/
protected WriterOptions writerVersion(WriterVersion version) {
if (version == WriterVersion.FUTURE) {
throw new IllegalArgumentException("Can't write a future version.");
}
this.writerVersion = version;
return this;
}
/**
* Manually set the time zone for the writer to utc.
* If not defined, system time zone is assumed.
*/
public WriterOptions useUTCTimestamp(boolean value) {
useUTCTimestamp = value;
return this;
}
/**
* Set the comma-separated list of columns that should be direct encoded.
* @param value the value to set
* @return this
*/
public WriterOptions directEncodingColumns(String value) {
directEncodingColumns = value;
return this;
}
/**
* Encrypt a set of columns with a key.
*
* Format of the string is a key-list.
* <ul>
* <li>key-list = key (';' key-list)?</li>
* <li>key = key-name ':' field-list</li>
* <li>field-list = field-name ( ',' field-list )?</li>
* <li>field-name = number | field-part ('.' field-name)?</li>
* <li>field-part = quoted string | simple name</li>
* </ul>
*
* @param value a key-list of which columns to encrypt
* @return this
*/
public WriterOptions encrypt(String value) {
encryption = value;
return this;
}
/**
* Set the masks for the unencrypted data.
*
* Format of the string is a mask-list.
* <ul>
* <li>mask-list = mask (';' mask-list)?</li>
* <li>mask = mask-name (',' parameter)* ':' field-list</li>
* <li>field-list = field-name ( ',' field-list )?</li>
* <li>field-name = number | field-part ('.' field-name)?</li>
* <li>field-part = quoted string | simple name</li>
* </ul>
*
* @param value a list of the masks and column names
* @return this
*/
public WriterOptions masks(String value) {
masks = value;
return this;
}
/**
* For users that need to override the current version of a key, this
* method allows them to define the version and algorithm for a given key.
*
* This will mostly be used for ORC file merging where the writer has to
* use the same version of the key that the original files used.
*
* @param keyName the key name
* @param version the version of the key to use
* @param algorithm the algorithm for the given key version
* @return this
*/
public WriterOptions setKeyVersion(String keyName, int version,
EncryptionAlgorithm algorithm) {
HadoopShims.KeyMetadata meta = new HadoopShims.KeyMetadata(keyName,
version, algorithm);
keyOverrides.put(keyName, meta);
return this;
}
/**
* Set the key provider for column encryption.
* @param provider the object that holds the master secrets
* @return this
*/
public WriterOptions setKeyProvider(KeyProvider provider) {
this.provider = provider;
return this;
}
/**
* Should the writer use the proleptic Gregorian calendar for
* times and dates.
* @param newValue true if we should use the proleptic calendar
* @return this
*/
public WriterOptions setProlepticGregorian(boolean newValue) {
this.useProlepticGregorian = newValue;
return this;
}
public KeyProvider getKeyProvider() {
return provider;
}
public boolean getBlockPadding() {
return blockPaddingValue;
}
public long getBlockSize() {
return blockSizeValue;
}
public String getBloomFilterColumns() {
return bloomFilterColumns;
}
public boolean getOverwrite() {
return overwrite;
}
public FileSystem getFileSystem() {
return fileSystemValue;
}
public Configuration getConfiguration() {
return configuration;
}
public TypeDescription getSchema() {
return schema;
}
public long getStripeSize() {
return stripeSizeValue;
}
public long getStripeRowCountValue() {
return stripeRowCountValue;
}
public CompressionKind getCompress() {
return compressValue;
}
public WriterCallback getCallback() {
return callback;
}
public Version getVersion() {
return versionValue;
}
public MemoryManager getMemoryManager() {
return memoryManagerValue;
}
public int getBufferSize() {
return bufferSizeValue;
}
public boolean isEnforceBufferSize() {
return enforceBufferSize;
}
public int getRowIndexStride() {
return rowIndexStrideValue;
}
public boolean isBuildIndex() {
return buildIndex;
}
public CompressionStrategy getCompressionStrategy() {
return compressionStrategy;
}
public EncodingStrategy getEncodingStrategy() {
return encodingStrategy;
}
public double getPaddingTolerance() {
return paddingTolerance;
}
public double getBloomFilterFpp() {
return bloomFilterFpp;
}
public BloomFilterVersion getBloomFilterVersion() {
return bloomFilterVersion;
}
public PhysicalWriter getPhysicalWriter() {
return physicalWriter;
}
public WriterVersion getWriterVersion() {
return writerVersion;
}
public boolean getWriteVariableLengthBlocks() {
return writeVariableLengthBlocks;
}
public HadoopShims getHadoopShims() {
return shims;
}
public boolean getUseUTCTimestamp() {
return useUTCTimestamp;
}
public String getDirectEncodingColumns() {
return directEncodingColumns;
}
public String getEncryption() {
return encryption;
}
public String getMasks() {
return masks;
}
public Map<String, HadoopShims.KeyMetadata> getKeyOverrides() {
return keyOverrides;
}
public boolean getProlepticGregorian() {
return useProlepticGregorian;
}
}
/**
* Create a set of writer options based on a configuration.
* @param conf the configuration to use for values
* @return A WriterOptions object that can be modified
*/
public static WriterOptions writerOptions(Configuration conf) {
return new WriterOptions(null, conf);
}
/**
* Create a set of write options based on a set of table properties and
* configuration.
* @param tableProperties the properties of the table
* @param conf the configuration of the query
* @return a WriterOptions object that can be modified
*/
public static WriterOptions writerOptions(Properties tableProperties,
Configuration conf) {
return new WriterOptions(tableProperties, conf);
}
private static MemoryManager memoryManager = null;
private static synchronized MemoryManager getStaticMemoryManager(Configuration conf) {
if (memoryManager == null) {
memoryManager = new MemoryManagerImpl(conf);