forked from fraugster/parquet-go
/
split.go
117 lines (101 loc) · 2.93 KB
/
split.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
package cmds
import (
"fmt"
"io"
"log"
"os"
"path/filepath"
"strings"
goparquet "github.com/fraugster/parquet-go"
"github.com/fraugster/parquet-go/parquet"
"github.com/spf13/cobra"
)
var (
partSize *string
targetFolder *string
rowGroupSize *string
compressionMethod *string
)
func init() {
partSize = splitFile.PersistentFlags().StringP("file-size", "s", "100MB", "The target size of parquet files, it is not the *exact* size on the output")
targetFolder = splitFile.PersistentFlags().StringP("target-folder", "t", "", "Target folder to write the files, use the source file folder if it's empty")
rowGroupSize = splitFile.PersistentFlags().StringP("row-group-size", "r", "128MB", "Uncompressed row group size")
compressionMethod = splitFile.PersistentFlags().StringP("compression", "c", "Snappy", "Compression method, valid values are Snappy, Gzip, None")
rootCmd.AddCommand(splitFile)
}
var splitFile = &cobra.Command{
Use: "split file-name.parquet",
Short: "Split the parquet file into multiple parquet files",
Run: func(cmd *cobra.Command, args []string) {
if len(args) != 1 {
_ = cmd.Usage()
os.Exit(1)
}
rgSize, err := humanToByte(*rowGroupSize)
if err != nil {
log.Fatalf("Invalid row group size: %q", *rowGroupSize)
}
pSize, err := humanToByte(*partSize)
if err != nil {
log.Fatalf("Invalid file size: %q", *partSize)
}
comp := parquet.CompressionCodec_UNCOMPRESSED
switch strings.ToUpper(*compressionMethod) {
case "SNAPPY":
comp = parquet.CompressionCodec_SNAPPY
case "GZIP":
comp = parquet.CompressionCodec_GZIP
case "NONE":
comp = parquet.CompressionCodec_UNCOMPRESSED
default:
log.Fatalf("Invalid compression codec: %q", *rowGroupSize)
}
fl, err := os.Open(args[0])
if err != nil {
log.Fatalf("Can not open the file: %q", err)
}
defer fl.Close()
reader, err := goparquet.NewFileReader(fl)
if err != nil {
log.Fatalf("could not create parquet reader: %q", err)
}
opts := []goparquet.FileWriterOption{
goparquet.WithSchemaDefinition(reader.GetSchemaDefinition()),
goparquet.WithCompressionCodec(comp),
goparquet.WithMaxRowGroupSize(rgSize),
}
for i := 1; ; i++ {
path := filepath.Join(*targetFolder, fmt.Sprintf("part_%d.parquet", i))
ok, err := copyData(reader, path, pSize, opts...)
if err != nil {
log.Fatalf("Writing part failed: %q", err)
}
if ok {
break
}
}
},
}
func copyData(reader *goparquet.FileReader, path string, size int64, opts ...goparquet.FileWriterOption) (bool, error) {
fl, err := os.Create(path)
if err != nil {
return false, err
}
defer fl.Close()
writer := goparquet.NewFileWriter(fl, opts...)
for {
row, err := reader.NextRow()
if err == io.EOF {
return true, writer.Close()
}
if err != nil {
return false, err
}
if err := writer.AddData(row); err != nil {
return false, err
}
if writer.CurrentFileSize() >= size {
return false, writer.Close()
}
}
}